New src/hotspot/cpu/riscv/stubGenerator

   1 /*
   2  * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2020, 2025, Huawei Technologies Co., Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "compiler/oopMap.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "memory/universe.hpp"
  34 #include "nativeInst_riscv.hpp"
  35 #include "oops/instanceOop.hpp"
  36 #include "oops/method.hpp"
  37 #include "oops/objArrayKlass.hpp"
  38 #include "oops/oop.inline.hpp"
  39 #include "prims/methodHandles.hpp"
  40 #include "prims/upcallLinker.hpp"
  41 #include "runtime/continuation.hpp"
  42 #include "runtime/continuationEntry.inline.hpp"
  43 #include "runtime/frame.inline.hpp"
  44 #include "runtime/handles.inline.hpp"
  45 #include "runtime/javaThread.hpp"
  46 #include "runtime/sharedRuntime.hpp"
  47 #include "runtime/stubCodeGenerator.hpp"
  48 #include "runtime/stubRoutines.hpp"
  49 #include "utilities/align.hpp"
  50 #include "utilities/powerOfTwo.hpp"
  51 #ifdef COMPILER2
  52 #include "opto/runtime.hpp"
  53 #endif
  54 
  55 // Declaration and definition of StubGenerator (no .hpp file).
  56 // For a more detailed description of the stub routine structure
  57 // see the comment in stubRoutines.hpp
  58 
  59 #undef __
  60 #define __ _masm->
  61 
  62 #ifdef PRODUCT
  63 #define BLOCK_COMMENT(str) /* nothing */
  64 #else
  65 #define BLOCK_COMMENT(str) __ block_comment(str)
  66 #endif
  67 
  68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  69 
  70 // Stub Code definitions
  71 
  72 class StubGenerator: public StubCodeGenerator {
  73  private:
  74 
  75 #ifdef PRODUCT
  76 #define inc_counter_np(counter) ((void)0)
  77 #else
  78   void inc_counter_np_(uint& counter) {
  79     __ incrementw(ExternalAddress((address)&counter));
  80   }
  81 #define inc_counter_np(counter) \
  82   BLOCK_COMMENT("inc_counter " #counter); \
  83   inc_counter_np_(counter);
  84 #endif
  85 
  86   // Call stubs are used to call Java from C
  87   //
  88   // Arguments:
  89   //    c_rarg0:   call wrapper address                   address
  90   //    c_rarg1:   result                                 address
  91   //    c_rarg2:   result type                            BasicType
  92   //    c_rarg3:   method                                 Method*
  93   //    c_rarg4:   (interpreter) entry point              address
  94   //    c_rarg5:   parameters                             intptr_t*
  95   //    c_rarg6:   parameter size (in words)              int
  96   //    c_rarg7:   thread                                 Thread*
  97   //
  98   // There is no return from the stub itself as any Java result
  99   // is written to result
 100   //
 101   // we save x1 (ra) as the return PC at the base of the frame and
 102   // link x8 (fp) below it as the frame pointer installing sp (x2)
 103   // into fp.
 104   //
 105   // we save x10-x17, which accounts for all the c arguments.
 106   //
 107   // TODO: strictly do we need to save them all? they are treated as
 108   // volatile by C so could we omit saving the ones we are going to
 109   // place in global registers (thread? method?) or those we only use
 110   // during setup of the Java call?
 111   //
 112   // we don't need to save x5 which C uses as an indirect result location
 113   // return register.
 114   //
 115   // we don't need to save x6-x7 and x28-x31 which both C and Java treat as
 116   // volatile
 117   //
 118   // we save x9, x18-x27, f8-f9, and f18-f27 which Java uses as temporary
 119   // registers and C expects to be callee-save
 120   //
 121   // so the stub frame looks like this when we enter Java code
 122   //
 123   //     [ return_from_Java     ] <--- sp
 124   //     [ argument word n      ]
 125   //      ...
 126   // -35 [ argument word 1      ]
 127   // -34 [ saved FRM in Floating-point Control and Status Register ] <--- sp_after_call
 128   // -33 [ saved f27            ]
 129   // -32 [ saved f26            ]
 130   // -31 [ saved f25            ]
 131   // -30 [ saved f24            ]
 132   // -29 [ saved f23            ]
 133   // -28 [ saved f22            ]
 134   // -27 [ saved f21            ]
 135   // -26 [ saved f20            ]
 136   // -25 [ saved f19            ]
 137   // -24 [ saved f18            ]
 138   // -23 [ saved f9             ]
 139   // -22 [ saved f8             ]
 140   // -21 [ saved x27            ]
 141   // -20 [ saved x26            ]
 142   // -19 [ saved x25            ]
 143   // -18 [ saved x24            ]
 144   // -17 [ saved x23            ]
 145   // -16 [ saved x22            ]
 146   // -15 [ saved x21            ]
 147   // -14 [ saved x20            ]
 148   // -13 [ saved x19            ]
 149   // -12 [ saved x18            ]
 150   // -11 [ saved x9             ]
 151   // -10 [ call wrapper   (x10) ]
 152   //  -9 [ result         (x11) ]
 153   //  -8 [ result type    (x12) ]
 154   //  -7 [ method         (x13) ]
 155   //  -6 [ entry point    (x14) ]
 156   //  -5 [ parameters     (x15) ]
 157   //  -4 [ parameter size (x16) ]
 158   //  -3 [ thread         (x17) ]
 159   //  -2 [ saved fp       (x8)  ]
 160   //  -1 [ saved ra       (x1)  ]
 161   //   0 [                      ] <--- fp == saved sp (x2)
 162 
 163   // Call stub stack layout word offsets from fp
 164   enum call_stub_layout {
 165     sp_after_call_off  = -34,
 166 
 167     frm_off            = sp_after_call_off,
 168     f27_off            = -33,
 169     f26_off            = -32,
 170     f25_off            = -31,
 171     f24_off            = -30,
 172     f23_off            = -29,
 173     f22_off            = -28,
 174     f21_off            = -27,
 175     f20_off            = -26,
 176     f19_off            = -25,
 177     f18_off            = -24,
 178     f9_off             = -23,
 179     f8_off             = -22,
 180 
 181     x27_off            = -21,
 182     x26_off            = -20,
 183     x25_off            = -19,
 184     x24_off            = -18,
 185     x23_off            = -17,
 186     x22_off            = -16,
 187     x21_off            = -15,
 188     x20_off            = -14,
 189     x19_off            = -13,
 190     x18_off            = -12,
 191     x9_off             = -11,
 192 
 193     call_wrapper_off   = -10,
 194     result_off         = -9,
 195     result_type_off    = -8,
 196     method_off         = -7,
 197     entry_point_off    = -6,
 198     parameters_off     = -5,
 199     parameter_size_off = -4,
 200     thread_off         = -3,
 201     fp_f               = -2,
 202     retaddr_off        = -1,
 203   };
 204 
 205   address generate_call_stub(address& return_address) {
 206     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 207            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 208            "adjust this code");
 209 
 210     StubId stub_id = StubId::stubgen_call_stub_id;
 211     StubCodeMark mark(this, stub_id);
 212     address start = __ pc();
 213 
 214     const Address sp_after_call (fp, sp_after_call_off  * wordSize);
 215 
 216     const Address frm_save      (fp, frm_off           * wordSize);
 217     const Address call_wrapper  (fp, call_wrapper_off   * wordSize);
 218     const Address result        (fp, result_off         * wordSize);
 219     const Address result_type   (fp, result_type_off    * wordSize);
 220     const Address method        (fp, method_off         * wordSize);
 221     const Address entry_point   (fp, entry_point_off    * wordSize);
 222     const Address parameters    (fp, parameters_off     * wordSize);
 223     const Address parameter_size(fp, parameter_size_off * wordSize);
 224 
 225     const Address thread        (fp, thread_off         * wordSize);
 226 
 227     const Address f27_save      (fp, f27_off            * wordSize);
 228     const Address f26_save      (fp, f26_off            * wordSize);
 229     const Address f25_save      (fp, f25_off            * wordSize);
 230     const Address f24_save      (fp, f24_off            * wordSize);
 231     const Address f23_save      (fp, f23_off            * wordSize);
 232     const Address f22_save      (fp, f22_off            * wordSize);
 233     const Address f21_save      (fp, f21_off            * wordSize);
 234     const Address f20_save      (fp, f20_off            * wordSize);
 235     const Address f19_save      (fp, f19_off            * wordSize);
 236     const Address f18_save      (fp, f18_off            * wordSize);
 237     const Address f9_save       (fp, f9_off             * wordSize);
 238     const Address f8_save       (fp, f8_off             * wordSize);
 239 
 240     const Address x27_save      (fp, x27_off            * wordSize);
 241     const Address x26_save      (fp, x26_off            * wordSize);
 242     const Address x25_save      (fp, x25_off            * wordSize);
 243     const Address x24_save      (fp, x24_off            * wordSize);
 244     const Address x23_save      (fp, x23_off            * wordSize);
 245     const Address x22_save      (fp, x22_off            * wordSize);
 246     const Address x21_save      (fp, x21_off            * wordSize);
 247     const Address x20_save      (fp, x20_off            * wordSize);
 248     const Address x19_save      (fp, x19_off            * wordSize);
 249     const Address x18_save      (fp, x18_off            * wordSize);
 250 
 251     const Address x9_save       (fp, x9_off             * wordSize);
 252 
 253     // stub code
 254 
 255     address riscv_entry = __ pc();
 256 
 257     // set up frame and move sp to end of save area
 258     __ enter();
 259     __ addi(sp, fp, sp_after_call_off * wordSize);
 260 
 261     // save register parameters and Java temporary/global registers
 262     // n.b. we save thread even though it gets installed in
 263     // xthread because we want to sanity check tp later
 264     __ sd(c_rarg7, thread);
 265     __ sw(c_rarg6, parameter_size);
 266     __ sd(c_rarg5, parameters);
 267     __ sd(c_rarg4, entry_point);
 268     __ sd(c_rarg3, method);
 269     __ sd(c_rarg2, result_type);
 270     __ sd(c_rarg1, result);
 271     __ sd(c_rarg0, call_wrapper);
 272 
 273     __ sd(x9, x9_save);
 274 
 275     __ sd(x18, x18_save);
 276     __ sd(x19, x19_save);
 277     __ sd(x20, x20_save);
 278     __ sd(x21, x21_save);
 279     __ sd(x22, x22_save);
 280     __ sd(x23, x23_save);
 281     __ sd(x24, x24_save);
 282     __ sd(x25, x25_save);
 283     __ sd(x26, x26_save);
 284     __ sd(x27, x27_save);
 285 
 286     __ fsd(f8,  f8_save);
 287     __ fsd(f9,  f9_save);
 288     __ fsd(f18, f18_save);
 289     __ fsd(f19, f19_save);
 290     __ fsd(f20, f20_save);
 291     __ fsd(f21, f21_save);
 292     __ fsd(f22, f22_save);
 293     __ fsd(f23, f23_save);
 294     __ fsd(f24, f24_save);
 295     __ fsd(f25, f25_save);
 296     __ fsd(f26, f26_save);
 297     __ fsd(f27, f27_save);
 298 
 299     __ frrm(t0);
 300     __ sd(t0, frm_save);
 301     // Set frm to the state we need. We do want Round to Nearest. We
 302     // don't want non-IEEE rounding modes.
 303     Label skip_fsrmi;
 304     guarantee(__ RoundingMode::rne == 0, "must be");
 305     __ beqz(t0, skip_fsrmi);
 306     __ fsrmi(__ RoundingMode::rne);
 307     __ bind(skip_fsrmi);
 308 
 309     // install Java thread in global register now we have saved
 310     // whatever value it held
 311     __ mv(xthread, c_rarg7);
 312 
 313     // And method
 314     __ mv(xmethod, c_rarg3);
 315 
 316     // set up the heapbase register
 317     __ reinit_heapbase();
 318 
 319 #ifdef ASSERT
 320     // make sure we have no pending exceptions
 321     {
 322       Label L;
 323       __ ld(t0, Address(xthread, in_bytes(Thread::pending_exception_offset())));
 324       __ beqz(t0, L);
 325       __ stop("StubRoutines::call_stub: entered with pending exception");
 326       __ BIND(L);
 327     }
 328 #endif
 329     // pass parameters if any
 330     __ mv(esp, sp);
 331     __ slli(t0, c_rarg6, LogBytesPerWord);
 332     __ sub(t0, sp, t0); // Move SP out of the way
 333     __ andi(sp, t0, -2 * wordSize);
 334 
 335     BLOCK_COMMENT("pass parameters if any");
 336     Label parameters_done;
 337     // parameter count is still in c_rarg6
 338     // and parameter pointer identifying param 1 is in c_rarg5
 339     __ beqz(c_rarg6, parameters_done);
 340 
 341     address loop = __ pc();
 342     __ ld(t0, Address(c_rarg5, 0));
 343     __ addi(c_rarg5, c_rarg5, wordSize);
 344     __ subi(c_rarg6, c_rarg6, 1);
 345     __ push_reg(t0);
 346     __ bgtz(c_rarg6, loop);
 347 
 348     __ BIND(parameters_done);
 349 
 350     // call Java entry -- passing methdoOop, and current sp
 351     //      xmethod: Method*
 352     //      x19_sender_sp: sender sp
 353     BLOCK_COMMENT("call Java function");
 354     __ mv(x19_sender_sp, sp);
 355     __ jalr(c_rarg4);
 356 
 357     // save current address for use by exception handling code
 358 
 359     return_address = __ pc();
 360 
 361     // store result depending on type (everything that is not
 362     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 363     // n.b. this assumes Java returns an integral result in x10
 364     // and a floating result in j_farg0
 365     __ ld(j_rarg2, result);
 366     Label is_long, is_float, is_double, exit;
 367     __ ld(j_rarg1, result_type);
 368     __ mv(t0, (u1)T_OBJECT);
 369     __ beq(j_rarg1, t0, is_long);
 370     __ mv(t0, (u1)T_LONG);
 371     __ beq(j_rarg1, t0, is_long);
 372     __ mv(t0, (u1)T_FLOAT);
 373     __ beq(j_rarg1, t0, is_float);
 374     __ mv(t0, (u1)T_DOUBLE);
 375     __ beq(j_rarg1, t0, is_double);
 376 
 377     // handle T_INT case
 378     __ sw(x10, Address(j_rarg2));
 379 
 380     __ BIND(exit);
 381 
 382     // pop parameters
 383     __ addi(esp, fp, sp_after_call_off * wordSize);
 384 
 385 #ifdef ASSERT
 386     // verify that threads correspond
 387     {
 388       Label L, S;
 389       __ ld(t0, thread);
 390       __ bne(xthread, t0, S);
 391       __ get_thread(t0);
 392       __ beq(xthread, t0, L);
 393       __ BIND(S);
 394       __ stop("StubRoutines::call_stub: threads must correspond");
 395       __ BIND(L);
 396     }
 397 #endif
 398 
 399     __ pop_cont_fastpath(xthread);
 400 
 401     // restore callee-save registers
 402     __ fld(f27, f27_save);
 403     __ fld(f26, f26_save);
 404     __ fld(f25, f25_save);
 405     __ fld(f24, f24_save);
 406     __ fld(f23, f23_save);
 407     __ fld(f22, f22_save);
 408     __ fld(f21, f21_save);
 409     __ fld(f20, f20_save);
 410     __ fld(f19, f19_save);
 411     __ fld(f18, f18_save);
 412     __ fld(f9,  f9_save);
 413     __ fld(f8,  f8_save);
 414 
 415     __ ld(x27, x27_save);
 416     __ ld(x26, x26_save);
 417     __ ld(x25, x25_save);
 418     __ ld(x24, x24_save);
 419     __ ld(x23, x23_save);
 420     __ ld(x22, x22_save);
 421     __ ld(x21, x21_save);
 422     __ ld(x20, x20_save);
 423     __ ld(x19, x19_save);
 424     __ ld(x18, x18_save);
 425 
 426     __ ld(x9, x9_save);
 427 
 428     // restore frm
 429     Label skip_fsrm;
 430     __ ld(t0, frm_save);
 431     __ frrm(t1);
 432     __ beq(t0, t1, skip_fsrm);
 433     __ fsrm(t0);
 434     __ bind(skip_fsrm);
 435 
 436     __ ld(c_rarg0, call_wrapper);
 437     __ ld(c_rarg1, result);
 438     __ ld(c_rarg2, result_type);
 439     __ ld(c_rarg3, method);
 440     __ ld(c_rarg4, entry_point);
 441     __ ld(c_rarg5, parameters);
 442     __ ld(c_rarg6, parameter_size);
 443     __ ld(c_rarg7, thread);
 444 
 445     // leave frame and return to caller
 446     __ leave();
 447     __ ret();
 448 
 449     // handle return types different from T_INT
 450 
 451     __ BIND(is_long);
 452     __ sd(x10, Address(j_rarg2, 0));
 453     __ j(exit);
 454 
 455     __ BIND(is_float);
 456     __ fsw(j_farg0, Address(j_rarg2, 0), t0);
 457     __ j(exit);
 458 
 459     __ BIND(is_double);
 460     __ fsd(j_farg0, Address(j_rarg2, 0), t0);
 461     __ j(exit);
 462 
 463     return start;
 464   }
 465 
 466   // Return point for a Java call if there's an exception thrown in
 467   // Java code.  The exception is caught and transformed into a
 468   // pending exception stored in JavaThread that can be tested from
 469   // within the VM.
 470   //
 471   // Note: Usually the parameters are removed by the callee. In case
 472   // of an exception crossing an activation frame boundary, that is
 473   // not the case if the callee is compiled code => need to setup the
 474   // sp.
 475   //
 476   // x10: exception oop
 477 
 478   address generate_catch_exception() {
 479     StubId stub_id = StubId::stubgen_catch_exception_id;
 480     StubCodeMark mark(this, stub_id);
 481     address start = __ pc();
 482 
 483     // same as in generate_call_stub():
 484     const Address thread(fp, thread_off * wordSize);
 485 
 486 #ifdef ASSERT
 487     // verify that threads correspond
 488     {
 489       Label L, S;
 490       __ ld(t0, thread);
 491       __ bne(xthread, t0, S);
 492       __ get_thread(t0);
 493       __ beq(xthread, t0, L);
 494       __ bind(S);
 495       __ stop("StubRoutines::catch_exception: threads must correspond");
 496       __ bind(L);
 497     }
 498 #endif
 499 
 500     // set pending exception
 501     __ verify_oop(x10);
 502 
 503     __ sd(x10, Address(xthread, Thread::pending_exception_offset()));
 504     __ mv(t0, (address)__FILE__);
 505     __ sd(t0, Address(xthread, Thread::exception_file_offset()));
 506     __ mv(t0, (int)__LINE__);
 507     __ sw(t0, Address(xthread, Thread::exception_line_offset()));
 508 
 509     // complete return to VM
 510     assert(StubRoutines::_call_stub_return_address != nullptr,
 511            "_call_stub_return_address must have been generated before");
 512     __ j(RuntimeAddress(StubRoutines::_call_stub_return_address));
 513 
 514     return start;
 515   }
 516 
 517   // Continuation point for runtime calls returning with a pending
 518   // exception.  The pending exception check happened in the runtime
 519   // or native call stub.  The pending exception in Thread is
 520   // converted into a Java-level exception.
 521   //
 522   // Contract with Java-level exception handlers:
 523   // x10: exception
 524   // x13: throwing pc
 525   //
 526   // NOTE: At entry of this stub, exception-pc must be in RA !!
 527 
 528   // NOTE: this is always used as a jump target within generated code
 529   // so it just needs to be generated code with no x86 prolog
 530 
 531   address generate_forward_exception() {
 532     StubId stub_id = StubId::stubgen_forward_exception_id;
 533     StubCodeMark mark(this, stub_id);
 534     address start = __ pc();
 535 
 536     // Upon entry, RA points to the return address returning into
 537     // Java (interpreted or compiled) code; i.e., the return address
 538     // becomes the throwing pc.
 539     //
 540     // Arguments pushed before the runtime call are still on the stack
 541     // but the exception handler will reset the stack pointer ->
 542     // ignore them.  A potential result in registers can be ignored as
 543     // well.
 544 
 545 #ifdef ASSERT
 546     // make sure this code is only executed if there is a pending exception
 547     {
 548       Label L;
 549       __ ld(t0, Address(xthread, Thread::pending_exception_offset()));
 550       __ bnez(t0, L);
 551       __ stop("StubRoutines::forward exception: no pending exception (1)");
 552       __ bind(L);
 553     }
 554 #endif
 555 
 556     // compute exception handler into x9
 557 
 558     // call the VM to find the handler address associated with the
 559     // caller address. pass thread in x10 and caller pc (ret address)
 560     // in x11. n.b. the caller pc is in ra, unlike x86 where it is on
 561     // the stack.
 562     __ mv(c_rarg1, ra);
 563     // ra will be trashed by the VM call so we move it to x9
 564     // (callee-saved) because we also need to pass it to the handler
 565     // returned by this call.
 566     __ mv(x9, ra);
 567     BLOCK_COMMENT("call exception_handler_for_return_address");
 568     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 569                          SharedRuntime::exception_handler_for_return_address),
 570                     xthread, c_rarg1);
 571     // we should not really care that ra is no longer the callee
 572     // address. we saved the value the handler needs in x9 so we can
 573     // just copy it to x13. however, the C2 handler will push its own
 574     // frame and then calls into the VM and the VM code asserts that
 575     // the PC for the frame above the handler belongs to a compiled
 576     // Java method. So, we restore ra here to satisfy that assert.
 577     __ mv(ra, x9);
 578     // setup x10 & x13 & clear pending exception
 579     __ mv(x13, x9);
 580     __ mv(x9, x10);
 581     __ ld(x10, Address(xthread, Thread::pending_exception_offset()));
 582     __ sd(zr, Address(xthread, Thread::pending_exception_offset()));
 583 
 584 #ifdef ASSERT
 585     // make sure exception is set
 586     {
 587       Label L;
 588       __ bnez(x10, L);
 589       __ stop("StubRoutines::forward exception: no pending exception (2)");
 590       __ bind(L);
 591     }
 592 #endif
 593 
 594     // continue at exception handler
 595     // x10: exception
 596     // x13: throwing pc
 597     // x9: exception handler
 598     __ verify_oop(x10);
 599     __ jr(x9);
 600 
 601     return start;
 602   }
 603 
 604   // Non-destructive plausibility checks for oops
 605   //
 606   // Arguments:
 607   //    x10: oop to verify
 608   //    t0: error message
 609   //
 610   // Stack after saving c_rarg3:
 611   //    [tos + 0]: saved c_rarg3
 612   //    [tos + 1]: saved c_rarg2
 613   //    [tos + 2]: saved ra
 614   //    [tos + 3]: saved t1
 615   //    [tos + 4]: saved x10
 616   //    [tos + 5]: saved t0
 617   address generate_verify_oop() {
 618 
 619     StubId stub_id = StubId::stubgen_verify_oop_id;
 620     StubCodeMark mark(this, stub_id);
 621     address start = __ pc();
 622 
 623     Label exit, error;
 624 
 625     __ push_reg(RegSet::of(c_rarg2, c_rarg3), sp); // save c_rarg2 and c_rarg3
 626 
 627     __ la(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 628     __ ld(c_rarg3, Address(c_rarg2));
 629     __ addi(c_rarg3, c_rarg3, 1);
 630     __ sd(c_rarg3, Address(c_rarg2));
 631 
 632     // object is in x10
 633     // make sure object is 'reasonable'
 634     __ beqz(x10, exit); // if obj is null it is OK
 635 
 636     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 637     bs_asm->check_oop(_masm, x10, c_rarg2, c_rarg3, error);
 638 
 639     // return if everything seems ok
 640     __ bind(exit);
 641 
 642     __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp);  // pop c_rarg2 and c_rarg3
 643     __ ret();
 644 
 645     // handle errors
 646     __ bind(error);
 647     __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp); // pop c_rarg2 and c_rarg3
 648 
 649     __ push_reg(RegSet::range(x0, x31), sp);
 650     // debug(char* msg, int64_t pc, int64_t regs[])
 651     __ mv(c_rarg0, t0);             // pass address of error message
 652     __ mv(c_rarg1, ra);             // pass return address
 653     __ mv(c_rarg2, sp);             // pass address of regs on stack
 654 #ifndef PRODUCT
 655     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 656 #endif
 657     BLOCK_COMMENT("call MacroAssembler::debug");
 658     __ rt_call(CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 659     __ ebreak();
 660 
 661     return start;
 662   }
 663 
 664   // The inner part of zero_words().
 665   //
 666   // Inputs:
 667   // x28: the HeapWord-aligned base address of an array to zero.
 668   // x29: the count in HeapWords, x29 > 0.
 669   //
 670   // Returns x28 and x29, adjusted for the caller to clear.
 671   // x28: the base address of the tail of words left to clear.
 672   // x29: the number of words in the tail.
 673   //      x29 < MacroAssembler::zero_words_block_size.
 674 
 675   address generate_zero_blocks() {
 676     Label done;
 677 
 678     const Register base = x28, cnt = x29, tmp1 = x30, tmp2 = x31;
 679 
 680     __ align(CodeEntryAlignment);
 681     StubId stub_id = StubId::stubgen_zero_blocks_id;
 682     StubCodeMark mark(this, stub_id);
 683     address start = __ pc();
 684 
 685     if (UseBlockZeroing) {
 686       int zicboz_block_size = VM_Version::zicboz_block_size.value();
 687       // Ensure count >= 2 * zicboz_block_size so that it still deserves
 688       // a cbo.zero after alignment.
 689       Label small;
 690       int low_limit = MAX2(2 * zicboz_block_size, (int)BlockZeroingLowLimit) / wordSize;
 691       __ mv(tmp1, low_limit);
 692       __ blt(cnt, tmp1, small);
 693       __ zero_dcache_blocks(base, cnt, tmp1, tmp2);
 694       __ bind(small);
 695     }
 696 
 697     {
 698       // Clear the remaining blocks.
 699       Label loop;
 700       __ mv(tmp1, MacroAssembler::zero_words_block_size);
 701       __ blt(cnt, tmp1, done);
 702       __ bind(loop);
 703       for (int i = 0; i < MacroAssembler::zero_words_block_size; i++) {
 704         __ sd(zr, Address(base, i * wordSize));
 705       }
 706       __ addi(base, base, MacroAssembler::zero_words_block_size * wordSize);
 707       __ subi(cnt, cnt, MacroAssembler::zero_words_block_size);
 708       __ bge(cnt, tmp1, loop);
 709       __ bind(done);
 710     }
 711 
 712     __ ret();
 713 
 714     return start;
 715   }
 716 
 717   typedef enum {
 718     copy_forwards = 1,
 719     copy_backwards = -1
 720   } copy_direction;
 721 
 722   // Bulk copy of blocks of 8 words.
 723   //
 724   // count is a count of words.
 725   //
 726   // Precondition: count >= 8
 727   //
 728   // Postconditions:
 729   //
 730   // The least significant bit of count contains the remaining count
 731   // of words to copy.  The rest of count is trash.
 732   //
 733   // s and d are adjusted to point to the remaining words to copy
 734   //
 735   address generate_copy_longs(StubId stub_id, Register s, Register d, Register count) {
 736     BasicType type;
 737     copy_direction direction;
 738     switch (stub_id) {
 739     case StubId::stubgen_copy_byte_f_id:
 740       direction = copy_forwards;
 741       type = T_BYTE;
 742       break;
 743     case StubId::stubgen_copy_byte_b_id:
 744       direction = copy_backwards;
 745       type = T_BYTE;
 746       break;
 747     default:
 748       ShouldNotReachHere();
 749     }
 750     int unit = wordSize * direction;
 751     int bias = wordSize;
 752 
 753     const Register tmp_reg0 = x13, tmp_reg1 = x14, tmp_reg2 = x15, tmp_reg3 = x16,
 754       tmp_reg4 = x17, tmp_reg5 = x7, tmp_reg6 = x28, tmp_reg7 = x29;
 755 
 756     const Register stride = x30;
 757 
 758     assert_different_registers(t0, tmp_reg0, tmp_reg1, tmp_reg2, tmp_reg3,
 759       tmp_reg4, tmp_reg5, tmp_reg6, tmp_reg7);
 760     assert_different_registers(s, d, count, t0);
 761 
 762     Label again, drain;
 763     StubCodeMark mark(this, stub_id);
 764     __ align(CodeEntryAlignment);
 765     address start = __ pc();
 766 
 767     if (direction == copy_forwards) {
 768       __ sub(s, s, bias);
 769       __ sub(d, d, bias);
 770     }
 771 
 772 #ifdef ASSERT
 773     // Make sure we are never given < 8 words
 774     {
 775       Label L;
 776 
 777       __ mv(t0, 8);
 778       __ bge(count, t0, L);
 779       __ stop("genrate_copy_longs called with < 8 words");
 780       __ bind(L);
 781     }
 782 #endif
 783 
 784     __ ld(tmp_reg0, Address(s, 1 * unit));
 785     __ ld(tmp_reg1, Address(s, 2 * unit));
 786     __ ld(tmp_reg2, Address(s, 3 * unit));
 787     __ ld(tmp_reg3, Address(s, 4 * unit));
 788     __ ld(tmp_reg4, Address(s, 5 * unit));
 789     __ ld(tmp_reg5, Address(s, 6 * unit));
 790     __ ld(tmp_reg6, Address(s, 7 * unit));
 791     __ ld(tmp_reg7, Address(s, 8 * unit));
 792     __ addi(s, s, 8 * unit);
 793 
 794     __ subi(count, count, 16);
 795     __ bltz(count, drain);
 796 
 797     __ bind(again);
 798 
 799     __ sd(tmp_reg0, Address(d, 1 * unit));
 800     __ sd(tmp_reg1, Address(d, 2 * unit));
 801     __ sd(tmp_reg2, Address(d, 3 * unit));
 802     __ sd(tmp_reg3, Address(d, 4 * unit));
 803     __ sd(tmp_reg4, Address(d, 5 * unit));
 804     __ sd(tmp_reg5, Address(d, 6 * unit));
 805     __ sd(tmp_reg6, Address(d, 7 * unit));
 806     __ sd(tmp_reg7, Address(d, 8 * unit));
 807 
 808     __ ld(tmp_reg0, Address(s, 1 * unit));
 809     __ ld(tmp_reg1, Address(s, 2 * unit));
 810     __ ld(tmp_reg2, Address(s, 3 * unit));
 811     __ ld(tmp_reg3, Address(s, 4 * unit));
 812     __ ld(tmp_reg4, Address(s, 5 * unit));
 813     __ ld(tmp_reg5, Address(s, 6 * unit));
 814     __ ld(tmp_reg6, Address(s, 7 * unit));
 815     __ ld(tmp_reg7, Address(s, 8 * unit));
 816 
 817     __ addi(s, s, 8 * unit);
 818     __ addi(d, d, 8 * unit);
 819 
 820     __ subi(count, count, 8);
 821     __ bgez(count, again);
 822 
 823     // Drain
 824     __ bind(drain);
 825 
 826     __ sd(tmp_reg0, Address(d, 1 * unit));
 827     __ sd(tmp_reg1, Address(d, 2 * unit));
 828     __ sd(tmp_reg2, Address(d, 3 * unit));
 829     __ sd(tmp_reg3, Address(d, 4 * unit));
 830     __ sd(tmp_reg4, Address(d, 5 * unit));
 831     __ sd(tmp_reg5, Address(d, 6 * unit));
 832     __ sd(tmp_reg6, Address(d, 7 * unit));
 833     __ sd(tmp_reg7, Address(d, 8 * unit));
 834     __ addi(d, d, 8 * unit);
 835 
 836     {
 837       Label L1, L2;
 838       __ test_bit(t0, count, 2);
 839       __ beqz(t0, L1);
 840 
 841       __ ld(tmp_reg0, Address(s, 1 * unit));
 842       __ ld(tmp_reg1, Address(s, 2 * unit));
 843       __ ld(tmp_reg2, Address(s, 3 * unit));
 844       __ ld(tmp_reg3, Address(s, 4 * unit));
 845       __ addi(s, s, 4 * unit);
 846 
 847       __ sd(tmp_reg0, Address(d, 1 * unit));
 848       __ sd(tmp_reg1, Address(d, 2 * unit));
 849       __ sd(tmp_reg2, Address(d, 3 * unit));
 850       __ sd(tmp_reg3, Address(d, 4 * unit));
 851       __ addi(d, d, 4 * unit);
 852 
 853       __ bind(L1);
 854 
 855       if (direction == copy_forwards) {
 856         __ addi(s, s, bias);
 857         __ addi(d, d, bias);
 858       }
 859 
 860       __ test_bit(t0, count, 1);
 861       __ beqz(t0, L2);
 862       if (direction == copy_backwards) {
 863         __ addi(s, s, 2 * unit);
 864         __ ld(tmp_reg0, Address(s));
 865         __ ld(tmp_reg1, Address(s, wordSize));
 866         __ addi(d, d, 2 * unit);
 867         __ sd(tmp_reg0, Address(d));
 868         __ sd(tmp_reg1, Address(d, wordSize));
 869       } else {
 870         __ ld(tmp_reg0, Address(s));
 871         __ ld(tmp_reg1, Address(s, wordSize));
 872         __ addi(s, s, 2 * unit);
 873         __ sd(tmp_reg0, Address(d));
 874         __ sd(tmp_reg1, Address(d, wordSize));
 875         __ addi(d, d, 2 * unit);
 876       }
 877       __ bind(L2);
 878     }
 879 
 880     __ ret();
 881 
 882     return start;
 883   }
 884 
 885   typedef void (MacroAssembler::*copy_insn)(Register Rd, const Address &adr, Register temp);
 886 
 887   void copy_memory_v(Register s, Register d, Register count, int step) {
 888     bool is_backward = step < 0;
 889     int granularity = g_uabs(step);
 890 
 891     const Register src = x30, dst = x31, vl = x14, cnt = x15, tmp1 = x16, tmp2 = x17;
 892     assert_different_registers(s, d, cnt, vl, tmp1, tmp2);
 893     Assembler::SEW sew = Assembler::elembytes_to_sew(granularity);
 894     Label loop_forward, loop_backward, done;
 895 
 896     __ mv(dst, d);
 897     __ mv(src, s);
 898     __ mv(cnt, count);
 899 
 900     __ bind(loop_forward);
 901     __ vsetvli(vl, cnt, sew, Assembler::m8);
 902     if (is_backward) {
 903       __ bne(vl, cnt, loop_backward);
 904     }
 905 
 906     __ vlex_v(v0, src, sew);
 907     __ sub(cnt, cnt, vl);
 908     if (sew != Assembler::e8) {
 909       // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary
 910       __ slli(vl, vl, sew);
 911     }
 912     __ add(src, src, vl);
 913 
 914     __ vsex_v(v0, dst, sew);
 915     __ add(dst, dst, vl);
 916     __ bnez(cnt, loop_forward);
 917 
 918     if (is_backward) {
 919       __ j(done);
 920 
 921       __ bind(loop_backward);
 922       __ sub(t0, cnt, vl);
 923       if (sew != Assembler::e8) {
 924         // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary
 925         __ slli(t0, t0, sew);
 926       }
 927       __ add(tmp1, s, t0);
 928       __ vlex_v(v0, tmp1, sew);
 929       __ add(tmp2, d, t0);
 930       __ vsex_v(v0, tmp2, sew);
 931       __ sub(cnt, cnt, vl);
 932       __ bnez(cnt, loop_forward);
 933       __ bind(done);
 934     }
 935   }
 936 
 937   // All-singing all-dancing memory copy.
 938   //
 939   // Copy count units of memory from s to d.  The size of a unit is
 940   // step, which can be positive or negative depending on the direction
 941   // of copy.
 942   //
 943   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 944                    Register s, Register d, Register count, int step) {
 945     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 946     if (UseRVV && (!is_reference_type(type) || bs_asm->supports_rvv_arraycopy())) {
 947       return copy_memory_v(s, d, count, step);
 948     }
 949 
 950     bool is_backwards = step < 0;
 951     int granularity = g_uabs(step);
 952 
 953     const Register src = x30, dst = x31, cnt = x15, tmp3 = x16, tmp4 = x17, tmp5 = x14, tmp6 = x13;
 954     const Register gct1 = x28, gct2 = x29, gct3 = t2;
 955 
 956     Label same_aligned;
 957     Label copy_big, copy32_loop, copy8_loop, copy_small, done;
 958 
 959     // The size of copy32_loop body increases significantly with ZGC GC barriers.
 960     // Need conditional far branches to reach a point beyond the loop in this case.
 961     bool is_far = UseZGC;
 962 
 963     __ beqz(count, done, is_far);
 964     __ slli(cnt, count, exact_log2(granularity));
 965     if (is_backwards) {
 966       __ add(src, s, cnt);
 967       __ add(dst, d, cnt);
 968     } else {
 969       __ mv(src, s);
 970       __ mv(dst, d);
 971     }
 972 
 973     if (is_aligned) {
 974       __ subi(t0, cnt, 32);
 975       __ bgez(t0, copy32_loop);
 976       __ subi(t0, cnt, 8);
 977       __ bgez(t0, copy8_loop, is_far);
 978       __ j(copy_small);
 979     } else {
 980       __ mv(t0, 16);
 981       __ blt(cnt, t0, copy_small, is_far);
 982 
 983       __ xorr(t0, src, dst);
 984       __ andi(t0, t0, 0b111);
 985       __ bnez(t0, copy_small, is_far);
 986 
 987       __ bind(same_aligned);
 988       __ andi(t0, src, 0b111);
 989       __ beqz(t0, copy_big);
 990       if (is_backwards) {
 991         __ addi(src, src, step);
 992         __ addi(dst, dst, step);
 993       }
 994       bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1);
 995       bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3);
 996       if (!is_backwards) {
 997         __ addi(src, src, step);
 998         __ addi(dst, dst, step);
 999       }
1000       __ subi(cnt, cnt, granularity);
1001       __ beqz(cnt, done, is_far);
1002       __ j(same_aligned);
1003 
1004       __ bind(copy_big);
1005       __ mv(t0, 32);
1006       __ blt(cnt, t0, copy8_loop, is_far);
1007     }
1008 
1009     __ bind(copy32_loop);
1010     if (is_backwards) {
1011       __ subi(src, src, wordSize * 4);
1012       __ subi(dst, dst, wordSize * 4);
1013     }
1014     // we first load 32 bytes, then write it, so the direction here doesn't matter
1015     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src),     gct1);
1016     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp4, Address(src, 8),  gct1);
1017     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp5, Address(src, 16), gct1);
1018     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp6, Address(src, 24), gct1);
1019 
1020     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst),     tmp3, gct1, gct2, gct3);
1021     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 8),  tmp4, gct1, gct2, gct3);
1022     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 16), tmp5, gct1, gct2, gct3);
1023     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 24), tmp6, gct1, gct2, gct3);
1024 
1025     if (!is_backwards) {
1026       __ addi(src, src, wordSize * 4);
1027       __ addi(dst, dst, wordSize * 4);
1028     }
1029     __ subi(t0, cnt, 32 + wordSize * 4);
1030     __ subi(cnt, cnt, wordSize * 4);
1031     __ bgez(t0, copy32_loop); // cnt >= 32, do next loop
1032 
1033     __ beqz(cnt, done); // if that's all - done
1034 
1035     __ subi(t0, cnt, 8); // if not - copy the reminder
1036     __ bltz(t0, copy_small); // cnt < 8, go to copy_small, else fall through to copy8_loop
1037 
1038     __ bind(copy8_loop);
1039     if (is_backwards) {
1040       __ subi(src, src, wordSize);
1041       __ subi(dst, dst, wordSize);
1042     }
1043     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src), gct1);
1044     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst), tmp3, gct1, gct2, gct3);
1045 
1046     if (!is_backwards) {
1047       __ addi(src, src, wordSize);
1048       __ addi(dst, dst, wordSize);
1049     }
1050     __ subi(t0, cnt, 8 + wordSize);
1051     __ subi(cnt, cnt, wordSize);
1052     __ bgez(t0, copy8_loop); // cnt >= 8, do next loop
1053 
1054     __ beqz(cnt, done); // if that's all - done
1055 
1056     __ bind(copy_small);
1057     if (is_backwards) {
1058       __ addi(src, src, step);
1059       __ addi(dst, dst, step);
1060     }
1061 
1062     bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1);
1063     bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3);
1064 
1065     if (!is_backwards) {
1066       __ addi(src, src, step);
1067       __ addi(dst, dst, step);
1068     }
1069     __ subi(cnt, cnt, granularity);
1070     __ bgtz(cnt, copy_small);
1071 
1072     __ bind(done);
1073   }
1074 
1075   // Scan over array at a for count oops, verifying each one.
1076   // Preserves a and count, clobbers t0 and t1.
1077   void verify_oop_array(size_t size, Register a, Register count, Register temp) {
1078     Label loop, end;
1079     __ mv(t1, zr);
1080     __ slli(t0, count, exact_log2(size));
1081     __ bind(loop);
1082     __ bgeu(t1, t0, end);
1083 
1084     __ add(temp, a, t1);
1085     if (size == (size_t)wordSize) {
1086       __ ld(temp, Address(temp, 0));
1087       __ verify_oop(temp);
1088     } else {
1089       __ lwu(temp, Address(temp, 0));
1090       __ decode_heap_oop(temp); // calls verify_oop
1091     }
1092     __ add(t1, t1, size);
1093     __ j(loop);
1094     __ bind(end);
1095   }
1096 
1097   // Arguments:
1098   //   stub_id - is used to name the stub and identify all details of
1099   //             how to perform the copy.
1100   //
1101   //   nopush_entry - is assigned to the stub's post push entry point
1102   //                  unless it is null
1103   //
1104   // Inputs:
1105   //   c_rarg0   - source array address
1106   //   c_rarg1   - destination array address
1107   //   c_rarg2   - element count, treated as ssize_t, can be zero
1108   //
1109   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1110   // the hardware handle it.  The two dwords within qwords that span
1111   // cache line boundaries will still be loaded and stored atomically.
1112   //
1113   // Side Effects: nopush_entry is set to the (post push) entry point
1114   //               so it can be used by the corresponding conjoint
1115   //               copy method
1116   //
1117   address generate_disjoint_copy(StubId stub_id, address* nopush_entry) {
1118     size_t size;
1119     bool aligned;
1120     bool is_oop;
1121     bool dest_uninitialized;
1122     switch (stub_id) {
1123     case StubId::stubgen_jbyte_disjoint_arraycopy_id:
1124       size = sizeof(jbyte);
1125       aligned = false;
1126       is_oop = false;
1127       dest_uninitialized = false;
1128       break;
1129     case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
1130       size = sizeof(jbyte);
1131       aligned = true;
1132       is_oop = false;
1133       dest_uninitialized = false;
1134       break;
1135     case StubId::stubgen_jshort_disjoint_arraycopy_id:
1136       size = sizeof(jshort);
1137       aligned = false;
1138       is_oop = false;
1139       dest_uninitialized = false;
1140       break;
1141     case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
1142       size = sizeof(jshort);
1143       aligned = true;
1144       is_oop = false;
1145       dest_uninitialized = false;
1146       break;
1147     case StubId::stubgen_jint_disjoint_arraycopy_id:
1148       size = sizeof(jint);
1149       aligned = false;
1150       is_oop = false;
1151       dest_uninitialized = false;
1152       break;
1153     case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
1154       size = sizeof(jint);
1155       aligned = true;
1156       is_oop = false;
1157       dest_uninitialized = false;
1158       break;
1159     case StubId::stubgen_jlong_disjoint_arraycopy_id:
1160       // since this is always aligned we can (should!) use the same
1161       // stub as for case arrayof_jlong_disjoint_arraycopy
1162       ShouldNotReachHere();
1163       break;
1164     case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
1165       size = sizeof(jlong);
1166       aligned = true;
1167       is_oop = false;
1168       dest_uninitialized = false;
1169       break;
1170     case StubId::stubgen_oop_disjoint_arraycopy_id:
1171       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1172       aligned = !UseCompressedOops;
1173       is_oop = true;
1174       dest_uninitialized = false;
1175       break;
1176     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
1177       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1178       aligned = !UseCompressedOops;
1179       is_oop = true;
1180       dest_uninitialized = false;
1181       break;
1182     case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
1183       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1184       aligned = !UseCompressedOops;
1185       is_oop = true;
1186       dest_uninitialized = true;
1187       break;
1188     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
1189       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1190       aligned = !UseCompressedOops;
1191       is_oop = true;
1192       dest_uninitialized = true;
1193       break;
1194     default:
1195       ShouldNotReachHere();
1196       break;
1197     }
1198 
1199     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1200     RegSet saved_reg = RegSet::of(s, d, count);
1201     __ align(CodeEntryAlignment);
1202     StubCodeMark mark(this, stub_id);
1203     address start = __ pc();
1204     __ enter();
1205 
1206     if (nopush_entry != nullptr) {
1207      *nopush_entry = __ pc();
1208       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1209       BLOCK_COMMENT("Entry:");
1210     }
1211 
1212     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1213     if (dest_uninitialized) {
1214       decorators |= IS_DEST_UNINITIALIZED;
1215     }
1216     if (aligned) {
1217       decorators |= ARRAYCOPY_ALIGNED;
1218     }
1219 
1220     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1221     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1222 
1223     if (is_oop) {
1224       // save regs before copy_memory
1225       __ push_reg(RegSet::of(d, count), sp);
1226     }
1227 
1228     {
1229       // UnsafeMemoryAccess page error: continue after unsafe access
1230       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1231       UnsafeMemoryAccessMark umam(this, add_entry, true);
1232       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1233     }
1234 
1235     if (is_oop) {
1236       __ pop_reg(RegSet::of(d, count), sp);
1237       if (VerifyOops) {
1238         verify_oop_array(size, d, count, t2);
1239       }
1240     }
1241 
1242     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0);
1243 
1244     __ leave();
1245     __ mv(x10, zr); // return 0
1246     __ ret();
1247     return start;
1248   }
1249 
1250   // Arguments:
1251   //   stub_id - is used to name the stub and identify all details of
1252   //             how to perform the copy.
1253   //
1254   //   nooverlap_target - identifes the (post push) entry for the
1255   //             corresponding disjoint copy routine which can be
1256   //             jumped to if the ranges do not actually overlap
1257   //
1258   //   nopush_entry - is assigned to the stub's post push entry point
1259   //                 unless it is null
1260   //
1261   // Inputs:
1262   //   c_rarg0   - source array address
1263   //   c_rarg1   - destination array address
1264   //   c_rarg2   - element count, treated as ssize_t, can be zero
1265   //
1266   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1267   // the hardware handle it.  The two dwords within qwords that span
1268   // cache line boundaries will still be loaded and stored atomically.
1269   //
1270   // Side Effects:
1271   //   nopush_entry is set to the no-overlap entry point so it can be
1272   //   used by some other conjoint copy method
1273   //
1274   address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
1275     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1276     RegSet saved_regs = RegSet::of(s, d, count);
1277     int size;
1278     bool aligned;
1279     bool is_oop;
1280     bool dest_uninitialized;
1281     switch (stub_id) {
1282     case StubId::stubgen_jbyte_arraycopy_id:
1283       size = sizeof(jbyte);
1284       aligned = false;
1285       is_oop = false;
1286       dest_uninitialized = false;
1287       break;
1288     case StubId::stubgen_arrayof_jbyte_arraycopy_id:
1289       size = sizeof(jbyte);
1290       aligned = true;
1291       is_oop = false;
1292       dest_uninitialized = false;
1293       break;
1294     case StubId::stubgen_jshort_arraycopy_id:
1295       size = sizeof(jshort);
1296       aligned = false;
1297       is_oop = false;
1298       dest_uninitialized = false;
1299       break;
1300     case StubId::stubgen_arrayof_jshort_arraycopy_id:
1301       size = sizeof(jshort);
1302       aligned = true;
1303       is_oop = false;
1304       dest_uninitialized = false;
1305       break;
1306     case StubId::stubgen_jint_arraycopy_id:
1307       size = sizeof(jint);
1308       aligned = false;
1309       is_oop = false;
1310       dest_uninitialized = false;
1311       break;
1312     case StubId::stubgen_arrayof_jint_arraycopy_id:
1313       size = sizeof(jint);
1314       aligned = true;
1315       is_oop = false;
1316       dest_uninitialized = false;
1317       break;
1318     case StubId::stubgen_jlong_arraycopy_id:
1319       // since this is always aligned we can (should!) use the same
1320       // stub as for case arrayof_jlong_disjoint_arraycopy
1321       ShouldNotReachHere();
1322       break;
1323     case StubId::stubgen_arrayof_jlong_arraycopy_id:
1324       size = sizeof(jlong);
1325       aligned = true;
1326       is_oop = false;
1327       dest_uninitialized = false;
1328       break;
1329     case StubId::stubgen_oop_arraycopy_id:
1330       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1331       aligned = !UseCompressedOops;
1332       is_oop = true;
1333       dest_uninitialized = false;
1334       break;
1335     case StubId::stubgen_arrayof_oop_arraycopy_id:
1336       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1337       aligned = !UseCompressedOops;
1338       is_oop = true;
1339       dest_uninitialized = false;
1340       break;
1341     case StubId::stubgen_oop_arraycopy_uninit_id:
1342       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1343       aligned = !UseCompressedOops;
1344       is_oop = true;
1345       dest_uninitialized = true;
1346       break;
1347     case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
1348       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1349       aligned = !UseCompressedOops;
1350       is_oop = true;
1351       dest_uninitialized = true;
1352       break;
1353     default:
1354       ShouldNotReachHere();
1355     }
1356 
1357     StubCodeMark mark(this, stub_id);
1358     address start = __ pc();
1359     __ enter();
1360 
1361     if (nopush_entry != nullptr) {
1362       *nopush_entry = __ pc();
1363       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1364       BLOCK_COMMENT("Entry:");
1365     }
1366 
1367     // use fwd copy when (d-s) above_equal (count*size)
1368     __ sub(t0, d, s);
1369     __ slli(t1, count, exact_log2(size));
1370     Label L_continue;
1371     __ bltu(t0, t1, L_continue);
1372     __ j(RuntimeAddress(nooverlap_target));
1373     __ bind(L_continue);
1374 
1375     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1376     if (dest_uninitialized) {
1377       decorators |= IS_DEST_UNINITIALIZED;
1378     }
1379     if (aligned) {
1380       decorators |= ARRAYCOPY_ALIGNED;
1381     }
1382 
1383     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1384     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1385 
1386     if (is_oop) {
1387       // save regs before copy_memory
1388       __ push_reg(RegSet::of(d, count), sp);
1389     }
1390 
1391     {
1392       // UnsafeMemoryAccess page error: continue after unsafe access
1393       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1394       UnsafeMemoryAccessMark umam(this, add_entry, true);
1395       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1396     }
1397 
1398     if (is_oop) {
1399       __ pop_reg(RegSet::of(d, count), sp);
1400       if (VerifyOops) {
1401         verify_oop_array(size, d, count, t2);
1402       }
1403     }
1404     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0);
1405     __ leave();
1406     __ mv(x10, zr); // return 0
1407     __ ret();
1408     return start;
1409   }
1410 
1411   // Helper for generating a dynamic type check.
1412   // Smashes t0, t1.
1413   void generate_type_check(Register sub_klass,
1414                            Register super_check_offset,
1415                            Register super_klass,
1416                            Register result,
1417                            Register tmp1,
1418                            Register tmp2,
1419                            Label& L_success) {
1420     assert_different_registers(sub_klass, super_check_offset, super_klass);
1421 
1422     BLOCK_COMMENT("type_check:");
1423 
1424     Label L_miss;
1425 
1426     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, super_check_offset);
1427     __ check_klass_subtype_slow_path(sub_klass, super_klass, tmp1, tmp2, &L_success, nullptr);
1428 
1429     // Fall through on failure!
1430     __ BIND(L_miss);
1431   }
1432 
1433   //
1434   //  Generate checkcasting array copy stub
1435   //
1436   //  Input:
1437   //    c_rarg0   - source array address
1438   //    c_rarg1   - destination array address
1439   //    c_rarg2   - element count, treated as ssize_t, can be zero
1440   //    c_rarg3   - size_t ckoff (super_check_offset)
1441   //    c_rarg4   - oop ckval (super_klass)
1442   //
1443   //  Output:
1444   //    x10 ==  0  -  success
1445   //    x10 == -1^K - failure, where K is partial transfer count
1446   //
1447   address generate_checkcast_copy(StubId stub_id, address* nopush_entry) {
1448     bool dest_uninitialized;
1449     switch (stub_id) {
1450     case StubId::stubgen_checkcast_arraycopy_id:
1451       dest_uninitialized = false;
1452       break;
1453     case StubId::stubgen_checkcast_arraycopy_uninit_id:
1454       dest_uninitialized = true;
1455       break;
1456     default:
1457       ShouldNotReachHere();
1458     }
1459 
1460     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1461 
1462     // Input registers (after setup_arg_regs)
1463     const Register from        = c_rarg0;   // source array address
1464     const Register to          = c_rarg1;   // destination array address
1465     const Register count       = c_rarg2;   // elementscount
1466     const Register ckoff       = c_rarg3;   // super_check_offset
1467     const Register ckval       = c_rarg4;   // super_klass
1468 
1469     RegSet wb_pre_saved_regs   = RegSet::range(c_rarg0, c_rarg4);
1470 
1471     // Registers used as temps (x7, x9, x18 are save-on-entry)
1472     const Register count_save  = x19;       // orig elementscount
1473     const Register start_to    = x18;       // destination array start address
1474     const Register copied_oop  = x7;        // actual oop copied
1475     const Register r9_klass    = x9;        // oop._klass
1476 
1477     // Registers used as gc temps (x15, x16, x17 are save-on-call)
1478     const Register gct1 = x15, gct2 = x16, gct3 = x17;
1479 
1480     //---------------------------------------------------------------
1481     // Assembler stub will be used for this call to arraycopy
1482     // if the two arrays are subtypes of Object[] but the
1483     // destination array type is not equal to or a supertype
1484     // of the source type.  Each element must be separately
1485     // checked.
1486 
1487     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1488                                copied_oop, r9_klass, count_save);
1489 
1490     __ align(CodeEntryAlignment);
1491     StubCodeMark mark(this, stub_id);
1492     address start = __ pc();
1493 
1494     __ enter(); // required for proper stackwalking of RuntimeStub frame
1495 
1496     // Caller of this entry point must set up the argument registers.
1497     if (nopush_entry != nullptr) {
1498       *nopush_entry = __ pc();
1499       BLOCK_COMMENT("Entry:");
1500     }
1501 
1502     // Empty array:  Nothing to do
1503     __ beqz(count, L_done);
1504 
1505     __ push_reg(RegSet::of(x7, x9, x18, x19), sp);
1506 
1507 #ifdef ASSERT
1508     BLOCK_COMMENT("assert consistent ckoff/ckval");
1509     // The ckoff and ckval must be mutually consistent,
1510     // even though caller generates both.
1511     { Label L;
1512       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1513       __ lwu(start_to, Address(ckval, sco_offset));
1514       __ beq(ckoff, start_to, L);
1515       __ stop("super_check_offset inconsistent");
1516       __ bind(L);
1517     }
1518 #endif //ASSERT
1519 
1520     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1521     if (dest_uninitialized) {
1522       decorators |= IS_DEST_UNINITIALIZED;
1523     }
1524 
1525     bool is_oop = true;
1526     int element_size = UseCompressedOops ? 4 : 8;
1527 
1528     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1529     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1530 
1531     // save the original count
1532     __ mv(count_save, count);
1533 
1534     // Copy from low to high addresses
1535     __ mv(start_to, to);              // Save destination array start address
1536     __ j(L_load_element);
1537 
1538     // ======== begin loop ========
1539     // (Loop is rotated; its entry is L_load_element.)
1540     // Loop control:
1541     //   for count to 0 do
1542     //     copied_oop = load_heap_oop(from++)
1543     //     ... generate_type_check ...
1544     //     store_heap_oop(to++, copied_oop)
1545     //   end
1546 
1547     __ align(OptoLoopAlignment);
1548 
1549     __ BIND(L_store_element);
1550     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1551                       Address(to, 0), copied_oop,
1552                       gct1, gct2, gct3);
1553     __ addi(to, to, UseCompressedOops ? 4 : 8);
1554     __ subi(count, count, 1);
1555     __ beqz(count, L_do_card_marks);
1556 
1557     // ======== loop entry is here ========
1558     __ BIND(L_load_element);
1559     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1560                      copied_oop, Address(from, 0),
1561                      gct1);
1562     __ addi(from, from, UseCompressedOops ? 4 : 8);
1563     __ beqz(copied_oop, L_store_element);
1564 
1565     __ load_klass(r9_klass, copied_oop);// query the object klass
1566 
1567     BLOCK_COMMENT("type_check:");
1568     generate_type_check(r9_klass, /*sub_klass*/
1569                         ckoff,    /*super_check_offset*/
1570                         ckval,    /*super_klass*/
1571                         x10,      /*result*/
1572                         gct1,     /*tmp1*/
1573                         gct2,     /*tmp2*/
1574                         L_store_element);
1575 
1576     // Fall through on failure!
1577 
1578     // ======== end loop ========
1579 
1580     // It was a real error; we must depend on the caller to finish the job.
1581     // Register count = remaining oops, count_orig = total oops.
1582     // Emit GC store barriers for the oops we have copied and report
1583     // their number to the caller.
1584 
1585     __ sub(count, count_save, count);     // K = partially copied oop count
1586     __ xori(count, count, -1);            // report (-1^K) to caller
1587     __ beqz(count, L_done_pop);
1588 
1589     __ BIND(L_do_card_marks);
1590     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, t0);
1591 
1592     __ bind(L_done_pop);
1593     __ pop_reg(RegSet::of(x7, x9, x18, x19), sp);
1594     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1595 
1596     __ bind(L_done);
1597     __ mv(x10, count);
1598     __ leave();
1599     __ ret();
1600 
1601     return start;
1602   }
1603 
1604   // Perform range checks on the proposed arraycopy.
1605   // Kills temp, but nothing else.
1606   // Also, clean the sign bits of src_pos and dst_pos.
1607   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1608                               Register src_pos, // source position (c_rarg1)
1609                               Register dst,     // destination array oo (c_rarg2)
1610                               Register dst_pos, // destination position (c_rarg3)
1611                               Register length,
1612                               Register temp,
1613                               Label& L_failed) {
1614     BLOCK_COMMENT("arraycopy_range_checks:");
1615 
1616     assert_different_registers(t0, temp);
1617 
1618     // if [src_pos + length > arrayOop(src)->length()] then FAIL
1619     __ lwu(t0, Address(src, arrayOopDesc::length_offset_in_bytes()));
1620     __ addw(temp, length, src_pos);
1621     __ bgtu(temp, t0, L_failed);
1622 
1623     // if [dst_pos + length > arrayOop(dst)->length()] then FAIL
1624     __ lwu(t0, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1625     __ addw(temp, length, dst_pos);
1626     __ bgtu(temp, t0, L_failed);
1627 
1628     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1629     __ zext(src_pos, src_pos, 32);
1630     __ zext(dst_pos, dst_pos, 32);
1631 
1632     BLOCK_COMMENT("arraycopy_range_checks done");
1633   }
1634 
1635   address generate_unsafecopy_common_error_exit() {
1636     address start = __ pc();
1637     __ mv(x10, 0);
1638     __ leave();
1639     __ ret();
1640     return start;
1641   }
1642 
1643   //
1644   //  Generate 'unsafe' set memory stub
1645   //  Though just as safe as the other stubs, it takes an unscaled
1646   //  size_t (# bytes) argument instead of an element count.
1647   //
1648   //  Input:
1649   //    c_rarg0   - destination array address
1650   //    c_rarg1   - byte count (size_t)
1651   //    c_rarg2   - byte value
1652   //
1653   address generate_unsafe_setmemory() {
1654     __ align(CodeEntryAlignment);
1655     StubId stub_id = StubId::stubgen_unsafe_setmemory_id;
1656     StubCodeMark mark(this, stub_id);
1657     address start = __ pc();
1658 
1659     // bump this on entry, not on exit:
1660     // inc_counter_np(SharedRuntime::_unsafe_set_memory_ctr);
1661 
1662     Label L_fill_elements;
1663 
1664     const Register dest = c_rarg0;
1665     const Register count = c_rarg1;
1666     const Register value = c_rarg2;
1667     const Register cnt_words = x28; // temp register
1668     const Register tmp_reg   = x29; // temp register
1669 
1670     // Mark remaining code as such which performs Unsafe accesses.
1671     UnsafeMemoryAccessMark umam(this, true, false);
1672 
1673     __ enter(); // required for proper stackwalking of RuntimeStub frame
1674 
1675     // if count < 8, jump to L_fill_elements
1676     __ mv(tmp_reg, 8); // 8 bytes fill by element
1677     __ bltu(count, tmp_reg, L_fill_elements);
1678 
1679     // Propagate byte to 64-bit width
1680     // 8 bit -> 16 bit
1681     __ zext(value, value, 8);
1682     __ slli(tmp_reg, value, 8);
1683     __ orr(value, value, tmp_reg);
1684     // 16 bit -> 32 bit
1685     __ slli(tmp_reg, value, 16);
1686     __ orr(value, value, tmp_reg);
1687     // 32 bit -> 64 bit
1688     __ slli(tmp_reg, value, 32);
1689     __ orr(value, value, tmp_reg);
1690 
1691     // Align source address at 8 bytes address boundary.
1692     Label L_skip_align1, L_skip_align2, L_skip_align4;
1693     // One byte misalignment happens.
1694     __ test_bit(tmp_reg, dest, 0);
1695     __ beqz(tmp_reg, L_skip_align1);
1696     __ sb(value, Address(dest, 0));
1697     __ addi(dest, dest, 1);
1698     __ subi(count, count, 1);
1699 
1700     __ bind(L_skip_align1);
1701     // Two bytes misalignment happens.
1702     __ test_bit(tmp_reg, dest, 1);
1703     __ beqz(tmp_reg, L_skip_align2);
1704     __ sh(value, Address(dest, 0));
1705     __ addi(dest, dest, 2);
1706     __ subi(count, count, 2);
1707 
1708     __ bind(L_skip_align2);
1709     // Four bytes misalignment happens.
1710     __ test_bit(tmp_reg, dest, 2);
1711     __ beqz(tmp_reg, L_skip_align4);
1712     __ sw(value, Address(dest, 0));
1713     __ addi(dest, dest, 4);
1714     __ subi(count, count, 4);
1715     __ bind(L_skip_align4);
1716 
1717     //  Fill large chunks
1718     __ srli(cnt_words, count, 3); // number of words
1719     __ slli(tmp_reg, cnt_words, 3);
1720     __ sub(count, count, tmp_reg);
1721     {
1722       __ fill_words(dest, cnt_words, value);
1723     }
1724 
1725     // Handle copies less than 8 bytes
1726     __ bind(L_fill_elements);
1727     Label L_fill_2, L_fill_1, L_exit;
1728     __ test_bit(tmp_reg, count, 2);
1729     __ beqz(tmp_reg, L_fill_2);
1730     __ sb(value, Address(dest, 0));
1731     __ sb(value, Address(dest, 1));
1732     __ sb(value, Address(dest, 2));
1733     __ sb(value, Address(dest, 3));
1734     __ addi(dest, dest, 4);
1735 
1736     __ bind(L_fill_2);
1737     __ test_bit(tmp_reg, count, 1);
1738     __ beqz(tmp_reg, L_fill_1);
1739     __ sb(value, Address(dest, 0));
1740     __ sb(value, Address(dest, 1));
1741     __ addi(dest, dest, 2);
1742 
1743     __ bind(L_fill_1);
1744     __ test_bit(tmp_reg, count, 0);
1745     __ beqz(tmp_reg, L_exit);
1746     __ sb(value, Address(dest, 0));
1747 
1748     __ bind(L_exit);
1749     __ leave();
1750     __ ret();
1751 
1752     return start;
1753   }
1754 
1755   //
1756   //  Generate 'unsafe' array copy stub
1757   //  Though just as safe as the other stubs, it takes an unscaled
1758   //  size_t argument instead of an element count.
1759   //
1760   //  Input:
1761   //    c_rarg0   - source array address
1762   //    c_rarg1   - destination array address
1763   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1764   //
1765   // Examines the alignment of the operands and dispatches
1766   // to a long, int, short, or byte copy loop.
1767   //
1768   address generate_unsafe_copy(address byte_copy_entry,
1769                                address short_copy_entry,
1770                                address int_copy_entry,
1771                                address long_copy_entry) {
1772     assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr &&
1773                 int_copy_entry != nullptr && long_copy_entry != nullptr);
1774     Label L_long_aligned, L_int_aligned, L_short_aligned;
1775     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1776 
1777     __ align(CodeEntryAlignment);
1778     StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
1779     StubCodeMark mark(this, stub_id);
1780     address start = __ pc();
1781     __ enter(); // required for proper stackwalking of RuntimeStub frame
1782 
1783     // bump this on entry, not on exit:
1784     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1785 
1786     __ orr(t0, s, d);
1787     __ orr(t0, t0, count);
1788 
1789     __ andi(t0, t0, BytesPerLong - 1);
1790     __ beqz(t0, L_long_aligned);
1791     __ andi(t0, t0, BytesPerInt - 1);
1792     __ beqz(t0, L_int_aligned);
1793     __ test_bit(t0, t0, 0);
1794     __ beqz(t0, L_short_aligned);
1795     __ j(RuntimeAddress(byte_copy_entry));
1796 
1797     __ BIND(L_short_aligned);
1798     __ srli(count, count, LogBytesPerShort);  // size => short_count
1799     __ j(RuntimeAddress(short_copy_entry));
1800     __ BIND(L_int_aligned);
1801     __ srli(count, count, LogBytesPerInt);    // size => int_count
1802     __ j(RuntimeAddress(int_copy_entry));
1803     __ BIND(L_long_aligned);
1804     __ srli(count, count, LogBytesPerLong);   // size => long_count
1805     __ j(RuntimeAddress(long_copy_entry));
1806 
1807     return start;
1808   }
1809 
1810   //
1811   //  Generate generic array copy stubs
1812   //
1813   //  Input:
1814   //    c_rarg0    -  src oop
1815   //    c_rarg1    -  src_pos (32-bits)
1816   //    c_rarg2    -  dst oop
1817   //    c_rarg3    -  dst_pos (32-bits)
1818   //    c_rarg4    -  element count (32-bits)
1819   //
1820   //  Output:
1821   //    x10 ==  0  -  success
1822   //    x10 == -1^K - failure, where K is partial transfer count
1823   //
1824   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
1825                                 address int_copy_entry, address oop_copy_entry,
1826                                 address long_copy_entry, address checkcast_copy_entry) {
1827     assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr &&
1828                 int_copy_entry != nullptr && oop_copy_entry != nullptr &&
1829                 long_copy_entry != nullptr && checkcast_copy_entry != nullptr);
1830     Label L_failed, L_failed_0, L_objArray;
1831     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1832 
1833     // Input registers
1834     const Register src        = c_rarg0;  // source array oop
1835     const Register src_pos    = c_rarg1;  // source position
1836     const Register dst        = c_rarg2;  // destination array oop
1837     const Register dst_pos    = c_rarg3;  // destination position
1838     const Register length     = c_rarg4;
1839 
1840     // Registers used as temps
1841     const Register dst_klass = c_rarg5;
1842 
1843     __ align(CodeEntryAlignment);
1844 
1845     StubId stub_id = StubId::stubgen_generic_arraycopy_id;
1846     StubCodeMark mark(this, stub_id);
1847 
1848     address start = __ pc();
1849 
1850     __ enter(); // required for proper stackwalking of RuntimeStub frame
1851 
1852     // bump this on entry, not on exit:
1853     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1854 
1855     //-----------------------------------------------------------------------
1856     // Assembler stub will be used for this call to arraycopy
1857     // if the following conditions are met:
1858     //
1859     // (1) src and dst must not be null.
1860     // (2) src_pos must not be negative.
1861     // (3) dst_pos must not be negative.
1862     // (4) length  must not be negative.
1863     // (5) src klass and dst klass should be the same and not null.
1864     // (6) src and dst should be arrays.
1865     // (7) src_pos + length must not exceed length of src.
1866     // (8) dst_pos + length must not exceed length of dst.
1867     //
1868 
1869     // if src is null then return -1
1870     __ beqz(src, L_failed);
1871 
1872     // if [src_pos < 0] then return -1
1873     __ sext(t0, src_pos, 32);
1874     __ bltz(t0, L_failed);
1875 
1876     // if dst is null then return -1
1877     __ beqz(dst, L_failed);
1878 
1879     // if [dst_pos < 0] then return -1
1880     __ sext(t0, dst_pos, 32);
1881     __ bltz(t0, L_failed);
1882 
1883     // registers used as temp
1884     const Register scratch_length    = x28; // elements count to copy
1885     const Register scratch_src_klass = x29; // array klass
1886     const Register lh                = x30; // layout helper
1887 
1888     // if [length < 0] then return -1
1889     __ sext(scratch_length, length, 32); // length (elements count, 32-bits value)
1890     __ bltz(scratch_length, L_failed);
1891 
1892     __ load_narrow_klass(scratch_src_klass, src);
1893 #ifdef ASSERT
1894     {
1895       BLOCK_COMMENT("assert klasses not null {");
1896       Label L1, L2;
1897       __ bnez(scratch_src_klass, L2);   // it is broken if klass is null
1898       __ bind(L1);
1899       __ stop("broken null klass");
1900       __ bind(L2);
1901       __ load_narrow_klass(t0, dst);
1902       __ beqz(t0, L1);     // this would be broken also
1903       BLOCK_COMMENT("} assert klasses not null done");
1904     }
1905 #endif
1906     __ decode_klass_not_null(scratch_src_klass, t0);
1907 
1908     // Load layout helper (32-bits)
1909     //
1910     //  |array_tag|     | header_size | element_type |     |log2_element_size|
1911     // 32        30    24            16              8     2                 0
1912     //
1913     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
1914     //
1915 
1916     const int lh_offset = in_bytes(Klass::layout_helper_offset());
1917 
1918     // Handle objArrays completely differently...
1919     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
1920     __ lw(lh, Address(scratch_src_klass, lh_offset));
1921     __ mv(t0, objArray_lh);
1922     __ beq(lh, t0, L_objArray);
1923 
1924     // if [src->klass() != dst->klass()] then return -1
1925     __ load_klass(t1, dst);
1926     __ bne(t1, scratch_src_klass, L_failed);
1927 
1928     // Check for flat inline type array -> return -1
1929     __ test_flat_array_oop(src, t1, L_failed);
1930 
1931     // Check for null-free (non-flat) inline type array -> handle as object array
1932     __ test_null_free_array_oop(src, t1, L_objArray);
1933 
1934     // if src->is_Array() isn't null then return -1
1935     // i.e. (lh >= 0)
1936     __ bgez(lh, L_failed);
1937 
1938     // At this point, it is known to be a typeArray (array_tag 0x3).
1939 #ifdef ASSERT
1940     {
1941       BLOCK_COMMENT("assert primitive array {");
1942       Label L;
1943       __ mv(t1, (int32_t)(Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
1944       __ bge(lh, t1, L);
1945       __ stop("must be a primitive array");
1946       __ bind(L);
1947       BLOCK_COMMENT("} assert primitive array done");
1948     }
1949 #endif
1950 
1951     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1952                            t1, L_failed);
1953 
1954     // TypeArrayKlass
1955     //
1956     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize)
1957     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize)
1958     //
1959 
1960     const Register t0_offset = t0;    // array offset
1961     const Register x30_elsize = lh;   // element size
1962 
1963     // Get array_header_in_bytes()
1964     int lh_header_size_width = exact_log2(Klass::_lh_header_size_mask + 1);
1965     int lh_header_size_msb = Klass::_lh_header_size_shift + lh_header_size_width;
1966     __ slli(t0_offset, lh, XLEN - lh_header_size_msb);          // left shift to remove 24 ~ 32;
1967     __ srli(t0_offset, t0_offset, XLEN - lh_header_size_width); // array_offset
1968 
1969     __ add(src, src, t0_offset);           // src array offset
1970     __ add(dst, dst, t0_offset);           // dst array offset
1971     BLOCK_COMMENT("choose copy loop based on element size");
1972 
1973     // next registers should be set before the jump to corresponding stub
1974     const Register from     = c_rarg0;  // source array address
1975     const Register to       = c_rarg1;  // destination array address
1976     const Register count    = c_rarg2;  // elements count
1977 
1978     // 'from', 'to', 'count' registers should be set in such order
1979     // since they are the same as 'src', 'src_pos', 'dst'.
1980 
1981     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
1982 
1983     // The possible values of elsize are 0-3, i.e. exact_log2(element
1984     // size in bytes).  We do a simple bitwise binary search.
1985   __ BIND(L_copy_bytes);
1986     __ test_bit(t0, x30_elsize, 1);
1987     __ bnez(t0, L_copy_ints);
1988     __ test_bit(t0, x30_elsize, 0);
1989     __ bnez(t0, L_copy_shorts);
1990     __ add(from, src, src_pos); // src_addr
1991     __ add(to, dst, dst_pos); // dst_addr
1992     __ sext(count, scratch_length, 32); // length
1993     __ j(RuntimeAddress(byte_copy_entry));
1994 
1995   __ BIND(L_copy_shorts);
1996     __ shadd(from, src_pos, src, t0, 1); // src_addr
1997     __ shadd(to, dst_pos, dst, t0, 1); // dst_addr
1998     __ sext(count, scratch_length, 32); // length
1999     __ j(RuntimeAddress(short_copy_entry));
2000 
2001   __ BIND(L_copy_ints);
2002     __ test_bit(t0, x30_elsize, 0);
2003     __ bnez(t0, L_copy_longs);
2004     __ shadd(from, src_pos, src, t0, 2); // src_addr
2005     __ shadd(to, dst_pos, dst, t0, 2); // dst_addr
2006     __ sext(count, scratch_length, 32); // length
2007     __ j(RuntimeAddress(int_copy_entry));
2008 
2009   __ BIND(L_copy_longs);
2010 #ifdef ASSERT
2011     {
2012       BLOCK_COMMENT("assert long copy {");
2013       Label L;
2014       __ andi(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> x30_elsize
2015       __ sext(lh, lh, 32);
2016       __ mv(t0, LogBytesPerLong);
2017       __ beq(x30_elsize, t0, L);
2018       __ stop("must be long copy, but elsize is wrong");
2019       __ bind(L);
2020       BLOCK_COMMENT("} assert long copy done");
2021     }
2022 #endif
2023     __ shadd(from, src_pos, src, t0, 3); // src_addr
2024     __ shadd(to, dst_pos, dst, t0, 3); // dst_addr
2025     __ sext(count, scratch_length, 32); // length
2026     __ j(RuntimeAddress(long_copy_entry));
2027 
2028     // ObjArrayKlass
2029   __ BIND(L_objArray);
2030     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2031 
2032     Label L_plain_copy, L_checkcast_copy;
2033     // test array classes for subtyping
2034     __ load_klass(t2, dst);
2035     __ bne(scratch_src_klass, t2, L_checkcast_copy); // usual case is exact equality
2036 
2037     // Identically typed arrays can be copied without element-wise checks.
2038     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2039                            t1, L_failed);
2040 
2041     __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop);
2042     __ addi(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2043     __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop);
2044     __ addi(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2045     __ sext(count, scratch_length, 32); // length
2046   __ BIND(L_plain_copy);
2047     __ j(RuntimeAddress(oop_copy_entry));
2048 
2049   __ BIND(L_checkcast_copy);
2050     // live at this point:  scratch_src_klass, scratch_length, t2 (dst_klass)
2051     {
2052       // Before looking at dst.length, make sure dst is also an objArray.
2053       __ lwu(t0, Address(t2, lh_offset));
2054       __ mv(t1, objArray_lh);
2055       __ bne(t0, t1, L_failed);
2056 
2057       // It is safe to examine both src.length and dst.length.
2058       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2059                              t2, L_failed);
2060 
2061       __ load_klass(dst_klass, dst); // reload
2062 
2063       // Marshal the base address arguments now, freeing registers.
2064       __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop);
2065       __ addi(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2066       __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop);
2067       __ addi(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2068       __ sext(count, length, 32); // length (reloaded)
2069       const Register sco_temp = c_rarg3; // this register is free now
2070       assert_different_registers(from, to, count, sco_temp,
2071                                  dst_klass, scratch_src_klass);
2072 
2073       // Generate the type check.
2074       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2075       __ lwu(sco_temp, Address(dst_klass, sco_offset));
2076 
2077       // Smashes t0, t1
2078       generate_type_check(scratch_src_klass, sco_temp, dst_klass, noreg, noreg, noreg, L_plain_copy);
2079 
2080       // Fetch destination element klass from the ObjArrayKlass header.
2081       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2082       __ ld(dst_klass, Address(dst_klass, ek_offset));
2083       __ lwu(sco_temp, Address(dst_klass, sco_offset));
2084 
2085       // the checkcast_copy loop needs two extra arguments:
2086       assert(c_rarg3 == sco_temp, "#3 already in place");
2087       // Set up arguments for checkcast_copy_entry.
2088       __ mv(c_rarg4, dst_klass);  // dst.klass.element_klass
2089       __ j(RuntimeAddress(checkcast_copy_entry));
2090     }
2091 
2092   __ BIND(L_failed);
2093     __ mv(x10, -1);
2094     __ leave();   // required for proper stackwalking of RuntimeStub frame
2095     __ ret();
2096 
2097     return start;
2098   }
2099 
2100   //
2101   // Generate stub for array fill. If "aligned" is true, the
2102   // "to" address is assumed to be heapword aligned.
2103   //
2104   // Arguments for generated stub:
2105   //   to:    c_rarg0
2106   //   value: c_rarg1
2107   //   count: c_rarg2 treated as signed
2108   //
2109   address generate_fill(StubId stub_id) {
2110     BasicType t;
2111     bool aligned;
2112 
2113     switch (stub_id) {
2114     case StubId::stubgen_jbyte_fill_id:
2115       t = T_BYTE;
2116       aligned = false;
2117       break;
2118     case StubId::stubgen_jshort_fill_id:
2119       t = T_SHORT;
2120       aligned = false;
2121       break;
2122     case StubId::stubgen_jint_fill_id:
2123       t = T_INT;
2124       aligned = false;
2125       break;
2126     case StubId::stubgen_arrayof_jbyte_fill_id:
2127       t = T_BYTE;
2128       aligned = true;
2129       break;
2130     case StubId::stubgen_arrayof_jshort_fill_id:
2131       t = T_SHORT;
2132       aligned = true;
2133       break;
2134     case StubId::stubgen_arrayof_jint_fill_id:
2135       t = T_INT;
2136       aligned = true;
2137       break;
2138     default:
2139       ShouldNotReachHere();
2140     };
2141 
2142     __ align(CodeEntryAlignment);
2143     StubCodeMark mark(this, stub_id);
2144     address start = __ pc();
2145 
2146     BLOCK_COMMENT("Entry:");
2147 
2148     const Register to        = c_rarg0;  // source array address
2149     const Register value     = c_rarg1;  // value
2150     const Register count     = c_rarg2;  // elements count
2151 
2152     const Register bz_base   = x28;      // base for block_zero routine
2153     const Register cnt_words = x29;      // temp register
2154     const Register tmp_reg   = t1;
2155 
2156     __ enter();
2157 
2158     Label L_fill_elements;
2159 
2160     int shift = -1;
2161     switch (t) {
2162       case T_BYTE:
2163         shift = 0;
2164         // Short arrays (< 8 bytes) fill by element
2165         __ mv(tmp_reg, 8 >> shift);
2166         __ bltu(count, tmp_reg, L_fill_elements);
2167 
2168         // Zero extend value
2169         // 8 bit -> 16 bit
2170         __ zext(value, value, 8);
2171         __ slli(tmp_reg, value, 8);
2172         __ orr(value, value, tmp_reg);
2173 
2174         // 16 bit -> 32 bit
2175         __ slli(tmp_reg, value, 16);
2176         __ orr(value, value, tmp_reg);
2177         break;
2178       case T_SHORT:
2179         shift = 1;
2180         // Short arrays (< 8 bytes) fill by element
2181         __ mv(tmp_reg, 8 >> shift);
2182         __ bltu(count, tmp_reg, L_fill_elements);
2183 
2184         // Zero extend value
2185         // 16 bit -> 32 bit
2186         __ zext(value, value, 16);
2187         __ slli(tmp_reg, value, 16);
2188         __ orr(value, value, tmp_reg);
2189         break;
2190       case T_INT:
2191         shift = 2;
2192         // Short arrays (< 8 bytes) fill by element
2193         __ mv(tmp_reg, 8 >> shift);
2194         __ bltu(count, tmp_reg, L_fill_elements);
2195         break;
2196       default: ShouldNotReachHere();
2197     }
2198 
2199     // Align source address at 8 bytes address boundary.
2200     Label L_skip_align1, L_skip_align2, L_skip_align4;
2201     if (!aligned) {
2202       switch (t) {
2203         case T_BYTE:
2204           // One byte misalignment happens only for byte arrays.
2205           __ test_bit(tmp_reg, to, 0);
2206           __ beqz(tmp_reg, L_skip_align1);
2207           __ sb(value, Address(to, 0));
2208           __ addi(to, to, 1);
2209           __ subiw(count, count, 1);
2210           __ bind(L_skip_align1);
2211           // Fallthrough
2212         case T_SHORT:
2213           // Two bytes misalignment happens only for byte and short (char) arrays.
2214           __ test_bit(tmp_reg, to, 1);
2215           __ beqz(tmp_reg, L_skip_align2);
2216           __ sh(value, Address(to, 0));
2217           __ addi(to, to, 2);
2218           __ subiw(count, count, 2 >> shift);
2219           __ bind(L_skip_align2);
2220           // Fallthrough
2221         case T_INT:
2222           // Align to 8 bytes, we know we are 4 byte aligned to start.
2223           __ test_bit(tmp_reg, to, 2);
2224           __ beqz(tmp_reg, L_skip_align4);
2225           __ sw(value, Address(to, 0));
2226           __ addi(to, to, 4);
2227           __ subiw(count, count, 4 >> shift);
2228           __ bind(L_skip_align4);
2229           break;
2230         default: ShouldNotReachHere();
2231       }
2232     }
2233 
2234     //
2235     //  Fill large chunks
2236     //
2237     __ srliw(cnt_words, count, 3 - shift); // number of words
2238 
2239     // 32 bit -> 64 bit
2240     __ zext(value, value, 32);
2241     __ slli(tmp_reg, value, 32);
2242     __ orr(value, value, tmp_reg);
2243 
2244     __ slli(tmp_reg, cnt_words, 3 - shift);
2245     __ subw(count, count, tmp_reg);
2246     {
2247       __ fill_words(to, cnt_words, value);
2248     }
2249 
2250     // Handle copies less than 8 bytes.
2251     // Address may not be heapword aligned.
2252     Label L_fill_1, L_fill_2, L_exit;
2253     __ bind(L_fill_elements);
2254     switch (t) {
2255       case T_BYTE:
2256         __ test_bit(tmp_reg, count, 2);
2257         __ beqz(tmp_reg, L_fill_2);
2258         __ sb(value, Address(to, 0));
2259         __ sb(value, Address(to, 1));
2260         __ sb(value, Address(to, 2));
2261         __ sb(value, Address(to, 3));
2262         __ addi(to, to, 4);
2263 
2264         __ bind(L_fill_2);
2265         __ test_bit(tmp_reg, count, 1);
2266         __ beqz(tmp_reg, L_fill_1);
2267         __ sb(value, Address(to, 0));
2268         __ sb(value, Address(to, 1));
2269         __ addi(to, to, 2);
2270 
2271         __ bind(L_fill_1);
2272         __ test_bit(tmp_reg, count, 0);
2273         __ beqz(tmp_reg, L_exit);
2274         __ sb(value, Address(to, 0));
2275         break;
2276       case T_SHORT:
2277         __ test_bit(tmp_reg, count, 1);
2278         __ beqz(tmp_reg, L_fill_2);
2279         __ sh(value, Address(to, 0));
2280         __ sh(value, Address(to, 2));
2281         __ addi(to, to, 4);
2282 
2283         __ bind(L_fill_2);
2284         __ test_bit(tmp_reg, count, 0);
2285         __ beqz(tmp_reg, L_exit);
2286         __ sh(value, Address(to, 0));
2287         break;
2288       case T_INT:
2289         __ beqz(count, L_exit);
2290         __ sw(value, Address(to, 0));
2291         break;
2292       default: ShouldNotReachHere();
2293     }
2294     __ bind(L_exit);
2295     __ leave();
2296     __ ret();
2297 
2298     return start;
2299   }
2300 
2301   void generate_arraycopy_stubs() {
2302     // Some copy stubs publish a normal entry and then a 2nd 'fallback'
2303     // entry immediately following their stack push. This can be used
2304     // as a post-push branch target for compatible stubs when they
2305     // identify a special case that can be handled by the fallback
2306     // stub e.g a disjoint copy stub may be use as a special case
2307     // fallback for its compatible conjoint copy stub.
2308     //
2309     // A no push entry is always returned in the following local and
2310     // then published by assigning to the appropriate entry field in
2311     // class StubRoutines. The entry value is then passed to the
2312     // generator for the compatible stub. That means the entry must be
2313     // listed when saving to/restoring from the AOT cache, ensuring
2314     // that the inter-stub jumps are noted at AOT-cache save and
2315     // relocated at AOT cache load.
2316     address nopush_entry = nullptr;
2317 
2318     // generate the common exit first so later stubs can rely on it if
2319     // they want an UnsafeMemoryAccess exit non-local to the stub
2320     StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
2321     // register the stub as the default exit with class UnsafeMemoryAccess
2322     UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
2323 
2324     // generate and publish riscv-specific bulk copy routines first
2325     // so we can call them from other copy stubs
2326     StubRoutines::riscv::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, c_rarg0, c_rarg1, t1);
2327     StubRoutines::riscv::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, c_rarg0, c_rarg1, t1);
2328 
2329     StubRoutines::riscv::_zero_blocks = generate_zero_blocks();
2330 
2331     //*** jbyte
2332     // Always need aligned and unaligned versions
2333     StubRoutines::_jbyte_disjoint_arraycopy          = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
2334     // disjoint nopush entry is needed by conjoint copy
2335     StubRoutines::_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
2336     StubRoutines::_jbyte_arraycopy                   = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
2337     // conjoint nopush entry is needed by generic/unsafe copy
2338     StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
2339     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
2340     // disjoint arrayof nopush entry is needed by conjoint copy
2341     StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
2342     StubRoutines::_arrayof_jbyte_arraycopy           = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
2343 
2344     //*** jshort
2345     // Always need aligned and unaligned versions
2346     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
2347     // disjoint nopush entry is needed by conjoint copy
2348     StubRoutines::_jshort_disjoint_arraycopy_nopush  = nopush_entry;
2349     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
2350     // conjoint nopush entry is used by generic/unsafe copy
2351     StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
2352     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
2353     // disjoint arrayof nopush entry is needed by conjoint copy
2354     StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
2355     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
2356 
2357     //*** jint
2358     // Aligned versions
2359     StubRoutines::_arrayof_jint_disjoint_arraycopy   = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
2360     // disjoint arrayof nopush entry is needed by conjoint copy
2361     StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
2362     StubRoutines::_arrayof_jint_arraycopy            = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
2363     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2364     // entry_jint_arraycopy always points to the unaligned version
2365     StubRoutines::_jint_disjoint_arraycopy           = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
2366     // disjoint nopush entry is needed by conjoint copy
2367     StubRoutines::_jint_disjoint_arraycopy_nopush  = nopush_entry;
2368     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
2369     // conjoint nopush entry is needed by generic/unsafe copy
2370     StubRoutines::_jint_arraycopy_nopush = nopush_entry;
2371 
2372     //*** jlong
2373     // It is always aligned
2374     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
2375     // disjoint arrayof nopush entry is needed by conjoint copy
2376     StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
2377     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
2378     // conjoint nopush entry is needed by generic/unsafe copy
2379     StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
2380     // disjoint normal/nopush and conjoint normal entries are not
2381     // generated since the arrayof versions are the same
2382     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2383     StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
2384     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2385 
2386     //*** oops
2387     StubRoutines::_arrayof_oop_disjoint_arraycopy
2388       = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
2389       // disjoint arrayof nopush entry is needed by conjoint copy
2390     StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
2391     StubRoutines::_arrayof_oop_arraycopy
2392       = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
2393     // conjoint arrayof nopush entry is needed by generic/unsafe copy
2394     StubRoutines::_oop_arraycopy_nopush = nopush_entry;
2395     // Aligned versions without pre-barriers
2396     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2397       = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
2398     // disjoint arrayof+uninit nopush entry is needed by conjoint copy
2399     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
2400 
2401     // note that we don't need a returned nopush entry because the
2402     // generic/unsafe copy does not cater for uninit arrays.
2403     StubRoutines::_arrayof_oop_arraycopy_uninit
2404       = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
2405 
2406     // for oop copies reuse arrayof entries for non-arrayof cases
2407     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2408     StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
2409     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2410     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2411     StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
2412     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2413 
2414     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
2415     // checkcast nopush entry is needed by generic copy
2416     StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
2417     // note that we don't need a returned nopush entry because the
2418     // generic copy does not cater for uninit arrays.
2419     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
2420 
2421 
2422     // unsafe arraycopy may fallback on conjoint stubs
2423     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
2424                                                               StubRoutines::_jshort_arraycopy_nopush,
2425                                                               StubRoutines::_jint_arraycopy_nopush,
2426                                                               StubRoutines::_jlong_arraycopy_nopush);
2427 
2428     // generic arraycopy may fallback on conjoint stubs
2429     StubRoutines::_generic_arraycopy   = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
2430                                                                StubRoutines::_jshort_arraycopy_nopush,
2431                                                                StubRoutines::_jint_arraycopy_nopush,
2432                                                                StubRoutines::_oop_arraycopy_nopush,
2433                                                                StubRoutines::_jlong_arraycopy_nopush,
2434                                                                StubRoutines::_checkcast_arraycopy_nopush);
2435 
2436     StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
2437     StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
2438     StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
2439     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
2440     StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
2441     StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
2442 
2443     StubRoutines::_unsafe_setmemory    = generate_unsafe_setmemory();
2444   }
2445 
2446   void aes_load_keys(const Register &key, VectorRegister *working_vregs, int rounds) {
2447     const int step = 16;
2448     for (int i = 0; i < rounds; i++) {
2449       __ vle32_v(working_vregs[i], key);
2450       // The keys are stored in little-endian array, while we need
2451       // to operate in big-endian.
2452       // So performing an endian-swap here with vrev8.v instruction
2453       __ vrev8_v(working_vregs[i], working_vregs[i]);
2454       __ addi(key, key, step);
2455     }
2456   }
2457 
2458   void aes_encrypt(const VectorRegister &res, VectorRegister *working_vregs, int rounds) {
2459     assert(rounds <= 15, "rounds should be less than or equal to working_vregs size");
2460 
2461     __ vxor_vv(res, res, working_vregs[0]);
2462     for (int i = 1; i < rounds - 1; i++) {
2463       __ vaesem_vv(res, working_vregs[i]);
2464     }
2465     __ vaesef_vv(res, working_vregs[rounds - 1]);
2466   }
2467 
2468   // Arguments:
2469   //
2470   // Inputs:
2471   //   c_rarg0   - source byte array address
2472   //   c_rarg1   - destination byte array address
2473   //   c_rarg2   - sessionKe (key) in little endian int array
2474   //
2475   address generate_aescrypt_encryptBlock() {
2476     assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
2477 
2478     __ align(CodeEntryAlignment);
2479     StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
2480     StubCodeMark mark(this, stub_id);
2481 
2482     Label L_aes128, L_aes192;
2483 
2484     const Register from        = c_rarg0;  // source array address
2485     const Register to          = c_rarg1;  // destination array address
2486     const Register key         = c_rarg2;  // key array address
2487     const Register keylen      = c_rarg3;
2488 
2489     VectorRegister working_vregs[] = {
2490       v4, v5, v6, v7, v8, v9, v10, v11,
2491       v12, v13, v14, v15, v16, v17, v18
2492     };
2493     const VectorRegister res   = v19;
2494 
2495     address start = __ pc();
2496     __ enter();
2497 
2498     __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2499 
2500     __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2501     __ vle32_v(res, from);
2502 
2503     __ mv(t2, 52); // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
2504     __ bltu(keylen, t2, L_aes128);
2505     __ beq(keylen, t2, L_aes192);
2506     // Else we fallthrough to the biggest case (256-bit key size)
2507 
2508     // Note: the following function performs key += 15*16
2509     aes_load_keys(key, working_vregs, 15);
2510     aes_encrypt(res, working_vregs, 15);
2511     __ vse32_v(res, to);
2512     __ mv(c_rarg0, 0);
2513     __ leave();
2514     __ ret();
2515 
2516   __ bind(L_aes192);
2517     // Note: the following function performs key += 13*16
2518     aes_load_keys(key, working_vregs, 13);
2519     aes_encrypt(res, working_vregs, 13);
2520     __ vse32_v(res, to);
2521     __ mv(c_rarg0, 0);
2522     __ leave();
2523     __ ret();
2524 
2525   __ bind(L_aes128);
2526     // Note: the following function performs key += 11*16
2527     aes_load_keys(key, working_vregs, 11);
2528     aes_encrypt(res, working_vregs, 11);
2529     __ vse32_v(res, to);
2530     __ mv(c_rarg0, 0);
2531     __ leave();
2532     __ ret();
2533 
2534     return start;
2535   }
2536 
2537   void aes_decrypt(const VectorRegister &res, VectorRegister *working_vregs, int rounds) {
2538     assert(rounds <= 15, "rounds should be less than or equal to working_vregs size");
2539 
2540     __ vxor_vv(res, res, working_vregs[rounds - 1]);
2541     for (int i = rounds - 2; i > 0; i--) {
2542       __ vaesdm_vv(res, working_vregs[i]);
2543     }
2544     __ vaesdf_vv(res, working_vregs[0]);
2545   }
2546 
2547   // Arguments:
2548   //
2549   // Inputs:
2550   //   c_rarg0   - source byte array address
2551   //   c_rarg1   - destination byte array address
2552   //   c_rarg2   - sessionKe (key) in little endian int array
2553   //
2554   address generate_aescrypt_decryptBlock() {
2555     assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
2556 
2557     __ align(CodeEntryAlignment);
2558     StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
2559     StubCodeMark mark(this, stub_id);
2560 
2561     Label L_aes128, L_aes192;
2562 
2563     const Register from        = c_rarg0;  // source array address
2564     const Register to          = c_rarg1;  // destination array address
2565     const Register key         = c_rarg2;  // key array address
2566     const Register keylen      = c_rarg3;
2567 
2568     VectorRegister working_vregs[] = {
2569       v4, v5, v6, v7, v8, v9, v10, v11,
2570       v12, v13, v14, v15, v16, v17, v18
2571     };
2572     const VectorRegister res   = v19;
2573 
2574     address start = __ pc();
2575     __ enter(); // required for proper stackwalking of RuntimeStub frame
2576 
2577     __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2578 
2579     __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2580     __ vle32_v(res, from);
2581 
2582     __ mv(t2, 52); // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
2583     __ bltu(keylen, t2, L_aes128);
2584     __ beq(keylen, t2, L_aes192);
2585     // Else we fallthrough to the biggest case (256-bit key size)
2586 
2587     // Note: the following function performs key += 15*16
2588     aes_load_keys(key, working_vregs, 15);
2589     aes_decrypt(res, working_vregs, 15);
2590     __ vse32_v(res, to);
2591     __ mv(c_rarg0, 0);
2592     __ leave();
2593     __ ret();
2594 
2595   __ bind(L_aes192);
2596     // Note: the following function performs key += 13*16
2597     aes_load_keys(key, working_vregs, 13);
2598     aes_decrypt(res, working_vregs, 13);
2599     __ vse32_v(res, to);
2600     __ mv(c_rarg0, 0);
2601     __ leave();
2602     __ ret();
2603 
2604   __ bind(L_aes128);
2605     // Note: the following function performs key += 11*16
2606     aes_load_keys(key, working_vregs, 11);
2607     aes_decrypt(res, working_vregs, 11);
2608     __ vse32_v(res, to);
2609     __ mv(c_rarg0, 0);
2610     __ leave();
2611     __ ret();
2612 
2613     return start;
2614   }
2615 
2616   void cipherBlockChaining_encryptAESCrypt(int round, Register from, Register to, Register key,
2617                                            Register rvec, Register input_len) {
2618     const Register len = x29;
2619 
2620     VectorRegister working_vregs[] = {
2621       v1, v2, v3, v4, v5, v6, v7, v8,
2622       v9, v10, v11, v12, v13, v14, v15
2623     };
2624 
2625     const unsigned int BLOCK_SIZE = 16;
2626 
2627     __ mv(len, input_len);
2628     // load init rvec
2629     __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2630     __ vle32_v(v16, rvec);
2631 
2632     aes_load_keys(key, working_vregs, round);
2633     Label L_enc_loop;
2634     __ bind(L_enc_loop);
2635     // Encrypt from source by block size
2636       __ vle32_v(v17, from);
2637       __ addi(from, from, BLOCK_SIZE);
2638       __ vxor_vv(v16, v16, v17);
2639       aes_encrypt(v16, working_vregs, round);
2640       __ vse32_v(v16, to);
2641       __ addi(to, to, BLOCK_SIZE);
2642       __ subi(len, len, BLOCK_SIZE);
2643       __ bnez(len, L_enc_loop);
2644 
2645     // save current rvec and return
2646     __ vse32_v(v16, rvec);
2647     __ mv(x10, input_len);
2648     __ leave();
2649     __ ret();
2650   }
2651 
2652   // Arguments:
2653   //
2654   // Inputs:
2655   //   c_rarg0   - source byte array address
2656   //   c_rarg1   - destination byte array address
2657   //   c_rarg2   - K (key) in little endian int array
2658   //   c_rarg3   - r vector byte array address
2659   //   c_rarg4   - input length
2660   //
2661   // Output:
2662   //   x10       - input length
2663   //
2664   address generate_cipherBlockChaining_encryptAESCrypt() {
2665     assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
2666     __ align(CodeEntryAlignment);
2667     StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
2668     StubCodeMark mark(this, stub_id);
2669 
2670     const Register from       = c_rarg0;
2671     const Register to         = c_rarg1;
2672     const Register key        = c_rarg2;
2673     const Register rvec       = c_rarg3;
2674     const Register input_len  = c_rarg4;
2675 
2676     const Register keylen     = x28;
2677 
2678     address start = __ pc();
2679     __ enter();
2680 
2681     Label L_aes128, L_aes192;
2682     // Compute #rounds for AES based on the length of the key array
2683     __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2684     __ mv(t0, 52);
2685     __ bltu(keylen, t0, L_aes128);
2686     __ beq(keylen, t0, L_aes192);
2687     // Else we fallthrough to the biggest case (256-bit key size)
2688 
2689     // Note: the following function performs key += 15*16
2690     cipherBlockChaining_encryptAESCrypt(15, from, to, key, rvec, input_len);
2691 
2692     // Note: the following function performs key += 11*16
2693     __ bind(L_aes128);
2694     cipherBlockChaining_encryptAESCrypt(11, from, to, key, rvec, input_len);
2695 
2696     // Note: the following function performs key += 13*16
2697     __ bind(L_aes192);
2698     cipherBlockChaining_encryptAESCrypt(13, from, to, key, rvec, input_len);
2699 
2700     return start;
2701   }
2702 
2703   void cipherBlockChaining_decryptAESCrypt(int round, Register from, Register to, Register key,
2704                                            Register rvec, Register input_len) {
2705     const Register len = x29;
2706 
2707     VectorRegister working_vregs[] = {
2708       v1, v2, v3, v4, v5, v6, v7, v8,
2709       v9, v10, v11, v12, v13, v14, v15
2710     };
2711 
2712     const unsigned int BLOCK_SIZE = 16;
2713 
2714     __ mv(len, input_len);
2715     // load init rvec
2716     __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2717     __ vle32_v(v16, rvec);
2718 
2719     aes_load_keys(key, working_vregs, round);
2720     Label L_dec_loop;
2721     // Decrypt from source by block size
2722     __ bind(L_dec_loop);
2723       __ vle32_v(v17, from);
2724       __ addi(from, from, BLOCK_SIZE);
2725       __ vmv_v_v(v18, v17);
2726       aes_decrypt(v17, working_vregs, round);
2727       __ vxor_vv(v17, v17, v16);
2728       __ vse32_v(v17, to);
2729       __ vmv_v_v(v16, v18);
2730       __ addi(to, to, BLOCK_SIZE);
2731       __ subi(len, len, BLOCK_SIZE);
2732       __ bnez(len, L_dec_loop);
2733 
2734     // save current rvec and return
2735     __ vse32_v(v16, rvec);
2736     __ mv(x10, input_len);
2737     __ leave();
2738     __ ret();
2739   }
2740 
2741   // Arguments:
2742   //
2743   // Inputs:
2744   //   c_rarg0   - source byte array address
2745   //   c_rarg1   - destination byte array address
2746   //   c_rarg2   - K (key) in little endian int array
2747   //   c_rarg3   - r vector byte array address
2748   //   c_rarg4   - input length
2749   //
2750   // Output:
2751   //   x10       - input length
2752   //
2753   address generate_cipherBlockChaining_decryptAESCrypt() {
2754     assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
2755     __ align(CodeEntryAlignment);
2756     StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
2757     StubCodeMark mark(this, stub_id);
2758 
2759     const Register from        = c_rarg0;
2760     const Register to          = c_rarg1;
2761     const Register key         = c_rarg2;
2762     const Register rvec        = c_rarg3;
2763     const Register input_len   = c_rarg4;
2764 
2765     const Register keylen      = x28;
2766 
2767     address start = __ pc();
2768     __ enter();
2769 
2770     Label L_aes128, L_aes192, L_aes128_loop, L_aes192_loop, L_aes256_loop;
2771     // Compute #rounds for AES based on the length of the key array
2772     __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2773     __ mv(t0, 52);
2774     __ bltu(keylen, t0, L_aes128);
2775     __ beq(keylen, t0, L_aes192);
2776     // Else we fallthrough to the biggest case (256-bit key size)
2777 
2778     // Note: the following function performs key += 15*16
2779     cipherBlockChaining_decryptAESCrypt(15, from, to, key, rvec, input_len);
2780 
2781     // Note: the following function performs key += 11*16
2782     __ bind(L_aes128);
2783     cipherBlockChaining_decryptAESCrypt(11, from, to, key, rvec, input_len);
2784 
2785     // Note: the following function performs key += 13*16
2786     __ bind(L_aes192);
2787     cipherBlockChaining_decryptAESCrypt(13, from, to, key, rvec, input_len);
2788 
2789     return start;
2790   }
2791 
2792   // Load big-endian 128-bit from memory.
2793   void be_load_counter_128(Register counter_hi, Register counter_lo, Register counter) {
2794     __ ld(counter_lo, Address(counter, 8)); // Load 128-bits from counter
2795     __ ld(counter_hi, Address(counter));
2796     __ rev8(counter_lo, counter_lo);        // Convert big-endian to little-endian
2797     __ rev8(counter_hi, counter_hi);
2798   }
2799 
2800   // Little-endian 128-bit + 64-bit -> 128-bit addition.
2801   void add_counter_128(Register counter_hi, Register counter_lo) {
2802     assert_different_registers(counter_hi, counter_lo, t0);
2803     __ addi(counter_lo, counter_lo, 1);
2804     __ seqz(t0, counter_lo);                // Check for result overflow
2805     __ add(counter_hi, counter_hi, t0);     // Add 1 if overflow otherwise 0
2806   }
2807 
2808   // Store big-endian 128-bit to memory.
2809   void be_store_counter_128(Register counter_hi, Register counter_lo, Register counter) {
2810     assert_different_registers(counter_hi, counter_lo, t0, t1);
2811     __ rev8(t0, counter_lo);                // Convert little-endian to big-endian
2812     __ rev8(t1, counter_hi);
2813     __ sd(t0, Address(counter, 8));         // Store 128-bits to counter
2814     __ sd(t1, Address(counter));
2815   }
2816 
2817   void counterMode_AESCrypt(int round, Register in, Register out, Register key, Register counter,
2818                             Register input_len,  Register saved_encrypted_ctr, Register used_ptr) {
2819     // Algorithm:
2820     //
2821     //   aes_load_keys();
2822     //   load_counter_128(counter_hi, counter_lo, counter);
2823     //
2824     //   L_next:
2825     //     if (used >= BLOCK_SIZE) goto L_main_loop;
2826     //
2827     //   L_encrypt_next:
2828     //       *out = *in ^ saved_encrypted_ctr[used]);
2829     //       out++; in++; used++; len--;
2830     //       if (len == 0) goto L_exit;
2831     //       goto L_next;
2832     //
2833     //   L_main_loop:
2834     //     if (len == 0) goto L_exit;
2835     //     saved_encrypted_ctr = aes_encrypt(counter);
2836     //
2837     //     add_counter_128(counter_hi, counter_lo);
2838     //     be_store_counter_128(counter_hi, counter_lo, counter);
2839     //     used = 0;
2840     //
2841     //     if(len < BLOCK_SIZE) goto L_encrypt_next;
2842     //
2843     //     v_in = load_16Byte(in);
2844     //     v_out = load_16Byte(out);
2845     //     v_saved_encrypted_ctr = load_16Byte(saved_encrypted_ctr);
2846     //     v_out = v_in ^ v_saved_encrypted_ctr;
2847     //     out += BLOCK_SIZE;
2848     //     in += BLOCK_SIZE;
2849     //     len -= BLOCK_SIZE;
2850     //     used = BLOCK_SIZE;
2851     //     goto L_main_loop;
2852     //
2853     //
2854     //   L_exit:
2855     //     store(used);
2856     //     result = input_len
2857     //     return result;
2858 
2859     const Register used          = x28;
2860     const Register len           = x29;
2861     const Register counter_hi    = x30;
2862     const Register counter_lo    = x31;
2863     const Register block_size    = t2;
2864 
2865     const unsigned int BLOCK_SIZE = 16;
2866 
2867     VectorRegister working_vregs[] = {
2868       v1, v2, v3, v4, v5, v6, v7, v8,
2869       v9, v10, v11, v12, v13, v14, v15
2870     };
2871 
2872     __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2873 
2874     __ lwu(used, Address(used_ptr));
2875     __ mv(len, input_len);
2876     __ mv(block_size, BLOCK_SIZE);
2877 
2878     // load keys to working_vregs according to round
2879     aes_load_keys(key, working_vregs, round);
2880 
2881     // 128-bit big-endian load
2882     be_load_counter_128(counter_hi, counter_lo, counter);
2883 
2884     Label L_next, L_encrypt_next, L_main_loop, L_exit;
2885     // Check the last saved_encrypted_ctr used value, we fall through
2886     // to L_encrypt_next when the used value lower than block_size
2887     __ bind(L_next);
2888     __ bgeu(used, block_size, L_main_loop);
2889 
2890     // There is still data left fewer than block_size after L_main_loop
2891     // or last used, we encrypt them one by one.
2892     __ bind(L_encrypt_next);
2893     __ add(t0, saved_encrypted_ctr, used);
2894     __ lbu(t1, Address(t0));
2895     __ lbu(t0, Address(in));
2896     __ xorr(t1, t1, t0);
2897     __ sb(t1, Address(out));
2898     __ addi(in, in, 1);
2899     __ addi(out, out, 1);
2900     __ addi(used, used, 1);
2901     __ subi(len, len, 1);
2902     __ beqz(len, L_exit);
2903     __ j(L_next);
2904 
2905     // We will calculate the next saved_encrypted_ctr and encrypt the blocks of data
2906     // one by one until there is less than a full block remaining if len not zero
2907     __ bind(L_main_loop);
2908     __ beqz(len, L_exit);
2909     __ vle32_v(v16, counter);
2910 
2911     // encrypt counter according to round
2912     aes_encrypt(v16, working_vregs, round);
2913 
2914     __ vse32_v(v16, saved_encrypted_ctr);
2915 
2916     // 128-bit little-endian increment
2917     add_counter_128(counter_hi, counter_lo);
2918     // 128-bit big-endian store
2919     be_store_counter_128(counter_hi, counter_lo, counter);
2920 
2921     __ mv(used, 0);
2922     // Check if we have a full block_size
2923     __ bltu(len, block_size, L_encrypt_next);
2924 
2925     // We have one full block to encrypt at least
2926     __ vle32_v(v17, in);
2927     __ vxor_vv(v16, v16, v17);
2928     __ vse32_v(v16, out);
2929     __ add(out, out, block_size);
2930     __ add(in, in, block_size);
2931     __ sub(len, len, block_size);
2932     __ mv(used, block_size);
2933     __ j(L_main_loop);
2934 
2935     __ bind(L_exit);
2936     __ sw(used, Address(used_ptr));
2937     __ mv(x10, input_len);
2938     __ leave();
2939     __ ret();
2940   };
2941 
2942   // CTR AES crypt.
2943   // Arguments:
2944   //
2945   // Inputs:
2946   //   c_rarg0   - source byte array address
2947   //   c_rarg1   - destination byte array address
2948   //   c_rarg2   - K (key) in little endian int array
2949   //   c_rarg3   - counter vector byte array address
2950   //   c_rarg4   - input length
2951   //   c_rarg5   - saved encryptedCounter start
2952   //   c_rarg6   - saved used length
2953   //
2954   // Output:
2955   //   x10       - input length
2956   //
2957   address generate_counterMode_AESCrypt() {
2958     assert(UseAESCTRIntrinsics, "need AES instructions (Zvkned extension) and Zbb extension support");
2959 
2960     __ align(CodeEntryAlignment);
2961     StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
2962     StubCodeMark mark(this, stub_id);
2963 
2964     const Register in                  = c_rarg0;
2965     const Register out                 = c_rarg1;
2966     const Register key                 = c_rarg2;
2967     const Register counter             = c_rarg3;
2968     const Register input_len           = c_rarg4;
2969     const Register saved_encrypted_ctr = c_rarg5;
2970     const Register used_len_ptr        = c_rarg6;
2971 
2972     const Register keylen              = c_rarg7; // temporary register
2973 
2974     const address start = __ pc();
2975     __ enter();
2976 
2977     Label L_exit;
2978     __ beqz(input_len, L_exit);
2979 
2980     Label L_aes128, L_aes192;
2981     // Compute #rounds for AES based on the length of the key array
2982     __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2983     __ mv(t0, 52); // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
2984     __ bltu(keylen, t0, L_aes128);
2985     __ beq(keylen, t0, L_aes192);
2986     // Else we fallthrough to the biggest case (256-bit key size)
2987 
2988     // Note: the following function performs crypt with key += 15*16
2989     counterMode_AESCrypt(15, in, out, key, counter, input_len, saved_encrypted_ctr, used_len_ptr);
2990 
2991     // Note: the following function performs crypt with key += 13*16
2992     __ bind(L_aes192);
2993     counterMode_AESCrypt(13, in, out, key, counter, input_len, saved_encrypted_ctr, used_len_ptr);
2994 
2995     // Note: the following function performs crypt with key += 11*16
2996     __ bind(L_aes128);
2997     counterMode_AESCrypt(11, in, out, key, counter, input_len, saved_encrypted_ctr, used_len_ptr);
2998 
2999     __ bind(L_exit);
3000     __ mv(x10, input_len);
3001     __ leave();
3002     __ ret();
3003 
3004     return start;
3005   }
3006 
3007   void ghash_loop(Register state, Register subkeyH, Register data, Register blocks,
3008                   VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3) {
3009     VectorRegister partial_hash = vtmp1;
3010     VectorRegister hash_subkey  = vtmp2;
3011     VectorRegister cipher_text  = vtmp3;
3012 
3013     const unsigned int BLOCK_SIZE = 16;
3014 
3015     __ vsetivli(x0, 2, Assembler::e64, Assembler::m1);
3016     __ vle64_v(hash_subkey, subkeyH);
3017     __ vrev8_v(hash_subkey, hash_subkey);
3018     __ vle64_v(partial_hash, state);
3019     __ vrev8_v(partial_hash, partial_hash);
3020 
3021     __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
3022     Label L_ghash_loop;
3023     __ bind(L_ghash_loop);
3024       __ vle32_v(cipher_text, data);
3025       __ addi(data, data, BLOCK_SIZE);
3026       __ vghsh_vv(partial_hash, hash_subkey, cipher_text);
3027       __ subi(blocks, blocks, 1);
3028       __ bnez(blocks, L_ghash_loop);
3029 
3030     __ vsetivli(x0, 2, Assembler::e64, Assembler::m1);
3031     __ vrev8_v(partial_hash, partial_hash);
3032     __ vse64_v(partial_hash, state);
3033   }
3034 
3035   /**
3036    *  Arguments:
3037    *
3038    *  Input:
3039    *  c_rarg0   - current state address
3040    *  c_rarg1   - H key address
3041    *  c_rarg2   - data address
3042    *  c_rarg3   - number of blocks
3043    *
3044    *  Output:
3045    *  Updated state at c_rarg0
3046    */
3047   address generate_ghash_processBlocks() {
3048     assert(UseGHASHIntrinsics, "need GHASH instructions (Zvkg extension) and Zvbb support");
3049 
3050     __ align(CodeEntryAlignment);
3051     StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
3052     StubCodeMark mark(this, stub_id);
3053 
3054     address start = __ pc();
3055     __ enter();
3056 
3057     Register state   = c_rarg0;
3058     Register subkeyH = c_rarg1;
3059     Register data    = c_rarg2;
3060     Register blocks  = c_rarg3;
3061 
3062     VectorRegister vtmp1 = v1;
3063     VectorRegister vtmp2 = v2;
3064     VectorRegister vtmp3 = v3;
3065 
3066     ghash_loop(state, subkeyH, data, blocks, vtmp1, vtmp2, vtmp3);
3067 
3068     __ leave();
3069     __ ret();
3070 
3071     return start;
3072   }
3073 
3074   // code for comparing 8 characters of strings with Latin1 and Utf16 encoding
3075   void compare_string_8_x_LU(Register tmpL, Register tmpU,
3076                              Register strL, Register strU, Label& DIFF) {
3077     const Register tmp = x30, tmpLval = x12;
3078 
3079     int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE);
3080     assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
3081 
3082 #ifdef ASSERT
3083     if (AvoidUnalignedAccesses) {
3084       Label align_ok;
3085       __ andi(t0, strL, 0x7);
3086       __ beqz(t0, align_ok);
3087       __ stop("bad alignment");
3088       __ bind(align_ok);
3089     }
3090 #endif
3091     __ ld(tmpLval, Address(strL));
3092     __ addi(strL, strL, wordSize);
3093 
3094     // compare first 4 characters
3095     __ load_long_misaligned(tmpU, Address(strU), tmp, (base_offset % 8) != 0 ? 4 : 8);
3096     __ addi(strU, strU, wordSize);
3097     __ inflate_lo32(tmpL, tmpLval);
3098     __ xorr(tmp, tmpU, tmpL);
3099     __ bnez(tmp, DIFF);
3100 
3101     // compare second 4 characters
3102     __ load_long_misaligned(tmpU, Address(strU), tmp, (base_offset % 8) != 0 ? 4 : 8);
3103     __ addi(strU, strU, wordSize);
3104     __ inflate_hi32(tmpL, tmpLval);
3105     __ xorr(tmp, tmpU, tmpL);
3106     __ bnez(tmp, DIFF);
3107   }
3108 
3109   // x10  = result
3110   // x11  = str1
3111   // x12  = cnt1
3112   // x13  = str2
3113   // x14  = cnt2
3114   // x28  = tmp1
3115   // x29  = tmp2
3116   // x30  = tmp3
3117   address generate_compare_long_string_different_encoding(StubId stub_id) {
3118     bool isLU;
3119     switch (stub_id) {
3120     case StubId::stubgen_compare_long_string_LU_id:
3121       isLU = true;
3122       break;
3123     case StubId::stubgen_compare_long_string_UL_id:
3124       isLU = false;
3125       break;
3126     default:
3127       ShouldNotReachHere();
3128     };
3129     __ align(CodeEntryAlignment);
3130     StubCodeMark mark(this, stub_id);
3131     address entry = __ pc();
3132     Label SMALL_LOOP, TAIL, LOAD_LAST, DONE, CALCULATE_DIFFERENCE;
3133     const Register result = x10, str1 = x11, str2 = x13, cnt2 = x14,
3134                    tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x12;
3135 
3136     int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE);
3137     assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
3138 
3139     Register strU = isLU ? str2 : str1,
3140              strL = isLU ? str1 : str2,
3141              tmpU = isLU ? tmp2 : tmp1, // where to keep U for comparison
3142              tmpL = isLU ? tmp1 : tmp2; // where to keep L for comparison
3143 
3144     if (AvoidUnalignedAccesses && (base_offset % 8) != 0) {
3145       // Load 4 bytes from strL to make sure main loop is 8-byte aligned
3146       // cnt2 is >= 68 here, no need to check it for >= 0
3147       __ lwu(tmpL, Address(strL));
3148       __ addi(strL, strL, wordSize / 2);
3149       __ load_long_misaligned(tmpU, Address(strU), tmp4, (base_offset % 8) != 0 ? 4 : 8);
3150       __ addi(strU, strU, wordSize);
3151       __ inflate_lo32(tmp3, tmpL);
3152       __ mv(tmpL, tmp3);
3153       __ xorr(tmp3, tmpU, tmpL);
3154       __ bnez(tmp3, CALCULATE_DIFFERENCE);
3155       __ subi(cnt2, cnt2, wordSize / 2);
3156     }
3157 
3158     // we are now 8-bytes aligned on strL when AvoidUnalignedAccesses is true
3159     __ subi(cnt2, cnt2, wordSize * 2);
3160     __ bltz(cnt2, TAIL);
3161     __ bind(SMALL_LOOP); // smaller loop
3162       __ subi(cnt2, cnt2, wordSize * 2);
3163       compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
3164       compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
3165       __ bgez(cnt2, SMALL_LOOP);
3166       __ addi(t0, cnt2, wordSize * 2);
3167       __ beqz(t0, DONE);
3168     __ bind(TAIL);  // 1..15 characters left
3169       // Aligned access. Load bytes in portions - 4, 2, 1.
3170 
3171       __ addi(t0, cnt2, wordSize);
3172       __ addi(cnt2, cnt2, wordSize * 2); // amount of characters left to process
3173       __ bltz(t0, LOAD_LAST);
3174       // remaining characters are greater than or equals to 8, we can do one compare_string_8_x_LU
3175       compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
3176       __ subi(cnt2, cnt2, wordSize);
3177       __ beqz(cnt2, DONE);  // no character left
3178       __ bind(LOAD_LAST);   // cnt2 = 1..7 characters left
3179 
3180       __ subi(cnt2, cnt2, wordSize); // cnt2 is now an offset in strL which points to last 8 bytes
3181       __ slli(t0, cnt2, 1);     // t0 is now an offset in strU which points to last 16 bytes
3182       __ add(strL, strL, cnt2); // Address of last 8 bytes in Latin1 string
3183       __ add(strU, strU, t0);   // Address of last 16 bytes in UTF-16 string
3184       __ load_int_misaligned(tmpL, Address(strL), t0, false);
3185       __ load_long_misaligned(tmpU, Address(strU), t0, 2);
3186       __ inflate_lo32(tmp3, tmpL);
3187       __ mv(tmpL, tmp3);
3188       __ xorr(tmp3, tmpU, tmpL);
3189       __ bnez(tmp3, CALCULATE_DIFFERENCE);
3190 
3191       __ addi(strL, strL, wordSize / 2); // Address of last 4 bytes in Latin1 string
3192       __ addi(strU, strU, wordSize);   // Address of last 8 bytes in UTF-16 string
3193       __ load_int_misaligned(tmpL, Address(strL), t0, false);
3194       __ load_long_misaligned(tmpU, Address(strU), t0, 2);
3195       __ inflate_lo32(tmp3, tmpL);
3196       __ mv(tmpL, tmp3);
3197       __ xorr(tmp3, tmpU, tmpL);
3198       __ bnez(tmp3, CALCULATE_DIFFERENCE);
3199       __ j(DONE); // no character left
3200 
3201       // Find the first different characters in the longwords and
3202       // compute their difference.
3203     __ bind(CALCULATE_DIFFERENCE);
3204       // count bits of trailing zero chars
3205       __ ctzc_bits(tmp4, tmp3);
3206       __ srl(tmp1, tmp1, tmp4);
3207       __ srl(tmp2, tmp2, tmp4);
3208       __ zext(tmp1, tmp1, 16);
3209       __ zext(tmp2, tmp2, 16);
3210       __ sub(result, tmp1, tmp2);
3211     __ bind(DONE);
3212       __ ret();
3213     return entry;
3214   }
3215 
3216   address generate_method_entry_barrier() {
3217     __ align(CodeEntryAlignment);
3218     StubId stub_id = StubId::stubgen_method_entry_barrier_id;
3219     StubCodeMark mark(this, stub_id);
3220 
3221     Label deoptimize_label;
3222 
3223     address start = __ pc();
3224 
3225     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
3226 
3227     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
3228       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
3229       Address thread_epoch_addr(xthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
3230       __ la(t1, ExternalAddress(bs_asm->patching_epoch_addr()));
3231       __ lwu(t1, t1);
3232       __ sw(t1, thread_epoch_addr);
3233       // There are two ways this can work:
3234       // - The writer did system icache shootdown after the instruction stream update.
3235       //   Hence do nothing.
3236       // - The writer trust us to make sure our icache is in sync before entering.
3237       //   Hence use cmodx fence (fence.i, may change).
3238       if (UseCtxFencei) {
3239         __ cmodx_fence();
3240       }
3241       __ membar(__ LoadLoad);
3242     }
3243 
3244     __ set_last_Java_frame(sp, fp, ra);
3245 
3246     __ enter();
3247     __ addi(t1, sp, wordSize);
3248 
3249     __ subi(sp, sp, 4 * wordSize);
3250 
3251     __ push_call_clobbered_registers();
3252 
3253     __ mv(c_rarg0, t1);
3254     __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
3255 
3256     __ reset_last_Java_frame(true);
3257 
3258     __ mv(t0, x10);
3259 
3260     __ pop_call_clobbered_registers();
3261 
3262     __ bnez(t0, deoptimize_label);
3263 
3264     __ leave();
3265     __ ret();
3266 
3267     __ BIND(deoptimize_label);
3268 
3269     __ ld(t0, Address(sp, 0));
3270     __ ld(fp, Address(sp, wordSize));
3271     __ ld(ra, Address(sp, wordSize * 2));
3272     __ ld(t1, Address(sp, wordSize * 3));
3273 
3274     __ mv(sp, t0);
3275     __ jr(t1);
3276 
3277     return start;
3278   }
3279 
3280   // x10  = result
3281   // x11  = str1
3282   // x12  = cnt1
3283   // x13  = str2
3284   // x14  = cnt2
3285   // x28  = tmp1
3286   // x29  = tmp2
3287   // x30  = tmp3
3288   // x31  = tmp4
3289   address generate_compare_long_string_same_encoding(StubId stub_id) {
3290     bool isLL;
3291     switch (stub_id) {
3292     case StubId::stubgen_compare_long_string_LL_id:
3293       isLL = true;
3294       break;
3295     case StubId::stubgen_compare_long_string_UU_id:
3296       isLL = false;
3297       break;
3298     default:
3299       ShouldNotReachHere();
3300     };
3301     __ align(CodeEntryAlignment);
3302     StubCodeMark mark(this, stub_id);
3303     address entry = __ pc();
3304     Label SMALL_LOOP, CHECK_LAST, DIFF2, TAIL,
3305           LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF;
3306     const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, cnt2 = x14,
3307                    tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x7, tmp5 = x31;
3308     RegSet spilled_regs = RegSet::of(tmp4, tmp5);
3309 
3310     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
3311     // update cnt2 counter with already loaded 8 bytes
3312     __ subi(cnt2, cnt2, wordSize / (isLL ? 1 : 2));
3313     // update pointers, because of previous read
3314     __ addi(str1, str1, wordSize);
3315     __ addi(str2, str2, wordSize);
3316     // less than 16 bytes left?
3317     __ subi(cnt2, cnt2, isLL ? 16 : 8);
3318     __ push_reg(spilled_regs, sp);
3319     __ bltz(cnt2, TAIL);
3320     __ bind(SMALL_LOOP);
3321       // compare 16 bytes of strings with same encoding
3322       __ ld(tmp5, Address(str1));
3323       __ addi(str1, str1, 8);
3324       __ xorr(tmp4, tmp1, tmp2);
3325       __ ld(cnt1, Address(str2));
3326       __ addi(str2, str2, 8);
3327       __ bnez(tmp4, DIFF);
3328       __ ld(tmp1, Address(str1));
3329       __ addi(str1, str1, 8);
3330       __ xorr(tmp4, tmp5, cnt1);
3331       __ ld(tmp2, Address(str2));
3332       __ addi(str2, str2, 8);
3333       __ bnez(tmp4, DIFF2);
3334 
3335       __ subi(cnt2, cnt2, isLL ? 16 : 8);
3336       __ bgez(cnt2, SMALL_LOOP);
3337     __ bind(TAIL);
3338       __ addi(cnt2, cnt2, isLL ? 16 : 8);
3339       __ beqz(cnt2, LAST_CHECK_AND_LENGTH_DIFF);
3340       __ subi(cnt2, cnt2, isLL ? 8 : 4);
3341       __ blez(cnt2, CHECK_LAST);
3342       __ xorr(tmp4, tmp1, tmp2);
3343       __ bnez(tmp4, DIFF);
3344       __ ld(tmp1, Address(str1));
3345       __ addi(str1, str1, 8);
3346       __ ld(tmp2, Address(str2));
3347       __ addi(str2, str2, 8);
3348       __ subi(cnt2, cnt2, isLL ? 8 : 4);
3349     __ bind(CHECK_LAST);
3350       if (!isLL) {
3351         __ add(cnt2, cnt2, cnt2); // now in bytes
3352       }
3353       __ xorr(tmp4, tmp1, tmp2);
3354       __ bnez(tmp4, DIFF);
3355       __ add(str1, str1, cnt2);
3356       __ load_long_misaligned(tmp5, Address(str1), tmp3, isLL ? 1 : 2);
3357       __ add(str2, str2, cnt2);
3358       __ load_long_misaligned(cnt1, Address(str2), tmp3, isLL ? 1 : 2);
3359       __ xorr(tmp4, tmp5, cnt1);
3360       __ beqz(tmp4, LENGTH_DIFF);
3361       // Find the first different characters in the longwords and
3362       // compute their difference.
3363     __ bind(DIFF2);
3364       // count bits of trailing zero chars
3365       __ ctzc_bits(tmp3, tmp4, isLL);
3366       __ srl(tmp5, tmp5, tmp3);
3367       __ srl(cnt1, cnt1, tmp3);
3368       if (isLL) {
3369         __ zext(tmp5, tmp5, 8);
3370         __ zext(cnt1, cnt1, 8);
3371       } else {
3372         __ zext(tmp5, tmp5, 16);
3373         __ zext(cnt1, cnt1, 16);
3374       }
3375       __ sub(result, tmp5, cnt1);
3376       __ j(LENGTH_DIFF);
3377     __ bind(DIFF);
3378       // count bits of trailing zero chars
3379       __ ctzc_bits(tmp3, tmp4, isLL);
3380       __ srl(tmp1, tmp1, tmp3);
3381       __ srl(tmp2, tmp2, tmp3);
3382       if (isLL) {
3383         __ zext(tmp1, tmp1, 8);
3384         __ zext(tmp2, tmp2, 8);
3385       } else {
3386         __ zext(tmp1, tmp1, 16);
3387         __ zext(tmp2, tmp2, 16);
3388       }
3389       __ sub(result, tmp1, tmp2);
3390       __ j(LENGTH_DIFF);
3391     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
3392       __ xorr(tmp4, tmp1, tmp2);
3393       __ bnez(tmp4, DIFF);
3394     __ bind(LENGTH_DIFF);
3395       __ pop_reg(spilled_regs, sp);
3396       __ ret();
3397     return entry;
3398   }
3399 
3400   void generate_compare_long_strings() {
3401     StubRoutines::riscv::_compare_long_string_LL = generate_compare_long_string_same_encoding(StubId::stubgen_compare_long_string_LL_id);
3402     StubRoutines::riscv::_compare_long_string_UU = generate_compare_long_string_same_encoding(StubId::stubgen_compare_long_string_UU_id);
3403     StubRoutines::riscv::_compare_long_string_LU = generate_compare_long_string_different_encoding(StubId::stubgen_compare_long_string_LU_id);
3404     StubRoutines::riscv::_compare_long_string_UL = generate_compare_long_string_different_encoding(StubId::stubgen_compare_long_string_UL_id);
3405   }
3406 
3407   // x10 result
3408   // x11 src
3409   // x12 src count
3410   // x13 pattern
3411   // x14 pattern count
3412   address generate_string_indexof_linear(StubId stub_id)
3413   {
3414     bool needle_isL;
3415     bool haystack_isL;
3416     switch (stub_id) {
3417     case StubId::stubgen_string_indexof_linear_ll_id:
3418       needle_isL = true;
3419       haystack_isL = true;
3420       break;
3421     case StubId::stubgen_string_indexof_linear_ul_id:
3422       needle_isL = true;
3423       haystack_isL = false;
3424       break;
3425     case StubId::stubgen_string_indexof_linear_uu_id:
3426       needle_isL = false;
3427       haystack_isL = false;
3428       break;
3429     default:
3430       ShouldNotReachHere();
3431     };
3432 
3433     __ align(CodeEntryAlignment);
3434     StubCodeMark mark(this, stub_id);
3435     address entry = __ pc();
3436 
3437     int needle_chr_size = needle_isL ? 1 : 2;
3438     int haystack_chr_size = haystack_isL ? 1 : 2;
3439     int needle_chr_shift = needle_isL ? 0 : 1;
3440     int haystack_chr_shift = haystack_isL ? 0 : 1;
3441     bool isL = needle_isL && haystack_isL;
3442     // parameters
3443     Register result = x10, haystack = x11, haystack_len = x12, needle = x13, needle_len = x14;
3444     // temporary registers
3445     Register mask1 = x20, match_mask = x21, first = x22, trailing_zeros = x23, mask2 = x24, tmp = x25;
3446     // redefinitions
3447     Register ch1 = x28, ch2 = x29;
3448     RegSet spilled_regs = RegSet::range(x20, x25) + RegSet::range(x28, x29);
3449 
3450     __ push_reg(spilled_regs, sp);
3451 
3452     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
3453           L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
3454           L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
3455           L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
3456           L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
3457           L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
3458 
3459     __ ld(ch1, Address(needle));
3460     __ ld(ch2, Address(haystack));
3461     // src.length - pattern.length
3462     __ sub(haystack_len, haystack_len, needle_len);
3463 
3464     // first is needle[0]
3465     __ zext(first, ch1, needle_isL ? 8 : 16);
3466 
3467     uint64_t mask0101 = UCONST64(0x0101010101010101);
3468     uint64_t mask0001 = UCONST64(0x0001000100010001);
3469     __ mv(mask1, haystack_isL ? mask0101 : mask0001);
3470     __ mul(first, first, mask1);
3471     uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f);
3472     uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff);
3473     __ mv(mask2, haystack_isL ? mask7f7f : mask7fff);
3474     if (needle_isL != haystack_isL) {
3475       __ mv(tmp, ch1);
3476     }
3477     __ subi(haystack_len, haystack_len, wordSize / haystack_chr_size - 1);
3478     __ blez(haystack_len, L_SMALL);
3479 
3480     if (needle_isL != haystack_isL) {
3481       __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
3482     }
3483     // xorr, sub, orr, notr, andr
3484     // compare and set match_mask[i] with 0x80/0x8000 (Latin1/UTF16) if ch2[i] == first[i]
3485     // eg:
3486     // first:        aa aa aa aa aa aa aa aa
3487     // ch2:          aa aa li nx jd ka aa aa
3488     // match_mask:   80 80 00 00 00 00 80 80
3489     __ compute_match_mask(ch2, first, match_mask, mask1, mask2);
3490 
3491     // search first char of needle, if success, goto L_HAS_ZERO;
3492     __ bnez(match_mask, L_HAS_ZERO);
3493     __ subi(haystack_len, haystack_len, wordSize / haystack_chr_size);
3494     __ addi(result, result, wordSize / haystack_chr_size);
3495     __ addi(haystack, haystack, wordSize);
3496     __ bltz(haystack_len, L_POST_LOOP);
3497 
3498     __ bind(L_LOOP);
3499     __ ld(ch2, Address(haystack));
3500     __ compute_match_mask(ch2, first, match_mask, mask1, mask2);
3501     __ bnez(match_mask, L_HAS_ZERO);
3502 
3503     __ bind(L_LOOP_PROCEED);
3504     __ subi(haystack_len, haystack_len, wordSize / haystack_chr_size);
3505     __ addi(haystack, haystack, wordSize);
3506     __ addi(result, result, wordSize / haystack_chr_size);
3507     __ bgez(haystack_len, L_LOOP);
3508 
3509     __ bind(L_POST_LOOP);
3510     __ mv(ch2, -wordSize / haystack_chr_size);
3511     __ ble(haystack_len, ch2, NOMATCH); // no extra characters to check
3512     __ ld(ch2, Address(haystack));
3513     __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
3514     __ neg(haystack_len, haystack_len);
3515     __ xorr(ch2, first, ch2);
3516     __ sub(match_mask, ch2, mask1);
3517     __ orr(ch2, ch2, mask2);
3518     __ mv(trailing_zeros, -1); // all bits set
3519     __ j(L_SMALL_PROCEED);
3520 
3521     __ align(OptoLoopAlignment);
3522     __ bind(L_SMALL);
3523     __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
3524     __ neg(haystack_len, haystack_len);
3525     if (needle_isL != haystack_isL) {
3526       __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
3527     }
3528     __ xorr(ch2, first, ch2);
3529     __ sub(match_mask, ch2, mask1);
3530     __ orr(ch2, ch2, mask2);
3531     __ mv(trailing_zeros, -1); // all bits set
3532 
3533     __ bind(L_SMALL_PROCEED);
3534     __ srl(trailing_zeros, trailing_zeros, haystack_len); // mask. zeroes on useless bits.
3535     __ notr(ch2, ch2);
3536     __ andr(match_mask, match_mask, ch2);
3537     __ andr(match_mask, match_mask, trailing_zeros); // clear useless bits and check
3538     __ beqz(match_mask, NOMATCH);
3539 
3540     __ bind(L_SMALL_HAS_ZERO_LOOP);
3541     // count bits of trailing zero chars
3542     __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, ch2, tmp);
3543     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
3544     __ mv(ch2, wordSize / haystack_chr_size);
3545     __ ble(needle_len, ch2, L_SMALL_CMP_LOOP_LAST_CMP2);
3546     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
3547     __ mv(trailing_zeros, wordSize / haystack_chr_size);
3548     __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
3549 
3550     __ bind(L_SMALL_CMP_LOOP);
3551     __ shadd(first, trailing_zeros, needle, first, needle_chr_shift);
3552     __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift);
3553     needle_isL ? __ lbu(first, Address(first)) : __ lhu(first, Address(first));
3554     haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
3555     __ addi(trailing_zeros, trailing_zeros, 1);
3556     __ bge(trailing_zeros, needle_len, L_SMALL_CMP_LOOP_LAST_CMP);
3557     __ beq(first, ch2, L_SMALL_CMP_LOOP);
3558 
3559     __ bind(L_SMALL_CMP_LOOP_NOMATCH);
3560     __ beqz(match_mask, NOMATCH);
3561     // count bits of trailing zero chars
3562     __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
3563     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
3564     __ addi(result, result, 1);
3565     __ addi(haystack, haystack, haystack_chr_size);
3566     __ j(L_SMALL_HAS_ZERO_LOOP);
3567 
3568     __ align(OptoLoopAlignment);
3569     __ bind(L_SMALL_CMP_LOOP_LAST_CMP);
3570     __ bne(first, ch2, L_SMALL_CMP_LOOP_NOMATCH);
3571     __ j(DONE);
3572 
3573     __ align(OptoLoopAlignment);
3574     __ bind(L_SMALL_CMP_LOOP_LAST_CMP2);
3575     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
3576     __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
3577     __ j(DONE);
3578 
3579     __ align(OptoLoopAlignment);
3580     __ bind(L_HAS_ZERO);
3581     // count bits of trailing zero chars
3582     __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
3583     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
3584     __ slli(needle_len, needle_len, BitsPerByte * wordSize / 2);
3585     __ orr(haystack_len, haystack_len, needle_len); // restore needle_len(32bits)
3586     __ subi(result, result, 1); // array index from 0, so result -= 1
3587 
3588     __ bind(L_HAS_ZERO_LOOP);
3589     __ mv(needle_len, wordSize / haystack_chr_size);
3590     __ srli(ch2, haystack_len, BitsPerByte * wordSize / 2);
3591     __ bge(needle_len, ch2, L_CMP_LOOP_LAST_CMP2);
3592     // load next 8 bytes from haystack, and increase result index
3593     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
3594     __ addi(result, result, 1);
3595     __ mv(trailing_zeros, wordSize / haystack_chr_size);
3596     __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
3597 
3598     // compare one char
3599     __ bind(L_CMP_LOOP);
3600     __ shadd(needle_len, trailing_zeros, needle, needle_len, needle_chr_shift);
3601     needle_isL ? __ lbu(needle_len, Address(needle_len)) : __ lhu(needle_len, Address(needle_len));
3602     __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift);
3603     haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
3604     __ addi(trailing_zeros, trailing_zeros, 1); // next char index
3605     __ srli(tmp, haystack_len, BitsPerByte * wordSize / 2);
3606     __ bge(trailing_zeros, tmp, L_CMP_LOOP_LAST_CMP);
3607     __ beq(needle_len, ch2, L_CMP_LOOP);
3608 
3609     __ bind(L_CMP_LOOP_NOMATCH);
3610     __ beqz(match_mask, L_HAS_ZERO_LOOP_NOMATCH);
3611     // count bits of trailing zero chars
3612     __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, needle_len, ch2);
3613     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
3614     __ addi(haystack, haystack, haystack_chr_size);
3615     __ j(L_HAS_ZERO_LOOP);
3616 
3617     __ align(OptoLoopAlignment);
3618     __ bind(L_CMP_LOOP_LAST_CMP);
3619     __ bne(needle_len, ch2, L_CMP_LOOP_NOMATCH);
3620     __ j(DONE);
3621 
3622     __ align(OptoLoopAlignment);
3623     __ bind(L_CMP_LOOP_LAST_CMP2);
3624     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
3625     __ addi(result, result, 1);
3626     __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
3627     __ j(DONE);
3628 
3629     __ align(OptoLoopAlignment);
3630     __ bind(L_HAS_ZERO_LOOP_NOMATCH);
3631     // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
3632     // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
3633     // so, result was increased at max by wordSize/str2_chr_size - 1, so,
3634     // respective high bit wasn't changed. L_LOOP_PROCEED will increase
3635     // result by analyzed characters value, so, we can just reset lower bits
3636     // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
3637     // 2) restore needle_len and haystack_len values from "compressed" haystack_len
3638     // 3) advance haystack value to represent next haystack octet. result & 7/3 is
3639     // index of last analyzed substring inside current octet. So, haystack in at
3640     // respective start address. We need to advance it to next octet
3641     __ andi(match_mask, result, wordSize / haystack_chr_size - 1);
3642     __ srli(needle_len, haystack_len, BitsPerByte * wordSize / 2);
3643     __ andi(result, result, haystack_isL ? -8 : -4);
3644     __ slli(tmp, match_mask, haystack_chr_shift);
3645     __ sub(haystack, haystack, tmp);
3646     __ sext(haystack_len, haystack_len, 32);
3647     __ j(L_LOOP_PROCEED);
3648 
3649     __ align(OptoLoopAlignment);
3650     __ bind(NOMATCH);
3651     __ mv(result, -1);
3652 
3653     __ bind(DONE);
3654     __ pop_reg(spilled_regs, sp);
3655     __ ret();
3656     return entry;
3657   }
3658 
3659   void generate_string_indexof_stubs()
3660   {
3661     StubRoutines::riscv::_string_indexof_linear_ll = generate_string_indexof_linear(StubId::stubgen_string_indexof_linear_ll_id);
3662     StubRoutines::riscv::_string_indexof_linear_uu = generate_string_indexof_linear(StubId::stubgen_string_indexof_linear_uu_id);
3663     StubRoutines::riscv::_string_indexof_linear_ul = generate_string_indexof_linear(StubId::stubgen_string_indexof_linear_ul_id);
3664   }
3665 
3666 #ifdef COMPILER2
3667   void generate_lookup_secondary_supers_table_stub() {
3668     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
3669     StubCodeMark mark(this, stub_id);
3670 
3671     const Register
3672       r_super_klass  = x10,
3673       r_array_base   = x11,
3674       r_array_length = x12,
3675       r_array_index  = x13,
3676       r_sub_klass    = x14,
3677       result         = x15,
3678       r_bitmap       = x16;
3679 
3680     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
3681       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
3682       Label L_success;
3683       __ enter();
3684       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, result,
3685                                              r_array_base, r_array_length, r_array_index,
3686                                              r_bitmap, slot, /*stub_is_near*/true);
3687       __ leave();
3688       __ ret();
3689     }
3690   }
3691 
3692   // Slow path implementation for UseSecondarySupersTable.
3693   address generate_lookup_secondary_supers_table_slow_path_stub() {
3694     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
3695     StubCodeMark mark(this, stub_id);
3696 
3697     address start = __ pc();
3698     const Register
3699       r_super_klass  = x10,        // argument
3700       r_array_base   = x11,        // argument
3701       temp1          = x12,        // tmp
3702       r_array_index  = x13,        // argument
3703       result         = x15,        // argument
3704       r_bitmap       = x16;        // argument
3705 
3706 
3707     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
3708     __ ret();
3709 
3710     return start;
3711   }
3712 
3713   address generate_mulAdd()
3714   {
3715     __ align(CodeEntryAlignment);
3716     StubId stub_id = StubId::stubgen_mulAdd_id;
3717     StubCodeMark mark(this, stub_id);
3718 
3719     address entry = __ pc();
3720 
3721     const Register out     = x10;
3722     const Register in      = x11;
3723     const Register offset  = x12;
3724     const Register len     = x13;
3725     const Register k       = x14;
3726     const Register tmp     = x28;
3727 
3728     BLOCK_COMMENT("Entry:");
3729     __ enter();
3730     __ mul_add(out, in, offset, len, k, tmp);
3731     __ leave();
3732     __ ret();
3733 
3734     return entry;
3735   }
3736 
3737   /**
3738    *  Arguments:
3739    *
3740    *  Input:
3741    *    c_rarg0   - x address
3742    *    c_rarg1   - x length
3743    *    c_rarg2   - y address
3744    *    c_rarg3   - y length
3745    *    c_rarg4   - z address
3746    */
3747   address generate_multiplyToLen()
3748   {
3749     __ align(CodeEntryAlignment);
3750     StubId stub_id = StubId::stubgen_multiplyToLen_id;
3751     StubCodeMark mark(this, stub_id);
3752     address entry = __ pc();
3753 
3754     const Register x     = x10;
3755     const Register xlen  = x11;
3756     const Register y     = x12;
3757     const Register ylen  = x13;
3758     const Register z     = x14;
3759 
3760     const Register tmp0  = x15;
3761     const Register tmp1  = x16;
3762     const Register tmp2  = x17;
3763     const Register tmp3  = x7;
3764     const Register tmp4  = x28;
3765     const Register tmp5  = x29;
3766     const Register tmp6  = x30;
3767     const Register tmp7  = x31;
3768 
3769     BLOCK_COMMENT("Entry:");
3770     __ enter(); // required for proper stackwalking of RuntimeStub frame
3771     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3772     __ leave(); // required for proper stackwalking of RuntimeStub frame
3773     __ ret();
3774 
3775     return entry;
3776   }
3777 
3778   address generate_squareToLen()
3779   {
3780     __ align(CodeEntryAlignment);
3781     StubId stub_id = StubId::stubgen_squareToLen_id;
3782     StubCodeMark mark(this, stub_id);
3783     address entry = __ pc();
3784 
3785     const Register x     = x10;
3786     const Register xlen  = x11;
3787     const Register z     = x12;
3788     const Register y     = x14; // == x
3789     const Register ylen  = x15; // == xlen
3790 
3791     const Register tmp0  = x13; // zlen, unused
3792     const Register tmp1  = x16;
3793     const Register tmp2  = x17;
3794     const Register tmp3  = x7;
3795     const Register tmp4  = x28;
3796     const Register tmp5  = x29;
3797     const Register tmp6  = x30;
3798     const Register tmp7  = x31;
3799 
3800     BLOCK_COMMENT("Entry:");
3801     __ enter();
3802     __ mv(y, x);
3803     __ mv(ylen, xlen);
3804     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3805     __ leave();
3806     __ ret();
3807 
3808     return entry;
3809   }
3810 
3811   // Arguments:
3812   //
3813   // Input:
3814   //   c_rarg0   - newArr address
3815   //   c_rarg1   - oldArr address
3816   //   c_rarg2   - newIdx
3817   //   c_rarg3   - shiftCount
3818   //   c_rarg4   - numIter
3819   //
3820   address generate_bigIntegerLeftShift() {
3821     __ align(CodeEntryAlignment);
3822     StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
3823     StubCodeMark mark(this, stub_id);
3824     address entry = __ pc();
3825 
3826     Label loop, exit;
3827 
3828     Register newArr        = c_rarg0;
3829     Register oldArr        = c_rarg1;
3830     Register newIdx        = c_rarg2;
3831     Register shiftCount    = c_rarg3;
3832     Register numIter       = c_rarg4;
3833 
3834     Register shiftRevCount = c_rarg5;
3835     Register oldArrNext    = t1;
3836 
3837     __ beqz(numIter, exit);
3838     __ shadd(newArr, newIdx, newArr, t0, 2);
3839 
3840     __ mv(shiftRevCount, 32);
3841     __ sub(shiftRevCount, shiftRevCount, shiftCount);
3842 
3843     __ bind(loop);
3844     __ addi(oldArrNext, oldArr, 4);
3845     __ vsetvli(t0, numIter, Assembler::e32, Assembler::m4);
3846     __ vle32_v(v0, oldArr);
3847     __ vle32_v(v4, oldArrNext);
3848     __ vsll_vx(v0, v0, shiftCount);
3849     __ vsrl_vx(v4, v4, shiftRevCount);
3850     __ vor_vv(v0, v0, v4);
3851     __ vse32_v(v0, newArr);
3852     __ sub(numIter, numIter, t0);
3853     __ shadd(oldArr, t0, oldArr, t1, 2);
3854     __ shadd(newArr, t0, newArr, t1, 2);
3855     __ bnez(numIter, loop);
3856 
3857     __ bind(exit);
3858     __ ret();
3859 
3860     return entry;
3861   }
3862 
3863   // Arguments:
3864   //
3865   // Input:
3866   //   c_rarg0   - newArr address
3867   //   c_rarg1   - oldArr address
3868   //   c_rarg2   - newIdx
3869   //   c_rarg3   - shiftCount
3870   //   c_rarg4   - numIter
3871   //
3872   address generate_bigIntegerRightShift() {
3873     __ align(CodeEntryAlignment);
3874     StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
3875     StubCodeMark mark(this, stub_id);
3876     address entry = __ pc();
3877 
3878     Label loop, exit;
3879 
3880     Register newArr        = c_rarg0;
3881     Register oldArr        = c_rarg1;
3882     Register newIdx        = c_rarg2;
3883     Register shiftCount    = c_rarg3;
3884     Register numIter       = c_rarg4;
3885     Register idx           = numIter;
3886 
3887     Register shiftRevCount = c_rarg5;
3888     Register oldArrNext    = c_rarg6;
3889     Register newArrCur     = t0;
3890     Register oldArrCur     = t1;
3891 
3892     __ beqz(idx, exit);
3893     __ shadd(newArr, newIdx, newArr, t0, 2);
3894 
3895     __ mv(shiftRevCount, 32);
3896     __ sub(shiftRevCount, shiftRevCount, shiftCount);
3897 
3898     __ bind(loop);
3899     __ vsetvli(t0, idx, Assembler::e32, Assembler::m4);
3900     __ sub(idx, idx, t0);
3901     __ shadd(oldArrNext, idx, oldArr, t1, 2);
3902     __ shadd(newArrCur, idx, newArr, t1, 2);
3903     __ addi(oldArrCur, oldArrNext, 4);
3904     __ vle32_v(v0, oldArrCur);
3905     __ vle32_v(v4, oldArrNext);
3906     __ vsrl_vx(v0, v0, shiftCount);
3907     __ vsll_vx(v4, v4, shiftRevCount);
3908     __ vor_vv(v0, v0, v4);
3909     __ vse32_v(v0, newArrCur);
3910     __ bnez(idx, loop);
3911 
3912     __ bind(exit);
3913     __ ret();
3914 
3915     return entry;
3916   }
3917 #endif
3918 
3919 #ifdef COMPILER2
3920   class MontgomeryMultiplyGenerator : public MacroAssembler {
3921 
3922     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
3923       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2, Ri, Rj;
3924 
3925     RegSet _toSave;
3926     bool _squaring;
3927 
3928   public:
3929     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
3930       : MacroAssembler(as->code()), _squaring(squaring) {
3931 
3932       // Register allocation
3933 
3934       RegSetIterator<Register> regs = RegSet::range(x10, x26).begin();
3935       Pa_base = *regs;       // Argument registers
3936       if (squaring) {
3937         Pb_base = Pa_base;
3938       } else {
3939         Pb_base = *++regs;
3940       }
3941       Pn_base = *++regs;
3942       Rlen= *++regs;
3943       inv = *++regs;
3944       Pm_base = *++regs;
3945 
3946                         // Working registers:
3947       Ra =  *++regs;    // The current digit of a, b, n, and m.
3948       Rb =  *++regs;
3949       Rm =  *++regs;
3950       Rn =  *++regs;
3951 
3952       Pa =  *++regs;      // Pointers to the current/next digit of a, b, n, and m.
3953       Pb =  *++regs;
3954       Pm =  *++regs;
3955       Pn =  *++regs;
3956 
3957       tmp0 =  *++regs;    // Three registers which form a
3958       tmp1 =  *++regs;    // triple-precision accumuator.
3959       tmp2 =  *++regs;
3960 
3961       Ri =  x6;         // Inner and outer loop indexes.
3962       Rj =  x7;
3963 
3964       Rhi_ab = x28;     // Product registers: low and high parts
3965       Rlo_ab = x29;     // of a*b and m*n.
3966       Rhi_mn = x30;
3967       Rlo_mn = x31;
3968 
3969       // x18 and up are callee-saved.
3970       _toSave = RegSet::range(x18, *regs) + Pm_base;
3971     }
3972 
3973   private:
3974     void save_regs() {
3975       push_reg(_toSave, sp);
3976     }
3977 
3978     void restore_regs() {
3979       pop_reg(_toSave, sp);
3980     }
3981 
3982     template <typename T>
3983     void unroll_2(Register count, T block) {
3984       Label loop, end, odd;
3985       beqz(count, end);
3986       test_bit(t0, count, 0);
3987       bnez(t0, odd);
3988       align(16);
3989       bind(loop);
3990       (this->*block)();
3991       bind(odd);
3992       (this->*block)();
3993       subi(count, count, 2);
3994       bgtz(count, loop);
3995       bind(end);
3996     }
3997 
3998     template <typename T>
3999     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
4000       Label loop, end, odd;
4001       beqz(count, end);
4002       test_bit(tmp, count, 0);
4003       bnez(tmp, odd);
4004       align(16);
4005       bind(loop);
4006       (this->*block)(d, s, tmp);
4007       bind(odd);
4008       (this->*block)(d, s, tmp);
4009       subi(count, count, 2);
4010       bgtz(count, loop);
4011       bind(end);
4012     }
4013 
4014     void pre1(RegisterOrConstant i) {
4015       block_comment("pre1");
4016       // Pa = Pa_base;
4017       // Pb = Pb_base + i;
4018       // Pm = Pm_base;
4019       // Pn = Pn_base + i;
4020       // Ra = *Pa;
4021       // Rb = *Pb;
4022       // Rm = *Pm;
4023       // Rn = *Pn;
4024       if (i.is_register()) {
4025         slli(t0, i.as_register(), LogBytesPerWord);
4026       } else {
4027         mv(t0, i.as_constant());
4028         slli(t0, t0, LogBytesPerWord);
4029       }
4030 
4031       mv(Pa, Pa_base);
4032       add(Pb, Pb_base, t0);
4033       mv(Pm, Pm_base);
4034       add(Pn, Pn_base, t0);
4035 
4036       ld(Ra, Address(Pa));
4037       ld(Rb, Address(Pb));
4038       ld(Rm, Address(Pm));
4039       ld(Rn, Address(Pn));
4040 
4041       // Zero the m*n result.
4042       mv(Rhi_mn, zr);
4043       mv(Rlo_mn, zr);
4044     }
4045 
4046     // The core multiply-accumulate step of a Montgomery
4047     // multiplication.  The idea is to schedule operations as a
4048     // pipeline so that instructions with long latencies (loads and
4049     // multiplies) have time to complete before their results are
4050     // used.  This most benefits in-order implementations of the
4051     // architecture but out-of-order ones also benefit.
4052     void step() {
4053       block_comment("step");
4054       // MACC(Ra, Rb, tmp0, tmp1, tmp2);
4055       // Ra = *++Pa;
4056       // Rb = *--Pb;
4057       mulhu(Rhi_ab, Ra, Rb);
4058       mul(Rlo_ab, Ra, Rb);
4059       addi(Pa, Pa, wordSize);
4060       ld(Ra, Address(Pa));
4061       subi(Pb, Pb, wordSize);
4062       ld(Rb, Address(Pb));
4063       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n from the
4064                                             // previous iteration.
4065       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
4066       // Rm = *++Pm;
4067       // Rn = *--Pn;
4068       mulhu(Rhi_mn, Rm, Rn);
4069       mul(Rlo_mn, Rm, Rn);
4070       addi(Pm, Pm, wordSize);
4071       ld(Rm, Address(Pm));
4072       subi(Pn, Pn, wordSize);
4073       ld(Rn, Address(Pn));
4074       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
4075     }
4076 
4077     void post1() {
4078       block_comment("post1");
4079 
4080       // MACC(Ra, Rb, tmp0, tmp1, tmp2);
4081       // Ra = *++Pa;
4082       // Rb = *--Pb;
4083       mulhu(Rhi_ab, Ra, Rb);
4084       mul(Rlo_ab, Ra, Rb);
4085       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
4086       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
4087 
4088       // *Pm = Rm = tmp0 * inv;
4089       mul(Rm, tmp0, inv);
4090       sd(Rm, Address(Pm));
4091 
4092       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
4093       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
4094       mulhu(Rhi_mn, Rm, Rn);
4095 
4096 #ifndef PRODUCT
4097       // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply");
4098       {
4099         mul(Rlo_mn, Rm, Rn);
4100         add(Rlo_mn, tmp0, Rlo_mn);
4101         Label ok;
4102         beqz(Rlo_mn, ok);
4103         stop("broken Montgomery multiply");
4104         bind(ok);
4105       }
4106 #endif
4107       // We have very carefully set things up so that
4108       // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate
4109       // the lower half of Rm * Rn because we know the result already:
4110       // it must be -tmp0.  tmp0 + (-tmp0) must generate a carry iff
4111       // tmp0 != 0.  So, rather than do a mul and an cad we just set
4112       // the carry flag iff tmp0 is nonzero.
4113       //
4114       // mul(Rlo_mn, Rm, Rn);
4115       // cad(zr, tmp0, Rlo_mn);
4116       subi(t0, tmp0, 1);
4117       sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero
4118       cadc(tmp0, tmp1, Rhi_mn, t0);
4119       adc(tmp1, tmp2, zr, t0);
4120       mv(tmp2, zr);
4121     }
4122 
4123     void pre2(Register i, Register len) {
4124       block_comment("pre2");
4125       // Pa = Pa_base + i-len;
4126       // Pb = Pb_base + len;
4127       // Pm = Pm_base + i-len;
4128       // Pn = Pn_base + len;
4129 
4130       sub(Rj, i, len);
4131       // Rj == i-len
4132 
4133       // Ra as temp register
4134       slli(Ra, Rj, LogBytesPerWord);
4135       add(Pa, Pa_base, Ra);
4136       add(Pm, Pm_base, Ra);
4137       slli(Ra, len, LogBytesPerWord);
4138       add(Pb, Pb_base, Ra);
4139       add(Pn, Pn_base, Ra);
4140 
4141       // Ra = *++Pa;
4142       // Rb = *--Pb;
4143       // Rm = *++Pm;
4144       // Rn = *--Pn;
4145       addi(Pa, Pa, wordSize);
4146       ld(Ra, Address(Pa));
4147       subi(Pb, Pb, wordSize);
4148       ld(Rb, Address(Pb));
4149       addi(Pm, Pm, wordSize);
4150       ld(Rm, Address(Pm));
4151       subi(Pn, Pn, wordSize);
4152       ld(Rn, Address(Pn));
4153 
4154       mv(Rhi_mn, zr);
4155       mv(Rlo_mn, zr);
4156     }
4157 
4158     void post2(Register i, Register len) {
4159       block_comment("post2");
4160       sub(Rj, i, len);
4161 
4162       cad(tmp0, tmp0, Rlo_mn, t0); // The pending m*n, low part
4163 
4164       // As soon as we know the least significant digit of our result,
4165       // store it.
4166       // Pm_base[i-len] = tmp0;
4167       // Rj as temp register
4168       slli(Rj, Rj, LogBytesPerWord);
4169       add(Rj, Pm_base, Rj);
4170       sd(tmp0, Address(Rj));
4171 
4172       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
4173       cadc(tmp0, tmp1, Rhi_mn, t0); // The pending m*n, high part
4174       adc(tmp1, tmp2, zr, t0);
4175       mv(tmp2, zr);
4176     }
4177 
4178     // A carry in tmp0 after Montgomery multiplication means that we
4179     // should subtract multiples of n from our result in m.  We'll
4180     // keep doing that until there is no carry.
4181     void normalize(Register len) {
4182       block_comment("normalize");
4183       // while (tmp0)
4184       //   tmp0 = sub(Pm_base, Pn_base, tmp0, len);
4185       Label loop, post, again;
4186       Register cnt = tmp1, i = tmp2; // Re-use registers; we're done with them now
4187       beqz(tmp0, post); {
4188         bind(again); {
4189           mv(i, zr);
4190           mv(cnt, len);
4191           slli(Rn, i, LogBytesPerWord);
4192           add(Rm, Pm_base, Rn);
4193           ld(Rm, Address(Rm));
4194           add(Rn, Pn_base, Rn);
4195           ld(Rn, Address(Rn));
4196           mv(t0, 1); // set carry flag, i.e. no borrow
4197           align(16);
4198           bind(loop); {
4199             notr(Rn, Rn);
4200             add(Rm, Rm, t0);
4201             add(Rm, Rm, Rn);
4202             sltu(t0, Rm, Rn);
4203             slli(Rn, i, LogBytesPerWord); // Rn as temp register
4204             add(Rn, Pm_base, Rn);
4205             sd(Rm, Address(Rn));
4206             addi(i, i, 1);
4207             slli(Rn, i, LogBytesPerWord);
4208             add(Rm, Pm_base, Rn);
4209             ld(Rm, Address(Rm));
4210             add(Rn, Pn_base, Rn);
4211             ld(Rn, Address(Rn));
4212             subi(cnt, cnt, 1);
4213           } bnez(cnt, loop);
4214           subi(tmp0, tmp0, 1);
4215           add(tmp0, tmp0, t0);
4216         } bnez(tmp0, again);
4217       } bind(post);
4218     }
4219 
4220     // Move memory at s to d, reversing words.
4221     //    Increments d to end of copied memory
4222     //    Destroys tmp1, tmp2
4223     //    Preserves len
4224     //    Leaves s pointing to the address which was in d at start
4225     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
4226       assert(tmp1->encoding() < x28->encoding(), "register corruption");
4227       assert(tmp2->encoding() < x28->encoding(), "register corruption");
4228 
4229       shadd(s, len, s, tmp1, LogBytesPerWord);
4230       mv(tmp1, len);
4231       unroll_2(tmp1,  &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
4232       slli(tmp1, len, LogBytesPerWord);
4233       sub(s, d, tmp1);
4234     }
4235     // [63...0] -> [31...0][63...32]
4236     void reverse1(Register d, Register s, Register tmp) {
4237       subi(s, s, wordSize);
4238       ld(tmp, Address(s));
4239       ror(tmp, tmp, 32, t0);
4240       sd(tmp, Address(d));
4241       addi(d, d, wordSize);
4242     }
4243 
4244     void step_squaring() {
4245       // An extra ACC
4246       step();
4247       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
4248     }
4249 
4250     void last_squaring(Register i) {
4251       Label dont;
4252       // if ((i & 1) == 0) {
4253       test_bit(t0, i, 0);
4254       bnez(t0, dont); {
4255         // MACC(Ra, Rb, tmp0, tmp1, tmp2);
4256         // Ra = *++Pa;
4257         // Rb = *--Pb;
4258         mulhu(Rhi_ab, Ra, Rb);
4259         mul(Rlo_ab, Ra, Rb);
4260         acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
4261       } bind(dont);
4262     }
4263 
4264     void extra_step_squaring() {
4265       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
4266 
4267       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
4268       // Rm = *++Pm;
4269       // Rn = *--Pn;
4270       mulhu(Rhi_mn, Rm, Rn);
4271       mul(Rlo_mn, Rm, Rn);
4272       addi(Pm, Pm, wordSize);
4273       ld(Rm, Address(Pm));
4274       subi(Pn, Pn, wordSize);
4275       ld(Rn, Address(Pn));
4276     }
4277 
4278     void post1_squaring() {
4279       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
4280 
4281       // *Pm = Rm = tmp0 * inv;
4282       mul(Rm, tmp0, inv);
4283       sd(Rm, Address(Pm));
4284 
4285       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
4286       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
4287       mulhu(Rhi_mn, Rm, Rn);
4288 
4289 #ifndef PRODUCT
4290       // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply");
4291       {
4292         mul(Rlo_mn, Rm, Rn);
4293         add(Rlo_mn, tmp0, Rlo_mn);
4294         Label ok;
4295         beqz(Rlo_mn, ok); {
4296           stop("broken Montgomery multiply");
4297         } bind(ok);
4298       }
4299 #endif
4300       // We have very carefully set things up so that
4301       // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate
4302       // the lower half of Rm * Rn because we know the result already:
4303       // it must be -tmp0.  tmp0 + (-tmp0) must generate a carry iff
4304       // tmp0 != 0.  So, rather than do a mul and a cad we just set
4305       // the carry flag iff tmp0 is nonzero.
4306       //
4307       // mul(Rlo_mn, Rm, Rn);
4308       // cad(zr, tmp, Rlo_mn);
4309       subi(t0, tmp0, 1);
4310       sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero
4311       cadc(tmp0, tmp1, Rhi_mn, t0);
4312       adc(tmp1, tmp2, zr, t0);
4313       mv(tmp2, zr);
4314     }
4315 
4316     // use t0 as carry
4317     void acc(Register Rhi, Register Rlo,
4318              Register tmp0, Register tmp1, Register tmp2) {
4319       cad(tmp0, tmp0, Rlo, t0);
4320       cadc(tmp1, tmp1, Rhi, t0);
4321       adc(tmp2, tmp2, zr, t0);
4322     }
4323 
4324   public:
4325     /**
4326      * Fast Montgomery multiplication.  The derivation of the
4327      * algorithm is in A Cryptographic Library for the Motorola
4328      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
4329      *
4330      * Arguments:
4331      *
4332      * Inputs for multiplication:
4333      *   c_rarg0   - int array elements a
4334      *   c_rarg1   - int array elements b
4335      *   c_rarg2   - int array elements n (the modulus)
4336      *   c_rarg3   - int length
4337      *   c_rarg4   - int inv
4338      *   c_rarg5   - int array elements m (the result)
4339      *
4340      * Inputs for squaring:
4341      *   c_rarg0   - int array elements a
4342      *   c_rarg1   - int array elements n (the modulus)
4343      *   c_rarg2   - int length
4344      *   c_rarg3   - int inv
4345      *   c_rarg4   - int array elements m (the result)
4346      *
4347      */
4348     address generate_multiply() {
4349       Label argh, nothing;
4350       bind(argh);
4351       stop("MontgomeryMultiply total_allocation must be <= 8192");
4352 
4353       align(CodeEntryAlignment);
4354       address entry = pc();
4355 
4356       beqz(Rlen, nothing);
4357 
4358       enter();
4359 
4360       // Make room.
4361       mv(Ra, 512);
4362       bgt(Rlen, Ra, argh);
4363       slli(Ra, Rlen, exact_log2(4 * sizeof(jint)));
4364       sub(Ra, sp, Ra);
4365       andi(sp, Ra, -2 * wordSize);
4366 
4367       srliw(Rlen, Rlen, 1);  // length in longwords = len/2
4368 
4369       {
4370         // Copy input args, reversing as we go.  We use Ra as a
4371         // temporary variable.
4372         reverse(Ra, Pa_base, Rlen, Ri, Rj);
4373         if (!_squaring)
4374           reverse(Ra, Pb_base, Rlen, Ri, Rj);
4375         reverse(Ra, Pn_base, Rlen, Ri, Rj);
4376       }
4377 
4378       // Push all call-saved registers and also Pm_base which we'll need
4379       // at the end.
4380       save_regs();
4381 
4382 #ifndef PRODUCT
4383       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
4384       {
4385         ld(Rn, Address(Pn_base));
4386         mul(Rlo_mn, Rn, inv);
4387         mv(t0, -1);
4388         Label ok;
4389         beq(Rlo_mn, t0, ok);
4390         stop("broken inverse in Montgomery multiply");
4391         bind(ok);
4392       }
4393 #endif
4394 
4395       mv(Pm_base, Ra);
4396 
4397       mv(tmp0, zr);
4398       mv(tmp1, zr);
4399       mv(tmp2, zr);
4400 
4401       block_comment("for (int i = 0; i < len; i++) {");
4402       mv(Ri, zr); {
4403         Label loop, end;
4404         bge(Ri, Rlen, end);
4405 
4406         bind(loop);
4407         pre1(Ri);
4408 
4409         block_comment("  for (j = i; j; j--) {"); {
4410           mv(Rj, Ri);
4411           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4412         } block_comment("  } // j");
4413 
4414         post1();
4415         addiw(Ri, Ri, 1);
4416         blt(Ri, Rlen, loop);
4417         bind(end);
4418         block_comment("} // i");
4419       }
4420 
4421       block_comment("for (int i = len; i < 2*len; i++) {");
4422       mv(Ri, Rlen); {
4423         Label loop, end;
4424         slli(t0, Rlen, 1);
4425         bge(Ri, t0, end);
4426 
4427         bind(loop);
4428         pre2(Ri, Rlen);
4429 
4430         block_comment("  for (j = len*2-i-1; j; j--) {"); {
4431           slliw(Rj, Rlen, 1);
4432           subw(Rj, Rj, Ri);
4433           subiw(Rj, Rj, 1);
4434           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4435         } block_comment("  } // j");
4436 
4437         post2(Ri, Rlen);
4438         addiw(Ri, Ri, 1);
4439         slli(t0, Rlen, 1);
4440         blt(Ri, t0, loop);
4441         bind(end);
4442       }
4443       block_comment("} // i");
4444 
4445       normalize(Rlen);
4446 
4447       mv(Ra, Pm_base);  // Save Pm_base in Ra
4448       restore_regs();  // Restore caller's Pm_base
4449 
4450       // Copy our result into caller's Pm_base
4451       reverse(Pm_base, Ra, Rlen, Ri, Rj);
4452 
4453       leave();
4454       bind(nothing);
4455       ret();
4456 
4457       return entry;
4458     }
4459 
4460     /**
4461      *
4462      * Arguments:
4463      *
4464      * Inputs:
4465      *   c_rarg0   - int array elements a
4466      *   c_rarg1   - int array elements n (the modulus)
4467      *   c_rarg2   - int length
4468      *   c_rarg3   - int inv
4469      *   c_rarg4   - int array elements m (the result)
4470      *
4471      */
4472     address generate_square() {
4473       Label argh;
4474       bind(argh);
4475       stop("MontgomeryMultiply total_allocation must be <= 8192");
4476 
4477       align(CodeEntryAlignment);
4478       address entry = pc();
4479 
4480       enter();
4481 
4482       // Make room.
4483       mv(Ra, 512);
4484       bgt(Rlen, Ra, argh);
4485       slli(Ra, Rlen, exact_log2(4 * sizeof(jint)));
4486       sub(Ra, sp, Ra);
4487       andi(sp, Ra, -2 * wordSize);
4488 
4489       srliw(Rlen, Rlen, 1);  // length in longwords = len/2
4490 
4491       {
4492         // Copy input args, reversing as we go.  We use Ra as a
4493         // temporary variable.
4494         reverse(Ra, Pa_base, Rlen, Ri, Rj);
4495         reverse(Ra, Pn_base, Rlen, Ri, Rj);
4496       }
4497 
4498       // Push all call-saved registers and also Pm_base which we'll need
4499       // at the end.
4500       save_regs();
4501 
4502       mv(Pm_base, Ra);
4503 
4504       mv(tmp0, zr);
4505       mv(tmp1, zr);
4506       mv(tmp2, zr);
4507 
4508       block_comment("for (int i = 0; i < len; i++) {");
4509       mv(Ri, zr); {
4510         Label loop, end;
4511         bind(loop);
4512         bge(Ri, Rlen, end);
4513 
4514         pre1(Ri);
4515 
4516         block_comment("for (j = (i+1)/2; j; j--) {"); {
4517           addi(Rj, Ri, 1);
4518           srliw(Rj, Rj, 1);
4519           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4520         } block_comment("  } // j");
4521 
4522         last_squaring(Ri);
4523 
4524         block_comment("  for (j = i/2; j; j--) {"); {
4525           srliw(Rj, Ri, 1);
4526           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4527         } block_comment("  } // j");
4528 
4529         post1_squaring();
4530         addi(Ri, Ri, 1);
4531         blt(Ri, Rlen, loop);
4532 
4533         bind(end);
4534         block_comment("} // i");
4535       }
4536 
4537       block_comment("for (int i = len; i < 2*len; i++) {");
4538       mv(Ri, Rlen); {
4539         Label loop, end;
4540         bind(loop);
4541         slli(t0, Rlen, 1);
4542         bge(Ri, t0, end);
4543 
4544         pre2(Ri, Rlen);
4545 
4546         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
4547           slli(Rj, Rlen, 1);
4548           sub(Rj, Rj, Ri);
4549           subi(Rj, Rj, 1);
4550           srliw(Rj, Rj, 1);
4551           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4552         } block_comment("  } // j");
4553 
4554         last_squaring(Ri);
4555 
4556         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
4557           slli(Rj, Rlen, 1);
4558           sub(Rj, Rj, Ri);
4559           srliw(Rj, Rj, 1);
4560           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4561         } block_comment("  } // j");
4562 
4563         post2(Ri, Rlen);
4564         addi(Ri, Ri, 1);
4565         slli(t0, Rlen, 1);
4566         blt(Ri, t0, loop);
4567 
4568         bind(end);
4569         block_comment("} // i");
4570       }
4571 
4572       normalize(Rlen);
4573 
4574       mv(Ra, Pm_base);  // Save Pm_base in Ra
4575       restore_regs();  // Restore caller's Pm_base
4576 
4577       // Copy our result into caller's Pm_base
4578       reverse(Pm_base, Ra, Rlen, Ri, Rj);
4579 
4580       leave();
4581       ret();
4582 
4583       return entry;
4584     }
4585   };
4586 
4587 #endif // COMPILER2
4588 
4589   address generate_cont_thaw(Continuation::thaw_kind kind) {
4590     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
4591     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
4592 
4593     address start = __ pc();
4594 
4595     if (return_barrier) {
4596       __ ld(sp, Address(xthread, JavaThread::cont_entry_offset()));
4597     }
4598 
4599 #ifndef PRODUCT
4600     {
4601       Label OK;
4602       __ ld(t0, Address(xthread, JavaThread::cont_entry_offset()));
4603       __ beq(sp, t0, OK);
4604       __ stop("incorrect sp");
4605       __ bind(OK);
4606     }
4607 #endif
4608 
4609     if (return_barrier) {
4610       // preserve possible return value from a method returning to the return barrier
4611       __ subi(sp, sp, 2 * wordSize);
4612       __ fsd(f10, Address(sp, 0 * wordSize));
4613       __ sd(x10, Address(sp, 1 * wordSize));
4614     }
4615 
4616     __ mv(c_rarg1, (return_barrier ? 1 : 0));
4617     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), xthread, c_rarg1);
4618     __ mv(t1, x10); // x10 contains the size of the frames to thaw, 0 if overflow or no more frames
4619 
4620     if (return_barrier) {
4621       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
4622       __ ld(x10, Address(sp, 1 * wordSize));
4623       __ fld(f10, Address(sp, 0 * wordSize));
4624       __ addi(sp, sp, 2 * wordSize);
4625     }
4626 
4627 #ifndef PRODUCT
4628     {
4629       Label OK;
4630       __ ld(t0, Address(xthread, JavaThread::cont_entry_offset()));
4631       __ beq(sp, t0, OK);
4632       __ stop("incorrect sp");
4633       __ bind(OK);
4634     }
4635 #endif
4636 
4637     Label thaw_success;
4638     // t1 contains the size of the frames to thaw, 0 if overflow or no more frames
4639     __ bnez(t1, thaw_success);
4640     __ j(RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
4641     __ bind(thaw_success);
4642 
4643     // make room for the thawed frames
4644     __ sub(t0, sp, t1);
4645     __ andi(sp, t0, -16); // align
4646 
4647     if (return_barrier) {
4648       // save original return value -- again
4649       __ subi(sp, sp, 2 * wordSize);
4650       __ fsd(f10, Address(sp, 0 * wordSize));
4651       __ sd(x10, Address(sp, 1 * wordSize));
4652     }
4653 
4654     // If we want, we can templatize thaw by kind, and have three different entries
4655     __ mv(c_rarg1, kind);
4656 
4657     __ call_VM_leaf(Continuation::thaw_entry(), xthread, c_rarg1);
4658     __ mv(t1, x10); // x10 is the sp of the yielding frame
4659 
4660     if (return_barrier) {
4661       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
4662       __ ld(x10, Address(sp, 1 * wordSize));
4663       __ fld(f10, Address(sp, 0 * wordSize));
4664       __ addi(sp, sp, 2 * wordSize);
4665     } else {
4666       __ mv(x10, zr); // return 0 (success) from doYield
4667     }
4668 
4669     // we're now on the yield frame (which is in an address above us b/c sp has been pushed down)
4670     __ mv(fp, t1);
4671     __ subi(sp, t1, 2 * wordSize); // now pointing to fp spill
4672 
4673     if (return_barrier_exception) {
4674       __ ld(c_rarg1, Address(fp, -1 * wordSize)); // return address
4675       __ verify_oop(x10);
4676       __ mv(x9, x10); // save return value contaning the exception oop in callee-saved x9
4677 
4678       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), xthread, c_rarg1);
4679 
4680       // see OptoRuntime::generate_exception_blob: x10 -- exception oop, x13 -- exception pc
4681 
4682       __ mv(x11, x10); // the exception handler
4683       __ mv(x10, x9); // restore return value contaning the exception oop
4684       __ verify_oop(x10);
4685 
4686       __ leave();
4687       __ mv(x13, ra);
4688       __ jr(x11); // the exception handler
4689     } else {
4690       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
4691       __ leave();
4692       __ ret();
4693     }
4694 
4695     return start;
4696   }
4697 
4698   address generate_cont_thaw() {
4699     if (!Continuations::enabled()) return nullptr;
4700 
4701     StubId stub_id = StubId::stubgen_cont_thaw_id;
4702     StubCodeMark mark(this, stub_id);
4703     address start = __ pc();
4704     generate_cont_thaw(Continuation::thaw_top);
4705     return start;
4706   }
4707 
4708   address generate_cont_returnBarrier() {
4709     if (!Continuations::enabled()) return nullptr;
4710 
4711     // TODO: will probably need multiple return barriers depending on return type
4712     StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
4713     StubCodeMark mark(this, stub_id);
4714     address start = __ pc();
4715 
4716     generate_cont_thaw(Continuation::thaw_return_barrier);
4717 
4718     return start;
4719   }
4720 
4721   address generate_cont_returnBarrier_exception() {
4722     if (!Continuations::enabled()) return nullptr;
4723 
4724     StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
4725     StubCodeMark mark(this, stub_id);
4726     address start = __ pc();
4727 
4728     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
4729 
4730     return start;
4731   }
4732 
4733   address generate_cont_preempt_stub() {
4734     if (!Continuations::enabled()) return nullptr;
4735     StubId stub_id = StubId::stubgen_cont_preempt_id;
4736     StubCodeMark mark(this, stub_id);
4737     address start = __ pc();
4738 
4739     __ reset_last_Java_frame(true);
4740 
4741     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
4742     __ ld(sp, Address(xthread, JavaThread::cont_entry_offset()));
4743 
4744     Label preemption_cancelled;
4745     __ lbu(t0, Address(xthread, JavaThread::preemption_cancelled_offset()));
4746     __ bnez(t0, preemption_cancelled);
4747 
4748     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
4749     SharedRuntime::continuation_enter_cleanup(_masm);
4750     __ leave();
4751     __ ret();
4752 
4753     // We acquired the monitor after freezing the frames so call thaw to continue execution.
4754     __ bind(preemption_cancelled);
4755     __ sb(zr, Address(xthread, JavaThread::preemption_cancelled_offset()));
4756     __ la(fp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size() + 2 * wordSize)));
4757     __ la(t1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
4758     __ ld(t1, Address(t1));
4759     __ jr(t1);
4760 
4761     return start;
4762   }
4763 
4764 #ifdef COMPILER2
4765 
4766 #undef __
4767 #define __ this->
4768 
4769   class Sha2Generator : public MacroAssembler {
4770     StubCodeGenerator* _cgen;
4771    public:
4772       Sha2Generator(MacroAssembler* masm, StubCodeGenerator* cgen) : MacroAssembler(masm->code()), _cgen(cgen) {}
4773       address generate_sha256_implCompress(StubId stub_id) {
4774         return generate_sha2_implCompress(Assembler::e32, stub_id);
4775       }
4776       address generate_sha512_implCompress(StubId stub_id) {
4777         return generate_sha2_implCompress(Assembler::e64, stub_id);
4778       }
4779    private:
4780 
4781     void vleXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) {
4782       if (vset_sew == Assembler::e32) __ vle32_v(vr, sr);
4783       else                            __ vle64_v(vr, sr);
4784     }
4785 
4786     void vseXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) {
4787       if (vset_sew == Assembler::e32) __ vse32_v(vr, sr);
4788       else                            __ vse64_v(vr, sr);
4789     }
4790 
4791     // Overview of the logic in each "quad round".
4792     //
4793     // The code below repeats 16/20 times the logic implementing four rounds
4794     // of the SHA-256/512 core loop as documented by NIST. 16/20 "quad rounds"
4795     // to implementing the 64/80 single rounds.
4796     //
4797     //    // Load four word (u32/64) constants (K[t+3], K[t+2], K[t+1], K[t+0])
4798     //    // Output:
4799     //    //   vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]}
4800     //    vl1reXX.v vTmp1, ofs
4801     //
4802     //    // Increment word constant address by stride (16/32 bytes, 4*4B/8B, 128b/256b)
4803     //    addi ofs, ofs, 16/32
4804     //
4805     //    // Add constants to message schedule words:
4806     //    //  Input
4807     //    //    vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]}
4808     //    //    vW0 = {W[t+3], W[t+2], W[t+1], W[t+0]}; // Vt0 = W[3:0];
4809     //    //  Output
4810     //    //    vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
4811     //    vadd.vv vTmp0, vTmp1, vW0
4812     //
4813     //    //  2 rounds of working variables updates.
4814     //    //     vState1[t+4] <- vState1[t], vState0[t], vTmp0[t]
4815     //    //  Input:
4816     //    //    vState1 = {c[t],d[t],g[t],h[t]}   " = vState1[t] "
4817     //    //    vState0 = {a[t],b[t],e[t],f[t]}
4818     //    //    vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
4819     //    //  Output:
4820     //    //    vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]}  " = vState0[t+2] "
4821     //    //        = {h[t+4],g[t+4],d[t+4],c[t+4]}  " = vState1[t+4] "
4822     //    vsha2cl.vv vState1, vState0, vTmp0
4823     //
4824     //    //  2 rounds of working variables updates.
4825     //    //     vState0[t+4] <- vState0[t], vState0[t+2], vTmp0[t]
4826     //    //  Input
4827     //    //   vState0 = {a[t],b[t],e[t],f[t]}       " = vState0[t] "
4828     //    //       = {h[t+2],g[t+2],d[t+2],c[t+2]}   " = vState1[t+2] "
4829     //    //   vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]}   " = vState0[t+2] "
4830     //    //   vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
4831     //    //  Output:
4832     //    //   vState0 = {f[t+4],e[t+4],b[t+4],a[t+4]}   " = vState0[t+4] "
4833     //    vsha2ch.vv vState0, vState1, vTmp0
4834     //
4835     //    // Combine 2QW into 1QW
4836     //    //
4837     //    // To generate the next 4 words, "new_vW0"/"vTmp0" from vW0-vW3, vsha2ms needs
4838     //    //     vW0[0..3], vW1[0], vW2[1..3], vW3[0, 2..3]
4839     //    // and it can only take 3 vectors as inputs. Hence we need to combine
4840     //    // vW1[0] and vW2[1..3] in a single vector.
4841     //    //
4842     //    // vmerge Vt4, Vt1, Vt2, V0
4843     //    // Input
4844     //    //  V0 = mask // first word from vW2, 1..3 words from vW1
4845     //    //  vW2 = {Wt-8, Wt-7, Wt-6, Wt-5}
4846     //    //  vW1 = {Wt-12, Wt-11, Wt-10, Wt-9}
4847     //    // Output
4848     //    //  Vt4 = {Wt-12, Wt-7, Wt-6, Wt-5}
4849     //    vmerge.vvm vTmp0, vW2, vW1, v0
4850     //
4851     //    // Generate next Four Message Schedule Words (hence allowing for 4 more rounds)
4852     //    // Input
4853     //    //  vW0 = {W[t+ 3], W[t+ 2], W[t+ 1], W[t+ 0]}     W[ 3: 0]
4854     //    //  vW3 = {W[t+15], W[t+14], W[t+13], W[t+12]}     W[15:12]
4855     //    //  vTmp0 = {W[t+11], W[t+10], W[t+ 9], W[t+ 4]}     W[11: 9,4]
4856     //    // Output (next four message schedule words)
4857     //    //  vW0 = {W[t+19],  W[t+18],  W[t+17],  W[t+16]}  W[19:16]
4858     //    vsha2ms.vv vW0, vTmp0, vW3
4859     //
4860     // BEFORE
4861     //  vW0 - vW3 hold the message schedule words (initially the block words)
4862     //    vW0 = W[ 3: 0]   "oldest"
4863     //    vW1 = W[ 7: 4]
4864     //    vW2 = W[11: 8]
4865     //    vW3 = W[15:12]   "newest"
4866     //
4867     //  vt6 - vt7 hold the working state variables
4868     //    vState0 = {a[t],b[t],e[t],f[t]}   // initially {H5,H4,H1,H0}
4869     //    vState1 = {c[t],d[t],g[t],h[t]}   // initially {H7,H6,H3,H2}
4870     //
4871     // AFTER
4872     //  vW0 - vW3 hold the message schedule words (initially the block words)
4873     //    vW1 = W[ 7: 4]   "oldest"
4874     //    vW2 = W[11: 8]
4875     //    vW3 = W[15:12]
4876     //    vW0 = W[19:16]   "newest"
4877     //
4878     //  vState0 and vState1 hold the working state variables
4879     //    vState0 = {a[t+4],b[t+4],e[t+4],f[t+4]}
4880     //    vState1 = {c[t+4],d[t+4],g[t+4],h[t+4]}
4881     //
4882     //  The group of vectors vW0,vW1,vW2,vW3 is "rotated" by one in each quad-round,
4883     //  hence the uses of those vectors rotate in each round, and we get back to the
4884     //  initial configuration every 4 quad-rounds. We could avoid those changes at
4885     //  the cost of moving those vectors at the end of each quad-rounds.
4886     void sha2_quad_round(Assembler::SEW vset_sew, VectorRegister rot1, VectorRegister rot2, VectorRegister rot3, VectorRegister rot4,
4887                          Register scalarconst, VectorRegister vtemp, VectorRegister vtemp2, VectorRegister v_abef, VectorRegister v_cdgh,
4888                          bool gen_words = true, bool step_const = true) {
4889       __ vleXX_v(vset_sew, vtemp, scalarconst);
4890       if (step_const) {
4891         __ addi(scalarconst, scalarconst, vset_sew == Assembler::e32 ? 16 : 32);
4892       }
4893       __ vadd_vv(vtemp2, vtemp, rot1);
4894       __ vsha2cl_vv(v_cdgh, v_abef, vtemp2);
4895       __ vsha2ch_vv(v_abef, v_cdgh, vtemp2);
4896       if (gen_words) {
4897         __ vmerge_vvm(vtemp2, rot3, rot2);
4898         __ vsha2ms_vv(rot1, vtemp2, rot4);
4899       }
4900     }
4901 
4902     // Arguments:
4903     //
4904     // Inputs:
4905     //   c_rarg0   - byte[]  source+offset
4906     //   c_rarg1   - int[]   SHA.state
4907     //   c_rarg2   - int     offset
4908     //   c_rarg3   - int     limit
4909     //
4910     address generate_sha2_implCompress(Assembler::SEW vset_sew, StubId stub_id) {
4911       alignas(64) static const uint32_t round_consts_256[64] = {
4912         0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
4913         0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
4914         0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
4915         0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
4916         0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
4917         0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
4918         0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
4919         0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
4920         0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
4921         0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
4922         0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
4923         0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
4924         0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
4925         0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
4926         0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
4927         0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
4928       };
4929       alignas(64) static const uint64_t round_consts_512[80] = {
4930         0x428a2f98d728ae22l, 0x7137449123ef65cdl, 0xb5c0fbcfec4d3b2fl,
4931         0xe9b5dba58189dbbcl, 0x3956c25bf348b538l, 0x59f111f1b605d019l,
4932         0x923f82a4af194f9bl, 0xab1c5ed5da6d8118l, 0xd807aa98a3030242l,
4933         0x12835b0145706fbel, 0x243185be4ee4b28cl, 0x550c7dc3d5ffb4e2l,
4934         0x72be5d74f27b896fl, 0x80deb1fe3b1696b1l, 0x9bdc06a725c71235l,
4935         0xc19bf174cf692694l, 0xe49b69c19ef14ad2l, 0xefbe4786384f25e3l,
4936         0x0fc19dc68b8cd5b5l, 0x240ca1cc77ac9c65l, 0x2de92c6f592b0275l,
4937         0x4a7484aa6ea6e483l, 0x5cb0a9dcbd41fbd4l, 0x76f988da831153b5l,
4938         0x983e5152ee66dfabl, 0xa831c66d2db43210l, 0xb00327c898fb213fl,
4939         0xbf597fc7beef0ee4l, 0xc6e00bf33da88fc2l, 0xd5a79147930aa725l,
4940         0x06ca6351e003826fl, 0x142929670a0e6e70l, 0x27b70a8546d22ffcl,
4941         0x2e1b21385c26c926l, 0x4d2c6dfc5ac42aedl, 0x53380d139d95b3dfl,
4942         0x650a73548baf63del, 0x766a0abb3c77b2a8l, 0x81c2c92e47edaee6l,
4943         0x92722c851482353bl, 0xa2bfe8a14cf10364l, 0xa81a664bbc423001l,
4944         0xc24b8b70d0f89791l, 0xc76c51a30654be30l, 0xd192e819d6ef5218l,
4945         0xd69906245565a910l, 0xf40e35855771202al, 0x106aa07032bbd1b8l,
4946         0x19a4c116b8d2d0c8l, 0x1e376c085141ab53l, 0x2748774cdf8eeb99l,
4947         0x34b0bcb5e19b48a8l, 0x391c0cb3c5c95a63l, 0x4ed8aa4ae3418acbl,
4948         0x5b9cca4f7763e373l, 0x682e6ff3d6b2b8a3l, 0x748f82ee5defb2fcl,
4949         0x78a5636f43172f60l, 0x84c87814a1f0ab72l, 0x8cc702081a6439ecl,
4950         0x90befffa23631e28l, 0xa4506cebde82bde9l, 0xbef9a3f7b2c67915l,
4951         0xc67178f2e372532bl, 0xca273eceea26619cl, 0xd186b8c721c0c207l,
4952         0xeada7dd6cde0eb1el, 0xf57d4f7fee6ed178l, 0x06f067aa72176fbal,
4953         0x0a637dc5a2c898a6l, 0x113f9804bef90dael, 0x1b710b35131c471bl,
4954         0x28db77f523047d84l, 0x32caab7b40c72493l, 0x3c9ebe0a15c9bebcl,
4955         0x431d67c49c100d4cl, 0x4cc5d4becb3e42b6l, 0x597f299cfc657e2al,
4956         0x5fcb6fab3ad6faecl, 0x6c44198c4a475817l
4957       };
4958       const int const_add = vset_sew == Assembler::e32 ? 16 : 32;
4959 
4960       bool multi_block;
4961       switch (stub_id) {
4962       case StubId::stubgen_sha256_implCompress_id:
4963         assert (vset_sew == Assembler::e32, "wrong macroassembler for stub");
4964         multi_block = false;
4965         break;
4966       case StubId::stubgen_sha256_implCompressMB_id:
4967         assert (vset_sew == Assembler::e32, "wrong macroassembler for stub");
4968         multi_block = true;
4969         break;
4970       case StubId::stubgen_sha512_implCompress_id:
4971         assert (vset_sew == Assembler::e64, "wrong macroassembler for stub");
4972         multi_block = false;
4973         break;
4974       case StubId::stubgen_sha512_implCompressMB_id:
4975         assert (vset_sew == Assembler::e64, "wrong macroassembler for stub");
4976         multi_block = true;
4977         break;
4978       default:
4979         ShouldNotReachHere();
4980       };
4981       __ align(CodeEntryAlignment);
4982       StubCodeMark mark(_cgen, stub_id);
4983       address start = __ pc();
4984 
4985       Register buf   = c_rarg0;
4986       Register state = c_rarg1;
4987       Register ofs   = c_rarg2;
4988       Register limit = c_rarg3;
4989       Register consts =  t2; // caller saved
4990       Register state_c = x28; // caller saved
4991       VectorRegister vindex = v2;
4992       VectorRegister vW0 = v4;
4993       VectorRegister vW1 = v6;
4994       VectorRegister vW2 = v8;
4995       VectorRegister vW3 = v10;
4996       VectorRegister vState0 = v12;
4997       VectorRegister vState1 = v14;
4998       VectorRegister vHash0  = v16;
4999       VectorRegister vHash1  = v18;
5000       VectorRegister vTmp0   = v20;
5001       VectorRegister vTmp1   = v22;
5002 
5003       Label multi_block_loop;
5004 
5005       __ enter();
5006 
5007       address constant_table = vset_sew == Assembler::e32 ? (address)round_consts_256 : (address)round_consts_512;
5008       la(consts, ExternalAddress(constant_table));
5009 
5010       // Register use in this function:
5011       //
5012       // VECTORS
5013       //  vW0 - vW3 (512/1024-bits / 4*128/256 bits / 4*4*32/65 bits), hold the message
5014       //             schedule words (Wt). They start with the message block
5015       //             content (W0 to W15), then further words in the message
5016       //             schedule generated via vsha2ms from previous Wt.
5017       //   Initially:
5018       //     vW0 = W[  3:0] = { W3,  W2,  W1,  W0}
5019       //     vW1 = W[  7:4] = { W7,  W6,  W5,  W4}
5020       //     vW2 = W[ 11:8] = {W11, W10,  W9,  W8}
5021       //     vW3 = W[15:12] = {W15, W14, W13, W12}
5022       //
5023       //  vState0 - vState1 hold the working state variables (a, b, ..., h)
5024       //    vState0 = {f[t],e[t],b[t],a[t]}
5025       //    vState1 = {h[t],g[t],d[t],c[t]}
5026       //   Initially:
5027       //    vState0 = {H5i-1, H4i-1, H1i-1 , H0i-1}
5028       //    vState1 = {H7i-i, H6i-1, H3i-1 , H2i-1}
5029       //
5030       //  v0 = masks for vrgather/vmerge. Single value during the 16 rounds.
5031       //
5032       //  vTmp0 = temporary, Wt+Kt
5033       //  vTmp1 = temporary, Kt
5034       //
5035       //  vHash0/vHash1 = hold the initial values of the hash, byte-swapped.
5036       //
5037       // During most of the function the vector state is configured so that each
5038       // vector is interpreted as containing four 32/64 bits (e32/e64) elements (128/256 bits).
5039 
5040       // vsha2ch/vsha2cl uses EGW of 4*SEW.
5041       // SHA256 SEW = e32, EGW = 128-bits
5042       // SHA512 SEW = e64, EGW = 256-bits
5043       //
5044       // VLEN is required to be at least 128.
5045       // For the case of VLEN=128 and SHA512 we need LMUL=2 to work with 4*e64 (EGW = 256)
5046       //
5047       // m1: LMUL=1/2
5048       // ta: tail agnostic (don't care about those lanes)
5049       // ma: mask agnostic (don't care about those lanes)
5050       // x0 is not written, we known the number of vector elements.
5051 
5052       if (vset_sew == Assembler::e64 && MaxVectorSize == 16) { // SHA512 and VLEN = 128
5053         __ vsetivli(x0, 4, vset_sew, Assembler::m2, Assembler::ma, Assembler::ta);
5054       } else {
5055         __ vsetivli(x0, 4, vset_sew, Assembler::m1, Assembler::ma, Assembler::ta);
5056       }
5057 
5058       int64_t indexes = vset_sew == Assembler::e32 ? 0x00041014ul : 0x00082028ul;
5059       __ li(t0, indexes);
5060       __ vmv_v_x(vindex, t0);
5061 
5062       // Step-over a,b, so we are pointing to c.
5063       // const_add is equal to 4x state variable, div by 2 is thus 2, a,b
5064       __ addi(state_c, state, const_add/2);
5065 
5066       // Use index-load to get {f,e,b,a},{h,g,d,c}
5067       __ vluxei8_v(vState0, state, vindex);
5068       __ vluxei8_v(vState1, state_c, vindex);
5069 
5070       __ bind(multi_block_loop);
5071 
5072       // Capture the initial H values in vHash0 and vHash1 to allow for computing
5073       // the resulting H', since H' = H+{a',b',c',...,h'}.
5074       __ vmv_v_v(vHash0, vState0);
5075       __ vmv_v_v(vHash1, vState1);
5076 
5077       // Load the 512/1024-bits of the message block in vW0-vW3 and perform
5078       // an endian swap on each 4/8 bytes element.
5079       //
5080       // If Zvkb is not implemented one can use vrgather
5081       // with an index sequence to byte-swap.
5082       //  sequence = [3 2 1 0   7 6 5 4  11 10 9 8   15 14 13 12]
5083       //   <https://oeis.org/A004444> gives us "N ^ 3" as a nice formula to generate
5084       //  this sequence. 'vid' gives us the N.
5085       __ vleXX_v(vset_sew, vW0, buf);
5086       __ vrev8_v(vW0, vW0);
5087       __ addi(buf, buf, const_add);
5088       __ vleXX_v(vset_sew, vW1, buf);
5089       __ vrev8_v(vW1, vW1);
5090       __ addi(buf, buf, const_add);
5091       __ vleXX_v(vset_sew, vW2, buf);
5092       __ vrev8_v(vW2, vW2);
5093       __ addi(buf, buf, const_add);
5094       __ vleXX_v(vset_sew, vW3, buf);
5095       __ vrev8_v(vW3, vW3);
5096       __ addi(buf, buf, const_add);
5097 
5098       // Set v0 up for the vmerge that replaces the first word (idx==0)
5099       __ vid_v(v0);
5100       __ vmseq_vi(v0, v0, 0x0);  // v0.mask[i] = (i == 0 ? 1 : 0)
5101 
5102       VectorRegister rotation_regs[] = {vW0, vW1, vW2, vW3};
5103       int rot_pos = 0;
5104       // Quad-round #0 (+0, vW0->vW1->vW2->vW3) ... #11 (+3, vW3->vW0->vW1->vW2)
5105       const int qr_end = vset_sew == Assembler::e32 ? 12 : 16;
5106       for (int i = 0; i < qr_end; i++) {
5107         sha2_quad_round(vset_sew,
5108                    rotation_regs[(rot_pos + 0) & 0x3],
5109                    rotation_regs[(rot_pos + 1) & 0x3],
5110                    rotation_regs[(rot_pos + 2) & 0x3],
5111                    rotation_regs[(rot_pos + 3) & 0x3],
5112                    consts,
5113                    vTmp1, vTmp0, vState0, vState1);
5114         ++rot_pos;
5115       }
5116       // Quad-round #12 (+0, vW0->vW1->vW2->vW3) ... #15 (+3, vW3->vW0->vW1->vW2)
5117       // Note that we stop generating new message schedule words (Wt, vW0-13)
5118       // as we already generated all the words we end up consuming (i.e., W[63:60]).
5119       const int qr_c_end = qr_end + 4;
5120       for (int i = qr_end; i < qr_c_end; i++) {
5121         sha2_quad_round(vset_sew,
5122                    rotation_regs[(rot_pos + 0) & 0x3],
5123                    rotation_regs[(rot_pos + 1) & 0x3],
5124                    rotation_regs[(rot_pos + 2) & 0x3],
5125                    rotation_regs[(rot_pos + 3) & 0x3],
5126                    consts,
5127                    vTmp1, vTmp0, vState0, vState1, false, i < (qr_c_end-1));
5128         ++rot_pos;
5129       }
5130 
5131       //--------------------------------------------------------------------------------
5132       // Compute the updated hash value H'
5133       //   H' = H + {h',g',...,b',a'}
5134       //      = {h,g,...,b,a} + {h',g',...,b',a'}
5135       //      = {h+h',g+g',...,b+b',a+a'}
5136 
5137       // H' = H+{a',b',c',...,h'}
5138       __ vadd_vv(vState0, vHash0, vState0);
5139       __ vadd_vv(vState1, vHash1, vState1);
5140 
5141       if (multi_block) {
5142         int total_adds = vset_sew == Assembler::e32 ? 240 : 608;
5143         __ subi(consts, consts, total_adds);
5144         __ addi(ofs, ofs, vset_sew == Assembler::e32 ? 64 : 128);
5145         __ ble(ofs, limit, multi_block_loop);
5146         __ mv(c_rarg0, ofs); // return ofs
5147       }
5148 
5149       // Store H[0..8] = {a,b,c,d,e,f,g,h} from
5150       //  vState0 = {f,e,b,a}
5151       //  vState1 = {h,g,d,c}
5152       __ vsuxei8_v(vState0, state,   vindex);
5153       __ vsuxei8_v(vState1, state_c, vindex);
5154 
5155       __ leave();
5156       __ ret();
5157 
5158       return start;
5159     }
5160   };
5161 
5162 #undef __
5163 #define __ _masm->
5164 
5165   // Set of L registers that correspond to a contiguous memory area.
5166   // Each 64-bit register typically corresponds to 2 32-bit integers.
5167   template <uint L>
5168   class RegCache {
5169   private:
5170     MacroAssembler *_masm;
5171     Register _regs[L];
5172 
5173   public:
5174     RegCache(MacroAssembler *masm, RegSet rs): _masm(masm) {
5175       assert(rs.size() == L, "%u registers are used to cache %u 4-byte data", rs.size(), 2 * L);
5176       auto it = rs.begin();
5177       for (auto &r: _regs) {
5178         r = *it;
5179         ++it;
5180       }
5181     }
5182 
5183     // generate load for the i'th register
5184     void gen_load(uint i, Register base) {
5185       assert(i < L, "invalid i: %u", i);
5186       __ ld(_regs[i], Address(base, 8 * i));
5187     }
5188 
5189     // add i'th 32-bit integer to dest
5190     void add_u32(const Register dest, uint i, const Register rtmp = t0) {
5191       assert(i < 2 * L, "invalid i: %u", i);
5192 
5193       if (is_even(i)) {
5194         // Use the bottom 32 bits. No need to mask off the top 32 bits
5195         // as addw will do the right thing.
5196         __ addw(dest, dest, _regs[i / 2]);
5197       } else {
5198         // Use the top 32 bits by right-shifting them.
5199         __ srli(rtmp, _regs[i / 2], 32);
5200         __ addw(dest, dest, rtmp);
5201       }
5202     }
5203   };
5204 
5205   typedef RegCache<8> BufRegCache;
5206 
5207   // a += value + x + ac;
5208   // a = Integer.rotateLeft(a, s) + b;
5209   void m5_FF_GG_HH_II_epilogue(BufRegCache& reg_cache,
5210                                Register a, Register b, Register c, Register d,
5211                                int k, int s, int t,
5212                                Register value) {
5213     // a += ac
5214     __ addw(a, a, t, t1);
5215 
5216     // a += x;
5217     reg_cache.add_u32(a, k);
5218     // a += value;
5219     __ addw(a, a, value);
5220 
5221     // a = Integer.rotateLeft(a, s) + b;
5222     __ rolw(a, a, s);
5223     __ addw(a, a, b);
5224   }
5225 
5226   // a += ((b & c) | ((~b) & d)) + x + ac;
5227   // a = Integer.rotateLeft(a, s) + b;
5228   void md5_FF(BufRegCache& reg_cache,
5229               Register a, Register b, Register c, Register d,
5230               int k, int s, int t,
5231               Register rtmp1, Register rtmp2) {
5232     // rtmp1 = b & c
5233     __ andr(rtmp1, b, c);
5234 
5235     // rtmp2 = (~b) & d
5236     __ andn(rtmp2, d, b);
5237 
5238     // rtmp1 = (b & c) | ((~b) & d)
5239     __ orr(rtmp1, rtmp1, rtmp2);
5240 
5241     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
5242   }
5243 
5244   // a += ((b & d) | (c & (~d))) + x + ac;
5245   // a = Integer.rotateLeft(a, s) + b;
5246   void md5_GG(BufRegCache& reg_cache,
5247               Register a, Register b, Register c, Register d,
5248               int k, int s, int t,
5249               Register rtmp1, Register rtmp2) {
5250     // rtmp1 = b & d
5251     __ andr(rtmp1, b, d);
5252 
5253     // rtmp2 = c & (~d)
5254     __ andn(rtmp2, c, d);
5255 
5256     // rtmp1 = (b & d) | (c & (~d))
5257     __ orr(rtmp1, rtmp1, rtmp2);
5258 
5259     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
5260   }
5261 
5262   // a += ((b ^ c) ^ d) + x + ac;
5263   // a = Integer.rotateLeft(a, s) + b;
5264   void md5_HH(BufRegCache& reg_cache,
5265               Register a, Register b, Register c, Register d,
5266               int k, int s, int t,
5267               Register rtmp1, Register rtmp2) {
5268     // rtmp1 = (b ^ c) ^ d
5269     __ xorr(rtmp2, b, c);
5270     __ xorr(rtmp1, rtmp2, d);
5271 
5272     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
5273   }
5274 
5275   // a += (c ^ (b | (~d))) + x + ac;
5276   // a = Integer.rotateLeft(a, s) + b;
5277   void md5_II(BufRegCache& reg_cache,
5278               Register a, Register b, Register c, Register d,
5279               int k, int s, int t,
5280               Register rtmp1, Register rtmp2) {
5281     // rtmp1 = c ^ (b | (~d))
5282     __ orn(rtmp2, b, d);
5283     __ xorr(rtmp1, c, rtmp2);
5284 
5285     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
5286   }
5287 
5288   // Arguments:
5289   //
5290   // Inputs:
5291   //   c_rarg0   - byte[]  source+offset
5292   //   c_rarg1   - int[]   SHA.state
5293   //   c_rarg2   - int     offset  (multi_block == True)
5294   //   c_rarg3   - int     limit   (multi_block == True)
5295   //
5296   // Registers:
5297   //    x0   zero  (zero)
5298   //    x1     ra  (return address)
5299   //    x2     sp  (stack pointer)
5300   //    x3     gp  (global pointer)
5301   //    x4     tp  (thread pointer)
5302   //    x5     t0  (tmp register)
5303   //    x6     t1  (tmp register)
5304   //    x7     t2  state0
5305   //    x8  f0/s0  (frame pointer)
5306   //    x9     s1
5307   //   x10     a0  rtmp1 / c_rarg0
5308   //   x11     a1  rtmp2 / c_rarg1
5309   //   x12     a2  a     / c_rarg2
5310   //   x13     a3  b     / c_rarg3
5311   //   x14     a4  c
5312   //   x15     a5  d
5313   //   x16     a6  buf
5314   //   x17     a7  state
5315   //   x18     s2  ofs     [saved-reg]  (multi_block == True)
5316   //   x19     s3  limit   [saved-reg]  (multi_block == True)
5317   //   x20     s4  state1  [saved-reg]
5318   //   x21     s5  state2  [saved-reg]
5319   //   x22     s6  state3  [saved-reg]
5320   //   x23     s7
5321   //   x24     s8  buf0    [saved-reg]
5322   //   x25     s9  buf1    [saved-reg]
5323   //   x26    s10  buf2    [saved-reg]
5324   //   x27    s11  buf3    [saved-reg]
5325   //   x28     t3  buf4
5326   //   x29     t4  buf5
5327   //   x30     t5  buf6
5328   //   x31     t6  buf7
5329   address generate_md5_implCompress(StubId stub_id) {
5330     __ align(CodeEntryAlignment);
5331     bool multi_block;
5332     switch (stub_id) {
5333     case StubId::stubgen_md5_implCompress_id:
5334       multi_block = false;
5335       break;
5336     case StubId::stubgen_md5_implCompressMB_id:
5337       multi_block = true;
5338       break;
5339     default:
5340       ShouldNotReachHere();
5341     };
5342     StubCodeMark mark(this, stub_id);
5343     address start = __ pc();
5344 
5345     // rotation constants
5346     const int S11 = 7;
5347     const int S12 = 12;
5348     const int S13 = 17;
5349     const int S14 = 22;
5350     const int S21 = 5;
5351     const int S22 = 9;
5352     const int S23 = 14;
5353     const int S24 = 20;
5354     const int S31 = 4;
5355     const int S32 = 11;
5356     const int S33 = 16;
5357     const int S34 = 23;
5358     const int S41 = 6;
5359     const int S42 = 10;
5360     const int S43 = 15;
5361     const int S44 = 21;
5362 
5363     const int64_t mask32 = 0xffffffff;
5364 
5365     Register buf_arg   = c_rarg0; // a0
5366     Register state_arg = c_rarg1; // a1
5367     Register ofs_arg   = c_rarg2; // a2
5368     Register limit_arg = c_rarg3; // a3
5369 
5370     // we'll copy the args to these registers to free up a0-a3
5371     // to use for other values manipulated by instructions
5372     // that can be compressed
5373     Register buf       = x16; // a6
5374     Register state     = x17; // a7
5375     Register ofs       = x18; // s2
5376     Register limit     = x19; // s3
5377 
5378     // using x12->15 to allow compressed instructions
5379     Register a         = x12; // a2
5380     Register b         = x13; // a3
5381     Register c         = x14; // a4
5382     Register d         = x15; // a5
5383 
5384     Register state0    =  x7; // t2
5385     Register state1    = x20; // s4
5386     Register state2    = x21; // s5
5387     Register state3    = x22; // s6
5388 
5389     // using x10->x11 to allow compressed instructions
5390     Register rtmp1     = x10; // a0
5391     Register rtmp2     = x11; // a1
5392 
5393     RegSet reg_cache_saved_regs = RegSet::of(x24, x25, x26, x27); // s8, s9, s10, s11
5394     RegSet reg_cache_regs;
5395     reg_cache_regs += reg_cache_saved_regs;
5396     reg_cache_regs += RegSet::of(t3, t4, t5, t6);
5397     BufRegCache reg_cache(_masm, reg_cache_regs);
5398 
5399     RegSet saved_regs;
5400     if (multi_block) {
5401       saved_regs += RegSet::of(ofs, limit);
5402     }
5403     saved_regs += RegSet::of(state1, state2, state3);
5404     saved_regs += reg_cache_saved_regs;
5405 
5406     __ push_reg(saved_regs, sp);
5407 
5408     __ mv(buf, buf_arg);
5409     __ mv(state, state_arg);
5410     if (multi_block) {
5411       __ mv(ofs, ofs_arg);
5412       __ mv(limit, limit_arg);
5413     }
5414 
5415     // to minimize the number of memory operations:
5416     // read the 4 state 4-byte values in pairs, with a single ld,
5417     // and split them into 2 registers.
5418     //
5419     // And, as the core algorithm of md5 works on 32-bits words, so
5420     // in the following code, it does not care about the content of
5421     // higher 32-bits in state[x]. Based on this observation,
5422     // we can apply further optimization, which is to just ignore the
5423     // higher 32-bits in state0/state2, rather than set the higher
5424     // 32-bits of state0/state2 to zero explicitly with extra instructions.
5425     __ ld(state0, Address(state));
5426     __ srli(state1, state0, 32);
5427     __ ld(state2, Address(state, 8));
5428     __ srli(state3, state2, 32);
5429 
5430     Label md5_loop;
5431     __ BIND(md5_loop);
5432 
5433     __ mv(a, state0);
5434     __ mv(b, state1);
5435     __ mv(c, state2);
5436     __ mv(d, state3);
5437 
5438     // Round 1
5439     reg_cache.gen_load(0, buf);
5440     md5_FF(reg_cache, a, b, c, d,  0, S11, 0xd76aa478, rtmp1, rtmp2);
5441     md5_FF(reg_cache, d, a, b, c,  1, S12, 0xe8c7b756, rtmp1, rtmp2);
5442     reg_cache.gen_load(1, buf);
5443     md5_FF(reg_cache, c, d, a, b,  2, S13, 0x242070db, rtmp1, rtmp2);
5444     md5_FF(reg_cache, b, c, d, a,  3, S14, 0xc1bdceee, rtmp1, rtmp2);
5445     reg_cache.gen_load(2, buf);
5446     md5_FF(reg_cache, a, b, c, d,  4, S11, 0xf57c0faf, rtmp1, rtmp2);
5447     md5_FF(reg_cache, d, a, b, c,  5, S12, 0x4787c62a, rtmp1, rtmp2);
5448     reg_cache.gen_load(3, buf);
5449     md5_FF(reg_cache, c, d, a, b,  6, S13, 0xa8304613, rtmp1, rtmp2);
5450     md5_FF(reg_cache, b, c, d, a,  7, S14, 0xfd469501, rtmp1, rtmp2);
5451     reg_cache.gen_load(4, buf);
5452     md5_FF(reg_cache, a, b, c, d,  8, S11, 0x698098d8, rtmp1, rtmp2);
5453     md5_FF(reg_cache, d, a, b, c,  9, S12, 0x8b44f7af, rtmp1, rtmp2);
5454     reg_cache.gen_load(5, buf);
5455     md5_FF(reg_cache, c, d, a, b, 10, S13, 0xffff5bb1, rtmp1, rtmp2);
5456     md5_FF(reg_cache, b, c, d, a, 11, S14, 0x895cd7be, rtmp1, rtmp2);
5457     reg_cache.gen_load(6, buf);
5458     md5_FF(reg_cache, a, b, c, d, 12, S11, 0x6b901122, rtmp1, rtmp2);
5459     md5_FF(reg_cache, d, a, b, c, 13, S12, 0xfd987193, rtmp1, rtmp2);
5460     reg_cache.gen_load(7, buf);
5461     md5_FF(reg_cache, c, d, a, b, 14, S13, 0xa679438e, rtmp1, rtmp2);
5462     md5_FF(reg_cache, b, c, d, a, 15, S14, 0x49b40821, rtmp1, rtmp2);
5463 
5464     // Round 2
5465     md5_GG(reg_cache, a, b, c, d,  1, S21, 0xf61e2562, rtmp1, rtmp2);
5466     md5_GG(reg_cache, d, a, b, c,  6, S22, 0xc040b340, rtmp1, rtmp2);
5467     md5_GG(reg_cache, c, d, a, b, 11, S23, 0x265e5a51, rtmp1, rtmp2);
5468     md5_GG(reg_cache, b, c, d, a,  0, S24, 0xe9b6c7aa, rtmp1, rtmp2);
5469     md5_GG(reg_cache, a, b, c, d,  5, S21, 0xd62f105d, rtmp1, rtmp2);
5470     md5_GG(reg_cache, d, a, b, c, 10, S22, 0x02441453, rtmp1, rtmp2);
5471     md5_GG(reg_cache, c, d, a, b, 15, S23, 0xd8a1e681, rtmp1, rtmp2);
5472     md5_GG(reg_cache, b, c, d, a,  4, S24, 0xe7d3fbc8, rtmp1, rtmp2);
5473     md5_GG(reg_cache, a, b, c, d,  9, S21, 0x21e1cde6, rtmp1, rtmp2);
5474     md5_GG(reg_cache, d, a, b, c, 14, S22, 0xc33707d6, rtmp1, rtmp2);
5475     md5_GG(reg_cache, c, d, a, b,  3, S23, 0xf4d50d87, rtmp1, rtmp2);
5476     md5_GG(reg_cache, b, c, d, a,  8, S24, 0x455a14ed, rtmp1, rtmp2);
5477     md5_GG(reg_cache, a, b, c, d, 13, S21, 0xa9e3e905, rtmp1, rtmp2);
5478     md5_GG(reg_cache, d, a, b, c,  2, S22, 0xfcefa3f8, rtmp1, rtmp2);
5479     md5_GG(reg_cache, c, d, a, b,  7, S23, 0x676f02d9, rtmp1, rtmp2);
5480     md5_GG(reg_cache, b, c, d, a, 12, S24, 0x8d2a4c8a, rtmp1, rtmp2);
5481 
5482     // Round 3
5483     md5_HH(reg_cache, a, b, c, d,  5, S31, 0xfffa3942, rtmp1, rtmp2);
5484     md5_HH(reg_cache, d, a, b, c,  8, S32, 0x8771f681, rtmp1, rtmp2);
5485     md5_HH(reg_cache, c, d, a, b, 11, S33, 0x6d9d6122, rtmp1, rtmp2);
5486     md5_HH(reg_cache, b, c, d, a, 14, S34, 0xfde5380c, rtmp1, rtmp2);
5487     md5_HH(reg_cache, a, b, c, d,  1, S31, 0xa4beea44, rtmp1, rtmp2);
5488     md5_HH(reg_cache, d, a, b, c,  4, S32, 0x4bdecfa9, rtmp1, rtmp2);
5489     md5_HH(reg_cache, c, d, a, b,  7, S33, 0xf6bb4b60, rtmp1, rtmp2);
5490     md5_HH(reg_cache, b, c, d, a, 10, S34, 0xbebfbc70, rtmp1, rtmp2);
5491     md5_HH(reg_cache, a, b, c, d, 13, S31, 0x289b7ec6, rtmp1, rtmp2);
5492     md5_HH(reg_cache, d, a, b, c,  0, S32, 0xeaa127fa, rtmp1, rtmp2);
5493     md5_HH(reg_cache, c, d, a, b,  3, S33, 0xd4ef3085, rtmp1, rtmp2);
5494     md5_HH(reg_cache, b, c, d, a,  6, S34, 0x04881d05, rtmp1, rtmp2);
5495     md5_HH(reg_cache, a, b, c, d,  9, S31, 0xd9d4d039, rtmp1, rtmp2);
5496     md5_HH(reg_cache, d, a, b, c, 12, S32, 0xe6db99e5, rtmp1, rtmp2);
5497     md5_HH(reg_cache, c, d, a, b, 15, S33, 0x1fa27cf8, rtmp1, rtmp2);
5498     md5_HH(reg_cache, b, c, d, a,  2, S34, 0xc4ac5665, rtmp1, rtmp2);
5499 
5500     // Round 4
5501     md5_II(reg_cache, a, b, c, d,  0, S41, 0xf4292244, rtmp1, rtmp2);
5502     md5_II(reg_cache, d, a, b, c,  7, S42, 0x432aff97, rtmp1, rtmp2);
5503     md5_II(reg_cache, c, d, a, b, 14, S43, 0xab9423a7, rtmp1, rtmp2);
5504     md5_II(reg_cache, b, c, d, a,  5, S44, 0xfc93a039, rtmp1, rtmp2);
5505     md5_II(reg_cache, a, b, c, d, 12, S41, 0x655b59c3, rtmp1, rtmp2);
5506     md5_II(reg_cache, d, a, b, c,  3, S42, 0x8f0ccc92, rtmp1, rtmp2);
5507     md5_II(reg_cache, c, d, a, b, 10, S43, 0xffeff47d, rtmp1, rtmp2);
5508     md5_II(reg_cache, b, c, d, a,  1, S44, 0x85845dd1, rtmp1, rtmp2);
5509     md5_II(reg_cache, a, b, c, d,  8, S41, 0x6fa87e4f, rtmp1, rtmp2);
5510     md5_II(reg_cache, d, a, b, c, 15, S42, 0xfe2ce6e0, rtmp1, rtmp2);
5511     md5_II(reg_cache, c, d, a, b,  6, S43, 0xa3014314, rtmp1, rtmp2);
5512     md5_II(reg_cache, b, c, d, a, 13, S44, 0x4e0811a1, rtmp1, rtmp2);
5513     md5_II(reg_cache, a, b, c, d,  4, S41, 0xf7537e82, rtmp1, rtmp2);
5514     md5_II(reg_cache, d, a, b, c, 11, S42, 0xbd3af235, rtmp1, rtmp2);
5515     md5_II(reg_cache, c, d, a, b,  2, S43, 0x2ad7d2bb, rtmp1, rtmp2);
5516     md5_II(reg_cache, b, c, d, a,  9, S44, 0xeb86d391, rtmp1, rtmp2);
5517 
5518     __ addw(state0, state0, a);
5519     __ addw(state1, state1, b);
5520     __ addw(state2, state2, c);
5521     __ addw(state3, state3, d);
5522 
5523     if (multi_block) {
5524       __ addi(buf, buf, 64);
5525       __ addi(ofs, ofs, 64);
5526       // if (ofs <= limit) goto m5_loop
5527       __ bge(limit, ofs, md5_loop);
5528       __ mv(c_rarg0, ofs); // return ofs
5529     }
5530 
5531     // to minimize the number of memory operations:
5532     // write back the 4 state 4-byte values in pairs, with a single sd
5533     __ mv(t0, mask32);
5534     __ andr(state0, state0, t0);
5535     __ slli(state1, state1, 32);
5536     __ orr(state0, state0, state1);
5537     __ sd(state0, Address(state));
5538     __ andr(state2, state2, t0);
5539     __ slli(state3, state3, 32);
5540     __ orr(state2, state2, state3);
5541     __ sd(state2, Address(state, 8));
5542 
5543     __ pop_reg(saved_regs, sp);
5544     __ ret();
5545 
5546     return (address) start;
5547   }
5548 
5549   /**
5550    * Perform the quarter round calculations on values contained within four vector registers.
5551    *
5552    * @param aVec the SIMD register containing only the "a" values
5553    * @param bVec the SIMD register containing only the "b" values
5554    * @param cVec the SIMD register containing only the "c" values
5555    * @param dVec the SIMD register containing only the "d" values
5556    * @param tmp_vr temporary vector register holds intermedia values.
5557    */
5558   void chacha20_quarter_round(VectorRegister aVec, VectorRegister bVec,
5559                           VectorRegister cVec, VectorRegister dVec, VectorRegister tmp_vr) {
5560     // a += b, d ^= a, d <<<= 16
5561     __ vadd_vv(aVec, aVec, bVec);
5562     __ vxor_vv(dVec, dVec, aVec);
5563     __ vrole32_vi(dVec, 16, tmp_vr);
5564 
5565     // c += d, b ^= c, b <<<= 12
5566     __ vadd_vv(cVec, cVec, dVec);
5567     __ vxor_vv(bVec, bVec, cVec);
5568     __ vrole32_vi(bVec, 12, tmp_vr);
5569 
5570     // a += b, d ^= a, d <<<= 8
5571     __ vadd_vv(aVec, aVec, bVec);
5572     __ vxor_vv(dVec, dVec, aVec);
5573     __ vrole32_vi(dVec, 8, tmp_vr);
5574 
5575     // c += d, b ^= c, b <<<= 7
5576     __ vadd_vv(cVec, cVec, dVec);
5577     __ vxor_vv(bVec, bVec, cVec);
5578     __ vrole32_vi(bVec, 7, tmp_vr);
5579   }
5580 
5581   /**
5582    * int com.sun.crypto.provider.ChaCha20Cipher.implChaCha20Block(int[] initState, byte[] result)
5583    *
5584    *  Input arguments:
5585    *  c_rarg0   - state, the starting state
5586    *  c_rarg1   - key_stream, the array that will hold the result of the ChaCha20 block function
5587    *
5588    *  Implementation Note:
5589    *   Parallelization is achieved by loading individual state elements into vectors for N blocks.
5590    *   N depends on single vector register length.
5591    */
5592   address generate_chacha20Block() {
5593     Label L_Rounds;
5594 
5595     __ align(CodeEntryAlignment);
5596     StubId stub_id = StubId::stubgen_chacha20Block_id;
5597     StubCodeMark mark(this, stub_id);
5598     address start = __ pc();
5599     __ enter();
5600 
5601     const int states_len = 16;
5602     const int step = 4;
5603     const Register state = c_rarg0;
5604     const Register key_stream = c_rarg1;
5605     const Register tmp_addr = t0;
5606     const Register length = t1;
5607 
5608     // Organize vector registers in an array that facilitates
5609     // putting repetitive opcodes into loop structures below.
5610     const VectorRegister work_vrs[16] = {
5611       v0, v1, v2,  v3,  v4,  v5,  v6,  v7,
5612       v8, v9, v10, v11, v12, v13, v14, v15
5613     };
5614     const VectorRegister tmp_vr = v16;
5615     const VectorRegister counter_vr = v17;
5616 
5617     {
5618       // Put 16 here, as com.sun.crypto.providerChaCha20Cipher.KS_MAX_LEN is 1024
5619       // in java level.
5620       __ vsetivli(length, 16, Assembler::e32, Assembler::m1);
5621     }
5622 
5623     // Load from source state.
5624     // Every element in source state is duplicated to all elements in the corresponding vector.
5625     __ mv(tmp_addr, state);
5626     for (int i = 0; i < states_len; i += 1) {
5627       __ vlse32_v(work_vrs[i], tmp_addr, zr);
5628       __ addi(tmp_addr, tmp_addr, step);
5629     }
5630     // Adjust counter for every individual block.
5631     __ vid_v(counter_vr);
5632     __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
5633 
5634     // Perform 10 iterations of the 8 quarter round set
5635     {
5636       const Register loop = t2; // share t2 with other non-overlapping usages.
5637       __ mv(loop, 10);
5638       __ BIND(L_Rounds);
5639 
5640       chacha20_quarter_round(work_vrs[0], work_vrs[4], work_vrs[8],  work_vrs[12], tmp_vr);
5641       chacha20_quarter_round(work_vrs[1], work_vrs[5], work_vrs[9],  work_vrs[13], tmp_vr);
5642       chacha20_quarter_round(work_vrs[2], work_vrs[6], work_vrs[10], work_vrs[14], tmp_vr);
5643       chacha20_quarter_round(work_vrs[3], work_vrs[7], work_vrs[11], work_vrs[15], tmp_vr);
5644 
5645       chacha20_quarter_round(work_vrs[0], work_vrs[5], work_vrs[10], work_vrs[15], tmp_vr);
5646       chacha20_quarter_round(work_vrs[1], work_vrs[6], work_vrs[11], work_vrs[12], tmp_vr);
5647       chacha20_quarter_round(work_vrs[2], work_vrs[7], work_vrs[8],  work_vrs[13], tmp_vr);
5648       chacha20_quarter_round(work_vrs[3], work_vrs[4], work_vrs[9],  work_vrs[14], tmp_vr);
5649 
5650       __ subi(loop, loop, 1);
5651       __ bnez(loop, L_Rounds);
5652     }
5653 
5654     // Add the original state into the end working state.
5655     // We do this by first duplicating every element in source state array to the corresponding
5656     // vector, then adding it to the post-loop working state.
5657     __ mv(tmp_addr, state);
5658     for (int i = 0; i < states_len; i += 1) {
5659       __ vlse32_v(tmp_vr, tmp_addr, zr);
5660       __ addi(tmp_addr, tmp_addr, step);
5661       __ vadd_vv(work_vrs[i], work_vrs[i], tmp_vr);
5662     }
5663     // Add the counter overlay onto work_vrs[12] at the end.
5664     __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
5665 
5666     // Store result to key stream.
5667     {
5668       const Register stride = t2; // share t2 with other non-overlapping usages.
5669       // Every block occupies 64 bytes, so we use 64 as stride of the vector store.
5670       __ mv(stride, 64);
5671       for (int i = 0; i < states_len; i += 1) {
5672         __ vsse32_v(work_vrs[i], key_stream, stride);
5673         __ addi(key_stream, key_stream, step);
5674       }
5675     }
5676 
5677     // Return length of output key_stream
5678     __ slli(c_rarg0, length, 6);
5679 
5680     __ leave();
5681     __ ret();
5682 
5683     return (address) start;
5684   }
5685 
5686 
5687   // ------------------------ SHA-1 intrinsic ------------------------
5688 
5689   // K't =
5690   //    5a827999, 0  <= t <= 19
5691   //    6ed9eba1, 20 <= t <= 39
5692   //    8f1bbcdc, 40 <= t <= 59
5693   //    ca62c1d6, 60 <= t <= 79
5694   void sha1_prepare_k(Register cur_k, int round) {
5695     assert(round >= 0 && round < 80, "must be");
5696 
5697     static const int64_t ks[] = {0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6};
5698     if ((round % 20) == 0) {
5699       __ mv(cur_k, ks[round/20]);
5700     }
5701   }
5702 
5703   // W't =
5704   //    M't,                                      0 <=  t <= 15
5705   //    ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16),  16 <= t <= 79
5706   void sha1_prepare_w(Register cur_w, Register ws[], Register buf, int round) {
5707     assert(round >= 0 && round < 80, "must be");
5708 
5709     if (round < 16) {
5710       // in the first 16 rounds, in ws[], every register contains 2 W't, e.g.
5711       //   in ws[0], high part contains W't-0, low part contains W't-1,
5712       //   in ws[1], high part contains W't-2, low part contains W't-3,
5713       //   ...
5714       //   in ws[7], high part contains W't-14, low part contains W't-15.
5715 
5716       if ((round % 2) == 0) {
5717         __ ld(ws[round/2], Address(buf, (round/2) * 8));
5718         // reverse bytes, as SHA-1 is defined in big-endian.
5719         __ revb(ws[round/2], ws[round/2]);
5720         __ srli(cur_w, ws[round/2], 32);
5721       } else {
5722         __ mv(cur_w, ws[round/2]);
5723       }
5724 
5725       return;
5726     }
5727 
5728     if ((round % 2) == 0) {
5729       int idx = 16;
5730       // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16),  16 <= t <= 79
5731       __ srli(t1, ws[(idx-8)/2], 32);
5732       __ xorr(t0, ws[(idx-3)/2], t1);
5733 
5734       __ srli(t1, ws[(idx-14)/2], 32);
5735       __ srli(cur_w, ws[(idx-16)/2], 32);
5736       __ xorr(cur_w, cur_w, t1);
5737 
5738       __ xorr(cur_w, cur_w, t0);
5739       __ rolw(cur_w, cur_w, 1, t0);
5740 
5741       // copy the cur_w value to ws[8].
5742       // now, valid w't values are at:
5743       //  w0:       ws[0]'s lower 32 bits
5744       //  w1 ~ w14: ws[1] ~ ws[7]
5745       //  w15:      ws[8]'s higher 32 bits
5746       __ slli(ws[idx/2], cur_w, 32);
5747 
5748       return;
5749     }
5750 
5751     int idx = 17;
5752     // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16),  16 <= t <= 79
5753     __ srli(t1, ws[(idx-3)/2], 32);
5754     __ xorr(t0, t1, ws[(idx-8)/2]);
5755 
5756     __ xorr(cur_w, ws[(idx-16)/2], ws[(idx-14)/2]);
5757 
5758     __ xorr(cur_w, cur_w, t0);
5759     __ rolw(cur_w, cur_w, 1, t0);
5760 
5761     // copy the cur_w value to ws[8]
5762     __ zext(cur_w, cur_w, 32);
5763     __ orr(ws[idx/2], ws[idx/2], cur_w);
5764 
5765     // shift the w't registers, so they start from ws[0] again.
5766     // now, valid w't values are at:
5767     //  w0 ~ w15: ws[0] ~ ws[7]
5768     Register ws_0 = ws[0];
5769     for (int i = 0; i < 16/2; i++) {
5770       ws[i] = ws[i+1];
5771     }
5772     ws[8] = ws_0;
5773   }
5774 
5775   // f't(x, y, z) =
5776   //    Ch(x, y, z)     = (x & y) ^ (~x & z)            , 0  <= t <= 19
5777   //    Parity(x, y, z) = x ^ y ^ z                     , 20 <= t <= 39
5778   //    Maj(x, y, z)    = (x & y) ^ (x & z) ^ (y & z)   , 40 <= t <= 59
5779   //    Parity(x, y, z) = x ^ y ^ z                     , 60 <= t <= 79
5780   void sha1_f(Register dst, Register x, Register y, Register z, int round) {
5781     assert(round >= 0 && round < 80, "must be");
5782     assert_different_registers(dst, x, y, z, t0, t1);
5783 
5784     if (round < 20) {
5785       // (x & y) ^ (~x & z)
5786       __ andr(t0, x, y);
5787       __ andn(dst, z, x);
5788       __ xorr(dst, dst, t0);
5789     } else if (round >= 40 && round < 60) {
5790       // (x & y) ^ (x & z) ^ (y & z)
5791       __ andr(t0, x, y);
5792       __ andr(t1, x, z);
5793       __ andr(dst, y, z);
5794       __ xorr(dst, dst, t0);
5795       __ xorr(dst, dst, t1);
5796     } else {
5797       // x ^ y ^ z
5798       __ xorr(dst, x, y);
5799       __ xorr(dst, dst, z);
5800     }
5801   }
5802 
5803   // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
5804   // e = d
5805   // d = c
5806   // c = ROTL'30(b)
5807   // b = a
5808   // a = T
5809   void sha1_process_round(Register a, Register b, Register c, Register d, Register e,
5810                           Register cur_k, Register cur_w, Register tmp, int round) {
5811     assert(round >= 0 && round < 80, "must be");
5812     assert_different_registers(a, b, c, d, e, cur_w, cur_k, tmp, t0);
5813 
5814     // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
5815 
5816     // cur_w will be recalculated at the beginning of each round,
5817     // so, we can reuse it as a temp register here.
5818     Register tmp2 = cur_w;
5819 
5820     // reuse e as a temporary register, as we will mv new value into it later
5821     Register tmp3 = e;
5822     __ add(tmp2, cur_k, tmp2);
5823     __ add(tmp3, tmp3, tmp2);
5824     __ rolw(tmp2, a, 5, t0);
5825 
5826     sha1_f(tmp, b, c, d, round);
5827 
5828     __ add(tmp2, tmp2, tmp);
5829     __ add(tmp2, tmp2, tmp3);
5830 
5831     // e = d
5832     // d = c
5833     // c = ROTL'30(b)
5834     // b = a
5835     // a = T
5836     __ mv(e, d);
5837     __ mv(d, c);
5838 
5839     __ rolw(c, b, 30);
5840     __ mv(b, a);
5841     __ mv(a, tmp2);
5842   }
5843 
5844   // H(i)0 = a + H(i-1)0
5845   // H(i)1 = b + H(i-1)1
5846   // H(i)2 = c + H(i-1)2
5847   // H(i)3 = d + H(i-1)3
5848   // H(i)4 = e + H(i-1)4
5849   void sha1_calculate_im_hash(Register a, Register b, Register c, Register d, Register e,
5850                               Register prev_ab, Register prev_cd, Register prev_e) {
5851     assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e);
5852 
5853     __ add(a, a, prev_ab);
5854     __ srli(prev_ab, prev_ab, 32);
5855     __ add(b, b, prev_ab);
5856 
5857     __ add(c, c, prev_cd);
5858     __ srli(prev_cd, prev_cd, 32);
5859     __ add(d, d, prev_cd);
5860 
5861     __ add(e, e, prev_e);
5862   }
5863 
5864   void sha1_preserve_prev_abcde(Register a, Register b, Register c, Register d, Register e,
5865                                 Register prev_ab, Register prev_cd, Register prev_e) {
5866     assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e, t0);
5867 
5868     __ slli(t0, b, 32);
5869     __ zext(prev_ab, a, 32);
5870     __ orr(prev_ab, prev_ab, t0);
5871 
5872     __ slli(t0, d, 32);
5873     __ zext(prev_cd, c, 32);
5874     __ orr(prev_cd, prev_cd, t0);
5875 
5876     __ mv(prev_e, e);
5877   }
5878 
5879   // Intrinsic for:
5880   //   void sun.security.provider.SHA.implCompress0(byte[] buf, int ofs)
5881   //   void sun.security.provider.DigestBase.implCompressMultiBlock0(byte[] b, int ofs, int limit)
5882   //
5883   // Arguments:
5884   //
5885   // Inputs:
5886   //   c_rarg0: byte[]  src array + offset
5887   //   c_rarg1: int[]   SHA.state
5888   //   - - - - - - below are only for implCompressMultiBlock0 - - - - - -
5889   //   c_rarg2: int     offset
5890   //   c_rarg3: int     limit
5891   //
5892   // Outputs:
5893   //   - - - - - - below are only for implCompressMultiBlock0 - - - - - -
5894   //   c_rarg0: int offset, when (multi_block == true)
5895   //
5896   address generate_sha1_implCompress(StubId stub_id) {
5897       bool multi_block;
5898       switch (stub_id) {
5899       case StubId::stubgen_sha1_implCompress_id:
5900         multi_block = false;
5901         break;
5902       case StubId::stubgen_sha1_implCompressMB_id:
5903         multi_block = true;
5904         break;
5905       default:
5906         ShouldNotReachHere();
5907       };
5908     __ align(CodeEntryAlignment);
5909     StubCodeMark mark(this, stub_id);
5910 
5911     address start = __ pc();
5912     __ enter();
5913 
5914     RegSet saved_regs = RegSet::range(x18, x27);
5915     if (multi_block) {
5916       // use x9 as src below.
5917       saved_regs += RegSet::of(x9);
5918     }
5919     __ push_reg(saved_regs, sp);
5920 
5921     // c_rarg0 - c_rarg3: x10 - x13
5922     Register buf    = c_rarg0;
5923     Register state  = c_rarg1;
5924     Register offset = c_rarg2;
5925     Register limit  = c_rarg3;
5926     // use src to contain the original start point of the array.
5927     Register src    = x9;
5928 
5929     if (multi_block) {
5930       __ sub(limit, limit, offset);
5931       __ add(limit, limit, buf);
5932       __ sub(src, buf, offset);
5933     }
5934 
5935     // [args-reg]:  x14 - x17
5936     // [temp-reg]:  x28 - x31
5937     // [saved-reg]: x18 - x27
5938 
5939     // h0/1/2/3/4
5940     const Register a = x14, b = x15, c = x16, d = x17, e = x28;
5941     // w0, w1, ... w15
5942     // put two adjecent w's in one register:
5943     //    one at high word part, another at low word part
5944     // at different round (even or odd), w't value reside in different items in ws[].
5945     // w0 ~ w15, either reside in
5946     //    ws[0] ~ ws[7], where
5947     //      w0 at higher 32 bits of ws[0],
5948     //      w1 at lower 32 bits of ws[0],
5949     //      ...
5950     //      w14 at higher 32 bits of ws[7],
5951     //      w15 at lower 32 bits of ws[7].
5952     // or, reside in
5953     //    w0:       ws[0]'s lower 32 bits
5954     //    w1 ~ w14: ws[1] ~ ws[7]
5955     //    w15:      ws[8]'s higher 32 bits
5956     Register ws[9] = {x29, x30, x31, x18,
5957                       x19, x20, x21, x22,
5958                       x23}; // auxiliary register for calculating w's value
5959     // current k't's value
5960     const Register cur_k = x24;
5961     // current w't's value
5962     const Register cur_w = x25;
5963     // values of a, b, c, d, e in the previous round
5964     const Register prev_ab = x26, prev_cd = x27;
5965     const Register prev_e = offset; // reuse offset/c_rarg2
5966 
5967     // load 5 words state into a, b, c, d, e.
5968     //
5969     // To minimize the number of memory operations, we apply following
5970     // optimization: read the states (a/b/c/d) of 4-byte values in pairs,
5971     // with a single ld, and split them into 2 registers.
5972     //
5973     // And, as the core algorithm of SHA-1 works on 32-bits words, so
5974     // in the following code, it does not care about the content of
5975     // higher 32-bits in a/b/c/d/e. Based on this observation,
5976     // we can apply further optimization, which is to just ignore the
5977     // higher 32-bits in a/c/e, rather than set the higher
5978     // 32-bits of a/c/e to zero explicitly with extra instructions.
5979     __ ld(a, Address(state, 0));
5980     __ srli(b, a, 32);
5981     __ ld(c, Address(state, 8));
5982     __ srli(d, c, 32);
5983     __ lw(e, Address(state, 16));
5984 
5985     Label L_sha1_loop;
5986     if (multi_block) {
5987       __ BIND(L_sha1_loop);
5988     }
5989 
5990     sha1_preserve_prev_abcde(a, b, c, d, e, prev_ab, prev_cd, prev_e);
5991 
5992     for (int round = 0; round < 80; round++) {
5993       // prepare K't value
5994       sha1_prepare_k(cur_k, round);
5995 
5996       // prepare W't value
5997       sha1_prepare_w(cur_w, ws, buf, round);
5998 
5999       // one round process
6000       sha1_process_round(a, b, c, d, e, cur_k, cur_w, t2, round);
6001     }
6002 
6003     // compute the intermediate hash value
6004     sha1_calculate_im_hash(a, b, c, d, e, prev_ab, prev_cd, prev_e);
6005 
6006     if (multi_block) {
6007       int64_t block_bytes = 16 * 4;
6008       __ addi(buf, buf, block_bytes);
6009 
6010       __ bge(limit, buf, L_sha1_loop, /* is_far */ true);
6011     }
6012 
6013     // store back the state.
6014     __ zext(a, a, 32);
6015     __ slli(b, b, 32);
6016     __ orr(a, a, b);
6017     __ sd(a, Address(state, 0));
6018     __ zext(c, c, 32);
6019     __ slli(d, d, 32);
6020     __ orr(c, c, d);
6021     __ sd(c, Address(state, 8));
6022     __ sw(e, Address(state, 16));
6023 
6024     // return offset
6025     if (multi_block) {
6026       __ sub(c_rarg0, buf, src);
6027     }
6028 
6029     __ pop_reg(saved_regs, sp);
6030 
6031     __ leave();
6032     __ ret();
6033 
6034     return (address) start;
6035   }
6036 
6037   /**
6038    * vector registers:
6039    *   input VectorRegister's:  intputV1-V3, for m2 they could be v2, v4, v6, for m1 they could be v1, v2, v3
6040    *   index VectorRegister's:  idxV1-V4, for m2 they could be v8, v10, v12, v14, for m1 they could be v4, v5, v6, v7
6041    *   output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v8, v9, v10, v11
6042    *
6043    * NOTE: each field will occupy a vector register group
6044    */
6045   void base64_vector_encode_round(Register src, Register dst, Register codec,
6046                     Register size, Register stepSrc, Register stepDst,
6047                     VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3,
6048                     VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4,
6049                     VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3, VectorRegister outputV4,
6050                     Assembler::LMUL lmul) {
6051     // set vector register type/len
6052     __ vsetvli(x0, size, Assembler::e8, lmul);
6053 
6054     // segmented load src into v registers: mem(src) => vr(3)
6055     __ vlseg3e8_v(inputV1, src);
6056 
6057     // src = src + register_group_len_bytes * 3
6058     __ add(src, src, stepSrc);
6059 
6060     // encoding
6061     //   1. compute index into lookup table: vr(3) => vr(4)
6062     __ vsrl_vi(idxV1, inputV1, 2);
6063 
6064     __ vsrl_vi(idxV2, inputV2, 2);
6065     __ vsll_vi(inputV1, inputV1, 6);
6066     __ vor_vv(idxV2, idxV2, inputV1);
6067     __ vsrl_vi(idxV2, idxV2, 2);
6068 
6069     __ vsrl_vi(idxV3, inputV3, 4);
6070     __ vsll_vi(inputV2, inputV2, 4);
6071     __ vor_vv(idxV3, inputV2, idxV3);
6072     __ vsrl_vi(idxV3, idxV3, 2);
6073 
6074     __ vsll_vi(idxV4, inputV3, 2);
6075     __ vsrl_vi(idxV4, idxV4, 2);
6076 
6077     //   2. indexed load: vr(4) => vr(4)
6078     __ vluxei8_v(outputV1, codec, idxV1);
6079     __ vluxei8_v(outputV2, codec, idxV2);
6080     __ vluxei8_v(outputV3, codec, idxV3);
6081     __ vluxei8_v(outputV4, codec, idxV4);
6082 
6083     // segmented store encoded data in v registers back to dst: vr(4) => mem(dst)
6084     __ vsseg4e8_v(outputV1, dst);
6085 
6086     // dst = dst + register_group_len_bytes * 4
6087     __ add(dst, dst, stepDst);
6088   }
6089 
6090   /**
6091    *  void j.u.Base64.Encoder.encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL)
6092    *
6093    *  Input arguments:
6094    *  c_rarg0   - src, source array
6095    *  c_rarg1   - sp, src start offset
6096    *  c_rarg2   - sl, src end offset
6097    *  c_rarg3   - dst, dest array
6098    *  c_rarg4   - dp, dst start offset
6099    *  c_rarg5   - isURL, Base64 or URL character set
6100    */
6101   address generate_base64_encodeBlock() {
6102     alignas(64) static const char toBase64[64] = {
6103       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6104       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6105       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6106       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6107       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
6108     };
6109 
6110     alignas(64) static const char toBase64URL[64] = {
6111       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6112       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6113       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6114       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6115       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
6116     };
6117 
6118     __ align(CodeEntryAlignment);
6119     StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
6120     StubCodeMark mark(this, stub_id);
6121     address start = __ pc();
6122     __ enter();
6123 
6124     Register src    = c_rarg0;
6125     Register soff   = c_rarg1;
6126     Register send   = c_rarg2;
6127     Register dst    = c_rarg3;
6128     Register doff   = c_rarg4;
6129     Register isURL  = c_rarg5;
6130 
6131     Register codec  = c_rarg6;
6132     Register length = c_rarg7; // total length of src data in bytes
6133 
6134     Label ProcessData, Exit;
6135 
6136     // length should be multiple of 3
6137     __ sub(length, send, soff);
6138     // real src/dst to process data
6139     __ add(src, src, soff);
6140     __ add(dst, dst, doff);
6141 
6142     // load the codec base address
6143     __ la(codec, ExternalAddress((address) toBase64));
6144     __ beqz(isURL, ProcessData);
6145     __ la(codec, ExternalAddress((address) toBase64URL));
6146     __ BIND(ProcessData);
6147 
6148     // vector version
6149     if (UseRVV) {
6150       Label ProcessM2, ProcessM1, ProcessScalar;
6151 
6152       Register size      = soff;
6153       Register stepSrcM1 = send;
6154       Register stepSrcM2 = doff;
6155       Register stepDst   = isURL;
6156 
6157       __ mv(size, MaxVectorSize * 2);
6158       __ mv(stepSrcM1, MaxVectorSize * 3);
6159       __ slli(stepSrcM2, stepSrcM1, 1);
6160       __ mv(stepDst, MaxVectorSize * 2 * 4);
6161 
6162       __ blt(length, stepSrcM2, ProcessM1);
6163 
6164       __ BIND(ProcessM2);
6165       base64_vector_encode_round(src, dst, codec,
6166                     size, stepSrcM2, stepDst,
6167                     v2, v4, v6,         // inputs
6168                     v8, v10, v12, v14,  // indexes
6169                     v16, v18, v20, v22, // outputs
6170                     Assembler::m2);
6171 
6172       __ sub(length, length, stepSrcM2);
6173       __ bge(length, stepSrcM2, ProcessM2);
6174 
6175       __ BIND(ProcessM1);
6176       __ blt(length, stepSrcM1, ProcessScalar);
6177 
6178       __ srli(size, size, 1);
6179       __ srli(stepDst, stepDst, 1);
6180       base64_vector_encode_round(src, dst, codec,
6181                     size, stepSrcM1, stepDst,
6182                     v1, v2, v3,         // inputs
6183                     v4, v5, v6, v7,     // indexes
6184                     v8, v9, v10, v11,   // outputs
6185                     Assembler::m1);
6186       __ sub(length, length, stepSrcM1);
6187 
6188       __ BIND(ProcessScalar);
6189     }
6190 
6191     // scalar version
6192     {
6193       Register byte1 = soff, byte0 = send, byte2 = doff;
6194       Register combined24Bits = isURL;
6195 
6196       __ beqz(length, Exit);
6197 
6198       Label ScalarLoop;
6199       __ BIND(ScalarLoop);
6200       {
6201         // plain:   [byte0[7:0] : byte1[7:0] : byte2[7:0]] =>
6202         // encoded: [byte0[7:2] : byte0[1:0]+byte1[7:4] : byte1[3:0]+byte2[7:6] : byte2[5:0]]
6203 
6204         // load 3 bytes src data
6205         __ lbu(byte0, Address(src, 0));
6206         __ lbu(byte1, Address(src, 1));
6207         __ lbu(byte2, Address(src, 2));
6208         __ addi(src, src, 3);
6209 
6210         // construct 24 bits from 3 bytes
6211         __ slliw(byte0, byte0, 16);
6212         __ slliw(byte1, byte1, 8);
6213         __ orr(combined24Bits, byte0, byte1);
6214         __ orr(combined24Bits, combined24Bits, byte2);
6215 
6216         // get codec index and encode(ie. load from codec by index)
6217         __ slliw(byte0, combined24Bits, 8);
6218         __ srliw(byte0, byte0, 26);
6219         __ add(byte0, codec, byte0);
6220         __ lbu(byte0, byte0);
6221 
6222         __ slliw(byte1, combined24Bits, 14);
6223         __ srliw(byte1, byte1, 26);
6224         __ add(byte1, codec, byte1);
6225         __ lbu(byte1, byte1);
6226 
6227         __ slliw(byte2, combined24Bits, 20);
6228         __ srliw(byte2, byte2, 26);
6229         __ add(byte2, codec, byte2);
6230         __ lbu(byte2, byte2);
6231 
6232         __ andi(combined24Bits, combined24Bits, 0x3f);
6233         __ add(combined24Bits, codec, combined24Bits);
6234         __ lbu(combined24Bits, combined24Bits);
6235 
6236         // store 4 bytes encoded data
6237         __ sb(byte0, Address(dst, 0));
6238         __ sb(byte1, Address(dst, 1));
6239         __ sb(byte2, Address(dst, 2));
6240         __ sb(combined24Bits, Address(dst, 3));
6241 
6242         __ subi(length, length, 3);
6243         __ addi(dst, dst, 4);
6244         // loop back
6245         __ bnez(length, ScalarLoop);
6246       }
6247     }
6248 
6249     __ BIND(Exit);
6250 
6251     __ leave();
6252     __ ret();
6253 
6254     return (address) start;
6255   }
6256 
6257   /**
6258    * vector registers:
6259    * input VectorRegister's:  intputV1-V4, for m2 they could be v2, v4, v6, for m1 they could be v2, v4, v6, v8
6260    * index VectorRegister's:  idxV1-V3, for m2 they could be v8, v10, v12, v14, for m1 they could be v10, v12, v14, v16
6261    * output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v18, v20, v22
6262    *
6263    * NOTE: each field will occupy a single vector register group
6264    */
6265   void base64_vector_decode_round(Register src, Register dst, Register codec,
6266                     Register size, Register stepSrc, Register stepDst, Register failedIdx,
6267                     VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3, VectorRegister inputV4,
6268                     VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4,
6269                     VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3,
6270                     Assembler::LMUL lmul) {
6271     // set vector register type/len
6272     __ vsetvli(x0, size, Assembler::e8, lmul, Assembler::ma, Assembler::ta);
6273 
6274     // segmented load src into v registers: mem(src) => vr(4)
6275     __ vlseg4e8_v(inputV1, src);
6276 
6277     // src = src + register_group_len_bytes * 4
6278     __ add(src, src, stepSrc);
6279 
6280     // decoding
6281     //   1. indexed load: vr(4) => vr(4)
6282     __ vluxei8_v(idxV1, codec, inputV1);
6283     __ vluxei8_v(idxV2, codec, inputV2);
6284     __ vluxei8_v(idxV3, codec, inputV3);
6285     __ vluxei8_v(idxV4, codec, inputV4);
6286 
6287     //   2. check wrong data
6288     __ vor_vv(outputV1, idxV1, idxV2);
6289     __ vor_vv(outputV2, idxV3, idxV4);
6290     __ vor_vv(outputV1, outputV1, outputV2);
6291     __ vmseq_vi(v0, outputV1, -1);
6292     __ vfirst_m(failedIdx, v0);
6293     Label NoFailure, FailureAtIdx0;
6294     // valid value can only be -1 when < 0
6295     __ bltz(failedIdx, NoFailure);
6296     // when the first data (at index 0) fails, no need to process data anymore
6297     __ beqz(failedIdx, FailureAtIdx0);
6298     __ vsetvli(x0, failedIdx, Assembler::e8, lmul, Assembler::mu, Assembler::tu);
6299     __ slli(stepDst, failedIdx, 1);
6300     __ add(stepDst, failedIdx, stepDst);
6301     __ BIND(NoFailure);
6302 
6303     //   3. compute the decoded data: vr(4) => vr(3)
6304     __ vsll_vi(idxV1, idxV1, 2);
6305     __ vsrl_vi(outputV1, idxV2, 4);
6306     __ vor_vv(outputV1, outputV1, idxV1);
6307 
6308     __ vsll_vi(idxV2, idxV2, 4);
6309     __ vsrl_vi(outputV2, idxV3, 2);
6310     __ vor_vv(outputV2, outputV2, idxV2);
6311 
6312     __ vsll_vi(idxV3, idxV3, 6);
6313     __ vor_vv(outputV3, idxV4, idxV3);
6314 
6315     // segmented store encoded data in v registers back to dst: vr(3) => mem(dst)
6316     __ vsseg3e8_v(outputV1, dst);
6317 
6318     // dst = dst + register_group_len_bytes * 3
6319     __ add(dst, dst, stepDst);
6320     __ BIND(FailureAtIdx0);
6321   }
6322 
6323   /**
6324    * int j.u.Base64.Decoder.decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, boolean isMIME)
6325    *
6326    *  Input arguments:
6327    *  c_rarg0   - src, source array
6328    *  c_rarg1   - sp, src start offset
6329    *  c_rarg2   - sl, src end offset
6330    *  c_rarg3   - dst, dest array
6331    *  c_rarg4   - dp, dst start offset
6332    *  c_rarg5   - isURL, Base64 or URL character set
6333    *  c_rarg6   - isMIME, Decoding MIME block
6334    */
6335   address generate_base64_decodeBlock() {
6336 
6337     static const uint8_t fromBase64[256] = {
6338         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6339         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6340         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6341         52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6342         255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6343         15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
6344         255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6345         41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6346         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6347         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6348         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6349         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6350         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6351         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6352         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6353         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6354     };
6355 
6356     static const uint8_t fromBase64URL[256] = {
6357         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6358         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6359         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6360         52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6361         255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6362         15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
6363         255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6364         41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6365         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6366         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6367         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6368         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6369         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6370         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6371         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6372         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6373     };
6374 
6375     __ align(CodeEntryAlignment);
6376     StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
6377     StubCodeMark mark(this, stub_id);
6378     address start = __ pc();
6379     __ enter();
6380 
6381     Register src    = c_rarg0;
6382     Register soff   = c_rarg1;
6383     Register send   = c_rarg2;
6384     Register dst    = c_rarg3;
6385     Register doff   = c_rarg4;
6386     Register isURL  = c_rarg5;
6387     Register isMIME = c_rarg6;
6388 
6389     Register codec     = c_rarg7;
6390     Register dstBackup = t6;
6391     Register length    = t3;     // total length of src data in bytes
6392 
6393     Label ProcessData, Exit;
6394     Label ProcessScalar, ScalarLoop;
6395 
6396     // passed in length (send - soff) is guaranteed to be > 4,
6397     // and in this intrinsic we only process data of length in multiple of 4,
6398     // it's not guaranteed to be multiple of 4 by java level, so do it explicitly
6399     __ sub(length, send, soff);
6400     __ andi(length, length, -4);
6401     // real src/dst to process data
6402     __ add(src, src, soff);
6403     __ add(dst, dst, doff);
6404     // backup of dst, used to calculate the return value at exit
6405     __ mv(dstBackup, dst);
6406 
6407     // load the codec base address
6408     __ la(codec, ExternalAddress((address) fromBase64));
6409     __ beqz(isURL, ProcessData);
6410     __ la(codec, ExternalAddress((address) fromBase64URL));
6411     __ BIND(ProcessData);
6412 
6413     // vector version
6414     if (UseRVV) {
6415       // for MIME case, it has a default length limit of 76 which could be
6416       // different(smaller) from (send - soff), so in MIME case, we go through
6417       // the scalar code path directly.
6418       __ bnez(isMIME, ScalarLoop);
6419 
6420       Label ProcessM1, ProcessM2;
6421 
6422       Register failedIdx = soff;
6423       Register stepSrcM1 = send;
6424       Register stepSrcM2 = doff;
6425       Register stepDst   = isURL;
6426       Register size      = t4;
6427 
6428       __ mv(size, MaxVectorSize * 2);
6429       __ mv(stepSrcM1, MaxVectorSize * 4);
6430       __ slli(stepSrcM2, stepSrcM1, 1);
6431       __ mv(stepDst, MaxVectorSize * 2 * 3);
6432 
6433       __ blt(length, stepSrcM2, ProcessM1);
6434 
6435 
6436       // Assembler::m2
6437       __ BIND(ProcessM2);
6438       base64_vector_decode_round(src, dst, codec,
6439                     size, stepSrcM2, stepDst, failedIdx,
6440                     v2, v4, v6, v8,      // inputs
6441                     v10, v12, v14, v16,  // indexes
6442                     v18, v20, v22,       // outputs
6443                     Assembler::m2);
6444       __ sub(length, length, stepSrcM2);
6445 
6446       // error check
6447       // valid value of failedIdx can only be -1 when < 0
6448       __ bgez(failedIdx, Exit);
6449 
6450       __ bge(length, stepSrcM2, ProcessM2);
6451 
6452 
6453       // Assembler::m1
6454       __ BIND(ProcessM1);
6455       __ blt(length, stepSrcM1, ProcessScalar);
6456 
6457       __ srli(size, size, 1);
6458       __ srli(stepDst, stepDst, 1);
6459       base64_vector_decode_round(src, dst, codec,
6460                     size, stepSrcM1, stepDst, failedIdx,
6461                     v1, v2, v3, v4,      // inputs
6462                     v5, v6, v7, v8,      // indexes
6463                     v9, v10, v11,        // outputs
6464                     Assembler::m1);
6465       __ sub(length, length, stepSrcM1);
6466 
6467       // error check
6468       // valid value of failedIdx can only be -1 when < 0
6469       __ bgez(failedIdx, Exit);
6470 
6471       __ BIND(ProcessScalar);
6472       __ beqz(length, Exit);
6473     }
6474 
6475     // scalar version
6476     {
6477       Register byte0 = soff, byte1 = send, byte2 = doff, byte3 = isURL;
6478       Register combined32Bits = t4;
6479 
6480       // encoded:   [byte0[5:0] : byte1[5:0] : byte2[5:0]] : byte3[5:0]] =>
6481       // plain:     [byte0[5:0]+byte1[5:4] : byte1[3:0]+byte2[5:2] : byte2[1:0]+byte3[5:0]]
6482       __ BIND(ScalarLoop);
6483 
6484       // load 4 bytes encoded src data
6485       __ lbu(byte0, Address(src, 0));
6486       __ lbu(byte1, Address(src, 1));
6487       __ lbu(byte2, Address(src, 2));
6488       __ lbu(byte3, Address(src, 3));
6489       __ addi(src, src, 4);
6490 
6491       // get codec index and decode (ie. load from codec by index)
6492       __ add(byte0, codec, byte0);
6493       __ add(byte1, codec, byte1);
6494       __ lb(byte0, Address(byte0, 0));
6495       __ lb(byte1, Address(byte1, 0));
6496       __ add(byte2, codec, byte2);
6497       __ add(byte3, codec, byte3);
6498       __ lb(byte2, Address(byte2, 0));
6499       __ lb(byte3, Address(byte3, 0));
6500       __ slliw(byte0, byte0, 18);
6501       __ slliw(byte1, byte1, 12);
6502       __ orr(byte0, byte0, byte1);
6503       __ orr(byte0, byte0, byte3);
6504       __ slliw(byte2, byte2, 6);
6505       // For performance consideration, `combined32Bits` is constructed for 2 purposes at the same time,
6506       //  1. error check below
6507       //  2. decode below
6508       __ orr(combined32Bits, byte0, byte2);
6509 
6510       // error check
6511       __ bltz(combined32Bits, Exit);
6512 
6513       // store 3 bytes decoded data
6514       __ sraiw(byte0, combined32Bits, 16);
6515       __ sraiw(byte1, combined32Bits, 8);
6516       __ sb(byte0, Address(dst, 0));
6517       __ sb(byte1, Address(dst, 1));
6518       __ sb(combined32Bits, Address(dst, 2));
6519 
6520       __ subi(length, length, 4);
6521       __ addi(dst, dst, 3);
6522       // loop back
6523       __ bnez(length, ScalarLoop);
6524     }
6525 
6526     __ BIND(Exit);
6527     __ sub(c_rarg0, dst, dstBackup);
6528 
6529     __ leave();
6530     __ ret();
6531 
6532     return (address) start;
6533   }
6534 
6535   void adler32_process_bytes(Register buff, Register s1, Register s2, VectorRegister vtable,
6536     VectorRegister vzero, VectorRegister vbytes, VectorRegister vs1acc, VectorRegister vs2acc,
6537     Register temp0, Register temp1, Register temp2,  Register temp3,
6538     VectorRegister vtemp1, VectorRegister vtemp2, int step, Assembler::LMUL lmul) {
6539 
6540     assert((lmul == Assembler::m4 && step == 64) ||
6541            (lmul == Assembler::m2 && step == 32) ||
6542            (lmul == Assembler::m1 && step == 16),
6543            "LMUL should be aligned with step: m4 and 64, m2 and 32 or m1 and 16");
6544     // Below is function for calculating Adler32 checksum with 64-, 32- or 16-byte step. LMUL=m4, m2 or m1 is used.
6545     // The results are in v12, v13, ..., v22, v23. Example below is for 64-byte step case.
6546     // We use b1, b2, ..., b64 to denote the 64 bytes loaded in each iteration.
6547     // In non-vectorized code, we update s1 and s2 as:
6548     //   s1 <- s1 + b1
6549     //   s2 <- s2 + s1
6550     //   s1 <- s1 + b2
6551     //   s2 <- s2 + b1
6552     //   ...
6553     //   s1 <- s1 + b64
6554     //   s2 <- s2 + s1
6555     // Putting above assignments together, we have:
6556     //   s1_new = s1 + b1 + b2 + ... + b64
6557     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b64) =
6558     //          = s2 + s1 * 64 + (b1 * 64 + b2 * 63 + ... + b64 * 1) =
6559     //          = s2 + s1 * 64 + (b1, b2, ... b64) dot (64, 63, ... 1)
6560 
6561     __ mv(temp3, step);
6562     // Load data
6563     __ vsetvli(temp0, temp3, Assembler::e8, lmul);
6564     __ vle8_v(vbytes, buff);
6565     __ addi(buff, buff, step);
6566 
6567     // Upper bound reduction sum for s1_new:
6568     // 0xFF * 64 = 0x3FC0, so:
6569     // 1. Need to do vector-widening reduction sum
6570     // 2. It is safe to perform sign-extension during vmv.x.s with 16-bits elements
6571     __ vwredsumu_vs(vs1acc, vbytes, vzero);
6572     // Multiplication for s2_new
6573     __ vwmulu_vv(vs2acc, vtable, vbytes);
6574 
6575     // s2 = s2 + s1 * log2(step)
6576     __ slli(temp1, s1, exact_log2(step));
6577     __ add(s2, s2, temp1);
6578 
6579     // Summing up calculated results for s2_new
6580     if (MaxVectorSize > 16) {
6581       __ vsetvli(temp0, temp3, Assembler::e16, lmul);
6582     } else {
6583       // Half of vector-widening multiplication result is in successor of vs2acc
6584       // group for vlen == 16, in which case we need to double vector register
6585       // group width in order to reduction sum all of them
6586       Assembler::LMUL lmulx2 = (lmul == Assembler::m1) ? Assembler::m2 :
6587                                (lmul == Assembler::m2) ? Assembler::m4 : Assembler::m8;
6588       __ vsetvli(temp0, temp3, Assembler::e16, lmulx2);
6589     }
6590     // Upper bound for reduction sum:
6591     // 0xFF * (64 + 63 + ... + 2 + 1) = 0x817E0 max for whole register group, so:
6592     // 1. Need to do vector-widening reduction sum
6593     // 2. It is safe to perform sign-extension during vmv.x.s with 32-bits elements
6594     __ vwredsumu_vs(vtemp1, vs2acc, vzero);
6595 
6596     // Extracting results for:
6597     // s1_new
6598     __ vmv_x_s(temp0, vs1acc);
6599     __ add(s1, s1, temp0);
6600     // s2_new
6601     __ vsetvli(temp0, temp3, Assembler::e32, Assembler::m1);
6602     __ vmv_x_s(temp1, vtemp1);
6603     __ add(s2, s2, temp1);
6604   }
6605 
6606   /***
6607    *  int java.util.zip.Adler32.updateBytes(int adler, byte[] b, int off, int len)
6608    *
6609    *  Arguments:
6610    *
6611    *  Inputs:
6612    *   c_rarg0   - int   adler
6613    *   c_rarg1   - byte* buff (b + off)
6614    *   c_rarg2   - int   len
6615    *
6616    *  Output:
6617    *   c_rarg0   - int adler result
6618    */
6619   address generate_updateBytesAdler32() {
6620     __ align(CodeEntryAlignment);
6621     StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
6622     StubCodeMark mark(this, stub_id);
6623     address start = __ pc();
6624 
6625     Label L_nmax, L_nmax_loop, L_nmax_loop_entry, L_by16, L_by16_loop,
6626       L_by16_loop_unroll, L_by1_loop, L_do_mod, L_combine, L_by1;
6627 
6628     // Aliases
6629     Register adler  = c_rarg0;
6630     Register s1     = c_rarg0;
6631     Register s2     = c_rarg3;
6632     Register buff   = c_rarg1;
6633     Register len    = c_rarg2;
6634     Register nmax  = c_rarg4;
6635     Register base  = c_rarg5;
6636     Register count = c_rarg6;
6637     Register temp0 = t3;
6638     Register temp1 = t4;
6639     Register temp2 = t5;
6640     Register temp3 = t6;
6641 
6642     VectorRegister vzero = v31;
6643     VectorRegister vbytes = v8; // group: v8, v9, v10, v11
6644     VectorRegister vs1acc = v12; // group: v12, v13, v14, v15
6645     VectorRegister vs2acc = v16; // group: v16, v17, v18, v19, v20, v21, v22, v23
6646     VectorRegister vtable_64 = v24; // group: v24, v25, v26, v27
6647     VectorRegister vtable_32 = v4; // group: v4, v5
6648     VectorRegister vtable_16 = v30;
6649     VectorRegister vtemp1 = v28;
6650     VectorRegister vtemp2 = v29;
6651 
6652     // Max number of bytes we can process before having to take the mod
6653     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
6654     const uint64_t BASE = 0xfff1;
6655     const uint64_t NMAX = 0x15B0;
6656 
6657     // Loops steps
6658     int step_64 = 64;
6659     int step_32 = 32;
6660     int step_16 = 16;
6661     int step_1  = 1;
6662 
6663     __ enter(); // Required for proper stackwalking of RuntimeStub frame
6664     __ mv(temp1, 64);
6665     __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m4);
6666 
6667     // Generating accumulation coefficients for further calculations
6668     // vtable_64:
6669     __ vid_v(vtemp1);
6670     __ vrsub_vx(vtable_64, vtemp1, temp1);
6671     // vtable_64 group now contains { 0x40, 0x3f, 0x3e, ..., 0x3, 0x2, 0x1 }
6672 
6673     // vtable_32:
6674     __ mv(temp1, 32);
6675     __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m2);
6676     __ vid_v(vtemp1);
6677     __ vrsub_vx(vtable_32, vtemp1, temp1);
6678     // vtable_32 group now contains { 0x20, 0x1f, 0x1e, ..., 0x3, 0x2, 0x1 }
6679 
6680     __ vsetivli(temp0, 16, Assembler::e8, Assembler::m1);
6681     // vtable_16:
6682     __ mv(temp1, 16);
6683     __ vid_v(vtemp1);
6684     __ vrsub_vx(vtable_16, vtemp1, temp1);
6685     // vtable_16 now contains { 0x10, 0xf, 0xe, ..., 0x3, 0x2, 0x1 }
6686 
6687     __ vmv_v_i(vzero, 0);
6688 
6689     __ mv(base, BASE);
6690     __ mv(nmax, NMAX);
6691 
6692     // s1 is initialized to the lower 16 bits of adler
6693     // s2 is initialized to the upper 16 bits of adler
6694     __ srliw(s2, adler, 16); // s2 = ((adler >> 16) & 0xffff)
6695     __ zext(s1, adler, 16); // s1 = (adler & 0xffff)
6696 
6697     // The pipelined loop needs at least 16 elements for 1 iteration
6698     // It does check this, but it is more effective to skip to the cleanup loop
6699     __ mv(temp0, step_16);
6700     __ bgeu(len, temp0, L_nmax);
6701     __ beqz(len, L_combine);
6702 
6703     // Jumping to L_by1_loop
6704     __ subi(len, len, step_1);
6705     __ j(L_by1_loop);
6706 
6707   __ bind(L_nmax);
6708     __ sub(len, len, nmax);
6709     __ subi(count, nmax, 16);
6710     __ bltz(len, L_by16);
6711 
6712   // Align L_nmax loop by 64
6713   __ bind(L_nmax_loop_entry);
6714     __ subi(count, count, 32);
6715 
6716   __ bind(L_nmax_loop);
6717     adler32_process_bytes(buff, s1, s2, vtable_64, vzero,
6718       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6719       vtemp1, vtemp2, step_64, Assembler::m4);
6720     __ subi(count, count, step_64);
6721     __ bgtz(count, L_nmax_loop);
6722 
6723     // There are three iterations left to do
6724     adler32_process_bytes(buff, s1, s2, vtable_32, vzero,
6725       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6726       vtemp1, vtemp2, step_32, Assembler::m2);
6727     adler32_process_bytes(buff, s1, s2, vtable_16, vzero,
6728       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6729       vtemp1, vtemp2, step_16, Assembler::m1);
6730 
6731     // s1 = s1 % BASE
6732     __ remuw(s1, s1, base);
6733     // s2 = s2 % BASE
6734     __ remuw(s2, s2, base);
6735 
6736     __ sub(len, len, nmax);
6737     __ subi(count, nmax, 16);
6738     __ bgez(len, L_nmax_loop_entry);
6739 
6740   __ bind(L_by16);
6741     __ add(len, len, count);
6742     __ bltz(len, L_by1);
6743     // Trying to unroll
6744     __ mv(temp3, step_64);
6745     __ blt(len, temp3, L_by16_loop);
6746 
6747   __ bind(L_by16_loop_unroll);
6748     adler32_process_bytes(buff, s1, s2, vtable_64, vzero,
6749       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6750       vtemp1, vtemp2, step_64, Assembler::m4);
6751     __ subi(len, len, step_64);
6752     // By now the temp3 should still be 64
6753     __ bge(len, temp3, L_by16_loop_unroll);
6754 
6755   __ bind(L_by16_loop);
6756     adler32_process_bytes(buff, s1, s2, vtable_16, vzero,
6757       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6758       vtemp1, vtemp2, step_16, Assembler::m1);
6759     __ subi(len, len, step_16);
6760     __ bgez(len, L_by16_loop);
6761 
6762   __ bind(L_by1);
6763     __ addi(len, len, 15);
6764     __ bltz(len, L_do_mod);
6765 
6766   __ bind(L_by1_loop);
6767     __ lbu(temp0, Address(buff, 0));
6768     __ addi(buff, buff, step_1);
6769     __ add(s1, temp0, s1);
6770     __ add(s2, s2, s1);
6771     __ subi(len, len, step_1);
6772     __ bgez(len, L_by1_loop);
6773 
6774   __ bind(L_do_mod);
6775     // s1 = s1 % BASE
6776     __ remuw(s1, s1, base);
6777     // s2 = s2 % BASE
6778     __ remuw(s2, s2, base);
6779 
6780     // Combine lower bits and higher bits
6781     // adler = s1 | (s2 << 16)
6782   __ bind(L_combine);
6783     __ slli(s2, s2, 16);
6784     __ orr(s1, s1, s2);
6785 
6786     __ leave(); // Required for proper stackwalking of RuntimeStub frame
6787     __ ret();
6788 
6789     return start;
6790   }
6791 
6792 #endif // COMPILER2
6793 
6794   // x10 = input (float16)
6795   // f10 = result (float)
6796   // t1  = temporary register
6797   address generate_float16ToFloat() {
6798     __ align(CodeEntryAlignment);
6799     StubId stub_id = StubId::stubgen_hf2f_id;
6800     StubCodeMark mark(this, stub_id);
6801     address entry = __ pc();
6802     BLOCK_COMMENT("float16ToFloat:");
6803 
6804     FloatRegister dst = f10;
6805     Register src = x10;
6806     Label NaN_SLOW;
6807 
6808     assert(VM_Version::supports_float16_float_conversion(), "must");
6809 
6810     // On riscv, NaN needs a special process as fcvt does not work in that case.
6811     // On riscv, Inf does not need a special process as fcvt can handle it correctly.
6812     // but we consider to get the slow path to process NaN and Inf at the same time,
6813     // as both of them are rare cases, and if we try to get the slow path to handle
6814     // only NaN case it would sacrifise the performance for normal cases,
6815     // i.e. non-NaN and non-Inf cases.
6816 
6817     // check whether it's a NaN or +/- Inf.
6818     __ mv(t0, 0x7c00);
6819     __ andr(t1, src, t0);
6820     // jump to stub processing NaN and Inf cases.
6821     __ beq(t0, t1, NaN_SLOW);
6822 
6823     // non-NaN or non-Inf cases, just use built-in instructions.
6824     __ fmv_h_x(dst, src);
6825     __ fcvt_s_h(dst, dst);
6826     __ ret();
6827 
6828     __ bind(NaN_SLOW);
6829     // following instructions mainly focus on NaN, as riscv does not handle
6830     // NaN well with fcvt, but the code also works for Inf at the same time.
6831 
6832     // construct a NaN in 32 bits from the NaN in 16 bits,
6833     // we need the payloads of non-canonical NaNs to be preserved.
6834     __ mv(t1, 0x7f800000);
6835     // sign-bit was already set via sign-extension if necessary.
6836     __ slli(t0, src, 13);
6837     __ orr(t1, t0, t1);
6838     __ fmv_w_x(dst, t1);
6839 
6840     __ ret();
6841     return entry;
6842   }
6843 
6844   // f10 = input (float)
6845   // x10 = result (float16)
6846   // f11 = temporary float register
6847   // t1  = temporary register
6848   address generate_floatToFloat16() {
6849     __ align(CodeEntryAlignment);
6850     StubId stub_id = StubId::stubgen_f2hf_id;
6851     StubCodeMark mark(this, stub_id);
6852     address entry = __ pc();
6853     BLOCK_COMMENT("floatToFloat16:");
6854 
6855     Register dst = x10;
6856     FloatRegister src = f10, ftmp = f11;
6857     Label NaN_SLOW;
6858 
6859     assert(VM_Version::supports_float16_float_conversion(), "must");
6860 
6861     // On riscv, NaN needs a special process as fcvt does not work in that case.
6862 
6863     // check whether it's a NaN.
6864     // replace fclass with feq as performance optimization.
6865     __ feq_s(t0, src, src);
6866     // jump to stub processing NaN cases.
6867     __ beqz(t0, NaN_SLOW);
6868 
6869     // non-NaN cases, just use built-in instructions.
6870     __ fcvt_h_s(ftmp, src);
6871     __ fmv_x_h(dst, ftmp);
6872     __ ret();
6873 
6874     __ bind(NaN_SLOW);
6875 
6876     __ float_to_float16_NaN(dst, src, t0, t1);
6877 
6878     __ ret();
6879     return entry;
6880   }
6881 
6882 #ifdef COMPILER2
6883 
6884 static const int64_t right_2_bits = right_n_bits(2);
6885 static const int64_t right_3_bits = right_n_bits(3);
6886 
6887   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
6888   // are represented as long[5], with BITS_PER_LIMB = 26.
6889   // Pack five 26-bit limbs into three 64-bit registers.
6890   void poly1305_pack_26(Register dest0, Register dest1, Register dest2, Register src, Register tmp1, Register tmp2) {
6891     assert_different_registers(dest0, dest1, dest2, src, tmp1, tmp2);
6892 
6893     // The goal is to have 128-bit value in dest2:dest1:dest0
6894     __ ld(dest0, Address(src, 0));    // 26 bits in dest0
6895 
6896     __ ld(tmp1, Address(src, sizeof(jlong)));
6897     __ slli(tmp1, tmp1, 26);
6898     __ add(dest0, dest0, tmp1);       // 52 bits in dest0
6899 
6900     __ ld(tmp2, Address(src, 2 * sizeof(jlong)));
6901     __ slli(tmp1, tmp2, 52);
6902     __ add(dest0, dest0, tmp1);       // dest0 is full
6903 
6904     __ srli(dest1, tmp2, 12);         // 14-bit in dest1
6905 
6906     __ ld(tmp1, Address(src, 3 * sizeof(jlong)));
6907     __ slli(tmp1, tmp1, 14);
6908     __ add(dest1, dest1, tmp1);       // 40-bit in dest1
6909 
6910     __ ld(tmp1, Address(src, 4 * sizeof(jlong)));
6911     __ slli(tmp2, tmp1, 40);
6912     __ add(dest1, dest1, tmp2);       // dest1 is full
6913 
6914     if (dest2->is_valid()) {
6915       __ srli(tmp1, tmp1, 24);
6916       __ mv(dest2, tmp1);               // 2 bits in dest2
6917     } else {
6918 #ifdef ASSERT
6919       Label OK;
6920       __ srli(tmp1, tmp1, 24);
6921       __ beq(zr, tmp1, OK);           // 2 bits
6922       __ stop("high bits of Poly1305 integer should be zero");
6923       __ should_not_reach_here();
6924       __ bind(OK);
6925 #endif
6926     }
6927   }
6928 
6929   // As above, but return only a 128-bit integer, packed into two
6930   // 64-bit registers.
6931   void poly1305_pack_26(Register dest0, Register dest1, Register src, Register tmp1, Register tmp2) {
6932     poly1305_pack_26(dest0, dest1, noreg, src, tmp1, tmp2);
6933   }
6934 
6935   // U_2:U_1:U_0: += (U_2 >> 2) * 5
6936   void poly1305_reduce(Register U_2, Register U_1, Register U_0, Register tmp1, Register tmp2) {
6937     assert_different_registers(U_2, U_1, U_0, tmp1, tmp2);
6938 
6939     // First, U_2:U_1:U_0 += (U_2 >> 2)
6940     __ srli(tmp1, U_2, 2);
6941     __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2
6942     __ andi(U_2, U_2, right_2_bits); // Clear U_2 except for the lowest two bits
6943     __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2
6944     __ add(U_2, U_2, tmp2);
6945 
6946     // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
6947     __ slli(tmp1, tmp1, 2);
6948     __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2
6949     __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2
6950     __ add(U_2, U_2, tmp2);
6951   }
6952 
6953   // Poly1305, RFC 7539
6954   // void com.sun.crypto.provider.Poly1305.processMultipleBlocks(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs)
6955 
6956   // Arguments:
6957   //    c_rarg0:   input_start -- where the input is stored
6958   //    c_rarg1:   length
6959   //    c_rarg2:   acc_start -- where the output will be stored
6960   //    c_rarg3:   r_start -- where the randomly generated 128-bit key is stored
6961 
6962   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
6963   // description of the tricks used to simplify and accelerate this
6964   // computation.
6965 
6966   address generate_poly1305_processBlocks() {
6967     __ align(CodeEntryAlignment);
6968     StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
6969     StubCodeMark mark(this, stub_id);
6970     address start = __ pc();
6971     __ enter();
6972     Label here;
6973 
6974     RegSet saved_regs = RegSet::range(x18, x21);
6975     RegSetIterator<Register> regs = (RegSet::range(x14, x31) - RegSet::range(x22, x27)).begin();
6976     __ push_reg(saved_regs, sp);
6977 
6978     // Arguments
6979     const Register input_start = c_rarg0, length = c_rarg1, acc_start = c_rarg2, r_start = c_rarg3;
6980 
6981     // R_n is the 128-bit randomly-generated key, packed into two
6982     // registers. The caller passes this key to us as long[5], with
6983     // BITS_PER_LIMB = 26.
6984     const Register R_0 = *regs, R_1 = *++regs;
6985     poly1305_pack_26(R_0, R_1, r_start, t1, t2);
6986 
6987     // RR_n is (R_n >> 2) * 5
6988     const Register RR_0 = *++regs, RR_1 = *++regs;
6989     __ srli(t1, R_0, 2);
6990     __ shadd(RR_0, t1, t1, t2, 2);
6991     __ srli(t1, R_1, 2);
6992     __ shadd(RR_1, t1, t1, t2, 2);
6993 
6994     // U_n is the current checksum
6995     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
6996     poly1305_pack_26(U_0, U_1, U_2, acc_start, t1, t2);
6997 
6998     static constexpr int BLOCK_LENGTH = 16;
6999     Label DONE, LOOP;
7000 
7001     __ mv(t1, BLOCK_LENGTH);
7002     __ blt(length, t1, DONE); {
7003       __ bind(LOOP);
7004 
7005       // S_n is to be the sum of U_n and the next block of data
7006       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
7007       __ ld(S_0, Address(input_start, 0));
7008       __ ld(S_1, Address(input_start, wordSize));
7009 
7010       __ cad(S_0, S_0, U_0, t1); // Add U_0 to S_0 with carry output to t1
7011       __ cadc(S_1, S_1, U_1, t1); // Add U_1 with carry to S_1 with carry output to t1
7012       __ add(S_2, U_2, t1);
7013 
7014       __ addi(S_2, S_2, 1);
7015 
7016       const Register U_0HI = *++regs, U_1HI = *++regs;
7017 
7018       // NB: this logic depends on some of the special properties of
7019       // Poly1305 keys. In particular, because we know that the top
7020       // four bits of R_0 and R_1 are zero, we can add together
7021       // partial products without any risk of needing to propagate a
7022       // carry out.
7023       __ wide_mul(U_0, U_0HI, S_0, R_0);
7024       __ wide_madd(U_0, U_0HI, S_1, RR_1, t1, t2);
7025       __ wide_madd(U_0, U_0HI, S_2, RR_0, t1, t2);
7026 
7027       __ wide_mul(U_1, U_1HI, S_0, R_1);
7028       __ wide_madd(U_1, U_1HI, S_1, R_0, t1, t2);
7029       __ wide_madd(U_1, U_1HI, S_2, RR_1, t1, t2);
7030 
7031       __ andi(U_2, R_0, right_2_bits);
7032       __ mul(U_2, S_2, U_2);
7033 
7034       // Partial reduction mod 2**130 - 5
7035       __ cad(U_1, U_1, U_0HI, t1); // Add U_0HI to U_1 with carry output to t1
7036       __ adc(U_2, U_2, U_1HI, t1);
7037       // Sum is now in U_2:U_1:U_0.
7038 
7039       // U_2:U_1:U_0: += (U_2 >> 2) * 5
7040       poly1305_reduce(U_2, U_1, U_0, t1, t2);
7041 
7042       __ subi(length, length, BLOCK_LENGTH);
7043       __ addi(input_start, input_start, BLOCK_LENGTH);
7044       __ mv(t1, BLOCK_LENGTH);
7045       __ bge(length, t1, LOOP);
7046     }
7047 
7048     // Further reduce modulo 2^130 - 5
7049     poly1305_reduce(U_2, U_1, U_0, t1, t2);
7050 
7051     // Unpack the sum into five 26-bit limbs and write to memory.
7052     // First 26 bits is the first limb
7053     __ slli(t1, U_0, 38); // Take lowest 26 bits
7054     __ srli(t1, t1, 38);
7055     __ sd(t1, Address(acc_start)); // First 26-bit limb
7056 
7057     // 27-52 bits of U_0 is the second limb
7058     __ slli(t1, U_0, 12); // Take next 27-52 bits
7059     __ srli(t1, t1, 38);
7060     __ sd(t1, Address(acc_start, sizeof (jlong))); // Second 26-bit limb
7061 
7062     // Getting 53-64 bits of U_0 and 1-14 bits of U_1 in one register
7063     __ srli(t1, U_0, 52);
7064     __ slli(t2, U_1, 50);
7065     __ srli(t2, t2, 38);
7066     __ add(t1, t1, t2);
7067     __ sd(t1, Address(acc_start, 2 * sizeof (jlong))); // Third 26-bit limb
7068 
7069     // Storing 15-40 bits of U_1
7070     __ slli(t1, U_1, 24); // Already used up 14 bits
7071     __ srli(t1, t1, 38); // Clear all other bits from t1
7072     __ sd(t1, Address(acc_start, 3 * sizeof (jlong))); // Fourth 26-bit limb
7073 
7074     // Storing 41-64 bits of U_1 and first three bits from U_2 in one register
7075     __ srli(t1, U_1, 40);
7076     __ andi(t2, U_2, right_3_bits);
7077     __ slli(t2, t2, 24);
7078     __ add(t1, t1, t2);
7079     __ sd(t1, Address(acc_start, 4 * sizeof (jlong))); // Fifth 26-bit limb
7080 
7081     __ bind(DONE);
7082     __ pop_reg(saved_regs, sp);
7083     __ leave(); // Required for proper stackwalking
7084     __ ret();
7085 
7086     return start;
7087   }
7088 
7089   address generate_arrays_hashcode_powers_of_31() {
7090     assert(UseRVV, "sanity");
7091     const int lmul = 2;
7092     const int stride = MaxVectorSize / sizeof(jint) * lmul;
7093     __ align(CodeEntryAlignment);
7094     StubCodeMark mark(this, "StubRoutines", "arrays_hashcode_powers_of_31");
7095     address start = __ pc();
7096     for (int i = stride; i >= 0; i--) {
7097         jint power_of_31 = 1;
7098         for (int j = i; j > 0; j--) {
7099           power_of_31 = java_multiply(power_of_31, 31);
7100         }
7101         __ emit_int32(power_of_31);
7102     }
7103 
7104     return start;
7105   }
7106 
7107 #endif // COMPILER2
7108 
7109   /**
7110    *  Arguments:
7111    *
7112    * Inputs:
7113    *   c_rarg0   - int crc
7114    *   c_rarg1   - byte* buf
7115    *   c_rarg2   - int length
7116    *
7117    * Output:
7118    *   c_rarg0   - int crc result
7119    */
7120   address generate_updateBytesCRC32() {
7121     assert(UseCRC32Intrinsics, "what are we doing here?");
7122 
7123     __ align(CodeEntryAlignment);
7124     StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
7125     StubCodeMark mark(this, stub_id);
7126 
7127     address start = __ pc();
7128 
7129     // input parameters
7130     const Register crc    = c_rarg0;  // crc
7131     const Register buf    = c_rarg1;  // source java byte array address
7132     const Register len    = c_rarg2;  // length
7133 
7134     BLOCK_COMMENT("Entry:");
7135     __ enter(); // required for proper stackwalking of RuntimeStub frame
7136 
7137     __ kernel_crc32(crc, buf, len,
7138                     c_rarg3, c_rarg4, c_rarg5, c_rarg6, // tmp's for tables
7139                     c_rarg7, t2, t3, t4, t5, t6);       // misc tmps
7140 
7141     __ leave(); // required for proper stackwalking of RuntimeStub frame
7142     __ ret();
7143 
7144     return start;
7145   }
7146 
7147   // exception handler for upcall stubs
7148   address generate_upcall_stub_exception_handler() {
7149     StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
7150     StubCodeMark mark(this, stub_id);
7151     address start = __ pc();
7152 
7153     // Native caller has no idea how to handle exceptions,
7154     // so we just crash here. Up to callee to catch exceptions.
7155     __ verify_oop(x10); // return a exception oop in a0
7156     __ rt_call(CAST_FROM_FN_PTR(address, UpcallLinker::handle_uncaught_exception));
7157     __ should_not_reach_here();
7158 
7159     return start;
7160   }
7161 
7162   // load Method* target of MethodHandle
7163   // j_rarg0 = jobject receiver
7164   // xmethod = Method* result
7165   address generate_upcall_stub_load_target() {
7166 
7167     StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
7168     StubCodeMark mark(this, stub_id);
7169     address start = __ pc();
7170 
7171     __ resolve_global_jobject(j_rarg0, t0, t1);
7172       // Load target method from receiver
7173     __ load_heap_oop(xmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), t0, t1);
7174     __ load_heap_oop(xmethod, Address(xmethod, java_lang_invoke_LambdaForm::vmentry_offset()), t0, t1);
7175     __ load_heap_oop(xmethod, Address(xmethod, java_lang_invoke_MemberName::method_offset()), t0, t1);
7176     __ access_load_at(T_ADDRESS, IN_HEAP, xmethod,
7177                       Address(xmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
7178                       noreg, noreg);
7179     __ sd(xmethod, Address(xthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
7180 
7181     __ ret();
7182 
7183     return start;
7184   }
7185 
7186 #undef __
7187 
7188   // Initialization
7189   void generate_preuniverse_stubs() {
7190     // preuniverse stubs are not needed for riscv
7191   }
7192 
7193   void generate_initial_stubs() {
7194     // Generate initial stubs and initializes the entry points
7195 
7196     // entry points that exist in all platforms Note: This is code
7197     // that could be shared among different platforms - however the
7198     // benefit seems to be smaller than the disadvantage of having a
7199     // much more complicated generator structure. See also comment in
7200     // stubRoutines.hpp.
7201 
7202     StubRoutines::_forward_exception_entry = generate_forward_exception();
7203 
7204     if (UnsafeMemoryAccess::_table == nullptr) {
7205       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
7206     }
7207 
7208     StubRoutines::_call_stub_entry =
7209       generate_call_stub(StubRoutines::_call_stub_return_address);
7210 
7211     // is referenced by megamorphic call
7212     StubRoutines::_catch_exception_entry = generate_catch_exception();
7213 
7214     if (UseCRC32Intrinsics) {
7215       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
7216     }
7217 
7218     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
7219         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
7220       StubRoutines::_hf2f = generate_float16ToFloat();
7221       StubRoutines::_f2hf = generate_floatToFloat16();
7222     }
7223   }
7224 
7225   void generate_continuation_stubs() {
7226     // Continuation stubs:
7227     StubRoutines::_cont_thaw             = generate_cont_thaw();
7228     StubRoutines::_cont_returnBarrier    = generate_cont_returnBarrier();
7229     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
7230     StubRoutines::_cont_preempt_stub     = generate_cont_preempt_stub();
7231   }
7232 
7233   void generate_final_stubs() {
7234     // support for verify_oop (must happen after universe_init)
7235     if (VerifyOops) {
7236       StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
7237     }
7238 
7239     // arraycopy stubs used by compilers
7240     generate_arraycopy_stubs();
7241 
7242     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
7243 
7244 #ifdef COMPILER2
7245     if (UseSecondarySupersTable) {
7246       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
7247       if (!InlineSecondarySupersTest) {
7248         generate_lookup_secondary_supers_table_stub();
7249       }
7250     }
7251 #endif // COMPILER2
7252 
7253     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
7254     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
7255 
7256     StubRoutines::riscv::set_completed();
7257   }
7258 
7259   void generate_compiler_stubs() {
7260 #ifdef COMPILER2
7261     if (UseMulAddIntrinsic) {
7262       StubRoutines::_mulAdd = generate_mulAdd();
7263     }
7264 
7265     if (UseMultiplyToLenIntrinsic) {
7266       StubRoutines::_multiplyToLen = generate_multiplyToLen();
7267     }
7268 
7269     if (UseSquareToLenIntrinsic) {
7270       StubRoutines::_squareToLen = generate_squareToLen();
7271     }
7272 
7273     if (UseMontgomeryMultiplyIntrinsic) {
7274       StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
7275       StubCodeMark mark(this, stub_id);
7276       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
7277       StubRoutines::_montgomeryMultiply = g.generate_multiply();
7278     }
7279 
7280     if (UseMontgomerySquareIntrinsic) {
7281       StubId stub_id = StubId::stubgen_montgomerySquare_id;
7282       StubCodeMark mark(this, stub_id);
7283       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
7284       StubRoutines::_montgomerySquare = g.generate_square();
7285     }
7286 
7287     if (UseAESIntrinsics) {
7288       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
7289       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
7290       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
7291       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
7292     }
7293 
7294     if (UseAESCTRIntrinsics) {
7295       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
7296     }
7297 
7298     if (UseGHASHIntrinsics) {
7299       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
7300     }
7301 
7302     if (UsePoly1305Intrinsics) {
7303       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
7304     }
7305 
7306     if (UseRVV) {
7307       StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
7308       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
7309     }
7310 
7311     if (UseVectorizedHashCodeIntrinsic && UseRVV) {
7312       StubRoutines::riscv::_arrays_hashcode_powers_of_31 = generate_arrays_hashcode_powers_of_31();
7313     }
7314 
7315     if (UseSHA256Intrinsics) {
7316       Sha2Generator sha2(_masm, this);
7317       StubRoutines::_sha256_implCompress   = sha2.generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
7318       StubRoutines::_sha256_implCompressMB = sha2.generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
7319     }
7320 
7321     if (UseSHA512Intrinsics) {
7322       Sha2Generator sha2(_masm, this);
7323       StubRoutines::_sha512_implCompress   = sha2.generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
7324       StubRoutines::_sha512_implCompressMB = sha2.generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
7325     }
7326 
7327     if (UseMD5Intrinsics) {
7328       StubRoutines::_md5_implCompress   = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
7329       StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
7330     }
7331 
7332     if (UseChaCha20Intrinsics) {
7333       StubRoutines::_chacha20Block = generate_chacha20Block();
7334     }
7335 
7336     if (UseSHA1Intrinsics) {
7337       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
7338       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
7339     }
7340 
7341     if (UseBASE64Intrinsics) {
7342       StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
7343       StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
7344     }
7345 
7346     if (UseAdler32Intrinsics) {
7347       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
7348     }
7349 
7350     generate_compare_long_strings();
7351 
7352     generate_string_indexof_stubs();
7353 
7354 #endif // COMPILER2
7355   }
7356 
7357  public:
7358   StubGenerator(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) : StubCodeGenerator(code, blob_id, stub_data) {
7359     switch(blob_id) {
7360     case BlobId::stubgen_preuniverse_id:
7361       generate_preuniverse_stubs();
7362       break;
7363     case BlobId::stubgen_initial_id:
7364       generate_initial_stubs();
7365       break;
7366     case BlobId::stubgen_continuation_id:
7367       generate_continuation_stubs();
7368       break;
7369     case BlobId::stubgen_compiler_id:
7370       generate_compiler_stubs();
7371       break;
7372     case BlobId::stubgen_final_id:
7373       generate_final_stubs();
7374       break;
7375     default:
7376       fatal("unexpected blob id: %s", StubInfo::name(blob_id));
7377       break;
7378     };
7379   }
7380 }; // end class declaration
7381 
7382 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) {
7383   StubGenerator g(code, blob_id, stub_data);
7384 }