Old src/hotspot/cpu/riscv/stubGenerator

   1 /*
   2  * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2020, 2025, Huawei Technologies Co., Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "compiler/oopMap.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "memory/universe.hpp"
  34 #include "nativeInst_riscv.hpp"
  35 #include "oops/instanceOop.hpp"
  36 #include "oops/method.hpp"
  37 #include "oops/objArrayKlass.hpp"
  38 #include "oops/oop.inline.hpp"
  39 #include "prims/methodHandles.hpp"
  40 #include "prims/upcallLinker.hpp"
  41 #include "runtime/continuation.hpp"
  42 #include "runtime/continuationEntry.inline.hpp"
  43 #include "runtime/frame.inline.hpp"
  44 #include "runtime/handles.inline.hpp"
  45 #include "runtime/javaThread.hpp"
  46 #include "runtime/sharedRuntime.hpp"
  47 #include "runtime/stubCodeGenerator.hpp"
  48 #include "runtime/stubRoutines.hpp"
  49 #include "utilities/align.hpp"
  50 #include "utilities/powerOfTwo.hpp"
  51 #ifdef COMPILER2
  52 #include "opto/runtime.hpp"
  53 #endif
  54 
  55 // Declaration and definition of StubGenerator (no .hpp file).
  56 // For a more detailed description of the stub routine structure
  57 // see the comment in stubRoutines.hpp
  58 
  59 #undef __
  60 #define __ _masm->
  61 
  62 #ifdef PRODUCT
  63 #define BLOCK_COMMENT(str) /* nothing */
  64 #else
  65 #define BLOCK_COMMENT(str) __ block_comment(str)
  66 #endif
  67 
  68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  69 
  70 // Stub Code definitions
  71 
  72 class StubGenerator: public StubCodeGenerator {
  73  private:
  74 
  75 #ifdef PRODUCT
  76 #define inc_counter_np(counter) ((void)0)
  77 #else
  78   void inc_counter_np_(uint& counter) {
  79     __ incrementw(ExternalAddress((address)&counter));
  80   }
  81 #define inc_counter_np(counter) \
  82   BLOCK_COMMENT("inc_counter " #counter); \
  83   inc_counter_np_(counter);
  84 #endif
  85 
  86   // Call stubs are used to call Java from C
  87   //
  88   // Arguments:
  89   //    c_rarg0:   call wrapper address                   address
  90   //    c_rarg1:   result                                 address
  91   //    c_rarg2:   result type                            BasicType
  92   //    c_rarg3:   method                                 Method*
  93   //    c_rarg4:   (interpreter) entry point              address
  94   //    c_rarg5:   parameters                             intptr_t*
  95   //    c_rarg6:   parameter size (in words)              int
  96   //    c_rarg7:   thread                                 Thread*
  97   //
  98   // There is no return from the stub itself as any Java result
  99   // is written to result
 100   //
 101   // we save x1 (ra) as the return PC at the base of the frame and
 102   // link x8 (fp) below it as the frame pointer installing sp (x2)
 103   // into fp.
 104   //
 105   // we save x10-x17, which accounts for all the c arguments.
 106   //
 107   // TODO: strictly do we need to save them all? they are treated as
 108   // volatile by C so could we omit saving the ones we are going to
 109   // place in global registers (thread? method?) or those we only use
 110   // during setup of the Java call?
 111   //
 112   // we don't need to save x5 which C uses as an indirect result location
 113   // return register.
 114   //
 115   // we don't need to save x6-x7 and x28-x31 which both C and Java treat as
 116   // volatile
 117   //
 118   // we save x9, x18-x27, f8-f9, and f18-f27 which Java uses as temporary
 119   // registers and C expects to be callee-save
 120   //
 121   // so the stub frame looks like this when we enter Java code
 122   //
 123   //     [ return_from_Java     ] <--- sp
 124   //     [ argument word n      ]
 125   //      ...
 126   // -35 [ argument word 1      ]
 127   // -34 [ saved FRM in Floating-point Control and Status Register ] <--- sp_after_call
 128   // -33 [ saved f27            ]
 129   // -32 [ saved f26            ]
 130   // -31 [ saved f25            ]
 131   // -30 [ saved f24            ]
 132   // -29 [ saved f23            ]
 133   // -28 [ saved f22            ]
 134   // -27 [ saved f21            ]
 135   // -26 [ saved f20            ]
 136   // -25 [ saved f19            ]
 137   // -24 [ saved f18            ]
 138   // -23 [ saved f9             ]
 139   // -22 [ saved f8             ]
 140   // -21 [ saved x27            ]
 141   // -20 [ saved x26            ]
 142   // -19 [ saved x25            ]
 143   // -18 [ saved x24            ]
 144   // -17 [ saved x23            ]
 145   // -16 [ saved x22            ]
 146   // -15 [ saved x21            ]
 147   // -14 [ saved x20            ]
 148   // -13 [ saved x19            ]
 149   // -12 [ saved x18            ]
 150   // -11 [ saved x9             ]
 151   // -10 [ call wrapper   (x10) ]
 152   //  -9 [ result         (x11) ]
 153   //  -8 [ result type    (x12) ]
 154   //  -7 [ method         (x13) ]
 155   //  -6 [ entry point    (x14) ]
 156   //  -5 [ parameters     (x15) ]
 157   //  -4 [ parameter size (x16) ]
 158   //  -3 [ thread         (x17) ]
 159   //  -2 [ saved fp       (x8)  ]
 160   //  -1 [ saved ra       (x1)  ]
 161   //   0 [                      ] <--- fp == saved sp (x2)
 162 
 163   // Call stub stack layout word offsets from fp
 164   enum call_stub_layout {
 165     sp_after_call_off  = -34,
 166 
 167     frm_off            = sp_after_call_off,
 168     f27_off            = -33,
 169     f26_off            = -32,
 170     f25_off            = -31,
 171     f24_off            = -30,
 172     f23_off            = -29,
 173     f22_off            = -28,
 174     f21_off            = -27,
 175     f20_off            = -26,
 176     f19_off            = -25,
 177     f18_off            = -24,
 178     f9_off             = -23,
 179     f8_off             = -22,
 180 
 181     x27_off            = -21,
 182     x26_off            = -20,
 183     x25_off            = -19,
 184     x24_off            = -18,
 185     x23_off            = -17,
 186     x22_off            = -16,
 187     x21_off            = -15,
 188     x20_off            = -14,
 189     x19_off            = -13,
 190     x18_off            = -12,
 191     x9_off             = -11,
 192 
 193     call_wrapper_off   = -10,
 194     result_off         = -9,
 195     result_type_off    = -8,
 196     method_off         = -7,
 197     entry_point_off    = -6,
 198     parameters_off     = -5,
 199     parameter_size_off = -4,
 200     thread_off         = -3,
 201     fp_f               = -2,
 202     retaddr_off        = -1,
 203   };
 204 
 205   address generate_call_stub(address& return_address) {
 206     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 207            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 208            "adjust this code");
 209 
 210     StubId stub_id = StubId::stubgen_call_stub_id;
 211     StubCodeMark mark(this, stub_id);
 212     address start = __ pc();
 213 
 214     const Address sp_after_call (fp, sp_after_call_off  * wordSize);
 215 
 216     const Address frm_save      (fp, frm_off           * wordSize);
 217     const Address call_wrapper  (fp, call_wrapper_off   * wordSize);
 218     const Address result        (fp, result_off         * wordSize);
 219     const Address result_type   (fp, result_type_off    * wordSize);
 220     const Address method        (fp, method_off         * wordSize);
 221     const Address entry_point   (fp, entry_point_off    * wordSize);
 222     const Address parameters    (fp, parameters_off     * wordSize);
 223     const Address parameter_size(fp, parameter_size_off * wordSize);
 224 
 225     const Address thread        (fp, thread_off         * wordSize);
 226 
 227     const Address f27_save      (fp, f27_off            * wordSize);
 228     const Address f26_save      (fp, f26_off            * wordSize);
 229     const Address f25_save      (fp, f25_off            * wordSize);
 230     const Address f24_save      (fp, f24_off            * wordSize);
 231     const Address f23_save      (fp, f23_off            * wordSize);
 232     const Address f22_save      (fp, f22_off            * wordSize);
 233     const Address f21_save      (fp, f21_off            * wordSize);
 234     const Address f20_save      (fp, f20_off            * wordSize);
 235     const Address f19_save      (fp, f19_off            * wordSize);
 236     const Address f18_save      (fp, f18_off            * wordSize);
 237     const Address f9_save       (fp, f9_off             * wordSize);
 238     const Address f8_save       (fp, f8_off             * wordSize);
 239 
 240     const Address x27_save      (fp, x27_off            * wordSize);
 241     const Address x26_save      (fp, x26_off            * wordSize);
 242     const Address x25_save      (fp, x25_off            * wordSize);
 243     const Address x24_save      (fp, x24_off            * wordSize);
 244     const Address x23_save      (fp, x23_off            * wordSize);
 245     const Address x22_save      (fp, x22_off            * wordSize);
 246     const Address x21_save      (fp, x21_off            * wordSize);
 247     const Address x20_save      (fp, x20_off            * wordSize);
 248     const Address x19_save      (fp, x19_off            * wordSize);
 249     const Address x18_save      (fp, x18_off            * wordSize);
 250 
 251     const Address x9_save       (fp, x9_off             * wordSize);
 252 
 253     // stub code
 254 
 255     address riscv_entry = __ pc();
 256 
 257     // set up frame and move sp to end of save area
 258     __ enter();
 259     __ addi(sp, fp, sp_after_call_off * wordSize);
 260 
 261     // save register parameters and Java temporary/global registers
 262     // n.b. we save thread even though it gets installed in
 263     // xthread because we want to sanity check tp later
 264     __ sd(c_rarg7, thread);
 265     __ sw(c_rarg6, parameter_size);
 266     __ sd(c_rarg5, parameters);
 267     __ sd(c_rarg4, entry_point);
 268     __ sd(c_rarg3, method);
 269     __ sd(c_rarg2, result_type);
 270     __ sd(c_rarg1, result);
 271     __ sd(c_rarg0, call_wrapper);
 272 
 273     __ sd(x9, x9_save);
 274 
 275     __ sd(x18, x18_save);
 276     __ sd(x19, x19_save);
 277     __ sd(x20, x20_save);
 278     __ sd(x21, x21_save);
 279     __ sd(x22, x22_save);
 280     __ sd(x23, x23_save);
 281     __ sd(x24, x24_save);
 282     __ sd(x25, x25_save);
 283     __ sd(x26, x26_save);
 284     __ sd(x27, x27_save);
 285 
 286     __ fsd(f8,  f8_save);
 287     __ fsd(f9,  f9_save);
 288     __ fsd(f18, f18_save);
 289     __ fsd(f19, f19_save);
 290     __ fsd(f20, f20_save);
 291     __ fsd(f21, f21_save);
 292     __ fsd(f22, f22_save);
 293     __ fsd(f23, f23_save);
 294     __ fsd(f24, f24_save);
 295     __ fsd(f25, f25_save);
 296     __ fsd(f26, f26_save);
 297     __ fsd(f27, f27_save);
 298 
 299     __ frrm(t0);
 300     __ sd(t0, frm_save);
 301     // Set frm to the state we need. We do want Round to Nearest. We
 302     // don't want non-IEEE rounding modes.
 303     Label skip_fsrmi;
 304     guarantee(__ RoundingMode::rne == 0, "must be");
 305     __ beqz(t0, skip_fsrmi);
 306     __ fsrmi(__ RoundingMode::rne);
 307     __ bind(skip_fsrmi);
 308 
 309     // install Java thread in global register now we have saved
 310     // whatever value it held
 311     __ mv(xthread, c_rarg7);
 312 
 313     // And method
 314     __ mv(xmethod, c_rarg3);
 315 
 316     // set up the heapbase register
 317     __ reinit_heapbase();
 318 
 319 #ifdef ASSERT
 320     // make sure we have no pending exceptions
 321     {
 322       Label L;
 323       __ ld(t0, Address(xthread, in_bytes(Thread::pending_exception_offset())));
 324       __ beqz(t0, L);
 325       __ stop("StubRoutines::call_stub: entered with pending exception");
 326       __ BIND(L);
 327     }
 328 #endif
 329     // pass parameters if any
 330     __ mv(esp, sp);
 331     __ slli(t0, c_rarg6, LogBytesPerWord);
 332     __ sub(t0, sp, t0); // Move SP out of the way
 333     __ andi(sp, t0, -2 * wordSize);
 334 
 335     BLOCK_COMMENT("pass parameters if any");
 336     Label parameters_done;
 337     // parameter count is still in c_rarg6
 338     // and parameter pointer identifying param 1 is in c_rarg5
 339     __ beqz(c_rarg6, parameters_done);
 340 
 341     address loop = __ pc();
 342     __ ld(t0, Address(c_rarg5, 0));
 343     __ addi(c_rarg5, c_rarg5, wordSize);
 344     __ subi(c_rarg6, c_rarg6, 1);
 345     __ push_reg(t0);
 346     __ bgtz(c_rarg6, loop);
 347 
 348     __ BIND(parameters_done);
 349 
 350     // call Java entry -- passing methdoOop, and current sp
 351     //      xmethod: Method*
 352     //      x19_sender_sp: sender sp
 353     BLOCK_COMMENT("call Java function");
 354     __ mv(x19_sender_sp, sp);
 355     __ jalr(c_rarg4);
 356 
 357     // save current address for use by exception handling code
 358 
 359     return_address = __ pc();
 360 
 361     // store result depending on type (everything that is not
 362     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 363     // n.b. this assumes Java returns an integral result in x10
 364     // and a floating result in j_farg0
 365     __ ld(j_rarg2, result);
 366     Label is_long, is_float, is_double, exit;
 367     __ ld(j_rarg1, result_type);
 368     __ mv(t0, (u1)T_OBJECT);
 369     __ beq(j_rarg1, t0, is_long);
 370     __ mv(t0, (u1)T_LONG);
 371     __ beq(j_rarg1, t0, is_long);
 372     __ mv(t0, (u1)T_FLOAT);
 373     __ beq(j_rarg1, t0, is_float);
 374     __ mv(t0, (u1)T_DOUBLE);
 375     __ beq(j_rarg1, t0, is_double);
 376 
 377     // handle T_INT case
 378     __ sw(x10, Address(j_rarg2));
 379 
 380     __ BIND(exit);
 381 
 382     // pop parameters
 383     __ addi(esp, fp, sp_after_call_off * wordSize);
 384 
 385 #ifdef ASSERT
 386     // verify that threads correspond
 387     {
 388       Label L, S;
 389       __ ld(t0, thread);
 390       __ bne(xthread, t0, S);
 391       __ get_thread(t0);
 392       __ beq(xthread, t0, L);
 393       __ BIND(S);
 394       __ stop("StubRoutines::call_stub: threads must correspond");
 395       __ BIND(L);
 396     }
 397 #endif
 398 
 399     __ pop_cont_fastpath(xthread);
 400 
 401     // restore callee-save registers
 402     __ fld(f27, f27_save);
 403     __ fld(f26, f26_save);
 404     __ fld(f25, f25_save);
 405     __ fld(f24, f24_save);
 406     __ fld(f23, f23_save);
 407     __ fld(f22, f22_save);
 408     __ fld(f21, f21_save);
 409     __ fld(f20, f20_save);
 410     __ fld(f19, f19_save);
 411     __ fld(f18, f18_save);
 412     __ fld(f9,  f9_save);
 413     __ fld(f8,  f8_save);
 414 
 415     __ ld(x27, x27_save);
 416     __ ld(x26, x26_save);
 417     __ ld(x25, x25_save);
 418     __ ld(x24, x24_save);
 419     __ ld(x23, x23_save);
 420     __ ld(x22, x22_save);
 421     __ ld(x21, x21_save);
 422     __ ld(x20, x20_save);
 423     __ ld(x19, x19_save);
 424     __ ld(x18, x18_save);
 425 
 426     __ ld(x9, x9_save);
 427 
 428     // restore frm
 429     Label skip_fsrm;
 430     __ ld(t0, frm_save);
 431     __ frrm(t1);
 432     __ beq(t0, t1, skip_fsrm);
 433     __ fsrm(t0);
 434     __ bind(skip_fsrm);
 435 
 436     __ ld(c_rarg0, call_wrapper);
 437     __ ld(c_rarg1, result);
 438     __ ld(c_rarg2, result_type);
 439     __ ld(c_rarg3, method);
 440     __ ld(c_rarg4, entry_point);
 441     __ ld(c_rarg5, parameters);
 442     __ ld(c_rarg6, parameter_size);
 443     __ ld(c_rarg7, thread);
 444 
 445     // leave frame and return to caller
 446     __ leave();
 447     __ ret();
 448 
 449     // handle return types different from T_INT
 450 
 451     __ BIND(is_long);
 452     __ sd(x10, Address(j_rarg2, 0));
 453     __ j(exit);
 454 
 455     __ BIND(is_float);
 456     __ fsw(j_farg0, Address(j_rarg2, 0), t0);
 457     __ j(exit);
 458 
 459     __ BIND(is_double);
 460     __ fsd(j_farg0, Address(j_rarg2, 0), t0);
 461     __ j(exit);
 462 
 463     return start;
 464   }
 465 
 466   // Return point for a Java call if there's an exception thrown in
 467   // Java code.  The exception is caught and transformed into a
 468   // pending exception stored in JavaThread that can be tested from
 469   // within the VM.
 470   //
 471   // Note: Usually the parameters are removed by the callee. In case
 472   // of an exception crossing an activation frame boundary, that is
 473   // not the case if the callee is compiled code => need to setup the
 474   // sp.
 475   //
 476   // x10: exception oop
 477 
 478   address generate_catch_exception() {
 479     StubId stub_id = StubId::stubgen_catch_exception_id;
 480     StubCodeMark mark(this, stub_id);
 481     address start = __ pc();
 482 
 483     // same as in generate_call_stub():
 484     const Address thread(fp, thread_off * wordSize);
 485 
 486 #ifdef ASSERT
 487     // verify that threads correspond
 488     {
 489       Label L, S;
 490       __ ld(t0, thread);
 491       __ bne(xthread, t0, S);
 492       __ get_thread(t0);
 493       __ beq(xthread, t0, L);
 494       __ bind(S);
 495       __ stop("StubRoutines::catch_exception: threads must correspond");
 496       __ bind(L);
 497     }
 498 #endif
 499 
 500     // set pending exception
 501     __ verify_oop(x10);
 502 
 503     __ sd(x10, Address(xthread, Thread::pending_exception_offset()));
 504     __ mv(t0, (address)__FILE__);
 505     __ sd(t0, Address(xthread, Thread::exception_file_offset()));
 506     __ mv(t0, (int)__LINE__);
 507     __ sw(t0, Address(xthread, Thread::exception_line_offset()));
 508 
 509     // complete return to VM
 510     assert(StubRoutines::_call_stub_return_address != nullptr,
 511            "_call_stub_return_address must have been generated before");
 512     __ j(RuntimeAddress(StubRoutines::_call_stub_return_address));
 513 
 514     return start;
 515   }
 516 
 517   // Continuation point for runtime calls returning with a pending
 518   // exception.  The pending exception check happened in the runtime
 519   // or native call stub.  The pending exception in Thread is
 520   // converted into a Java-level exception.
 521   //
 522   // Contract with Java-level exception handlers:
 523   // x10: exception
 524   // x13: throwing pc
 525   //
 526   // NOTE: At entry of this stub, exception-pc must be in RA !!
 527 
 528   // NOTE: this is always used as a jump target within generated code
 529   // so it just needs to be generated code with no x86 prolog
 530 
 531   address generate_forward_exception() {
 532     StubId stub_id = StubId::stubgen_forward_exception_id;
 533     StubCodeMark mark(this, stub_id);
 534     address start = __ pc();
 535 
 536     // Upon entry, RA points to the return address returning into
 537     // Java (interpreted or compiled) code; i.e., the return address
 538     // becomes the throwing pc.
 539     //
 540     // Arguments pushed before the runtime call are still on the stack
 541     // but the exception handler will reset the stack pointer ->
 542     // ignore them.  A potential result in registers can be ignored as
 543     // well.
 544 
 545 #ifdef ASSERT
 546     // make sure this code is only executed if there is a pending exception
 547     {
 548       Label L;
 549       __ ld(t0, Address(xthread, Thread::pending_exception_offset()));
 550       __ bnez(t0, L);
 551       __ stop("StubRoutines::forward exception: no pending exception (1)");
 552       __ bind(L);
 553     }
 554 #endif
 555 
 556     // compute exception handler into x9
 557 
 558     // call the VM to find the handler address associated with the
 559     // caller address. pass thread in x10 and caller pc (ret address)
 560     // in x11. n.b. the caller pc is in ra, unlike x86 where it is on
 561     // the stack.
 562     __ mv(c_rarg1, ra);
 563     // ra will be trashed by the VM call so we move it to x9
 564     // (callee-saved) because we also need to pass it to the handler
 565     // returned by this call.
 566     __ mv(x9, ra);
 567     BLOCK_COMMENT("call exception_handler_for_return_address");
 568     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 569                          SharedRuntime::exception_handler_for_return_address),
 570                     xthread, c_rarg1);
 571     // we should not really care that ra is no longer the callee
 572     // address. we saved the value the handler needs in x9 so we can
 573     // just copy it to x13. however, the C2 handler will push its own
 574     // frame and then calls into the VM and the VM code asserts that
 575     // the PC for the frame above the handler belongs to a compiled
 576     // Java method. So, we restore ra here to satisfy that assert.
 577     __ mv(ra, x9);
 578     // setup x10 & x13 & clear pending exception
 579     __ mv(x13, x9);
 580     __ mv(x9, x10);
 581     __ ld(x10, Address(xthread, Thread::pending_exception_offset()));
 582     __ sd(zr, Address(xthread, Thread::pending_exception_offset()));
 583 
 584 #ifdef ASSERT
 585     // make sure exception is set
 586     {
 587       Label L;
 588       __ bnez(x10, L);
 589       __ stop("StubRoutines::forward exception: no pending exception (2)");
 590       __ bind(L);
 591     }
 592 #endif
 593 
 594     // continue at exception handler
 595     // x10: exception
 596     // x13: throwing pc
 597     // x9: exception handler
 598     __ verify_oop(x10);
 599     __ jr(x9);
 600 
 601     return start;
 602   }
 603 
 604   // Non-destructive plausibility checks for oops
 605   //
 606   // Arguments:
 607   //    x10: oop to verify
 608   //    t0: error message
 609   //
 610   // Stack after saving c_rarg3:
 611   //    [tos + 0]: saved c_rarg3
 612   //    [tos + 1]: saved c_rarg2
 613   //    [tos + 2]: saved ra
 614   //    [tos + 3]: saved t1
 615   //    [tos + 4]: saved x10
 616   //    [tos + 5]: saved t0
 617   address generate_verify_oop() {
 618 
 619     StubId stub_id = StubId::stubgen_verify_oop_id;
 620     StubCodeMark mark(this, stub_id);
 621     address start = __ pc();
 622 
 623     Label exit, error;
 624 
 625     __ push_reg(RegSet::of(c_rarg2, c_rarg3), sp); // save c_rarg2 and c_rarg3
 626 
 627     __ la(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 628     __ ld(c_rarg3, Address(c_rarg2));
 629     __ addi(c_rarg3, c_rarg3, 1);
 630     __ sd(c_rarg3, Address(c_rarg2));
 631 
 632     // object is in x10
 633     // make sure object is 'reasonable'
 634     __ beqz(x10, exit); // if obj is null it is OK
 635 
 636     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 637     bs_asm->check_oop(_masm, x10, c_rarg2, c_rarg3, error);
 638 
 639     // return if everything seems ok
 640     __ bind(exit);
 641 
 642     __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp);  // pop c_rarg2 and c_rarg3
 643     __ ret();
 644 
 645     // handle errors
 646     __ bind(error);
 647     __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp); // pop c_rarg2 and c_rarg3
 648 
 649     __ push_reg(RegSet::range(x0, x31), sp);
 650     // debug(char* msg, int64_t pc, int64_t regs[])
 651     __ mv(c_rarg0, t0);             // pass address of error message
 652     __ mv(c_rarg1, ra);             // pass return address
 653     __ mv(c_rarg2, sp);             // pass address of regs on stack
 654 #ifndef PRODUCT
 655     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 656 #endif
 657     BLOCK_COMMENT("call MacroAssembler::debug");
 658     __ rt_call(CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 659     __ ebreak();
 660 
 661     return start;
 662   }
 663 
 664   // The inner part of zero_words().
 665   //
 666   // Inputs:
 667   // x28: the HeapWord-aligned base address of an array to zero.
 668   // x29: the count in HeapWords, x29 > 0.
 669   //
 670   // Returns x28 and x29, adjusted for the caller to clear.
 671   // x28: the base address of the tail of words left to clear.
 672   // x29: the number of words in the tail.
 673   //      x29 < MacroAssembler::zero_words_block_size.
 674 
 675   address generate_zero_blocks() {
 676     Label done;
 677 
 678     const Register base = x28, cnt = x29, tmp1 = x30, tmp2 = x31;
 679 
 680     __ align(CodeEntryAlignment);
 681     StubId stub_id = StubId::stubgen_zero_blocks_id;
 682     StubCodeMark mark(this, stub_id);
 683     address start = __ pc();
 684 
 685     if (UseBlockZeroing) {
 686       int zicboz_block_size = VM_Version::zicboz_block_size.value();
 687       // Ensure count >= 2 * zicboz_block_size so that it still deserves
 688       // a cbo.zero after alignment.
 689       Label small;
 690       int low_limit = MAX2(2 * zicboz_block_size, (int)BlockZeroingLowLimit) / wordSize;
 691       __ mv(tmp1, low_limit);
 692       __ blt(cnt, tmp1, small);
 693       __ zero_dcache_blocks(base, cnt, tmp1, tmp2);
 694       __ bind(small);
 695     }
 696 
 697     {
 698       // Clear the remaining blocks.
 699       Label loop;
 700       __ mv(tmp1, MacroAssembler::zero_words_block_size);
 701       __ blt(cnt, tmp1, done);
 702       __ bind(loop);
 703       for (int i = 0; i < MacroAssembler::zero_words_block_size; i++) {
 704         __ sd(zr, Address(base, i * wordSize));
 705       }
 706       __ addi(base, base, MacroAssembler::zero_words_block_size * wordSize);
 707       __ subi(cnt, cnt, MacroAssembler::zero_words_block_size);
 708       __ bge(cnt, tmp1, loop);
 709       __ bind(done);
 710     }
 711 
 712     __ ret();
 713 
 714     return start;
 715   }
 716 
 717   typedef enum {
 718     copy_forwards = 1,
 719     copy_backwards = -1
 720   } copy_direction;
 721 
 722   // Bulk copy of blocks of 8 words.
 723   //
 724   // count is a count of words.
 725   //
 726   // Precondition: count >= 8
 727   //
 728   // Postconditions:
 729   //
 730   // The least significant bit of count contains the remaining count
 731   // of words to copy.  The rest of count is trash.
 732   //
 733   // s and d are adjusted to point to the remaining words to copy
 734   //
 735   address generate_copy_longs(StubId stub_id, Register s, Register d, Register count) {
 736     BasicType type;
 737     copy_direction direction;
 738     switch (stub_id) {
 739     case StubId::stubgen_copy_byte_f_id:
 740       direction = copy_forwards;
 741       type = T_BYTE;
 742       break;
 743     case StubId::stubgen_copy_byte_b_id:
 744       direction = copy_backwards;
 745       type = T_BYTE;
 746       break;
 747     default:
 748       ShouldNotReachHere();
 749     }
 750     int unit = wordSize * direction;
 751     int bias = wordSize;
 752 
 753     const Register tmp_reg0 = x13, tmp_reg1 = x14, tmp_reg2 = x15, tmp_reg3 = x16,
 754       tmp_reg4 = x17, tmp_reg5 = x7, tmp_reg6 = x28, tmp_reg7 = x29;
 755 
 756     const Register stride = x30;
 757 
 758     assert_different_registers(t0, tmp_reg0, tmp_reg1, tmp_reg2, tmp_reg3,
 759       tmp_reg4, tmp_reg5, tmp_reg6, tmp_reg7);
 760     assert_different_registers(s, d, count, t0);
 761 
 762     Label again, drain;
 763     StubCodeMark mark(this, stub_id);
 764     __ align(CodeEntryAlignment);
 765     address start = __ pc();
 766 
 767     if (direction == copy_forwards) {
 768       __ sub(s, s, bias);
 769       __ sub(d, d, bias);
 770     }
 771 
 772 #ifdef ASSERT
 773     // Make sure we are never given < 8 words
 774     {
 775       Label L;
 776 
 777       __ mv(t0, 8);
 778       __ bge(count, t0, L);
 779       __ stop("genrate_copy_longs called with < 8 words");
 780       __ bind(L);
 781     }
 782 #endif
 783 
 784     __ ld(tmp_reg0, Address(s, 1 * unit));
 785     __ ld(tmp_reg1, Address(s, 2 * unit));
 786     __ ld(tmp_reg2, Address(s, 3 * unit));
 787     __ ld(tmp_reg3, Address(s, 4 * unit));
 788     __ ld(tmp_reg4, Address(s, 5 * unit));
 789     __ ld(tmp_reg5, Address(s, 6 * unit));
 790     __ ld(tmp_reg6, Address(s, 7 * unit));
 791     __ ld(tmp_reg7, Address(s, 8 * unit));
 792     __ addi(s, s, 8 * unit);
 793 
 794     __ subi(count, count, 16);
 795     __ bltz(count, drain);
 796 
 797     __ bind(again);
 798 
 799     __ sd(tmp_reg0, Address(d, 1 * unit));
 800     __ sd(tmp_reg1, Address(d, 2 * unit));
 801     __ sd(tmp_reg2, Address(d, 3 * unit));
 802     __ sd(tmp_reg3, Address(d, 4 * unit));
 803     __ sd(tmp_reg4, Address(d, 5 * unit));
 804     __ sd(tmp_reg5, Address(d, 6 * unit));
 805     __ sd(tmp_reg6, Address(d, 7 * unit));
 806     __ sd(tmp_reg7, Address(d, 8 * unit));
 807 
 808     __ ld(tmp_reg0, Address(s, 1 * unit));
 809     __ ld(tmp_reg1, Address(s, 2 * unit));
 810     __ ld(tmp_reg2, Address(s, 3 * unit));
 811     __ ld(tmp_reg3, Address(s, 4 * unit));
 812     __ ld(tmp_reg4, Address(s, 5 * unit));
 813     __ ld(tmp_reg5, Address(s, 6 * unit));
 814     __ ld(tmp_reg6, Address(s, 7 * unit));
 815     __ ld(tmp_reg7, Address(s, 8 * unit));
 816 
 817     __ addi(s, s, 8 * unit);
 818     __ addi(d, d, 8 * unit);
 819 
 820     __ subi(count, count, 8);
 821     __ bgez(count, again);
 822 
 823     // Drain
 824     __ bind(drain);
 825 
 826     __ sd(tmp_reg0, Address(d, 1 * unit));
 827     __ sd(tmp_reg1, Address(d, 2 * unit));
 828     __ sd(tmp_reg2, Address(d, 3 * unit));
 829     __ sd(tmp_reg3, Address(d, 4 * unit));
 830     __ sd(tmp_reg4, Address(d, 5 * unit));
 831     __ sd(tmp_reg5, Address(d, 6 * unit));
 832     __ sd(tmp_reg6, Address(d, 7 * unit));
 833     __ sd(tmp_reg7, Address(d, 8 * unit));
 834     __ addi(d, d, 8 * unit);
 835 
 836     {
 837       Label L1, L2;
 838       __ test_bit(t0, count, 2);
 839       __ beqz(t0, L1);
 840 
 841       __ ld(tmp_reg0, Address(s, 1 * unit));
 842       __ ld(tmp_reg1, Address(s, 2 * unit));
 843       __ ld(tmp_reg2, Address(s, 3 * unit));
 844       __ ld(tmp_reg3, Address(s, 4 * unit));
 845       __ addi(s, s, 4 * unit);
 846 
 847       __ sd(tmp_reg0, Address(d, 1 * unit));
 848       __ sd(tmp_reg1, Address(d, 2 * unit));
 849       __ sd(tmp_reg2, Address(d, 3 * unit));
 850       __ sd(tmp_reg3, Address(d, 4 * unit));
 851       __ addi(d, d, 4 * unit);
 852 
 853       __ bind(L1);
 854 
 855       if (direction == copy_forwards) {
 856         __ addi(s, s, bias);
 857         __ addi(d, d, bias);
 858       }
 859 
 860       __ test_bit(t0, count, 1);
 861       __ beqz(t0, L2);
 862       if (direction == copy_backwards) {
 863         __ addi(s, s, 2 * unit);
 864         __ ld(tmp_reg0, Address(s));
 865         __ ld(tmp_reg1, Address(s, wordSize));
 866         __ addi(d, d, 2 * unit);
 867         __ sd(tmp_reg0, Address(d));
 868         __ sd(tmp_reg1, Address(d, wordSize));
 869       } else {
 870         __ ld(tmp_reg0, Address(s));
 871         __ ld(tmp_reg1, Address(s, wordSize));
 872         __ addi(s, s, 2 * unit);
 873         __ sd(tmp_reg0, Address(d));
 874         __ sd(tmp_reg1, Address(d, wordSize));
 875         __ addi(d, d, 2 * unit);
 876       }
 877       __ bind(L2);
 878     }
 879 
 880     __ ret();
 881 
 882     return start;
 883   }
 884 
 885   typedef void (MacroAssembler::*copy_insn)(Register Rd, const Address &adr, Register temp);
 886 
 887   void copy_memory_v(Register s, Register d, Register count, int step) {
 888     bool is_backward = step < 0;
 889     int granularity = g_uabs(step);
 890 
 891     const Register src = x30, dst = x31, vl = x14, cnt = x15, tmp1 = x16, tmp2 = x17;
 892     assert_different_registers(s, d, cnt, vl, tmp1, tmp2);
 893     Assembler::SEW sew = Assembler::elembytes_to_sew(granularity);
 894     Label loop_forward, loop_backward, done;
 895 
 896     __ mv(dst, d);
 897     __ mv(src, s);
 898     __ mv(cnt, count);
 899 
 900     __ bind(loop_forward);
 901     __ vsetvli(vl, cnt, sew, Assembler::m8);
 902     if (is_backward) {
 903       __ bne(vl, cnt, loop_backward);
 904     }
 905 
 906     __ vlex_v(v0, src, sew);
 907     __ sub(cnt, cnt, vl);
 908     if (sew != Assembler::e8) {
 909       // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary
 910       __ slli(vl, vl, sew);
 911     }
 912     __ add(src, src, vl);
 913 
 914     __ vsex_v(v0, dst, sew);
 915     __ add(dst, dst, vl);
 916     __ bnez(cnt, loop_forward);
 917 
 918     if (is_backward) {
 919       __ j(done);
 920 
 921       __ bind(loop_backward);
 922       __ sub(t0, cnt, vl);
 923       if (sew != Assembler::e8) {
 924         // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary
 925         __ slli(t0, t0, sew);
 926       }
 927       __ add(tmp1, s, t0);
 928       __ vlex_v(v0, tmp1, sew);
 929       __ add(tmp2, d, t0);
 930       __ vsex_v(v0, tmp2, sew);
 931       __ sub(cnt, cnt, vl);
 932       __ bnez(cnt, loop_forward);
 933       __ bind(done);
 934     }
 935   }
 936 
 937   // All-singing all-dancing memory copy.
 938   //
 939   // Copy count units of memory from s to d.  The size of a unit is
 940   // step, which can be positive or negative depending on the direction
 941   // of copy.
 942   //
 943   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 944                    Register s, Register d, Register count, int step) {
 945     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 946     if (UseRVV && (!is_reference_type(type) || bs_asm->supports_rvv_arraycopy())) {
 947       return copy_memory_v(s, d, count, step);
 948     }
 949 
 950     bool is_backwards = step < 0;
 951     int granularity = g_uabs(step);
 952 
 953     const Register src = x30, dst = x31, cnt = x15, tmp3 = x16, tmp4 = x17, tmp5 = x14, tmp6 = x13;
 954     const Register gct1 = x28, gct2 = x29, gct3 = t2;
 955 
 956     Label same_aligned;
 957     Label copy_big, copy32_loop, copy8_loop, copy_small, done;
 958 
 959     // The size of copy32_loop body increases significantly with ZGC GC barriers.
 960     // Need conditional far branches to reach a point beyond the loop in this case.
 961     bool is_far = UseZGC;
 962 
 963     __ beqz(count, done, is_far);
 964     __ slli(cnt, count, exact_log2(granularity));
 965     if (is_backwards) {
 966       __ add(src, s, cnt);
 967       __ add(dst, d, cnt);
 968     } else {
 969       __ mv(src, s);
 970       __ mv(dst, d);
 971     }
 972 
 973     if (is_aligned) {
 974       __ subi(t0, cnt, 32);
 975       __ bgez(t0, copy32_loop);
 976       __ subi(t0, cnt, 8);
 977       __ bgez(t0, copy8_loop, is_far);
 978       __ j(copy_small);
 979     } else {
 980       __ mv(t0, 16);
 981       __ blt(cnt, t0, copy_small, is_far);
 982 
 983       __ xorr(t0, src, dst);
 984       __ andi(t0, t0, 0b111);
 985       __ bnez(t0, copy_small, is_far);
 986 
 987       __ bind(same_aligned);
 988       __ andi(t0, src, 0b111);
 989       __ beqz(t0, copy_big);
 990       if (is_backwards) {
 991         __ addi(src, src, step);
 992         __ addi(dst, dst, step);
 993       }
 994       bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1);
 995       bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3);
 996       if (!is_backwards) {
 997         __ addi(src, src, step);
 998         __ addi(dst, dst, step);
 999       }
1000       __ subi(cnt, cnt, granularity);
1001       __ beqz(cnt, done, is_far);
1002       __ j(same_aligned);
1003 
1004       __ bind(copy_big);
1005       __ mv(t0, 32);
1006       __ blt(cnt, t0, copy8_loop, is_far);
1007     }
1008 
1009     __ bind(copy32_loop);
1010     if (is_backwards) {
1011       __ subi(src, src, wordSize * 4);
1012       __ subi(dst, dst, wordSize * 4);
1013     }
1014     // we first load 32 bytes, then write it, so the direction here doesn't matter
1015     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src),     gct1);
1016     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp4, Address(src, 8),  gct1);
1017     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp5, Address(src, 16), gct1);
1018     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp6, Address(src, 24), gct1);
1019 
1020     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst),     tmp3, gct1, gct2, gct3);
1021     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 8),  tmp4, gct1, gct2, gct3);
1022     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 16), tmp5, gct1, gct2, gct3);
1023     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 24), tmp6, gct1, gct2, gct3);
1024 
1025     if (!is_backwards) {
1026       __ addi(src, src, wordSize * 4);
1027       __ addi(dst, dst, wordSize * 4);
1028     }
1029     __ subi(t0, cnt, 32 + wordSize * 4);
1030     __ subi(cnt, cnt, wordSize * 4);
1031     __ bgez(t0, copy32_loop); // cnt >= 32, do next loop
1032 
1033     __ beqz(cnt, done); // if that's all - done
1034 
1035     __ subi(t0, cnt, 8); // if not - copy the reminder
1036     __ bltz(t0, copy_small); // cnt < 8, go to copy_small, else fall through to copy8_loop
1037 
1038     __ bind(copy8_loop);
1039     if (is_backwards) {
1040       __ subi(src, src, wordSize);
1041       __ subi(dst, dst, wordSize);
1042     }
1043     bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src), gct1);
1044     bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst), tmp3, gct1, gct2, gct3);
1045 
1046     if (!is_backwards) {
1047       __ addi(src, src, wordSize);
1048       __ addi(dst, dst, wordSize);
1049     }
1050     __ subi(t0, cnt, 8 + wordSize);
1051     __ subi(cnt, cnt, wordSize);
1052     __ bgez(t0, copy8_loop); // cnt >= 8, do next loop
1053 
1054     __ beqz(cnt, done); // if that's all - done
1055 
1056     __ bind(copy_small);
1057     if (is_backwards) {
1058       __ addi(src, src, step);
1059       __ addi(dst, dst, step);
1060     }
1061 
1062     bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1);
1063     bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3);
1064 
1065     if (!is_backwards) {
1066       __ addi(src, src, step);
1067       __ addi(dst, dst, step);
1068     }
1069     __ subi(cnt, cnt, granularity);
1070     __ bgtz(cnt, copy_small);
1071 
1072     __ bind(done);
1073   }
1074 
1075   // Scan over array at a for count oops, verifying each one.
1076   // Preserves a and count, clobbers t0 and t1.
1077   void verify_oop_array(size_t size, Register a, Register count, Register temp) {
1078     Label loop, end;
1079     __ mv(t1, zr);
1080     __ slli(t0, count, exact_log2(size));
1081     __ bind(loop);
1082     __ bgeu(t1, t0, end);
1083 
1084     __ add(temp, a, t1);
1085     if (size == (size_t)wordSize) {
1086       __ ld(temp, Address(temp, 0));
1087       __ verify_oop(temp);
1088     } else {
1089       __ lwu(temp, Address(temp, 0));
1090       __ decode_heap_oop(temp); // calls verify_oop
1091     }
1092     __ add(t1, t1, size);
1093     __ j(loop);
1094     __ bind(end);
1095   }
1096 
1097   // Arguments:
1098   //   stub_id - is used to name the stub and identify all details of
1099   //             how to perform the copy.
1100   //
1101   //   nopush_entry - is assigned to the stub's post push entry point
1102   //                  unless it is null
1103   //
1104   // Inputs:
1105   //   c_rarg0   - source array address
1106   //   c_rarg1   - destination array address
1107   //   c_rarg2   - element count, treated as ssize_t, can be zero
1108   //
1109   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1110   // the hardware handle it.  The two dwords within qwords that span
1111   // cache line boundaries will still be loaded and stored atomically.
1112   //
1113   // Side Effects: nopush_entry is set to the (post push) entry point
1114   //               so it can be used by the corresponding conjoint
1115   //               copy method
1116   //
1117   address generate_disjoint_copy(StubId stub_id, address* nopush_entry) {
1118     size_t size;
1119     bool aligned;
1120     bool is_oop;
1121     bool dest_uninitialized;
1122     switch (stub_id) {
1123     case StubId::stubgen_jbyte_disjoint_arraycopy_id:
1124       size = sizeof(jbyte);
1125       aligned = false;
1126       is_oop = false;
1127       dest_uninitialized = false;
1128       break;
1129     case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
1130       size = sizeof(jbyte);
1131       aligned = true;
1132       is_oop = false;
1133       dest_uninitialized = false;
1134       break;
1135     case StubId::stubgen_jshort_disjoint_arraycopy_id:
1136       size = sizeof(jshort);
1137       aligned = false;
1138       is_oop = false;
1139       dest_uninitialized = false;
1140       break;
1141     case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
1142       size = sizeof(jshort);
1143       aligned = true;
1144       is_oop = false;
1145       dest_uninitialized = false;
1146       break;
1147     case StubId::stubgen_jint_disjoint_arraycopy_id:
1148       size = sizeof(jint);
1149       aligned = false;
1150       is_oop = false;
1151       dest_uninitialized = false;
1152       break;
1153     case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
1154       size = sizeof(jint);
1155       aligned = true;
1156       is_oop = false;
1157       dest_uninitialized = false;
1158       break;
1159     case StubId::stubgen_jlong_disjoint_arraycopy_id:
1160       // since this is always aligned we can (should!) use the same
1161       // stub as for case arrayof_jlong_disjoint_arraycopy
1162       ShouldNotReachHere();
1163       break;
1164     case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
1165       size = sizeof(jlong);
1166       aligned = true;
1167       is_oop = false;
1168       dest_uninitialized = false;
1169       break;
1170     case StubId::stubgen_oop_disjoint_arraycopy_id:
1171       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1172       aligned = !UseCompressedOops;
1173       is_oop = true;
1174       dest_uninitialized = false;
1175       break;
1176     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
1177       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1178       aligned = !UseCompressedOops;
1179       is_oop = true;
1180       dest_uninitialized = false;
1181       break;
1182     case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
1183       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1184       aligned = !UseCompressedOops;
1185       is_oop = true;
1186       dest_uninitialized = true;
1187       break;
1188     case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
1189       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1190       aligned = !UseCompressedOops;
1191       is_oop = true;
1192       dest_uninitialized = true;
1193       break;
1194     default:
1195       ShouldNotReachHere();
1196       break;
1197     }
1198 
1199     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1200     RegSet saved_reg = RegSet::of(s, d, count);
1201     __ align(CodeEntryAlignment);
1202     StubCodeMark mark(this, stub_id);
1203     address start = __ pc();
1204     __ enter();
1205 
1206     if (nopush_entry != nullptr) {
1207      *nopush_entry = __ pc();
1208       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1209       BLOCK_COMMENT("Entry:");
1210     }
1211 
1212     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1213     if (dest_uninitialized) {
1214       decorators |= IS_DEST_UNINITIALIZED;
1215     }
1216     if (aligned) {
1217       decorators |= ARRAYCOPY_ALIGNED;
1218     }
1219 
1220     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1221     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1222 
1223     if (is_oop) {
1224       // save regs before copy_memory
1225       __ push_reg(RegSet::of(d, count), sp);
1226     }
1227 
1228     {
1229       // UnsafeMemoryAccess page error: continue after unsafe access
1230       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1231       UnsafeMemoryAccessMark umam(this, add_entry, true);
1232       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1233     }
1234 
1235     if (is_oop) {
1236       __ pop_reg(RegSet::of(d, count), sp);
1237       if (VerifyOops) {
1238         verify_oop_array(size, d, count, t2);
1239       }
1240     }
1241 
1242     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0);
1243 
1244     __ leave();
1245     __ mv(x10, zr); // return 0
1246     __ ret();
1247     return start;
1248   }
1249 
1250   // Arguments:
1251   //   stub_id - is used to name the stub and identify all details of
1252   //             how to perform the copy.
1253   //
1254   //   nooverlap_target - identifes the (post push) entry for the
1255   //             corresponding disjoint copy routine which can be
1256   //             jumped to if the ranges do not actually overlap
1257   //
1258   //   nopush_entry - is assigned to the stub's post push entry point
1259   //                 unless it is null
1260   //
1261   // Inputs:
1262   //   c_rarg0   - source array address
1263   //   c_rarg1   - destination array address
1264   //   c_rarg2   - element count, treated as ssize_t, can be zero
1265   //
1266   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1267   // the hardware handle it.  The two dwords within qwords that span
1268   // cache line boundaries will still be loaded and stored atomically.
1269   //
1270   // Side Effects:
1271   //   nopush_entry is set to the no-overlap entry point so it can be
1272   //   used by some other conjoint copy method
1273   //
1274   address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
1275     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1276     RegSet saved_regs = RegSet::of(s, d, count);
1277     int size;
1278     bool aligned;
1279     bool is_oop;
1280     bool dest_uninitialized;
1281     switch (stub_id) {
1282     case StubId::stubgen_jbyte_arraycopy_id:
1283       size = sizeof(jbyte);
1284       aligned = false;
1285       is_oop = false;
1286       dest_uninitialized = false;
1287       break;
1288     case StubId::stubgen_arrayof_jbyte_arraycopy_id:
1289       size = sizeof(jbyte);
1290       aligned = true;
1291       is_oop = false;
1292       dest_uninitialized = false;
1293       break;
1294     case StubId::stubgen_jshort_arraycopy_id:
1295       size = sizeof(jshort);
1296       aligned = false;
1297       is_oop = false;
1298       dest_uninitialized = false;
1299       break;
1300     case StubId::stubgen_arrayof_jshort_arraycopy_id:
1301       size = sizeof(jshort);
1302       aligned = true;
1303       is_oop = false;
1304       dest_uninitialized = false;
1305       break;
1306     case StubId::stubgen_jint_arraycopy_id:
1307       size = sizeof(jint);
1308       aligned = false;
1309       is_oop = false;
1310       dest_uninitialized = false;
1311       break;
1312     case StubId::stubgen_arrayof_jint_arraycopy_id:
1313       size = sizeof(jint);
1314       aligned = true;
1315       is_oop = false;
1316       dest_uninitialized = false;
1317       break;
1318     case StubId::stubgen_jlong_arraycopy_id:
1319       // since this is always aligned we can (should!) use the same
1320       // stub as for case arrayof_jlong_disjoint_arraycopy
1321       ShouldNotReachHere();
1322       break;
1323     case StubId::stubgen_arrayof_jlong_arraycopy_id:
1324       size = sizeof(jlong);
1325       aligned = true;
1326       is_oop = false;
1327       dest_uninitialized = false;
1328       break;
1329     case StubId::stubgen_oop_arraycopy_id:
1330       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1331       aligned = !UseCompressedOops;
1332       is_oop = true;
1333       dest_uninitialized = false;
1334       break;
1335     case StubId::stubgen_arrayof_oop_arraycopy_id:
1336       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1337       aligned = !UseCompressedOops;
1338       is_oop = true;
1339       dest_uninitialized = false;
1340       break;
1341     case StubId::stubgen_oop_arraycopy_uninit_id:
1342       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1343       aligned = !UseCompressedOops;
1344       is_oop = true;
1345       dest_uninitialized = true;
1346       break;
1347     case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
1348       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1349       aligned = !UseCompressedOops;
1350       is_oop = true;
1351       dest_uninitialized = true;
1352       break;
1353     default:
1354       ShouldNotReachHere();
1355     }
1356 
1357     StubCodeMark mark(this, stub_id);
1358     address start = __ pc();
1359     __ enter();
1360 
1361     if (nopush_entry != nullptr) {
1362       *nopush_entry = __ pc();
1363       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1364       BLOCK_COMMENT("Entry:");
1365     }
1366 
1367     // use fwd copy when (d-s) above_equal (count*size)
1368     __ sub(t0, d, s);
1369     __ slli(t1, count, exact_log2(size));
1370     Label L_continue;
1371     __ bltu(t0, t1, L_continue);
1372     __ j(RuntimeAddress(nooverlap_target));
1373     __ bind(L_continue);
1374 
1375     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1376     if (dest_uninitialized) {
1377       decorators |= IS_DEST_UNINITIALIZED;
1378     }
1379     if (aligned) {
1380       decorators |= ARRAYCOPY_ALIGNED;
1381     }
1382 
1383     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1384     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1385 
1386     if (is_oop) {
1387       // save regs before copy_memory
1388       __ push_reg(RegSet::of(d, count), sp);
1389     }
1390 
1391     {
1392       // UnsafeMemoryAccess page error: continue after unsafe access
1393       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1394       UnsafeMemoryAccessMark umam(this, add_entry, true);
1395       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1396     }
1397 
1398     if (is_oop) {
1399       __ pop_reg(RegSet::of(d, count), sp);
1400       if (VerifyOops) {
1401         verify_oop_array(size, d, count, t2);
1402       }
1403     }
1404     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0);
1405     __ leave();
1406     __ mv(x10, zr); // return 0
1407     __ ret();
1408     return start;
1409   }
1410 
1411   // Helper for generating a dynamic type check.
1412   // Smashes t0, t1.
1413   void generate_type_check(Register sub_klass,
1414                            Register super_check_offset,
1415                            Register super_klass,
1416                            Register result,
1417                            Register tmp1,
1418                            Register tmp2,
1419                            Label& L_success) {
1420     assert_different_registers(sub_klass, super_check_offset, super_klass);
1421 
1422     BLOCK_COMMENT("type_check:");
1423 
1424     Label L_miss;
1425 
1426     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, super_check_offset);
1427     __ check_klass_subtype_slow_path(sub_klass, super_klass, tmp1, tmp2, &L_success, nullptr);
1428 
1429     // Fall through on failure!
1430     __ BIND(L_miss);
1431   }
1432 
1433   //
1434   //  Generate checkcasting array copy stub
1435   //
1436   //  Input:
1437   //    c_rarg0   - source array address
1438   //    c_rarg1   - destination array address
1439   //    c_rarg2   - element count, treated as ssize_t, can be zero
1440   //    c_rarg3   - size_t ckoff (super_check_offset)
1441   //    c_rarg4   - oop ckval (super_klass)
1442   //
1443   //  Output:
1444   //    x10 ==  0  -  success
1445   //    x10 == -1^K - failure, where K is partial transfer count
1446   //
1447   address generate_checkcast_copy(StubId stub_id, address* nopush_entry) {
1448     bool dest_uninitialized;
1449     switch (stub_id) {
1450     case StubId::stubgen_checkcast_arraycopy_id:
1451       dest_uninitialized = false;
1452       break;
1453     case StubId::stubgen_checkcast_arraycopy_uninit_id:
1454       dest_uninitialized = true;
1455       break;
1456     default:
1457       ShouldNotReachHere();
1458     }
1459 
1460     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1461 
1462     // Input registers (after setup_arg_regs)
1463     const Register from        = c_rarg0;   // source array address
1464     const Register to          = c_rarg1;   // destination array address
1465     const Register count       = c_rarg2;   // elementscount
1466     const Register ckoff       = c_rarg3;   // super_check_offset
1467     const Register ckval       = c_rarg4;   // super_klass
1468 
1469     RegSet wb_pre_saved_regs   = RegSet::range(c_rarg0, c_rarg4);
1470 
1471     // Registers used as temps (x7, x9, x18 are save-on-entry)
1472     const Register count_save  = x19;       // orig elementscount
1473     const Register start_to    = x18;       // destination array start address
1474     const Register copied_oop  = x7;        // actual oop copied
1475     const Register r9_klass    = x9;        // oop._klass
1476 
1477     // Registers used as gc temps (x15, x16, x17 are save-on-call)
1478     const Register gct1 = x15, gct2 = x16, gct3 = x17;
1479 
1480     //---------------------------------------------------------------
1481     // Assembler stub will be used for this call to arraycopy
1482     // if the two arrays are subtypes of Object[] but the
1483     // destination array type is not equal to or a supertype
1484     // of the source type.  Each element must be separately
1485     // checked.
1486 
1487     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1488                                copied_oop, r9_klass, count_save);
1489 
1490     __ align(CodeEntryAlignment);
1491     StubCodeMark mark(this, stub_id);
1492     address start = __ pc();
1493 
1494     __ enter(); // required for proper stackwalking of RuntimeStub frame
1495 
1496     // Caller of this entry point must set up the argument registers.
1497     if (nopush_entry != nullptr) {
1498       *nopush_entry = __ pc();
1499       BLOCK_COMMENT("Entry:");
1500     }
1501 
1502     // Empty array:  Nothing to do
1503     __ beqz(count, L_done);
1504 
1505     __ push_reg(RegSet::of(x7, x9, x18, x19), sp);
1506 
1507 #ifdef ASSERT
1508     BLOCK_COMMENT("assert consistent ckoff/ckval");
1509     // The ckoff and ckval must be mutually consistent,
1510     // even though caller generates both.
1511     { Label L;
1512       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1513       __ lwu(start_to, Address(ckval, sco_offset));
1514       __ beq(ckoff, start_to, L);
1515       __ stop("super_check_offset inconsistent");
1516       __ bind(L);
1517     }
1518 #endif //ASSERT
1519 
1520     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1521     if (dest_uninitialized) {
1522       decorators |= IS_DEST_UNINITIALIZED;
1523     }
1524 
1525     bool is_oop = true;
1526     int element_size = UseCompressedOops ? 4 : 8;
1527 
1528     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1529     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1530 
1531     // save the original count
1532     __ mv(count_save, count);
1533 
1534     // Copy from low to high addresses
1535     __ mv(start_to, to);              // Save destination array start address
1536     __ j(L_load_element);
1537 
1538     // ======== begin loop ========
1539     // (Loop is rotated; its entry is L_load_element.)
1540     // Loop control:
1541     //   for count to 0 do
1542     //     copied_oop = load_heap_oop(from++)
1543     //     ... generate_type_check ...
1544     //     store_heap_oop(to++, copied_oop)
1545     //   end
1546 
1547     __ align(OptoLoopAlignment);
1548 
1549     __ BIND(L_store_element);
1550     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1551                       Address(to, 0), copied_oop,
1552                       gct1, gct2, gct3);
1553     __ addi(to, to, UseCompressedOops ? 4 : 8);
1554     __ subi(count, count, 1);
1555     __ beqz(count, L_do_card_marks);
1556 
1557     // ======== loop entry is here ========
1558     __ BIND(L_load_element);
1559     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1560                      copied_oop, Address(from, 0),
1561                      gct1);
1562     __ addi(from, from, UseCompressedOops ? 4 : 8);
1563     __ beqz(copied_oop, L_store_element);
1564 
1565     __ load_klass(r9_klass, copied_oop);// query the object klass
1566 
1567     BLOCK_COMMENT("type_check:");
1568     generate_type_check(r9_klass, /*sub_klass*/
1569                         ckoff,    /*super_check_offset*/
1570                         ckval,    /*super_klass*/
1571                         x10,      /*result*/
1572                         gct1,     /*tmp1*/
1573                         gct2,     /*tmp2*/
1574                         L_store_element);
1575 
1576     // Fall through on failure!
1577 
1578     // ======== end loop ========
1579 
1580     // It was a real error; we must depend on the caller to finish the job.
1581     // Register count = remaining oops, count_orig = total oops.
1582     // Emit GC store barriers for the oops we have copied and report
1583     // their number to the caller.
1584 
1585     __ sub(count, count_save, count);     // K = partially copied oop count
1586     __ xori(count, count, -1);            // report (-1^K) to caller
1587     __ beqz(count, L_done_pop);
1588 
1589     __ BIND(L_do_card_marks);
1590     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, t0);
1591 
1592     __ bind(L_done_pop);
1593     __ pop_reg(RegSet::of(x7, x9, x18, x19), sp);
1594     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1595 
1596     __ bind(L_done);
1597     __ mv(x10, count);
1598     __ leave();
1599     __ ret();
1600 
1601     return start;
1602   }
1603 
1604   // Perform range checks on the proposed arraycopy.
1605   // Kills temp, but nothing else.
1606   // Also, clean the sign bits of src_pos and dst_pos.
1607   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1608                               Register src_pos, // source position (c_rarg1)
1609                               Register dst,     // destination array oo (c_rarg2)
1610                               Register dst_pos, // destination position (c_rarg3)
1611                               Register length,
1612                               Register temp,
1613                               Label& L_failed) {
1614     BLOCK_COMMENT("arraycopy_range_checks:");
1615 
1616     assert_different_registers(t0, temp);
1617 
1618     // if [src_pos + length > arrayOop(src)->length()] then FAIL
1619     __ lwu(t0, Address(src, arrayOopDesc::length_offset_in_bytes()));
1620     __ addw(temp, length, src_pos);
1621     __ bgtu(temp, t0, L_failed);
1622 
1623     // if [dst_pos + length > arrayOop(dst)->length()] then FAIL
1624     __ lwu(t0, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1625     __ addw(temp, length, dst_pos);
1626     __ bgtu(temp, t0, L_failed);
1627 
1628     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1629     __ zext(src_pos, src_pos, 32);
1630     __ zext(dst_pos, dst_pos, 32);
1631 
1632     BLOCK_COMMENT("arraycopy_range_checks done");
1633   }
1634 
1635   address generate_unsafecopy_common_error_exit() {
1636     address start = __ pc();
1637     __ mv(x10, 0);
1638     __ leave();
1639     __ ret();
1640     return start;
1641   }
1642 
1643   //
1644   //  Generate 'unsafe' set memory stub
1645   //  Though just as safe as the other stubs, it takes an unscaled
1646   //  size_t (# bytes) argument instead of an element count.
1647   //
1648   //  Input:
1649   //    c_rarg0   - destination array address
1650   //    c_rarg1   - byte count (size_t)
1651   //    c_rarg2   - byte value
1652   //
1653   address generate_unsafe_setmemory() {
1654     __ align(CodeEntryAlignment);
1655     StubId stub_id = StubId::stubgen_unsafe_setmemory_id;
1656     StubCodeMark mark(this, stub_id);
1657     address start = __ pc();
1658 
1659     // bump this on entry, not on exit:
1660     // inc_counter_np(SharedRuntime::_unsafe_set_memory_ctr);
1661 
1662     Label L_fill_elements;
1663 
1664     const Register dest = c_rarg0;
1665     const Register count = c_rarg1;
1666     const Register value = c_rarg2;
1667     const Register cnt_words = x28; // temp register
1668     const Register tmp_reg   = x29; // temp register
1669 
1670     // Mark remaining code as such which performs Unsafe accesses.
1671     UnsafeMemoryAccessMark umam(this, true, false);
1672 
1673     __ enter(); // required for proper stackwalking of RuntimeStub frame
1674 
1675     // if count < 8, jump to L_fill_elements
1676     __ mv(tmp_reg, 8); // 8 bytes fill by element
1677     __ bltu(count, tmp_reg, L_fill_elements);
1678 
1679     // Propagate byte to 64-bit width
1680     // 8 bit -> 16 bit
1681     __ zext(value, value, 8);
1682     __ slli(tmp_reg, value, 8);
1683     __ orr(value, value, tmp_reg);
1684     // 16 bit -> 32 bit
1685     __ slli(tmp_reg, value, 16);
1686     __ orr(value, value, tmp_reg);
1687     // 32 bit -> 64 bit
1688     __ slli(tmp_reg, value, 32);
1689     __ orr(value, value, tmp_reg);
1690 
1691     // Align source address at 8 bytes address boundary.
1692     Label L_skip_align1, L_skip_align2, L_skip_align4;
1693     // One byte misalignment happens.
1694     __ test_bit(tmp_reg, dest, 0);
1695     __ beqz(tmp_reg, L_skip_align1);
1696     __ sb(value, Address(dest, 0));
1697     __ addi(dest, dest, 1);
1698     __ subi(count, count, 1);
1699 
1700     __ bind(L_skip_align1);
1701     // Two bytes misalignment happens.
1702     __ test_bit(tmp_reg, dest, 1);
1703     __ beqz(tmp_reg, L_skip_align2);
1704     __ sh(value, Address(dest, 0));
1705     __ addi(dest, dest, 2);
1706     __ subi(count, count, 2);
1707 
1708     __ bind(L_skip_align2);
1709     // Four bytes misalignment happens.
1710     __ test_bit(tmp_reg, dest, 2);
1711     __ beqz(tmp_reg, L_skip_align4);
1712     __ sw(value, Address(dest, 0));
1713     __ addi(dest, dest, 4);
1714     __ subi(count, count, 4);
1715     __ bind(L_skip_align4);
1716 
1717     //  Fill large chunks
1718     __ srli(cnt_words, count, 3); // number of words
1719     __ slli(tmp_reg, cnt_words, 3);
1720     __ sub(count, count, tmp_reg);
1721     {
1722       __ fill_words(dest, cnt_words, value);
1723     }
1724 
1725     // Handle copies less than 8 bytes
1726     __ bind(L_fill_elements);
1727     Label L_fill_2, L_fill_1, L_exit;
1728     __ test_bit(tmp_reg, count, 2);
1729     __ beqz(tmp_reg, L_fill_2);
1730     __ sb(value, Address(dest, 0));
1731     __ sb(value, Address(dest, 1));
1732     __ sb(value, Address(dest, 2));
1733     __ sb(value, Address(dest, 3));
1734     __ addi(dest, dest, 4);
1735 
1736     __ bind(L_fill_2);
1737     __ test_bit(tmp_reg, count, 1);
1738     __ beqz(tmp_reg, L_fill_1);
1739     __ sb(value, Address(dest, 0));
1740     __ sb(value, Address(dest, 1));
1741     __ addi(dest, dest, 2);
1742 
1743     __ bind(L_fill_1);
1744     __ test_bit(tmp_reg, count, 0);
1745     __ beqz(tmp_reg, L_exit);
1746     __ sb(value, Address(dest, 0));
1747 
1748     __ bind(L_exit);
1749     __ leave();
1750     __ ret();
1751 
1752     return start;
1753   }
1754 
1755   //
1756   //  Generate 'unsafe' array copy stub
1757   //  Though just as safe as the other stubs, it takes an unscaled
1758   //  size_t argument instead of an element count.
1759   //
1760   //  Input:
1761   //    c_rarg0   - source array address
1762   //    c_rarg1   - destination array address
1763   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1764   //
1765   // Examines the alignment of the operands and dispatches
1766   // to a long, int, short, or byte copy loop.
1767   //
1768   address generate_unsafe_copy(address byte_copy_entry,
1769                                address short_copy_entry,
1770                                address int_copy_entry,
1771                                address long_copy_entry) {
1772     assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr &&
1773                 int_copy_entry != nullptr && long_copy_entry != nullptr);
1774     Label L_long_aligned, L_int_aligned, L_short_aligned;
1775     const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1776 
1777     __ align(CodeEntryAlignment);
1778     StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
1779     StubCodeMark mark(this, stub_id);
1780     address start = __ pc();
1781     __ enter(); // required for proper stackwalking of RuntimeStub frame
1782 
1783     // bump this on entry, not on exit:
1784     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1785 
1786     __ orr(t0, s, d);
1787     __ orr(t0, t0, count);
1788 
1789     __ andi(t0, t0, BytesPerLong - 1);
1790     __ beqz(t0, L_long_aligned);
1791     __ andi(t0, t0, BytesPerInt - 1);
1792     __ beqz(t0, L_int_aligned);
1793     __ test_bit(t0, t0, 0);
1794     __ beqz(t0, L_short_aligned);
1795     __ j(RuntimeAddress(byte_copy_entry));
1796 
1797     __ BIND(L_short_aligned);
1798     __ srli(count, count, LogBytesPerShort);  // size => short_count
1799     __ j(RuntimeAddress(short_copy_entry));
1800     __ BIND(L_int_aligned);
1801     __ srli(count, count, LogBytesPerInt);    // size => int_count
1802     __ j(RuntimeAddress(int_copy_entry));
1803     __ BIND(L_long_aligned);
1804     __ srli(count, count, LogBytesPerLong);   // size => long_count
1805     __ j(RuntimeAddress(long_copy_entry));
1806 
1807     return start;
1808   }
1809 
1810   //
1811   //  Generate generic array copy stubs
1812   //
1813   //  Input:
1814   //    c_rarg0    -  src oop
1815   //    c_rarg1    -  src_pos (32-bits)
1816   //    c_rarg2    -  dst oop
1817   //    c_rarg3    -  dst_pos (32-bits)
1818   //    c_rarg4    -  element count (32-bits)
1819   //
1820   //  Output:
1821   //    x10 ==  0  -  success
1822   //    x10 == -1^K - failure, where K is partial transfer count
1823   //
1824   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
1825                                 address int_copy_entry, address oop_copy_entry,
1826                                 address long_copy_entry, address checkcast_copy_entry) {
1827     assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr &&
1828                 int_copy_entry != nullptr && oop_copy_entry != nullptr &&
1829                 long_copy_entry != nullptr && checkcast_copy_entry != nullptr);
1830     Label L_failed, L_failed_0, L_objArray;
1831     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1832 
1833     // Input registers
1834     const Register src        = c_rarg0;  // source array oop
1835     const Register src_pos    = c_rarg1;  // source position
1836     const Register dst        = c_rarg2;  // destination array oop
1837     const Register dst_pos    = c_rarg3;  // destination position
1838     const Register length     = c_rarg4;
1839 
1840     // Registers used as temps
1841     const Register dst_klass = c_rarg5;
1842 
1843     __ align(CodeEntryAlignment);
1844 
1845     StubId stub_id = StubId::stubgen_generic_arraycopy_id;
1846     StubCodeMark mark(this, stub_id);
1847 
1848     address start = __ pc();
1849 
1850     __ enter(); // required for proper stackwalking of RuntimeStub frame
1851 
1852     // bump this on entry, not on exit:
1853     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1854 
1855     //-----------------------------------------------------------------------
1856     // Assembler stub will be used for this call to arraycopy
1857     // if the following conditions are met:
1858     //
1859     // (1) src and dst must not be null.
1860     // (2) src_pos must not be negative.
1861     // (3) dst_pos must not be negative.
1862     // (4) length  must not be negative.
1863     // (5) src klass and dst klass should be the same and not null.
1864     // (6) src and dst should be arrays.
1865     // (7) src_pos + length must not exceed length of src.
1866     // (8) dst_pos + length must not exceed length of dst.
1867     //
1868 
1869     // if src is null then return -1
1870     __ beqz(src, L_failed);
1871 
1872     // if [src_pos < 0] then return -1
1873     __ sext(t0, src_pos, 32);
1874     __ bltz(t0, L_failed);
1875 
1876     // if dst is null then return -1
1877     __ beqz(dst, L_failed);
1878 
1879     // if [dst_pos < 0] then return -1
1880     __ sext(t0, dst_pos, 32);
1881     __ bltz(t0, L_failed);
1882 
1883     // registers used as temp
1884     const Register scratch_length    = x28; // elements count to copy
1885     const Register scratch_src_klass = x29; // array klass
1886     const Register lh                = x30; // layout helper
1887 
1888     // if [length < 0] then return -1
1889     __ sext(scratch_length, length, 32); // length (elements count, 32-bits value)
1890     __ bltz(scratch_length, L_failed);
1891 
1892     __ load_narrow_klass(scratch_src_klass, src);
1893 #ifdef ASSERT
1894     {
1895       BLOCK_COMMENT("assert klasses not null {");
1896       Label L1, L2;
1897       __ bnez(scratch_src_klass, L2);   // it is broken if klass is null
1898       __ bind(L1);
1899       __ stop("broken null klass");
1900       __ bind(L2);
1901       __ load_narrow_klass(t0, dst);
1902       __ beqz(t0, L1);     // this would be broken also
1903       BLOCK_COMMENT("} assert klasses not null done");
1904     }
1905 #endif
1906     __ decode_klass_not_null(scratch_src_klass, t0);
1907 
1908     // Load layout helper (32-bits)
1909     //
1910     //  |array_tag|     | header_size | element_type |     |log2_element_size|
1911     // 32        30    24            16              8     2                 0
1912     //
1913     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
1914     //
1915 
1916     const int lh_offset = in_bytes(Klass::layout_helper_offset());
1917 
1918     // Handle objArrays completely differently...
1919     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
1920     __ lw(lh, Address(scratch_src_klass, lh_offset));
1921     __ mv(t0, objArray_lh);
1922     __ beq(lh, t0, L_objArray);
1923 
1924     // if [src->klass() != dst->klass()] then return -1
1925     __ load_klass(t1, dst);
1926     __ bne(t1, scratch_src_klass, L_failed);
1927 
1928     // if src->is_Array() isn't null then return -1
1929     // i.e. (lh >= 0)
1930     __ bgez(lh, L_failed);
1931 
1932     // At this point, it is known to be a typeArray (array_tag 0x3).
1933 #ifdef ASSERT
1934     {
1935       BLOCK_COMMENT("assert primitive array {");
1936       Label L;
1937       __ mv(t1, (int32_t)(Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
1938       __ bge(lh, t1, L);
1939       __ stop("must be a primitive array");
1940       __ bind(L);
1941       BLOCK_COMMENT("} assert primitive array done");
1942     }
1943 #endif
1944 
1945     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1946                            t1, L_failed);
1947 
1948     // TypeArrayKlass
1949     //
1950     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize)
1951     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize)
1952     //
1953 
1954     const Register t0_offset = t0;    // array offset
1955     const Register x30_elsize = lh;   // element size
1956 
1957     // Get array_header_in_bytes()
1958     int lh_header_size_width = exact_log2(Klass::_lh_header_size_mask + 1);
1959     int lh_header_size_msb = Klass::_lh_header_size_shift + lh_header_size_width;
1960     __ slli(t0_offset, lh, XLEN - lh_header_size_msb);          // left shift to remove 24 ~ 32;
1961     __ srli(t0_offset, t0_offset, XLEN - lh_header_size_width); // array_offset
1962 
1963     __ add(src, src, t0_offset);           // src array offset
1964     __ add(dst, dst, t0_offset);           // dst array offset
1965     BLOCK_COMMENT("choose copy loop based on element size");
1966 
1967     // next registers should be set before the jump to corresponding stub
1968     const Register from     = c_rarg0;  // source array address
1969     const Register to       = c_rarg1;  // destination array address
1970     const Register count    = c_rarg2;  // elements count
1971 
1972     // 'from', 'to', 'count' registers should be set in such order
1973     // since they are the same as 'src', 'src_pos', 'dst'.
1974 
1975     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
1976 
1977     // The possible values of elsize are 0-3, i.e. exact_log2(element
1978     // size in bytes).  We do a simple bitwise binary search.
1979   __ BIND(L_copy_bytes);
1980     __ test_bit(t0, x30_elsize, 1);
1981     __ bnez(t0, L_copy_ints);
1982     __ test_bit(t0, x30_elsize, 0);
1983     __ bnez(t0, L_copy_shorts);
1984     __ add(from, src, src_pos); // src_addr
1985     __ add(to, dst, dst_pos); // dst_addr
1986     __ sext(count, scratch_length, 32); // length
1987     __ j(RuntimeAddress(byte_copy_entry));
1988 
1989   __ BIND(L_copy_shorts);
1990     __ shadd(from, src_pos, src, t0, 1); // src_addr
1991     __ shadd(to, dst_pos, dst, t0, 1); // dst_addr
1992     __ sext(count, scratch_length, 32); // length
1993     __ j(RuntimeAddress(short_copy_entry));
1994 
1995   __ BIND(L_copy_ints);
1996     __ test_bit(t0, x30_elsize, 0);
1997     __ bnez(t0, L_copy_longs);
1998     __ shadd(from, src_pos, src, t0, 2); // src_addr
1999     __ shadd(to, dst_pos, dst, t0, 2); // dst_addr
2000     __ sext(count, scratch_length, 32); // length
2001     __ j(RuntimeAddress(int_copy_entry));
2002 
2003   __ BIND(L_copy_longs);
2004 #ifdef ASSERT
2005     {
2006       BLOCK_COMMENT("assert long copy {");
2007       Label L;
2008       __ andi(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> x30_elsize
2009       __ sext(lh, lh, 32);
2010       __ mv(t0, LogBytesPerLong);
2011       __ beq(x30_elsize, t0, L);
2012       __ stop("must be long copy, but elsize is wrong");
2013       __ bind(L);
2014       BLOCK_COMMENT("} assert long copy done");
2015     }
2016 #endif
2017     __ shadd(from, src_pos, src, t0, 3); // src_addr
2018     __ shadd(to, dst_pos, dst, t0, 3); // dst_addr
2019     __ sext(count, scratch_length, 32); // length
2020     __ j(RuntimeAddress(long_copy_entry));
2021 
2022     // ObjArrayKlass
2023   __ BIND(L_objArray);
2024     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2025 
2026     Label L_plain_copy, L_checkcast_copy;
2027     // test array classes for subtyping
2028     __ load_klass(t2, dst);
2029     __ bne(scratch_src_klass, t2, L_checkcast_copy); // usual case is exact equality
2030 
2031     // Identically typed arrays can be copied without element-wise checks.
2032     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2033                            t1, L_failed);
2034 
2035     __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop);
2036     __ addi(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2037     __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop);
2038     __ addi(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2039     __ sext(count, scratch_length, 32); // length
2040   __ BIND(L_plain_copy);
2041     __ j(RuntimeAddress(oop_copy_entry));
2042 
2043   __ BIND(L_checkcast_copy);
2044     // live at this point:  scratch_src_klass, scratch_length, t2 (dst_klass)
2045     {
2046       // Before looking at dst.length, make sure dst is also an objArray.
2047       __ lwu(t0, Address(t2, lh_offset));
2048       __ mv(t1, objArray_lh);
2049       __ bne(t0, t1, L_failed);
2050 
2051       // It is safe to examine both src.length and dst.length.
2052       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2053                              t2, L_failed);
2054 
2055       __ load_klass(dst_klass, dst); // reload
2056 
2057       // Marshal the base address arguments now, freeing registers.
2058       __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop);
2059       __ addi(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2060       __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop);
2061       __ addi(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2062       __ sext(count, length, 32); // length (reloaded)
2063       const Register sco_temp = c_rarg3; // this register is free now
2064       assert_different_registers(from, to, count, sco_temp,
2065                                  dst_klass, scratch_src_klass);
2066 
2067       // Generate the type check.
2068       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2069       __ lwu(sco_temp, Address(dst_klass, sco_offset));
2070 
2071       // Smashes t0, t1
2072       generate_type_check(scratch_src_klass, sco_temp, dst_klass, noreg, noreg, noreg, L_plain_copy);
2073 
2074       // Fetch destination element klass from the ObjArrayKlass header.
2075       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2076       __ ld(dst_klass, Address(dst_klass, ek_offset));
2077       __ lwu(sco_temp, Address(dst_klass, sco_offset));
2078 
2079       // the checkcast_copy loop needs two extra arguments:
2080       assert(c_rarg3 == sco_temp, "#3 already in place");
2081       // Set up arguments for checkcast_copy_entry.
2082       __ mv(c_rarg4, dst_klass);  // dst.klass.element_klass
2083       __ j(RuntimeAddress(checkcast_copy_entry));
2084     }
2085 
2086   __ BIND(L_failed);
2087     __ mv(x10, -1);
2088     __ leave();   // required for proper stackwalking of RuntimeStub frame
2089     __ ret();
2090 
2091     return start;
2092   }
2093 
2094   //
2095   // Generate stub for array fill. If "aligned" is true, the
2096   // "to" address is assumed to be heapword aligned.
2097   //
2098   // Arguments for generated stub:
2099   //   to:    c_rarg0
2100   //   value: c_rarg1
2101   //   count: c_rarg2 treated as signed
2102   //
2103   address generate_fill(StubId stub_id) {
2104     BasicType t;
2105     bool aligned;
2106 
2107     switch (stub_id) {
2108     case StubId::stubgen_jbyte_fill_id:
2109       t = T_BYTE;
2110       aligned = false;
2111       break;
2112     case StubId::stubgen_jshort_fill_id:
2113       t = T_SHORT;
2114       aligned = false;
2115       break;
2116     case StubId::stubgen_jint_fill_id:
2117       t = T_INT;
2118       aligned = false;
2119       break;
2120     case StubId::stubgen_arrayof_jbyte_fill_id:
2121       t = T_BYTE;
2122       aligned = true;
2123       break;
2124     case StubId::stubgen_arrayof_jshort_fill_id:
2125       t = T_SHORT;
2126       aligned = true;
2127       break;
2128     case StubId::stubgen_arrayof_jint_fill_id:
2129       t = T_INT;
2130       aligned = true;
2131       break;
2132     default:
2133       ShouldNotReachHere();
2134     };
2135 
2136     __ align(CodeEntryAlignment);
2137     StubCodeMark mark(this, stub_id);
2138     address start = __ pc();
2139 
2140     BLOCK_COMMENT("Entry:");
2141 
2142     const Register to        = c_rarg0;  // source array address
2143     const Register value     = c_rarg1;  // value
2144     const Register count     = c_rarg2;  // elements count
2145 
2146     const Register bz_base   = x28;      // base for block_zero routine
2147     const Register cnt_words = x29;      // temp register
2148     const Register tmp_reg   = t1;
2149 
2150     __ enter();
2151 
2152     Label L_fill_elements;
2153 
2154     int shift = -1;
2155     switch (t) {
2156       case T_BYTE:
2157         shift = 0;
2158         // Short arrays (< 8 bytes) fill by element
2159         __ mv(tmp_reg, 8 >> shift);
2160         __ bltu(count, tmp_reg, L_fill_elements);
2161 
2162         // Zero extend value
2163         // 8 bit -> 16 bit
2164         __ zext(value, value, 8);
2165         __ slli(tmp_reg, value, 8);
2166         __ orr(value, value, tmp_reg);
2167 
2168         // 16 bit -> 32 bit
2169         __ slli(tmp_reg, value, 16);
2170         __ orr(value, value, tmp_reg);
2171         break;
2172       case T_SHORT:
2173         shift = 1;
2174         // Short arrays (< 8 bytes) fill by element
2175         __ mv(tmp_reg, 8 >> shift);
2176         __ bltu(count, tmp_reg, L_fill_elements);
2177 
2178         // Zero extend value
2179         // 16 bit -> 32 bit
2180         __ zext(value, value, 16);
2181         __ slli(tmp_reg, value, 16);
2182         __ orr(value, value, tmp_reg);
2183         break;
2184       case T_INT:
2185         shift = 2;
2186         // Short arrays (< 8 bytes) fill by element
2187         __ mv(tmp_reg, 8 >> shift);
2188         __ bltu(count, tmp_reg, L_fill_elements);
2189         break;
2190       default: ShouldNotReachHere();
2191     }
2192 
2193     // Align source address at 8 bytes address boundary.
2194     Label L_skip_align1, L_skip_align2, L_skip_align4;
2195     if (!aligned) {
2196       switch (t) {
2197         case T_BYTE:
2198           // One byte misalignment happens only for byte arrays.
2199           __ test_bit(tmp_reg, to, 0);
2200           __ beqz(tmp_reg, L_skip_align1);
2201           __ sb(value, Address(to, 0));
2202           __ addi(to, to, 1);
2203           __ subiw(count, count, 1);
2204           __ bind(L_skip_align1);
2205           // Fallthrough
2206         case T_SHORT:
2207           // Two bytes misalignment happens only for byte and short (char) arrays.
2208           __ test_bit(tmp_reg, to, 1);
2209           __ beqz(tmp_reg, L_skip_align2);
2210           __ sh(value, Address(to, 0));
2211           __ addi(to, to, 2);
2212           __ subiw(count, count, 2 >> shift);
2213           __ bind(L_skip_align2);
2214           // Fallthrough
2215         case T_INT:
2216           // Align to 8 bytes, we know we are 4 byte aligned to start.
2217           __ test_bit(tmp_reg, to, 2);
2218           __ beqz(tmp_reg, L_skip_align4);
2219           __ sw(value, Address(to, 0));
2220           __ addi(to, to, 4);
2221           __ subiw(count, count, 4 >> shift);
2222           __ bind(L_skip_align4);
2223           break;
2224         default: ShouldNotReachHere();
2225       }
2226     }
2227 
2228     //
2229     //  Fill large chunks
2230     //
2231     __ srliw(cnt_words, count, 3 - shift); // number of words
2232 
2233     // 32 bit -> 64 bit
2234     __ zext(value, value, 32);
2235     __ slli(tmp_reg, value, 32);
2236     __ orr(value, value, tmp_reg);
2237 
2238     __ slli(tmp_reg, cnt_words, 3 - shift);
2239     __ subw(count, count, tmp_reg);
2240     {
2241       __ fill_words(to, cnt_words, value);
2242     }
2243 
2244     // Handle copies less than 8 bytes.
2245     // Address may not be heapword aligned.
2246     Label L_fill_1, L_fill_2, L_exit;
2247     __ bind(L_fill_elements);
2248     switch (t) {
2249       case T_BYTE:
2250         __ test_bit(tmp_reg, count, 2);
2251         __ beqz(tmp_reg, L_fill_2);
2252         __ sb(value, Address(to, 0));
2253         __ sb(value, Address(to, 1));
2254         __ sb(value, Address(to, 2));
2255         __ sb(value, Address(to, 3));
2256         __ addi(to, to, 4);
2257 
2258         __ bind(L_fill_2);
2259         __ test_bit(tmp_reg, count, 1);
2260         __ beqz(tmp_reg, L_fill_1);
2261         __ sb(value, Address(to, 0));
2262         __ sb(value, Address(to, 1));
2263         __ addi(to, to, 2);
2264 
2265         __ bind(L_fill_1);
2266         __ test_bit(tmp_reg, count, 0);
2267         __ beqz(tmp_reg, L_exit);
2268         __ sb(value, Address(to, 0));
2269         break;
2270       case T_SHORT:
2271         __ test_bit(tmp_reg, count, 1);
2272         __ beqz(tmp_reg, L_fill_2);
2273         __ sh(value, Address(to, 0));
2274         __ sh(value, Address(to, 2));
2275         __ addi(to, to, 4);
2276 
2277         __ bind(L_fill_2);
2278         __ test_bit(tmp_reg, count, 0);
2279         __ beqz(tmp_reg, L_exit);
2280         __ sh(value, Address(to, 0));
2281         break;
2282       case T_INT:
2283         __ beqz(count, L_exit);
2284         __ sw(value, Address(to, 0));
2285         break;
2286       default: ShouldNotReachHere();
2287     }
2288     __ bind(L_exit);
2289     __ leave();
2290     __ ret();
2291 
2292     return start;
2293   }
2294 
2295   void generate_arraycopy_stubs() {
2296     // Some copy stubs publish a normal entry and then a 2nd 'fallback'
2297     // entry immediately following their stack push. This can be used
2298     // as a post-push branch target for compatible stubs when they
2299     // identify a special case that can be handled by the fallback
2300     // stub e.g a disjoint copy stub may be use as a special case
2301     // fallback for its compatible conjoint copy stub.
2302     //
2303     // A no push entry is always returned in the following local and
2304     // then published by assigning to the appropriate entry field in
2305     // class StubRoutines. The entry value is then passed to the
2306     // generator for the compatible stub. That means the entry must be
2307     // listed when saving to/restoring from the AOT cache, ensuring
2308     // that the inter-stub jumps are noted at AOT-cache save and
2309     // relocated at AOT cache load.
2310     address nopush_entry = nullptr;
2311 
2312     // generate the common exit first so later stubs can rely on it if
2313     // they want an UnsafeMemoryAccess exit non-local to the stub
2314     StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
2315     // register the stub as the default exit with class UnsafeMemoryAccess
2316     UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
2317 
2318     // generate and publish riscv-specific bulk copy routines first
2319     // so we can call them from other copy stubs
2320     StubRoutines::riscv::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, c_rarg0, c_rarg1, t1);
2321     StubRoutines::riscv::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, c_rarg0, c_rarg1, t1);
2322 
2323     StubRoutines::riscv::_zero_blocks = generate_zero_blocks();
2324 
2325     //*** jbyte
2326     // Always need aligned and unaligned versions
2327     StubRoutines::_jbyte_disjoint_arraycopy          = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
2328     // disjoint nopush entry is needed by conjoint copy
2329     StubRoutines::_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
2330     StubRoutines::_jbyte_arraycopy                   = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
2331     // conjoint nopush entry is needed by generic/unsafe copy
2332     StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
2333     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
2334     // disjoint arrayof nopush entry is needed by conjoint copy
2335     StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush  = nopush_entry;
2336     StubRoutines::_arrayof_jbyte_arraycopy           = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
2337 
2338     //*** jshort
2339     // Always need aligned and unaligned versions
2340     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
2341     // disjoint nopush entry is needed by conjoint copy
2342     StubRoutines::_jshort_disjoint_arraycopy_nopush  = nopush_entry;
2343     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
2344     // conjoint nopush entry is used by generic/unsafe copy
2345     StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
2346     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
2347     // disjoint arrayof nopush entry is needed by conjoint copy
2348     StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
2349     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
2350 
2351     //*** jint
2352     // Aligned versions
2353     StubRoutines::_arrayof_jint_disjoint_arraycopy   = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
2354     // disjoint arrayof nopush entry is needed by conjoint copy
2355     StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
2356     StubRoutines::_arrayof_jint_arraycopy            = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
2357     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2358     // entry_jint_arraycopy always points to the unaligned version
2359     StubRoutines::_jint_disjoint_arraycopy           = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
2360     // disjoint nopush entry is needed by conjoint copy
2361     StubRoutines::_jint_disjoint_arraycopy_nopush  = nopush_entry;
2362     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
2363     // conjoint nopush entry is needed by generic/unsafe copy
2364     StubRoutines::_jint_arraycopy_nopush = nopush_entry;
2365 
2366     //*** jlong
2367     // It is always aligned
2368     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
2369     // disjoint arrayof nopush entry is needed by conjoint copy
2370     StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
2371     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
2372     // conjoint nopush entry is needed by generic/unsafe copy
2373     StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
2374     // disjoint normal/nopush and conjoint normal entries are not
2375     // generated since the arrayof versions are the same
2376     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2377     StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
2378     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2379 
2380     //*** oops
2381     StubRoutines::_arrayof_oop_disjoint_arraycopy
2382       = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
2383       // disjoint arrayof nopush entry is needed by conjoint copy
2384     StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
2385     StubRoutines::_arrayof_oop_arraycopy
2386       = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
2387     // conjoint arrayof nopush entry is needed by generic/unsafe copy
2388     StubRoutines::_oop_arraycopy_nopush = nopush_entry;
2389     // Aligned versions without pre-barriers
2390     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2391       = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
2392     // disjoint arrayof+uninit nopush entry is needed by conjoint copy
2393     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
2394 
2395     // note that we don't need a returned nopush entry because the
2396     // generic/unsafe copy does not cater for uninit arrays.
2397     StubRoutines::_arrayof_oop_arraycopy_uninit
2398       = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
2399 
2400     // for oop copies reuse arrayof entries for non-arrayof cases
2401     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2402     StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
2403     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2404     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2405     StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
2406     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2407 
2408     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
2409     // checkcast nopush entry is needed by generic copy
2410     StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
2411     // note that we don't need a returned nopush entry because the
2412     // generic copy does not cater for uninit arrays.
2413     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
2414 
2415 
2416     // unsafe arraycopy may fallback on conjoint stubs
2417     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
2418                                                               StubRoutines::_jshort_arraycopy_nopush,
2419                                                               StubRoutines::_jint_arraycopy_nopush,
2420                                                               StubRoutines::_jlong_arraycopy_nopush);
2421 
2422     // generic arraycopy may fallback on conjoint stubs
2423     StubRoutines::_generic_arraycopy   = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
2424                                                                StubRoutines::_jshort_arraycopy_nopush,
2425                                                                StubRoutines::_jint_arraycopy_nopush,
2426                                                                StubRoutines::_oop_arraycopy_nopush,
2427                                                                StubRoutines::_jlong_arraycopy_nopush,
2428                                                                StubRoutines::_checkcast_arraycopy_nopush);
2429 
2430     StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
2431     StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
2432     StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
2433     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
2434     StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
2435     StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
2436 
2437     StubRoutines::_unsafe_setmemory    = generate_unsafe_setmemory();
2438   }
2439 
2440   void aes_load_keys(const Register &key, VectorRegister *working_vregs, int rounds) {
2441     const int step = 16;
2442     for (int i = 0; i < rounds; i++) {
2443       __ vle32_v(working_vregs[i], key);
2444       // The keys are stored in little-endian array, while we need
2445       // to operate in big-endian.
2446       // So performing an endian-swap here with vrev8.v instruction
2447       __ vrev8_v(working_vregs[i], working_vregs[i]);
2448       __ addi(key, key, step);
2449     }
2450   }
2451 
2452   void aes_encrypt(const VectorRegister &res, VectorRegister *working_vregs, int rounds) {
2453     assert(rounds <= 15, "rounds should be less than or equal to working_vregs size");
2454 
2455     __ vxor_vv(res, res, working_vregs[0]);
2456     for (int i = 1; i < rounds - 1; i++) {
2457       __ vaesem_vv(res, working_vregs[i]);
2458     }
2459     __ vaesef_vv(res, working_vregs[rounds - 1]);
2460   }
2461 
2462   // Arguments:
2463   //
2464   // Inputs:
2465   //   c_rarg0   - source byte array address
2466   //   c_rarg1   - destination byte array address
2467   //   c_rarg2   - sessionKe (key) in little endian int array
2468   //
2469   address generate_aescrypt_encryptBlock() {
2470     assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
2471 
2472     __ align(CodeEntryAlignment);
2473     StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
2474     StubCodeMark mark(this, stub_id);
2475 
2476     Label L_aes128, L_aes192;
2477 
2478     const Register from        = c_rarg0;  // source array address
2479     const Register to          = c_rarg1;  // destination array address
2480     const Register key         = c_rarg2;  // key array address
2481     const Register keylen      = c_rarg3;
2482 
2483     VectorRegister working_vregs[] = {
2484       v4, v5, v6, v7, v8, v9, v10, v11,
2485       v12, v13, v14, v15, v16, v17, v18
2486     };
2487     const VectorRegister res   = v19;
2488 
2489     address start = __ pc();
2490     __ enter();
2491 
2492     __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2493 
2494     __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2495     __ vle32_v(res, from);
2496 
2497     __ mv(t2, 52); // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
2498     __ bltu(keylen, t2, L_aes128);
2499     __ beq(keylen, t2, L_aes192);
2500     // Else we fallthrough to the biggest case (256-bit key size)
2501 
2502     // Note: the following function performs key += 15*16
2503     aes_load_keys(key, working_vregs, 15);
2504     aes_encrypt(res, working_vregs, 15);
2505     __ vse32_v(res, to);
2506     __ mv(c_rarg0, 0);
2507     __ leave();
2508     __ ret();
2509 
2510   __ bind(L_aes192);
2511     // Note: the following function performs key += 13*16
2512     aes_load_keys(key, working_vregs, 13);
2513     aes_encrypt(res, working_vregs, 13);
2514     __ vse32_v(res, to);
2515     __ mv(c_rarg0, 0);
2516     __ leave();
2517     __ ret();
2518 
2519   __ bind(L_aes128);
2520     // Note: the following function performs key += 11*16
2521     aes_load_keys(key, working_vregs, 11);
2522     aes_encrypt(res, working_vregs, 11);
2523     __ vse32_v(res, to);
2524     __ mv(c_rarg0, 0);
2525     __ leave();
2526     __ ret();
2527 
2528     return start;
2529   }
2530 
2531   void aes_decrypt(const VectorRegister &res, VectorRegister *working_vregs, int rounds) {
2532     assert(rounds <= 15, "rounds should be less than or equal to working_vregs size");
2533 
2534     __ vxor_vv(res, res, working_vregs[rounds - 1]);
2535     for (int i = rounds - 2; i > 0; i--) {
2536       __ vaesdm_vv(res, working_vregs[i]);
2537     }
2538     __ vaesdf_vv(res, working_vregs[0]);
2539   }
2540 
2541   // Arguments:
2542   //
2543   // Inputs:
2544   //   c_rarg0   - source byte array address
2545   //   c_rarg1   - destination byte array address
2546   //   c_rarg2   - sessionKe (key) in little endian int array
2547   //
2548   address generate_aescrypt_decryptBlock() {
2549     assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
2550 
2551     __ align(CodeEntryAlignment);
2552     StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
2553     StubCodeMark mark(this, stub_id);
2554 
2555     Label L_aes128, L_aes192;
2556 
2557     const Register from        = c_rarg0;  // source array address
2558     const Register to          = c_rarg1;  // destination array address
2559     const Register key         = c_rarg2;  // key array address
2560     const Register keylen      = c_rarg3;
2561 
2562     VectorRegister working_vregs[] = {
2563       v4, v5, v6, v7, v8, v9, v10, v11,
2564       v12, v13, v14, v15, v16, v17, v18
2565     };
2566     const VectorRegister res   = v19;
2567 
2568     address start = __ pc();
2569     __ enter(); // required for proper stackwalking of RuntimeStub frame
2570 
2571     __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2572 
2573     __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2574     __ vle32_v(res, from);
2575 
2576     __ mv(t2, 52); // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
2577     __ bltu(keylen, t2, L_aes128);
2578     __ beq(keylen, t2, L_aes192);
2579     // Else we fallthrough to the biggest case (256-bit key size)
2580 
2581     // Note: the following function performs key += 15*16
2582     aes_load_keys(key, working_vregs, 15);
2583     aes_decrypt(res, working_vregs, 15);
2584     __ vse32_v(res, to);
2585     __ mv(c_rarg0, 0);
2586     __ leave();
2587     __ ret();
2588 
2589   __ bind(L_aes192);
2590     // Note: the following function performs key += 13*16
2591     aes_load_keys(key, working_vregs, 13);
2592     aes_decrypt(res, working_vregs, 13);
2593     __ vse32_v(res, to);
2594     __ mv(c_rarg0, 0);
2595     __ leave();
2596     __ ret();
2597 
2598   __ bind(L_aes128);
2599     // Note: the following function performs key += 11*16
2600     aes_load_keys(key, working_vregs, 11);
2601     aes_decrypt(res, working_vregs, 11);
2602     __ vse32_v(res, to);
2603     __ mv(c_rarg0, 0);
2604     __ leave();
2605     __ ret();
2606 
2607     return start;
2608   }
2609 
2610   void cipherBlockChaining_encryptAESCrypt(int round, Register from, Register to, Register key,
2611                                            Register rvec, Register input_len) {
2612     const Register len = x29;
2613 
2614     VectorRegister working_vregs[] = {
2615       v1, v2, v3, v4, v5, v6, v7, v8,
2616       v9, v10, v11, v12, v13, v14, v15
2617     };
2618 
2619     const unsigned int BLOCK_SIZE = 16;
2620 
2621     __ mv(len, input_len);
2622     // load init rvec
2623     __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2624     __ vle32_v(v16, rvec);
2625 
2626     aes_load_keys(key, working_vregs, round);
2627     Label L_enc_loop;
2628     __ bind(L_enc_loop);
2629     // Encrypt from source by block size
2630       __ vle32_v(v17, from);
2631       __ addi(from, from, BLOCK_SIZE);
2632       __ vxor_vv(v16, v16, v17);
2633       aes_encrypt(v16, working_vregs, round);
2634       __ vse32_v(v16, to);
2635       __ addi(to, to, BLOCK_SIZE);
2636       __ subi(len, len, BLOCK_SIZE);
2637       __ bnez(len, L_enc_loop);
2638 
2639     // save current rvec and return
2640     __ vse32_v(v16, rvec);
2641     __ mv(x10, input_len);
2642     __ leave();
2643     __ ret();
2644   }
2645 
2646   // Arguments:
2647   //
2648   // Inputs:
2649   //   c_rarg0   - source byte array address
2650   //   c_rarg1   - destination byte array address
2651   //   c_rarg2   - K (key) in little endian int array
2652   //   c_rarg3   - r vector byte array address
2653   //   c_rarg4   - input length
2654   //
2655   // Output:
2656   //   x10       - input length
2657   //
2658   address generate_cipherBlockChaining_encryptAESCrypt() {
2659     assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
2660     __ align(CodeEntryAlignment);
2661     StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
2662     StubCodeMark mark(this, stub_id);
2663 
2664     const Register from       = c_rarg0;
2665     const Register to         = c_rarg1;
2666     const Register key        = c_rarg2;
2667     const Register rvec       = c_rarg3;
2668     const Register input_len  = c_rarg4;
2669 
2670     const Register keylen     = x28;
2671 
2672     address start = __ pc();
2673     __ enter();
2674 
2675     Label L_aes128, L_aes192;
2676     // Compute #rounds for AES based on the length of the key array
2677     __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2678     __ mv(t0, 52);
2679     __ bltu(keylen, t0, L_aes128);
2680     __ beq(keylen, t0, L_aes192);
2681     // Else we fallthrough to the biggest case (256-bit key size)
2682 
2683     // Note: the following function performs key += 15*16
2684     cipherBlockChaining_encryptAESCrypt(15, from, to, key, rvec, input_len);
2685 
2686     // Note: the following function performs key += 11*16
2687     __ bind(L_aes128);
2688     cipherBlockChaining_encryptAESCrypt(11, from, to, key, rvec, input_len);
2689 
2690     // Note: the following function performs key += 13*16
2691     __ bind(L_aes192);
2692     cipherBlockChaining_encryptAESCrypt(13, from, to, key, rvec, input_len);
2693 
2694     return start;
2695   }
2696 
2697   void cipherBlockChaining_decryptAESCrypt(int round, Register from, Register to, Register key,
2698                                            Register rvec, Register input_len) {
2699     const Register len = x29;
2700 
2701     VectorRegister working_vregs[] = {
2702       v1, v2, v3, v4, v5, v6, v7, v8,
2703       v9, v10, v11, v12, v13, v14, v15
2704     };
2705 
2706     const unsigned int BLOCK_SIZE = 16;
2707 
2708     __ mv(len, input_len);
2709     // load init rvec
2710     __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2711     __ vle32_v(v16, rvec);
2712 
2713     aes_load_keys(key, working_vregs, round);
2714     Label L_dec_loop;
2715     // Decrypt from source by block size
2716     __ bind(L_dec_loop);
2717       __ vle32_v(v17, from);
2718       __ addi(from, from, BLOCK_SIZE);
2719       __ vmv_v_v(v18, v17);
2720       aes_decrypt(v17, working_vregs, round);
2721       __ vxor_vv(v17, v17, v16);
2722       __ vse32_v(v17, to);
2723       __ vmv_v_v(v16, v18);
2724       __ addi(to, to, BLOCK_SIZE);
2725       __ subi(len, len, BLOCK_SIZE);
2726       __ bnez(len, L_dec_loop);
2727 
2728     // save current rvec and return
2729     __ vse32_v(v16, rvec);
2730     __ mv(x10, input_len);
2731     __ leave();
2732     __ ret();
2733   }
2734 
2735   // Arguments:
2736   //
2737   // Inputs:
2738   //   c_rarg0   - source byte array address
2739   //   c_rarg1   - destination byte array address
2740   //   c_rarg2   - K (key) in little endian int array
2741   //   c_rarg3   - r vector byte array address
2742   //   c_rarg4   - input length
2743   //
2744   // Output:
2745   //   x10       - input length
2746   //
2747   address generate_cipherBlockChaining_decryptAESCrypt() {
2748     assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
2749     __ align(CodeEntryAlignment);
2750     StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
2751     StubCodeMark mark(this, stub_id);
2752 
2753     const Register from        = c_rarg0;
2754     const Register to          = c_rarg1;
2755     const Register key         = c_rarg2;
2756     const Register rvec        = c_rarg3;
2757     const Register input_len   = c_rarg4;
2758 
2759     const Register keylen      = x28;
2760 
2761     address start = __ pc();
2762     __ enter();
2763 
2764     Label L_aes128, L_aes192, L_aes128_loop, L_aes192_loop, L_aes256_loop;
2765     // Compute #rounds for AES based on the length of the key array
2766     __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2767     __ mv(t0, 52);
2768     __ bltu(keylen, t0, L_aes128);
2769     __ beq(keylen, t0, L_aes192);
2770     // Else we fallthrough to the biggest case (256-bit key size)
2771 
2772     // Note: the following function performs key += 15*16
2773     cipherBlockChaining_decryptAESCrypt(15, from, to, key, rvec, input_len);
2774 
2775     // Note: the following function performs key += 11*16
2776     __ bind(L_aes128);
2777     cipherBlockChaining_decryptAESCrypt(11, from, to, key, rvec, input_len);
2778 
2779     // Note: the following function performs key += 13*16
2780     __ bind(L_aes192);
2781     cipherBlockChaining_decryptAESCrypt(13, from, to, key, rvec, input_len);
2782 
2783     return start;
2784   }
2785 
2786   // Load big-endian 128-bit from memory.
2787   void be_load_counter_128(Register counter_hi, Register counter_lo, Register counter) {
2788     __ ld(counter_lo, Address(counter, 8)); // Load 128-bits from counter
2789     __ ld(counter_hi, Address(counter));
2790     __ rev8(counter_lo, counter_lo);        // Convert big-endian to little-endian
2791     __ rev8(counter_hi, counter_hi);
2792   }
2793 
2794   // Little-endian 128-bit + 64-bit -> 128-bit addition.
2795   void add_counter_128(Register counter_hi, Register counter_lo) {
2796     assert_different_registers(counter_hi, counter_lo, t0);
2797     __ addi(counter_lo, counter_lo, 1);
2798     __ seqz(t0, counter_lo);                // Check for result overflow
2799     __ add(counter_hi, counter_hi, t0);     // Add 1 if overflow otherwise 0
2800   }
2801 
2802   // Store big-endian 128-bit to memory.
2803   void be_store_counter_128(Register counter_hi, Register counter_lo, Register counter) {
2804     assert_different_registers(counter_hi, counter_lo, t0, t1);
2805     __ rev8(t0, counter_lo);                // Convert little-endian to big-endian
2806     __ rev8(t1, counter_hi);
2807     __ sd(t0, Address(counter, 8));         // Store 128-bits to counter
2808     __ sd(t1, Address(counter));
2809   }
2810 
2811   void counterMode_AESCrypt(int round, Register in, Register out, Register key, Register counter,
2812                             Register input_len,  Register saved_encrypted_ctr, Register used_ptr) {
2813     // Algorithm:
2814     //
2815     //   aes_load_keys();
2816     //   load_counter_128(counter_hi, counter_lo, counter);
2817     //
2818     //   L_next:
2819     //     if (used >= BLOCK_SIZE) goto L_main_loop;
2820     //
2821     //   L_encrypt_next:
2822     //       *out = *in ^ saved_encrypted_ctr[used]);
2823     //       out++; in++; used++; len--;
2824     //       if (len == 0) goto L_exit;
2825     //       goto L_next;
2826     //
2827     //   L_main_loop:
2828     //     if (len == 0) goto L_exit;
2829     //     saved_encrypted_ctr = aes_encrypt(counter);
2830     //
2831     //     add_counter_128(counter_hi, counter_lo);
2832     //     be_store_counter_128(counter_hi, counter_lo, counter);
2833     //     used = 0;
2834     //
2835     //     if(len < BLOCK_SIZE) goto L_encrypt_next;
2836     //
2837     //     v_in = load_16Byte(in);
2838     //     v_out = load_16Byte(out);
2839     //     v_saved_encrypted_ctr = load_16Byte(saved_encrypted_ctr);
2840     //     v_out = v_in ^ v_saved_encrypted_ctr;
2841     //     out += BLOCK_SIZE;
2842     //     in += BLOCK_SIZE;
2843     //     len -= BLOCK_SIZE;
2844     //     used = BLOCK_SIZE;
2845     //     goto L_main_loop;
2846     //
2847     //
2848     //   L_exit:
2849     //     store(used);
2850     //     result = input_len
2851     //     return result;
2852 
2853     const Register used          = x28;
2854     const Register len           = x29;
2855     const Register counter_hi    = x30;
2856     const Register counter_lo    = x31;
2857     const Register block_size    = t2;
2858 
2859     const unsigned int BLOCK_SIZE = 16;
2860 
2861     VectorRegister working_vregs[] = {
2862       v1, v2, v3, v4, v5, v6, v7, v8,
2863       v9, v10, v11, v12, v13, v14, v15
2864     };
2865 
2866     __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2867 
2868     __ lwu(used, Address(used_ptr));
2869     __ mv(len, input_len);
2870     __ mv(block_size, BLOCK_SIZE);
2871 
2872     // load keys to working_vregs according to round
2873     aes_load_keys(key, working_vregs, round);
2874 
2875     // 128-bit big-endian load
2876     be_load_counter_128(counter_hi, counter_lo, counter);
2877 
2878     Label L_next, L_encrypt_next, L_main_loop, L_exit;
2879     // Check the last saved_encrypted_ctr used value, we fall through
2880     // to L_encrypt_next when the used value lower than block_size
2881     __ bind(L_next);
2882     __ bgeu(used, block_size, L_main_loop);
2883 
2884     // There is still data left fewer than block_size after L_main_loop
2885     // or last used, we encrypt them one by one.
2886     __ bind(L_encrypt_next);
2887     __ add(t0, saved_encrypted_ctr, used);
2888     __ lbu(t1, Address(t0));
2889     __ lbu(t0, Address(in));
2890     __ xorr(t1, t1, t0);
2891     __ sb(t1, Address(out));
2892     __ addi(in, in, 1);
2893     __ addi(out, out, 1);
2894     __ addi(used, used, 1);
2895     __ subi(len, len, 1);
2896     __ beqz(len, L_exit);
2897     __ j(L_next);
2898 
2899     // We will calculate the next saved_encrypted_ctr and encrypt the blocks of data
2900     // one by one until there is less than a full block remaining if len not zero
2901     __ bind(L_main_loop);
2902     __ beqz(len, L_exit);
2903     __ vle32_v(v16, counter);
2904 
2905     // encrypt counter according to round
2906     aes_encrypt(v16, working_vregs, round);
2907 
2908     __ vse32_v(v16, saved_encrypted_ctr);
2909 
2910     // 128-bit little-endian increment
2911     add_counter_128(counter_hi, counter_lo);
2912     // 128-bit big-endian store
2913     be_store_counter_128(counter_hi, counter_lo, counter);
2914 
2915     __ mv(used, 0);
2916     // Check if we have a full block_size
2917     __ bltu(len, block_size, L_encrypt_next);
2918 
2919     // We have one full block to encrypt at least
2920     __ vle32_v(v17, in);
2921     __ vxor_vv(v16, v16, v17);
2922     __ vse32_v(v16, out);
2923     __ add(out, out, block_size);
2924     __ add(in, in, block_size);
2925     __ sub(len, len, block_size);
2926     __ mv(used, block_size);
2927     __ j(L_main_loop);
2928 
2929     __ bind(L_exit);
2930     __ sw(used, Address(used_ptr));
2931     __ mv(x10, input_len);
2932     __ leave();
2933     __ ret();
2934   };
2935 
2936   // CTR AES crypt.
2937   // Arguments:
2938   //
2939   // Inputs:
2940   //   c_rarg0   - source byte array address
2941   //   c_rarg1   - destination byte array address
2942   //   c_rarg2   - K (key) in little endian int array
2943   //   c_rarg3   - counter vector byte array address
2944   //   c_rarg4   - input length
2945   //   c_rarg5   - saved encryptedCounter start
2946   //   c_rarg6   - saved used length
2947   //
2948   // Output:
2949   //   x10       - input length
2950   //
2951   address generate_counterMode_AESCrypt() {
2952     assert(UseAESCTRIntrinsics, "need AES instructions (Zvkned extension) and Zbb extension support");
2953 
2954     __ align(CodeEntryAlignment);
2955     StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
2956     StubCodeMark mark(this, stub_id);
2957 
2958     const Register in                  = c_rarg0;
2959     const Register out                 = c_rarg1;
2960     const Register key                 = c_rarg2;
2961     const Register counter             = c_rarg3;
2962     const Register input_len           = c_rarg4;
2963     const Register saved_encrypted_ctr = c_rarg5;
2964     const Register used_len_ptr        = c_rarg6;
2965 
2966     const Register keylen              = c_rarg7; // temporary register
2967 
2968     const address start = __ pc();
2969     __ enter();
2970 
2971     Label L_exit;
2972     __ beqz(input_len, L_exit);
2973 
2974     Label L_aes128, L_aes192;
2975     // Compute #rounds for AES based on the length of the key array
2976     __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2977     __ mv(t0, 52); // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
2978     __ bltu(keylen, t0, L_aes128);
2979     __ beq(keylen, t0, L_aes192);
2980     // Else we fallthrough to the biggest case (256-bit key size)
2981 
2982     // Note: the following function performs crypt with key += 15*16
2983     counterMode_AESCrypt(15, in, out, key, counter, input_len, saved_encrypted_ctr, used_len_ptr);
2984 
2985     // Note: the following function performs crypt with key += 13*16
2986     __ bind(L_aes192);
2987     counterMode_AESCrypt(13, in, out, key, counter, input_len, saved_encrypted_ctr, used_len_ptr);
2988 
2989     // Note: the following function performs crypt with key += 11*16
2990     __ bind(L_aes128);
2991     counterMode_AESCrypt(11, in, out, key, counter, input_len, saved_encrypted_ctr, used_len_ptr);
2992 
2993     __ bind(L_exit);
2994     __ mv(x10, input_len);
2995     __ leave();
2996     __ ret();
2997 
2998     return start;
2999   }
3000 
3001   void ghash_loop(Register state, Register subkeyH, Register data, Register blocks,
3002                   VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3) {
3003     VectorRegister partial_hash = vtmp1;
3004     VectorRegister hash_subkey  = vtmp2;
3005     VectorRegister cipher_text  = vtmp3;
3006 
3007     const unsigned int BLOCK_SIZE = 16;
3008 
3009     __ vsetivli(x0, 2, Assembler::e64, Assembler::m1);
3010     __ vle64_v(hash_subkey, subkeyH);
3011     __ vrev8_v(hash_subkey, hash_subkey);
3012     __ vle64_v(partial_hash, state);
3013     __ vrev8_v(partial_hash, partial_hash);
3014 
3015     __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
3016     Label L_ghash_loop;
3017     __ bind(L_ghash_loop);
3018       __ vle32_v(cipher_text, data);
3019       __ addi(data, data, BLOCK_SIZE);
3020       __ vghsh_vv(partial_hash, hash_subkey, cipher_text);
3021       __ subi(blocks, blocks, 1);
3022       __ bnez(blocks, L_ghash_loop);
3023 
3024     __ vsetivli(x0, 2, Assembler::e64, Assembler::m1);
3025     __ vrev8_v(partial_hash, partial_hash);
3026     __ vse64_v(partial_hash, state);
3027   }
3028 
3029   /**
3030    *  Arguments:
3031    *
3032    *  Input:
3033    *  c_rarg0   - current state address
3034    *  c_rarg1   - H key address
3035    *  c_rarg2   - data address
3036    *  c_rarg3   - number of blocks
3037    *
3038    *  Output:
3039    *  Updated state at c_rarg0
3040    */
3041   address generate_ghash_processBlocks() {
3042     assert(UseGHASHIntrinsics, "need GHASH instructions (Zvkg extension) and Zvbb support");
3043 
3044     __ align(CodeEntryAlignment);
3045     StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
3046     StubCodeMark mark(this, stub_id);
3047 
3048     address start = __ pc();
3049     __ enter();
3050 
3051     Register state   = c_rarg0;
3052     Register subkeyH = c_rarg1;
3053     Register data    = c_rarg2;
3054     Register blocks  = c_rarg3;
3055 
3056     VectorRegister vtmp1 = v1;
3057     VectorRegister vtmp2 = v2;
3058     VectorRegister vtmp3 = v3;
3059 
3060     ghash_loop(state, subkeyH, data, blocks, vtmp1, vtmp2, vtmp3);
3061 
3062     __ leave();
3063     __ ret();
3064 
3065     return start;
3066   }
3067 
3068   // code for comparing 8 characters of strings with Latin1 and Utf16 encoding
3069   void compare_string_8_x_LU(Register tmpL, Register tmpU,
3070                              Register strL, Register strU, Label& DIFF) {
3071     const Register tmp = x30, tmpLval = x12;
3072 
3073     int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE);
3074     assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
3075 
3076 #ifdef ASSERT
3077     if (AvoidUnalignedAccesses) {
3078       Label align_ok;
3079       __ andi(t0, strL, 0x7);
3080       __ beqz(t0, align_ok);
3081       __ stop("bad alignment");
3082       __ bind(align_ok);
3083     }
3084 #endif
3085     __ ld(tmpLval, Address(strL));
3086     __ addi(strL, strL, wordSize);
3087 
3088     // compare first 4 characters
3089     __ load_long_misaligned(tmpU, Address(strU), tmp, (base_offset % 8) != 0 ? 4 : 8);
3090     __ addi(strU, strU, wordSize);
3091     __ inflate_lo32(tmpL, tmpLval);
3092     __ xorr(tmp, tmpU, tmpL);
3093     __ bnez(tmp, DIFF);
3094 
3095     // compare second 4 characters
3096     __ load_long_misaligned(tmpU, Address(strU), tmp, (base_offset % 8) != 0 ? 4 : 8);
3097     __ addi(strU, strU, wordSize);
3098     __ inflate_hi32(tmpL, tmpLval);
3099     __ xorr(tmp, tmpU, tmpL);
3100     __ bnez(tmp, DIFF);
3101   }
3102 
3103   // x10  = result
3104   // x11  = str1
3105   // x12  = cnt1
3106   // x13  = str2
3107   // x14  = cnt2
3108   // x28  = tmp1
3109   // x29  = tmp2
3110   // x30  = tmp3
3111   address generate_compare_long_string_different_encoding(StubId stub_id) {
3112     bool isLU;
3113     switch (stub_id) {
3114     case StubId::stubgen_compare_long_string_LU_id:
3115       isLU = true;
3116       break;
3117     case StubId::stubgen_compare_long_string_UL_id:
3118       isLU = false;
3119       break;
3120     default:
3121       ShouldNotReachHere();
3122     };
3123     __ align(CodeEntryAlignment);
3124     StubCodeMark mark(this, stub_id);
3125     address entry = __ pc();
3126     Label SMALL_LOOP, TAIL, LOAD_LAST, DONE, CALCULATE_DIFFERENCE;
3127     const Register result = x10, str1 = x11, str2 = x13, cnt2 = x14,
3128                    tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x12;
3129 
3130     int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE);
3131     assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
3132 
3133     Register strU = isLU ? str2 : str1,
3134              strL = isLU ? str1 : str2,
3135              tmpU = isLU ? tmp2 : tmp1, // where to keep U for comparison
3136              tmpL = isLU ? tmp1 : tmp2; // where to keep L for comparison
3137 
3138     if (AvoidUnalignedAccesses && (base_offset % 8) != 0) {
3139       // Load 4 bytes from strL to make sure main loop is 8-byte aligned
3140       // cnt2 is >= 68 here, no need to check it for >= 0
3141       __ lwu(tmpL, Address(strL));
3142       __ addi(strL, strL, wordSize / 2);
3143       __ load_long_misaligned(tmpU, Address(strU), tmp4, (base_offset % 8) != 0 ? 4 : 8);
3144       __ addi(strU, strU, wordSize);
3145       __ inflate_lo32(tmp3, tmpL);
3146       __ mv(tmpL, tmp3);
3147       __ xorr(tmp3, tmpU, tmpL);
3148       __ bnez(tmp3, CALCULATE_DIFFERENCE);
3149       __ subi(cnt2, cnt2, wordSize / 2);
3150     }
3151 
3152     // we are now 8-bytes aligned on strL when AvoidUnalignedAccesses is true
3153     __ subi(cnt2, cnt2, wordSize * 2);
3154     __ bltz(cnt2, TAIL);
3155     __ bind(SMALL_LOOP); // smaller loop
3156       __ subi(cnt2, cnt2, wordSize * 2);
3157       compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
3158       compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
3159       __ bgez(cnt2, SMALL_LOOP);
3160       __ addi(t0, cnt2, wordSize * 2);
3161       __ beqz(t0, DONE);
3162     __ bind(TAIL);  // 1..15 characters left
3163       // Aligned access. Load bytes in portions - 4, 2, 1.
3164 
3165       __ addi(t0, cnt2, wordSize);
3166       __ addi(cnt2, cnt2, wordSize * 2); // amount of characters left to process
3167       __ bltz(t0, LOAD_LAST);
3168       // remaining characters are greater than or equals to 8, we can do one compare_string_8_x_LU
3169       compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
3170       __ subi(cnt2, cnt2, wordSize);
3171       __ beqz(cnt2, DONE);  // no character left
3172       __ bind(LOAD_LAST);   // cnt2 = 1..7 characters left
3173 
3174       __ subi(cnt2, cnt2, wordSize); // cnt2 is now an offset in strL which points to last 8 bytes
3175       __ slli(t0, cnt2, 1);     // t0 is now an offset in strU which points to last 16 bytes
3176       __ add(strL, strL, cnt2); // Address of last 8 bytes in Latin1 string
3177       __ add(strU, strU, t0);   // Address of last 16 bytes in UTF-16 string
3178       __ load_int_misaligned(tmpL, Address(strL), t0, false);
3179       __ load_long_misaligned(tmpU, Address(strU), t0, 2);
3180       __ inflate_lo32(tmp3, tmpL);
3181       __ mv(tmpL, tmp3);
3182       __ xorr(tmp3, tmpU, tmpL);
3183       __ bnez(tmp3, CALCULATE_DIFFERENCE);
3184 
3185       __ addi(strL, strL, wordSize / 2); // Address of last 4 bytes in Latin1 string
3186       __ addi(strU, strU, wordSize);   // Address of last 8 bytes in UTF-16 string
3187       __ load_int_misaligned(tmpL, Address(strL), t0, false);
3188       __ load_long_misaligned(tmpU, Address(strU), t0, 2);
3189       __ inflate_lo32(tmp3, tmpL);
3190       __ mv(tmpL, tmp3);
3191       __ xorr(tmp3, tmpU, tmpL);
3192       __ bnez(tmp3, CALCULATE_DIFFERENCE);
3193       __ j(DONE); // no character left
3194 
3195       // Find the first different characters in the longwords and
3196       // compute their difference.
3197     __ bind(CALCULATE_DIFFERENCE);
3198       // count bits of trailing zero chars
3199       __ ctzc_bits(tmp4, tmp3);
3200       __ srl(tmp1, tmp1, tmp4);
3201       __ srl(tmp2, tmp2, tmp4);
3202       __ zext(tmp1, tmp1, 16);
3203       __ zext(tmp2, tmp2, 16);
3204       __ sub(result, tmp1, tmp2);
3205     __ bind(DONE);
3206       __ ret();
3207     return entry;
3208   }
3209 
3210   address generate_method_entry_barrier() {
3211     __ align(CodeEntryAlignment);
3212     StubId stub_id = StubId::stubgen_method_entry_barrier_id;
3213     StubCodeMark mark(this, stub_id);
3214 
3215     Label deoptimize_label;
3216 
3217     address start = __ pc();
3218 
3219     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
3220 
3221     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
3222       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
3223       Address thread_epoch_addr(xthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
3224       __ la(t1, ExternalAddress(bs_asm->patching_epoch_addr()));
3225       __ lwu(t1, t1);
3226       __ sw(t1, thread_epoch_addr);
3227       // There are two ways this can work:
3228       // - The writer did system icache shootdown after the instruction stream update.
3229       //   Hence do nothing.
3230       // - The writer trust us to make sure our icache is in sync before entering.
3231       //   Hence use cmodx fence (fence.i, may change).
3232       if (UseCtxFencei) {
3233         __ cmodx_fence();
3234       }
3235       __ membar(__ LoadLoad);
3236     }
3237 
3238     __ set_last_Java_frame(sp, fp, ra);
3239 
3240     __ enter();
3241     __ addi(t1, sp, wordSize);
3242 
3243     __ subi(sp, sp, 4 * wordSize);
3244 
3245     __ push_call_clobbered_registers();
3246 
3247     __ mv(c_rarg0, t1);
3248     __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
3249 
3250     __ reset_last_Java_frame(true);
3251 
3252     __ mv(t0, x10);
3253 
3254     __ pop_call_clobbered_registers();
3255 
3256     __ bnez(t0, deoptimize_label);
3257 
3258     __ leave();
3259     __ ret();
3260 
3261     __ BIND(deoptimize_label);
3262 
3263     __ ld(t0, Address(sp, 0));
3264     __ ld(fp, Address(sp, wordSize));
3265     __ ld(ra, Address(sp, wordSize * 2));
3266     __ ld(t1, Address(sp, wordSize * 3));
3267 
3268     __ mv(sp, t0);
3269     __ jr(t1);
3270 
3271     return start;
3272   }
3273 
3274   // x10  = result
3275   // x11  = str1
3276   // x12  = cnt1
3277   // x13  = str2
3278   // x14  = cnt2
3279   // x28  = tmp1
3280   // x29  = tmp2
3281   // x30  = tmp3
3282   // x31  = tmp4
3283   address generate_compare_long_string_same_encoding(StubId stub_id) {
3284     bool isLL;
3285     switch (stub_id) {
3286     case StubId::stubgen_compare_long_string_LL_id:
3287       isLL = true;
3288       break;
3289     case StubId::stubgen_compare_long_string_UU_id:
3290       isLL = false;
3291       break;
3292     default:
3293       ShouldNotReachHere();
3294     };
3295     __ align(CodeEntryAlignment);
3296     StubCodeMark mark(this, stub_id);
3297     address entry = __ pc();
3298     Label SMALL_LOOP, CHECK_LAST, DIFF2, TAIL,
3299           LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF;
3300     const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, cnt2 = x14,
3301                    tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x7, tmp5 = x31;
3302     RegSet spilled_regs = RegSet::of(tmp4, tmp5);
3303 
3304     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
3305     // update cnt2 counter with already loaded 8 bytes
3306     __ subi(cnt2, cnt2, wordSize / (isLL ? 1 : 2));
3307     // update pointers, because of previous read
3308     __ addi(str1, str1, wordSize);
3309     __ addi(str2, str2, wordSize);
3310     // less than 16 bytes left?
3311     __ subi(cnt2, cnt2, isLL ? 16 : 8);
3312     __ push_reg(spilled_regs, sp);
3313     __ bltz(cnt2, TAIL);
3314     __ bind(SMALL_LOOP);
3315       // compare 16 bytes of strings with same encoding
3316       __ ld(tmp5, Address(str1));
3317       __ addi(str1, str1, 8);
3318       __ xorr(tmp4, tmp1, tmp2);
3319       __ ld(cnt1, Address(str2));
3320       __ addi(str2, str2, 8);
3321       __ bnez(tmp4, DIFF);
3322       __ ld(tmp1, Address(str1));
3323       __ addi(str1, str1, 8);
3324       __ xorr(tmp4, tmp5, cnt1);
3325       __ ld(tmp2, Address(str2));
3326       __ addi(str2, str2, 8);
3327       __ bnez(tmp4, DIFF2);
3328 
3329       __ subi(cnt2, cnt2, isLL ? 16 : 8);
3330       __ bgez(cnt2, SMALL_LOOP);
3331     __ bind(TAIL);
3332       __ addi(cnt2, cnt2, isLL ? 16 : 8);
3333       __ beqz(cnt2, LAST_CHECK_AND_LENGTH_DIFF);
3334       __ subi(cnt2, cnt2, isLL ? 8 : 4);
3335       __ blez(cnt2, CHECK_LAST);
3336       __ xorr(tmp4, tmp1, tmp2);
3337       __ bnez(tmp4, DIFF);
3338       __ ld(tmp1, Address(str1));
3339       __ addi(str1, str1, 8);
3340       __ ld(tmp2, Address(str2));
3341       __ addi(str2, str2, 8);
3342       __ subi(cnt2, cnt2, isLL ? 8 : 4);
3343     __ bind(CHECK_LAST);
3344       if (!isLL) {
3345         __ add(cnt2, cnt2, cnt2); // now in bytes
3346       }
3347       __ xorr(tmp4, tmp1, tmp2);
3348       __ bnez(tmp4, DIFF);
3349       __ add(str1, str1, cnt2);
3350       __ load_long_misaligned(tmp5, Address(str1), tmp3, isLL ? 1 : 2);
3351       __ add(str2, str2, cnt2);
3352       __ load_long_misaligned(cnt1, Address(str2), tmp3, isLL ? 1 : 2);
3353       __ xorr(tmp4, tmp5, cnt1);
3354       __ beqz(tmp4, LENGTH_DIFF);
3355       // Find the first different characters in the longwords and
3356       // compute their difference.
3357     __ bind(DIFF2);
3358       // count bits of trailing zero chars
3359       __ ctzc_bits(tmp3, tmp4, isLL);
3360       __ srl(tmp5, tmp5, tmp3);
3361       __ srl(cnt1, cnt1, tmp3);
3362       if (isLL) {
3363         __ zext(tmp5, tmp5, 8);
3364         __ zext(cnt1, cnt1, 8);
3365       } else {
3366         __ zext(tmp5, tmp5, 16);
3367         __ zext(cnt1, cnt1, 16);
3368       }
3369       __ sub(result, tmp5, cnt1);
3370       __ j(LENGTH_DIFF);
3371     __ bind(DIFF);
3372       // count bits of trailing zero chars
3373       __ ctzc_bits(tmp3, tmp4, isLL);
3374       __ srl(tmp1, tmp1, tmp3);
3375       __ srl(tmp2, tmp2, tmp3);
3376       if (isLL) {
3377         __ zext(tmp1, tmp1, 8);
3378         __ zext(tmp2, tmp2, 8);
3379       } else {
3380         __ zext(tmp1, tmp1, 16);
3381         __ zext(tmp2, tmp2, 16);
3382       }
3383       __ sub(result, tmp1, tmp2);
3384       __ j(LENGTH_DIFF);
3385     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
3386       __ xorr(tmp4, tmp1, tmp2);
3387       __ bnez(tmp4, DIFF);
3388     __ bind(LENGTH_DIFF);
3389       __ pop_reg(spilled_regs, sp);
3390       __ ret();
3391     return entry;
3392   }
3393 
3394   void generate_compare_long_strings() {
3395     StubRoutines::riscv::_compare_long_string_LL = generate_compare_long_string_same_encoding(StubId::stubgen_compare_long_string_LL_id);
3396     StubRoutines::riscv::_compare_long_string_UU = generate_compare_long_string_same_encoding(StubId::stubgen_compare_long_string_UU_id);
3397     StubRoutines::riscv::_compare_long_string_LU = generate_compare_long_string_different_encoding(StubId::stubgen_compare_long_string_LU_id);
3398     StubRoutines::riscv::_compare_long_string_UL = generate_compare_long_string_different_encoding(StubId::stubgen_compare_long_string_UL_id);
3399   }
3400 
3401   // x10 result
3402   // x11 src
3403   // x12 src count
3404   // x13 pattern
3405   // x14 pattern count
3406   address generate_string_indexof_linear(StubId stub_id)
3407   {
3408     bool needle_isL;
3409     bool haystack_isL;
3410     switch (stub_id) {
3411     case StubId::stubgen_string_indexof_linear_ll_id:
3412       needle_isL = true;
3413       haystack_isL = true;
3414       break;
3415     case StubId::stubgen_string_indexof_linear_ul_id:
3416       needle_isL = true;
3417       haystack_isL = false;
3418       break;
3419     case StubId::stubgen_string_indexof_linear_uu_id:
3420       needle_isL = false;
3421       haystack_isL = false;
3422       break;
3423     default:
3424       ShouldNotReachHere();
3425     };
3426 
3427     __ align(CodeEntryAlignment);
3428     StubCodeMark mark(this, stub_id);
3429     address entry = __ pc();
3430 
3431     int needle_chr_size = needle_isL ? 1 : 2;
3432     int haystack_chr_size = haystack_isL ? 1 : 2;
3433     int needle_chr_shift = needle_isL ? 0 : 1;
3434     int haystack_chr_shift = haystack_isL ? 0 : 1;
3435     bool isL = needle_isL && haystack_isL;
3436     // parameters
3437     Register result = x10, haystack = x11, haystack_len = x12, needle = x13, needle_len = x14;
3438     // temporary registers
3439     Register mask1 = x20, match_mask = x21, first = x22, trailing_zeros = x23, mask2 = x24, tmp = x25;
3440     // redefinitions
3441     Register ch1 = x28, ch2 = x29;
3442     RegSet spilled_regs = RegSet::range(x20, x25) + RegSet::range(x28, x29);
3443 
3444     __ push_reg(spilled_regs, sp);
3445 
3446     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
3447           L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
3448           L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
3449           L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
3450           L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
3451           L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
3452 
3453     __ ld(ch1, Address(needle));
3454     __ ld(ch2, Address(haystack));
3455     // src.length - pattern.length
3456     __ sub(haystack_len, haystack_len, needle_len);
3457 
3458     // first is needle[0]
3459     __ zext(first, ch1, needle_isL ? 8 : 16);
3460 
3461     uint64_t mask0101 = UCONST64(0x0101010101010101);
3462     uint64_t mask0001 = UCONST64(0x0001000100010001);
3463     __ mv(mask1, haystack_isL ? mask0101 : mask0001);
3464     __ mul(first, first, mask1);
3465     uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f);
3466     uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff);
3467     __ mv(mask2, haystack_isL ? mask7f7f : mask7fff);
3468     if (needle_isL != haystack_isL) {
3469       __ mv(tmp, ch1);
3470     }
3471     __ subi(haystack_len, haystack_len, wordSize / haystack_chr_size - 1);
3472     __ blez(haystack_len, L_SMALL);
3473 
3474     if (needle_isL != haystack_isL) {
3475       __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
3476     }
3477     // xorr, sub, orr, notr, andr
3478     // compare and set match_mask[i] with 0x80/0x8000 (Latin1/UTF16) if ch2[i] == first[i]
3479     // eg:
3480     // first:        aa aa aa aa aa aa aa aa
3481     // ch2:          aa aa li nx jd ka aa aa
3482     // match_mask:   80 80 00 00 00 00 80 80
3483     __ compute_match_mask(ch2, first, match_mask, mask1, mask2);
3484 
3485     // search first char of needle, if success, goto L_HAS_ZERO;
3486     __ bnez(match_mask, L_HAS_ZERO);
3487     __ subi(haystack_len, haystack_len, wordSize / haystack_chr_size);
3488     __ addi(result, result, wordSize / haystack_chr_size);
3489     __ addi(haystack, haystack, wordSize);
3490     __ bltz(haystack_len, L_POST_LOOP);
3491 
3492     __ bind(L_LOOP);
3493     __ ld(ch2, Address(haystack));
3494     __ compute_match_mask(ch2, first, match_mask, mask1, mask2);
3495     __ bnez(match_mask, L_HAS_ZERO);
3496 
3497     __ bind(L_LOOP_PROCEED);
3498     __ subi(haystack_len, haystack_len, wordSize / haystack_chr_size);
3499     __ addi(haystack, haystack, wordSize);
3500     __ addi(result, result, wordSize / haystack_chr_size);
3501     __ bgez(haystack_len, L_LOOP);
3502 
3503     __ bind(L_POST_LOOP);
3504     __ mv(ch2, -wordSize / haystack_chr_size);
3505     __ ble(haystack_len, ch2, NOMATCH); // no extra characters to check
3506     __ ld(ch2, Address(haystack));
3507     __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
3508     __ neg(haystack_len, haystack_len);
3509     __ xorr(ch2, first, ch2);
3510     __ sub(match_mask, ch2, mask1);
3511     __ orr(ch2, ch2, mask2);
3512     __ mv(trailing_zeros, -1); // all bits set
3513     __ j(L_SMALL_PROCEED);
3514 
3515     __ align(OptoLoopAlignment);
3516     __ bind(L_SMALL);
3517     __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
3518     __ neg(haystack_len, haystack_len);
3519     if (needle_isL != haystack_isL) {
3520       __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
3521     }
3522     __ xorr(ch2, first, ch2);
3523     __ sub(match_mask, ch2, mask1);
3524     __ orr(ch2, ch2, mask2);
3525     __ mv(trailing_zeros, -1); // all bits set
3526 
3527     __ bind(L_SMALL_PROCEED);
3528     __ srl(trailing_zeros, trailing_zeros, haystack_len); // mask. zeroes on useless bits.
3529     __ notr(ch2, ch2);
3530     __ andr(match_mask, match_mask, ch2);
3531     __ andr(match_mask, match_mask, trailing_zeros); // clear useless bits and check
3532     __ beqz(match_mask, NOMATCH);
3533 
3534     __ bind(L_SMALL_HAS_ZERO_LOOP);
3535     // count bits of trailing zero chars
3536     __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, ch2, tmp);
3537     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
3538     __ mv(ch2, wordSize / haystack_chr_size);
3539     __ ble(needle_len, ch2, L_SMALL_CMP_LOOP_LAST_CMP2);
3540     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
3541     __ mv(trailing_zeros, wordSize / haystack_chr_size);
3542     __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
3543 
3544     __ bind(L_SMALL_CMP_LOOP);
3545     __ shadd(first, trailing_zeros, needle, first, needle_chr_shift);
3546     __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift);
3547     needle_isL ? __ lbu(first, Address(first)) : __ lhu(first, Address(first));
3548     haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
3549     __ addi(trailing_zeros, trailing_zeros, 1);
3550     __ bge(trailing_zeros, needle_len, L_SMALL_CMP_LOOP_LAST_CMP);
3551     __ beq(first, ch2, L_SMALL_CMP_LOOP);
3552 
3553     __ bind(L_SMALL_CMP_LOOP_NOMATCH);
3554     __ beqz(match_mask, NOMATCH);
3555     // count bits of trailing zero chars
3556     __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
3557     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
3558     __ addi(result, result, 1);
3559     __ addi(haystack, haystack, haystack_chr_size);
3560     __ j(L_SMALL_HAS_ZERO_LOOP);
3561 
3562     __ align(OptoLoopAlignment);
3563     __ bind(L_SMALL_CMP_LOOP_LAST_CMP);
3564     __ bne(first, ch2, L_SMALL_CMP_LOOP_NOMATCH);
3565     __ j(DONE);
3566 
3567     __ align(OptoLoopAlignment);
3568     __ bind(L_SMALL_CMP_LOOP_LAST_CMP2);
3569     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
3570     __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
3571     __ j(DONE);
3572 
3573     __ align(OptoLoopAlignment);
3574     __ bind(L_HAS_ZERO);
3575     // count bits of trailing zero chars
3576     __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
3577     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
3578     __ slli(needle_len, needle_len, BitsPerByte * wordSize / 2);
3579     __ orr(haystack_len, haystack_len, needle_len); // restore needle_len(32bits)
3580     __ subi(result, result, 1); // array index from 0, so result -= 1
3581 
3582     __ bind(L_HAS_ZERO_LOOP);
3583     __ mv(needle_len, wordSize / haystack_chr_size);
3584     __ srli(ch2, haystack_len, BitsPerByte * wordSize / 2);
3585     __ bge(needle_len, ch2, L_CMP_LOOP_LAST_CMP2);
3586     // load next 8 bytes from haystack, and increase result index
3587     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
3588     __ addi(result, result, 1);
3589     __ mv(trailing_zeros, wordSize / haystack_chr_size);
3590     __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
3591 
3592     // compare one char
3593     __ bind(L_CMP_LOOP);
3594     __ shadd(needle_len, trailing_zeros, needle, needle_len, needle_chr_shift);
3595     needle_isL ? __ lbu(needle_len, Address(needle_len)) : __ lhu(needle_len, Address(needle_len));
3596     __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift);
3597     haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
3598     __ addi(trailing_zeros, trailing_zeros, 1); // next char index
3599     __ srli(tmp, haystack_len, BitsPerByte * wordSize / 2);
3600     __ bge(trailing_zeros, tmp, L_CMP_LOOP_LAST_CMP);
3601     __ beq(needle_len, ch2, L_CMP_LOOP);
3602 
3603     __ bind(L_CMP_LOOP_NOMATCH);
3604     __ beqz(match_mask, L_HAS_ZERO_LOOP_NOMATCH);
3605     // count bits of trailing zero chars
3606     __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, needle_len, ch2);
3607     __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
3608     __ addi(haystack, haystack, haystack_chr_size);
3609     __ j(L_HAS_ZERO_LOOP);
3610 
3611     __ align(OptoLoopAlignment);
3612     __ bind(L_CMP_LOOP_LAST_CMP);
3613     __ bne(needle_len, ch2, L_CMP_LOOP_NOMATCH);
3614     __ j(DONE);
3615 
3616     __ align(OptoLoopAlignment);
3617     __ bind(L_CMP_LOOP_LAST_CMP2);
3618     __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
3619     __ addi(result, result, 1);
3620     __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
3621     __ j(DONE);
3622 
3623     __ align(OptoLoopAlignment);
3624     __ bind(L_HAS_ZERO_LOOP_NOMATCH);
3625     // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
3626     // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
3627     // so, result was increased at max by wordSize/str2_chr_size - 1, so,
3628     // respective high bit wasn't changed. L_LOOP_PROCEED will increase
3629     // result by analyzed characters value, so, we can just reset lower bits
3630     // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
3631     // 2) restore needle_len and haystack_len values from "compressed" haystack_len
3632     // 3) advance haystack value to represent next haystack octet. result & 7/3 is
3633     // index of last analyzed substring inside current octet. So, haystack in at
3634     // respective start address. We need to advance it to next octet
3635     __ andi(match_mask, result, wordSize / haystack_chr_size - 1);
3636     __ srli(needle_len, haystack_len, BitsPerByte * wordSize / 2);
3637     __ andi(result, result, haystack_isL ? -8 : -4);
3638     __ slli(tmp, match_mask, haystack_chr_shift);
3639     __ sub(haystack, haystack, tmp);
3640     __ sext(haystack_len, haystack_len, 32);
3641     __ j(L_LOOP_PROCEED);
3642 
3643     __ align(OptoLoopAlignment);
3644     __ bind(NOMATCH);
3645     __ mv(result, -1);
3646 
3647     __ bind(DONE);
3648     __ pop_reg(spilled_regs, sp);
3649     __ ret();
3650     return entry;
3651   }
3652 
3653   void generate_string_indexof_stubs()
3654   {
3655     StubRoutines::riscv::_string_indexof_linear_ll = generate_string_indexof_linear(StubId::stubgen_string_indexof_linear_ll_id);
3656     StubRoutines::riscv::_string_indexof_linear_uu = generate_string_indexof_linear(StubId::stubgen_string_indexof_linear_uu_id);
3657     StubRoutines::riscv::_string_indexof_linear_ul = generate_string_indexof_linear(StubId::stubgen_string_indexof_linear_ul_id);
3658   }
3659 
3660 #ifdef COMPILER2
3661   void generate_lookup_secondary_supers_table_stub() {
3662     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
3663     StubCodeMark mark(this, stub_id);
3664 
3665     const Register
3666       r_super_klass  = x10,
3667       r_array_base   = x11,
3668       r_array_length = x12,
3669       r_array_index  = x13,
3670       r_sub_klass    = x14,
3671       result         = x15,
3672       r_bitmap       = x16;
3673 
3674     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
3675       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
3676       Label L_success;
3677       __ enter();
3678       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, result,
3679                                              r_array_base, r_array_length, r_array_index,
3680                                              r_bitmap, slot, /*stub_is_near*/true);
3681       __ leave();
3682       __ ret();
3683     }
3684   }
3685 
3686   // Slow path implementation for UseSecondarySupersTable.
3687   address generate_lookup_secondary_supers_table_slow_path_stub() {
3688     StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
3689     StubCodeMark mark(this, stub_id);
3690 
3691     address start = __ pc();
3692     const Register
3693       r_super_klass  = x10,        // argument
3694       r_array_base   = x11,        // argument
3695       temp1          = x12,        // tmp
3696       r_array_index  = x13,        // argument
3697       result         = x15,        // argument
3698       r_bitmap       = x16;        // argument
3699 
3700 
3701     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
3702     __ ret();
3703 
3704     return start;
3705   }
3706 
3707   address generate_mulAdd()
3708   {
3709     __ align(CodeEntryAlignment);
3710     StubId stub_id = StubId::stubgen_mulAdd_id;
3711     StubCodeMark mark(this, stub_id);
3712 
3713     address entry = __ pc();
3714 
3715     const Register out     = x10;
3716     const Register in      = x11;
3717     const Register offset  = x12;
3718     const Register len     = x13;
3719     const Register k       = x14;
3720     const Register tmp     = x28;
3721 
3722     BLOCK_COMMENT("Entry:");
3723     __ enter();
3724     __ mul_add(out, in, offset, len, k, tmp);
3725     __ leave();
3726     __ ret();
3727 
3728     return entry;
3729   }
3730 
3731   /**
3732    *  Arguments:
3733    *
3734    *  Input:
3735    *    c_rarg0   - x address
3736    *    c_rarg1   - x length
3737    *    c_rarg2   - y address
3738    *    c_rarg3   - y length
3739    *    c_rarg4   - z address
3740    */
3741   address generate_multiplyToLen()
3742   {
3743     __ align(CodeEntryAlignment);
3744     StubId stub_id = StubId::stubgen_multiplyToLen_id;
3745     StubCodeMark mark(this, stub_id);
3746     address entry = __ pc();
3747 
3748     const Register x     = x10;
3749     const Register xlen  = x11;
3750     const Register y     = x12;
3751     const Register ylen  = x13;
3752     const Register z     = x14;
3753 
3754     const Register tmp0  = x15;
3755     const Register tmp1  = x16;
3756     const Register tmp2  = x17;
3757     const Register tmp3  = x7;
3758     const Register tmp4  = x28;
3759     const Register tmp5  = x29;
3760     const Register tmp6  = x30;
3761     const Register tmp7  = x31;
3762 
3763     BLOCK_COMMENT("Entry:");
3764     __ enter(); // required for proper stackwalking of RuntimeStub frame
3765     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3766     __ leave(); // required for proper stackwalking of RuntimeStub frame
3767     __ ret();
3768 
3769     return entry;
3770   }
3771 
3772   address generate_squareToLen()
3773   {
3774     __ align(CodeEntryAlignment);
3775     StubId stub_id = StubId::stubgen_squareToLen_id;
3776     StubCodeMark mark(this, stub_id);
3777     address entry = __ pc();
3778 
3779     const Register x     = x10;
3780     const Register xlen  = x11;
3781     const Register z     = x12;
3782     const Register y     = x14; // == x
3783     const Register ylen  = x15; // == xlen
3784 
3785     const Register tmp0  = x13; // zlen, unused
3786     const Register tmp1  = x16;
3787     const Register tmp2  = x17;
3788     const Register tmp3  = x7;
3789     const Register tmp4  = x28;
3790     const Register tmp5  = x29;
3791     const Register tmp6  = x30;
3792     const Register tmp7  = x31;
3793 
3794     BLOCK_COMMENT("Entry:");
3795     __ enter();
3796     __ mv(y, x);
3797     __ mv(ylen, xlen);
3798     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3799     __ leave();
3800     __ ret();
3801 
3802     return entry;
3803   }
3804 
3805   // Arguments:
3806   //
3807   // Input:
3808   //   c_rarg0   - newArr address
3809   //   c_rarg1   - oldArr address
3810   //   c_rarg2   - newIdx
3811   //   c_rarg3   - shiftCount
3812   //   c_rarg4   - numIter
3813   //
3814   address generate_bigIntegerLeftShift() {
3815     __ align(CodeEntryAlignment);
3816     StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
3817     StubCodeMark mark(this, stub_id);
3818     address entry = __ pc();
3819 
3820     Label loop, exit;
3821 
3822     Register newArr        = c_rarg0;
3823     Register oldArr        = c_rarg1;
3824     Register newIdx        = c_rarg2;
3825     Register shiftCount    = c_rarg3;
3826     Register numIter       = c_rarg4;
3827 
3828     Register shiftRevCount = c_rarg5;
3829     Register oldArrNext    = t1;
3830 
3831     __ beqz(numIter, exit);
3832     __ shadd(newArr, newIdx, newArr, t0, 2);
3833 
3834     __ mv(shiftRevCount, 32);
3835     __ sub(shiftRevCount, shiftRevCount, shiftCount);
3836 
3837     __ bind(loop);
3838     __ addi(oldArrNext, oldArr, 4);
3839     __ vsetvli(t0, numIter, Assembler::e32, Assembler::m4);
3840     __ vle32_v(v0, oldArr);
3841     __ vle32_v(v4, oldArrNext);
3842     __ vsll_vx(v0, v0, shiftCount);
3843     __ vsrl_vx(v4, v4, shiftRevCount);
3844     __ vor_vv(v0, v0, v4);
3845     __ vse32_v(v0, newArr);
3846     __ sub(numIter, numIter, t0);
3847     __ shadd(oldArr, t0, oldArr, t1, 2);
3848     __ shadd(newArr, t0, newArr, t1, 2);
3849     __ bnez(numIter, loop);
3850 
3851     __ bind(exit);
3852     __ ret();
3853 
3854     return entry;
3855   }
3856 
3857   // Arguments:
3858   //
3859   // Input:
3860   //   c_rarg0   - newArr address
3861   //   c_rarg1   - oldArr address
3862   //   c_rarg2   - newIdx
3863   //   c_rarg3   - shiftCount
3864   //   c_rarg4   - numIter
3865   //
3866   address generate_bigIntegerRightShift() {
3867     __ align(CodeEntryAlignment);
3868     StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
3869     StubCodeMark mark(this, stub_id);
3870     address entry = __ pc();
3871 
3872     Label loop, exit;
3873 
3874     Register newArr        = c_rarg0;
3875     Register oldArr        = c_rarg1;
3876     Register newIdx        = c_rarg2;
3877     Register shiftCount    = c_rarg3;
3878     Register numIter       = c_rarg4;
3879     Register idx           = numIter;
3880 
3881     Register shiftRevCount = c_rarg5;
3882     Register oldArrNext    = c_rarg6;
3883     Register newArrCur     = t0;
3884     Register oldArrCur     = t1;
3885 
3886     __ beqz(idx, exit);
3887     __ shadd(newArr, newIdx, newArr, t0, 2);
3888 
3889     __ mv(shiftRevCount, 32);
3890     __ sub(shiftRevCount, shiftRevCount, shiftCount);
3891 
3892     __ bind(loop);
3893     __ vsetvli(t0, idx, Assembler::e32, Assembler::m4);
3894     __ sub(idx, idx, t0);
3895     __ shadd(oldArrNext, idx, oldArr, t1, 2);
3896     __ shadd(newArrCur, idx, newArr, t1, 2);
3897     __ addi(oldArrCur, oldArrNext, 4);
3898     __ vle32_v(v0, oldArrCur);
3899     __ vle32_v(v4, oldArrNext);
3900     __ vsrl_vx(v0, v0, shiftCount);
3901     __ vsll_vx(v4, v4, shiftRevCount);
3902     __ vor_vv(v0, v0, v4);
3903     __ vse32_v(v0, newArrCur);
3904     __ bnez(idx, loop);
3905 
3906     __ bind(exit);
3907     __ ret();
3908 
3909     return entry;
3910   }
3911 #endif
3912 
3913 #ifdef COMPILER2
3914   class MontgomeryMultiplyGenerator : public MacroAssembler {
3915 
3916     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
3917       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2, Ri, Rj;
3918 
3919     RegSet _toSave;
3920     bool _squaring;
3921 
3922   public:
3923     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
3924       : MacroAssembler(as->code()), _squaring(squaring) {
3925 
3926       // Register allocation
3927 
3928       RegSetIterator<Register> regs = RegSet::range(x10, x26).begin();
3929       Pa_base = *regs;       // Argument registers
3930       if (squaring) {
3931         Pb_base = Pa_base;
3932       } else {
3933         Pb_base = *++regs;
3934       }
3935       Pn_base = *++regs;
3936       Rlen= *++regs;
3937       inv = *++regs;
3938       Pm_base = *++regs;
3939 
3940                         // Working registers:
3941       Ra =  *++regs;    // The current digit of a, b, n, and m.
3942       Rb =  *++regs;
3943       Rm =  *++regs;
3944       Rn =  *++regs;
3945 
3946       Pa =  *++regs;      // Pointers to the current/next digit of a, b, n, and m.
3947       Pb =  *++regs;
3948       Pm =  *++regs;
3949       Pn =  *++regs;
3950 
3951       tmp0 =  *++regs;    // Three registers which form a
3952       tmp1 =  *++regs;    // triple-precision accumuator.
3953       tmp2 =  *++regs;
3954 
3955       Ri =  x6;         // Inner and outer loop indexes.
3956       Rj =  x7;
3957 
3958       Rhi_ab = x28;     // Product registers: low and high parts
3959       Rlo_ab = x29;     // of a*b and m*n.
3960       Rhi_mn = x30;
3961       Rlo_mn = x31;
3962 
3963       // x18 and up are callee-saved.
3964       _toSave = RegSet::range(x18, *regs) + Pm_base;
3965     }
3966 
3967   private:
3968     void save_regs() {
3969       push_reg(_toSave, sp);
3970     }
3971 
3972     void restore_regs() {
3973       pop_reg(_toSave, sp);
3974     }
3975 
3976     template <typename T>
3977     void unroll_2(Register count, T block) {
3978       Label loop, end, odd;
3979       beqz(count, end);
3980       test_bit(t0, count, 0);
3981       bnez(t0, odd);
3982       align(16);
3983       bind(loop);
3984       (this->*block)();
3985       bind(odd);
3986       (this->*block)();
3987       subi(count, count, 2);
3988       bgtz(count, loop);
3989       bind(end);
3990     }
3991 
3992     template <typename T>
3993     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
3994       Label loop, end, odd;
3995       beqz(count, end);
3996       test_bit(tmp, count, 0);
3997       bnez(tmp, odd);
3998       align(16);
3999       bind(loop);
4000       (this->*block)(d, s, tmp);
4001       bind(odd);
4002       (this->*block)(d, s, tmp);
4003       subi(count, count, 2);
4004       bgtz(count, loop);
4005       bind(end);
4006     }
4007 
4008     void pre1(RegisterOrConstant i) {
4009       block_comment("pre1");
4010       // Pa = Pa_base;
4011       // Pb = Pb_base + i;
4012       // Pm = Pm_base;
4013       // Pn = Pn_base + i;
4014       // Ra = *Pa;
4015       // Rb = *Pb;
4016       // Rm = *Pm;
4017       // Rn = *Pn;
4018       if (i.is_register()) {
4019         slli(t0, i.as_register(), LogBytesPerWord);
4020       } else {
4021         mv(t0, i.as_constant());
4022         slli(t0, t0, LogBytesPerWord);
4023       }
4024 
4025       mv(Pa, Pa_base);
4026       add(Pb, Pb_base, t0);
4027       mv(Pm, Pm_base);
4028       add(Pn, Pn_base, t0);
4029 
4030       ld(Ra, Address(Pa));
4031       ld(Rb, Address(Pb));
4032       ld(Rm, Address(Pm));
4033       ld(Rn, Address(Pn));
4034 
4035       // Zero the m*n result.
4036       mv(Rhi_mn, zr);
4037       mv(Rlo_mn, zr);
4038     }
4039 
4040     // The core multiply-accumulate step of a Montgomery
4041     // multiplication.  The idea is to schedule operations as a
4042     // pipeline so that instructions with long latencies (loads and
4043     // multiplies) have time to complete before their results are
4044     // used.  This most benefits in-order implementations of the
4045     // architecture but out-of-order ones also benefit.
4046     void step() {
4047       block_comment("step");
4048       // MACC(Ra, Rb, tmp0, tmp1, tmp2);
4049       // Ra = *++Pa;
4050       // Rb = *--Pb;
4051       mulhu(Rhi_ab, Ra, Rb);
4052       mul(Rlo_ab, Ra, Rb);
4053       addi(Pa, Pa, wordSize);
4054       ld(Ra, Address(Pa));
4055       subi(Pb, Pb, wordSize);
4056       ld(Rb, Address(Pb));
4057       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n from the
4058                                             // previous iteration.
4059       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
4060       // Rm = *++Pm;
4061       // Rn = *--Pn;
4062       mulhu(Rhi_mn, Rm, Rn);
4063       mul(Rlo_mn, Rm, Rn);
4064       addi(Pm, Pm, wordSize);
4065       ld(Rm, Address(Pm));
4066       subi(Pn, Pn, wordSize);
4067       ld(Rn, Address(Pn));
4068       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
4069     }
4070 
4071     void post1() {
4072       block_comment("post1");
4073 
4074       // MACC(Ra, Rb, tmp0, tmp1, tmp2);
4075       // Ra = *++Pa;
4076       // Rb = *--Pb;
4077       mulhu(Rhi_ab, Ra, Rb);
4078       mul(Rlo_ab, Ra, Rb);
4079       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
4080       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
4081 
4082       // *Pm = Rm = tmp0 * inv;
4083       mul(Rm, tmp0, inv);
4084       sd(Rm, Address(Pm));
4085 
4086       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
4087       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
4088       mulhu(Rhi_mn, Rm, Rn);
4089 
4090 #ifndef PRODUCT
4091       // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply");
4092       {
4093         mul(Rlo_mn, Rm, Rn);
4094         add(Rlo_mn, tmp0, Rlo_mn);
4095         Label ok;
4096         beqz(Rlo_mn, ok);
4097         stop("broken Montgomery multiply");
4098         bind(ok);
4099       }
4100 #endif
4101       // We have very carefully set things up so that
4102       // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate
4103       // the lower half of Rm * Rn because we know the result already:
4104       // it must be -tmp0.  tmp0 + (-tmp0) must generate a carry iff
4105       // tmp0 != 0.  So, rather than do a mul and an cad we just set
4106       // the carry flag iff tmp0 is nonzero.
4107       //
4108       // mul(Rlo_mn, Rm, Rn);
4109       // cad(zr, tmp0, Rlo_mn);
4110       subi(t0, tmp0, 1);
4111       sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero
4112       cadc(tmp0, tmp1, Rhi_mn, t0);
4113       adc(tmp1, tmp2, zr, t0);
4114       mv(tmp2, zr);
4115     }
4116 
4117     void pre2(Register i, Register len) {
4118       block_comment("pre2");
4119       // Pa = Pa_base + i-len;
4120       // Pb = Pb_base + len;
4121       // Pm = Pm_base + i-len;
4122       // Pn = Pn_base + len;
4123 
4124       sub(Rj, i, len);
4125       // Rj == i-len
4126 
4127       // Ra as temp register
4128       slli(Ra, Rj, LogBytesPerWord);
4129       add(Pa, Pa_base, Ra);
4130       add(Pm, Pm_base, Ra);
4131       slli(Ra, len, LogBytesPerWord);
4132       add(Pb, Pb_base, Ra);
4133       add(Pn, Pn_base, Ra);
4134 
4135       // Ra = *++Pa;
4136       // Rb = *--Pb;
4137       // Rm = *++Pm;
4138       // Rn = *--Pn;
4139       addi(Pa, Pa, wordSize);
4140       ld(Ra, Address(Pa));
4141       subi(Pb, Pb, wordSize);
4142       ld(Rb, Address(Pb));
4143       addi(Pm, Pm, wordSize);
4144       ld(Rm, Address(Pm));
4145       subi(Pn, Pn, wordSize);
4146       ld(Rn, Address(Pn));
4147 
4148       mv(Rhi_mn, zr);
4149       mv(Rlo_mn, zr);
4150     }
4151 
4152     void post2(Register i, Register len) {
4153       block_comment("post2");
4154       sub(Rj, i, len);
4155 
4156       cad(tmp0, tmp0, Rlo_mn, t0); // The pending m*n, low part
4157 
4158       // As soon as we know the least significant digit of our result,
4159       // store it.
4160       // Pm_base[i-len] = tmp0;
4161       // Rj as temp register
4162       slli(Rj, Rj, LogBytesPerWord);
4163       add(Rj, Pm_base, Rj);
4164       sd(tmp0, Address(Rj));
4165 
4166       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
4167       cadc(tmp0, tmp1, Rhi_mn, t0); // The pending m*n, high part
4168       adc(tmp1, tmp2, zr, t0);
4169       mv(tmp2, zr);
4170     }
4171 
4172     // A carry in tmp0 after Montgomery multiplication means that we
4173     // should subtract multiples of n from our result in m.  We'll
4174     // keep doing that until there is no carry.
4175     void normalize(Register len) {
4176       block_comment("normalize");
4177       // while (tmp0)
4178       //   tmp0 = sub(Pm_base, Pn_base, tmp0, len);
4179       Label loop, post, again;
4180       Register cnt = tmp1, i = tmp2; // Re-use registers; we're done with them now
4181       beqz(tmp0, post); {
4182         bind(again); {
4183           mv(i, zr);
4184           mv(cnt, len);
4185           slli(Rn, i, LogBytesPerWord);
4186           add(Rm, Pm_base, Rn);
4187           ld(Rm, Address(Rm));
4188           add(Rn, Pn_base, Rn);
4189           ld(Rn, Address(Rn));
4190           mv(t0, 1); // set carry flag, i.e. no borrow
4191           align(16);
4192           bind(loop); {
4193             notr(Rn, Rn);
4194             add(Rm, Rm, t0);
4195             add(Rm, Rm, Rn);
4196             sltu(t0, Rm, Rn);
4197             slli(Rn, i, LogBytesPerWord); // Rn as temp register
4198             add(Rn, Pm_base, Rn);
4199             sd(Rm, Address(Rn));
4200             addi(i, i, 1);
4201             slli(Rn, i, LogBytesPerWord);
4202             add(Rm, Pm_base, Rn);
4203             ld(Rm, Address(Rm));
4204             add(Rn, Pn_base, Rn);
4205             ld(Rn, Address(Rn));
4206             subi(cnt, cnt, 1);
4207           } bnez(cnt, loop);
4208           subi(tmp0, tmp0, 1);
4209           add(tmp0, tmp0, t0);
4210         } bnez(tmp0, again);
4211       } bind(post);
4212     }
4213 
4214     // Move memory at s to d, reversing words.
4215     //    Increments d to end of copied memory
4216     //    Destroys tmp1, tmp2
4217     //    Preserves len
4218     //    Leaves s pointing to the address which was in d at start
4219     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
4220       assert(tmp1->encoding() < x28->encoding(), "register corruption");
4221       assert(tmp2->encoding() < x28->encoding(), "register corruption");
4222 
4223       shadd(s, len, s, tmp1, LogBytesPerWord);
4224       mv(tmp1, len);
4225       unroll_2(tmp1,  &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
4226       slli(tmp1, len, LogBytesPerWord);
4227       sub(s, d, tmp1);
4228     }
4229     // [63...0] -> [31...0][63...32]
4230     void reverse1(Register d, Register s, Register tmp) {
4231       subi(s, s, wordSize);
4232       ld(tmp, Address(s));
4233       ror(tmp, tmp, 32, t0);
4234       sd(tmp, Address(d));
4235       addi(d, d, wordSize);
4236     }
4237 
4238     void step_squaring() {
4239       // An extra ACC
4240       step();
4241       acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
4242     }
4243 
4244     void last_squaring(Register i) {
4245       Label dont;
4246       // if ((i & 1) == 0) {
4247       test_bit(t0, i, 0);
4248       bnez(t0, dont); {
4249         // MACC(Ra, Rb, tmp0, tmp1, tmp2);
4250         // Ra = *++Pa;
4251         // Rb = *--Pb;
4252         mulhu(Rhi_ab, Ra, Rb);
4253         mul(Rlo_ab, Ra, Rb);
4254         acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
4255       } bind(dont);
4256     }
4257 
4258     void extra_step_squaring() {
4259       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
4260 
4261       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
4262       // Rm = *++Pm;
4263       // Rn = *--Pn;
4264       mulhu(Rhi_mn, Rm, Rn);
4265       mul(Rlo_mn, Rm, Rn);
4266       addi(Pm, Pm, wordSize);
4267       ld(Rm, Address(Pm));
4268       subi(Pn, Pn, wordSize);
4269       ld(Rn, Address(Pn));
4270     }
4271 
4272     void post1_squaring() {
4273       acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2);  // The pending m*n
4274 
4275       // *Pm = Rm = tmp0 * inv;
4276       mul(Rm, tmp0, inv);
4277       sd(Rm, Address(Pm));
4278 
4279       // MACC(Rm, Rn, tmp0, tmp1, tmp2);
4280       // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
4281       mulhu(Rhi_mn, Rm, Rn);
4282 
4283 #ifndef PRODUCT
4284       // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply");
4285       {
4286         mul(Rlo_mn, Rm, Rn);
4287         add(Rlo_mn, tmp0, Rlo_mn);
4288         Label ok;
4289         beqz(Rlo_mn, ok); {
4290           stop("broken Montgomery multiply");
4291         } bind(ok);
4292       }
4293 #endif
4294       // We have very carefully set things up so that
4295       // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate
4296       // the lower half of Rm * Rn because we know the result already:
4297       // it must be -tmp0.  tmp0 + (-tmp0) must generate a carry iff
4298       // tmp0 != 0.  So, rather than do a mul and a cad we just set
4299       // the carry flag iff tmp0 is nonzero.
4300       //
4301       // mul(Rlo_mn, Rm, Rn);
4302       // cad(zr, tmp, Rlo_mn);
4303       subi(t0, tmp0, 1);
4304       sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero
4305       cadc(tmp0, tmp1, Rhi_mn, t0);
4306       adc(tmp1, tmp2, zr, t0);
4307       mv(tmp2, zr);
4308     }
4309 
4310     // use t0 as carry
4311     void acc(Register Rhi, Register Rlo,
4312              Register tmp0, Register tmp1, Register tmp2) {
4313       cad(tmp0, tmp0, Rlo, t0);
4314       cadc(tmp1, tmp1, Rhi, t0);
4315       adc(tmp2, tmp2, zr, t0);
4316     }
4317 
4318   public:
4319     /**
4320      * Fast Montgomery multiplication.  The derivation of the
4321      * algorithm is in A Cryptographic Library for the Motorola
4322      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
4323      *
4324      * Arguments:
4325      *
4326      * Inputs for multiplication:
4327      *   c_rarg0   - int array elements a
4328      *   c_rarg1   - int array elements b
4329      *   c_rarg2   - int array elements n (the modulus)
4330      *   c_rarg3   - int length
4331      *   c_rarg4   - int inv
4332      *   c_rarg5   - int array elements m (the result)
4333      *
4334      * Inputs for squaring:
4335      *   c_rarg0   - int array elements a
4336      *   c_rarg1   - int array elements n (the modulus)
4337      *   c_rarg2   - int length
4338      *   c_rarg3   - int inv
4339      *   c_rarg4   - int array elements m (the result)
4340      *
4341      */
4342     address generate_multiply() {
4343       Label argh, nothing;
4344       bind(argh);
4345       stop("MontgomeryMultiply total_allocation must be <= 8192");
4346 
4347       align(CodeEntryAlignment);
4348       address entry = pc();
4349 
4350       beqz(Rlen, nothing);
4351 
4352       enter();
4353 
4354       // Make room.
4355       mv(Ra, 512);
4356       bgt(Rlen, Ra, argh);
4357       slli(Ra, Rlen, exact_log2(4 * sizeof(jint)));
4358       sub(Ra, sp, Ra);
4359       andi(sp, Ra, -2 * wordSize);
4360 
4361       srliw(Rlen, Rlen, 1);  // length in longwords = len/2
4362 
4363       {
4364         // Copy input args, reversing as we go.  We use Ra as a
4365         // temporary variable.
4366         reverse(Ra, Pa_base, Rlen, Ri, Rj);
4367         if (!_squaring)
4368           reverse(Ra, Pb_base, Rlen, Ri, Rj);
4369         reverse(Ra, Pn_base, Rlen, Ri, Rj);
4370       }
4371 
4372       // Push all call-saved registers and also Pm_base which we'll need
4373       // at the end.
4374       save_regs();
4375 
4376 #ifndef PRODUCT
4377       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
4378       {
4379         ld(Rn, Address(Pn_base));
4380         mul(Rlo_mn, Rn, inv);
4381         mv(t0, -1);
4382         Label ok;
4383         beq(Rlo_mn, t0, ok);
4384         stop("broken inverse in Montgomery multiply");
4385         bind(ok);
4386       }
4387 #endif
4388 
4389       mv(Pm_base, Ra);
4390 
4391       mv(tmp0, zr);
4392       mv(tmp1, zr);
4393       mv(tmp2, zr);
4394 
4395       block_comment("for (int i = 0; i < len; i++) {");
4396       mv(Ri, zr); {
4397         Label loop, end;
4398         bge(Ri, Rlen, end);
4399 
4400         bind(loop);
4401         pre1(Ri);
4402 
4403         block_comment("  for (j = i; j; j--) {"); {
4404           mv(Rj, Ri);
4405           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4406         } block_comment("  } // j");
4407 
4408         post1();
4409         addiw(Ri, Ri, 1);
4410         blt(Ri, Rlen, loop);
4411         bind(end);
4412         block_comment("} // i");
4413       }
4414 
4415       block_comment("for (int i = len; i < 2*len; i++) {");
4416       mv(Ri, Rlen); {
4417         Label loop, end;
4418         slli(t0, Rlen, 1);
4419         bge(Ri, t0, end);
4420 
4421         bind(loop);
4422         pre2(Ri, Rlen);
4423 
4424         block_comment("  for (j = len*2-i-1; j; j--) {"); {
4425           slliw(Rj, Rlen, 1);
4426           subw(Rj, Rj, Ri);
4427           subiw(Rj, Rj, 1);
4428           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4429         } block_comment("  } // j");
4430 
4431         post2(Ri, Rlen);
4432         addiw(Ri, Ri, 1);
4433         slli(t0, Rlen, 1);
4434         blt(Ri, t0, loop);
4435         bind(end);
4436       }
4437       block_comment("} // i");
4438 
4439       normalize(Rlen);
4440 
4441       mv(Ra, Pm_base);  // Save Pm_base in Ra
4442       restore_regs();  // Restore caller's Pm_base
4443 
4444       // Copy our result into caller's Pm_base
4445       reverse(Pm_base, Ra, Rlen, Ri, Rj);
4446 
4447       leave();
4448       bind(nothing);
4449       ret();
4450 
4451       return entry;
4452     }
4453 
4454     /**
4455      *
4456      * Arguments:
4457      *
4458      * Inputs:
4459      *   c_rarg0   - int array elements a
4460      *   c_rarg1   - int array elements n (the modulus)
4461      *   c_rarg2   - int length
4462      *   c_rarg3   - int inv
4463      *   c_rarg4   - int array elements m (the result)
4464      *
4465      */
4466     address generate_square() {
4467       Label argh;
4468       bind(argh);
4469       stop("MontgomeryMultiply total_allocation must be <= 8192");
4470 
4471       align(CodeEntryAlignment);
4472       address entry = pc();
4473 
4474       enter();
4475 
4476       // Make room.
4477       mv(Ra, 512);
4478       bgt(Rlen, Ra, argh);
4479       slli(Ra, Rlen, exact_log2(4 * sizeof(jint)));
4480       sub(Ra, sp, Ra);
4481       andi(sp, Ra, -2 * wordSize);
4482 
4483       srliw(Rlen, Rlen, 1);  // length in longwords = len/2
4484 
4485       {
4486         // Copy input args, reversing as we go.  We use Ra as a
4487         // temporary variable.
4488         reverse(Ra, Pa_base, Rlen, Ri, Rj);
4489         reverse(Ra, Pn_base, Rlen, Ri, Rj);
4490       }
4491 
4492       // Push all call-saved registers and also Pm_base which we'll need
4493       // at the end.
4494       save_regs();
4495 
4496       mv(Pm_base, Ra);
4497 
4498       mv(tmp0, zr);
4499       mv(tmp1, zr);
4500       mv(tmp2, zr);
4501 
4502       block_comment("for (int i = 0; i < len; i++) {");
4503       mv(Ri, zr); {
4504         Label loop, end;
4505         bind(loop);
4506         bge(Ri, Rlen, end);
4507 
4508         pre1(Ri);
4509 
4510         block_comment("for (j = (i+1)/2; j; j--) {"); {
4511           addi(Rj, Ri, 1);
4512           srliw(Rj, Rj, 1);
4513           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4514         } block_comment("  } // j");
4515 
4516         last_squaring(Ri);
4517 
4518         block_comment("  for (j = i/2; j; j--) {"); {
4519           srliw(Rj, Ri, 1);
4520           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4521         } block_comment("  } // j");
4522 
4523         post1_squaring();
4524         addi(Ri, Ri, 1);
4525         blt(Ri, Rlen, loop);
4526 
4527         bind(end);
4528         block_comment("} // i");
4529       }
4530 
4531       block_comment("for (int i = len; i < 2*len; i++) {");
4532       mv(Ri, Rlen); {
4533         Label loop, end;
4534         bind(loop);
4535         slli(t0, Rlen, 1);
4536         bge(Ri, t0, end);
4537 
4538         pre2(Ri, Rlen);
4539 
4540         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
4541           slli(Rj, Rlen, 1);
4542           sub(Rj, Rj, Ri);
4543           subi(Rj, Rj, 1);
4544           srliw(Rj, Rj, 1);
4545           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4546         } block_comment("  } // j");
4547 
4548         last_squaring(Ri);
4549 
4550         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
4551           slli(Rj, Rlen, 1);
4552           sub(Rj, Rj, Ri);
4553           srliw(Rj, Rj, 1);
4554           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4555         } block_comment("  } // j");
4556 
4557         post2(Ri, Rlen);
4558         addi(Ri, Ri, 1);
4559         slli(t0, Rlen, 1);
4560         blt(Ri, t0, loop);
4561 
4562         bind(end);
4563         block_comment("} // i");
4564       }
4565 
4566       normalize(Rlen);
4567 
4568       mv(Ra, Pm_base);  // Save Pm_base in Ra
4569       restore_regs();  // Restore caller's Pm_base
4570 
4571       // Copy our result into caller's Pm_base
4572       reverse(Pm_base, Ra, Rlen, Ri, Rj);
4573 
4574       leave();
4575       ret();
4576 
4577       return entry;
4578     }
4579   };
4580 
4581 #endif // COMPILER2
4582 
4583   address generate_cont_thaw(Continuation::thaw_kind kind) {
4584     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
4585     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
4586 
4587     address start = __ pc();
4588 
4589     if (return_barrier) {
4590       __ ld(sp, Address(xthread, JavaThread::cont_entry_offset()));
4591     }
4592 
4593 #ifndef PRODUCT
4594     {
4595       Label OK;
4596       __ ld(t0, Address(xthread, JavaThread::cont_entry_offset()));
4597       __ beq(sp, t0, OK);
4598       __ stop("incorrect sp");
4599       __ bind(OK);
4600     }
4601 #endif
4602 
4603     if (return_barrier) {
4604       // preserve possible return value from a method returning to the return barrier
4605       __ subi(sp, sp, 2 * wordSize);
4606       __ fsd(f10, Address(sp, 0 * wordSize));
4607       __ sd(x10, Address(sp, 1 * wordSize));
4608     }
4609 
4610     __ mv(c_rarg1, (return_barrier ? 1 : 0));
4611     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), xthread, c_rarg1);
4612     __ mv(t1, x10); // x10 contains the size of the frames to thaw, 0 if overflow or no more frames
4613 
4614     if (return_barrier) {
4615       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
4616       __ ld(x10, Address(sp, 1 * wordSize));
4617       __ fld(f10, Address(sp, 0 * wordSize));
4618       __ addi(sp, sp, 2 * wordSize);
4619     }
4620 
4621 #ifndef PRODUCT
4622     {
4623       Label OK;
4624       __ ld(t0, Address(xthread, JavaThread::cont_entry_offset()));
4625       __ beq(sp, t0, OK);
4626       __ stop("incorrect sp");
4627       __ bind(OK);
4628     }
4629 #endif
4630 
4631     Label thaw_success;
4632     // t1 contains the size of the frames to thaw, 0 if overflow or no more frames
4633     __ bnez(t1, thaw_success);
4634     __ j(RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
4635     __ bind(thaw_success);
4636 
4637     // make room for the thawed frames
4638     __ sub(t0, sp, t1);
4639     __ andi(sp, t0, -16); // align
4640 
4641     if (return_barrier) {
4642       // save original return value -- again
4643       __ subi(sp, sp, 2 * wordSize);
4644       __ fsd(f10, Address(sp, 0 * wordSize));
4645       __ sd(x10, Address(sp, 1 * wordSize));
4646     }
4647 
4648     // If we want, we can templatize thaw by kind, and have three different entries
4649     __ mv(c_rarg1, kind);
4650 
4651     __ call_VM_leaf(Continuation::thaw_entry(), xthread, c_rarg1);
4652     __ mv(t1, x10); // x10 is the sp of the yielding frame
4653 
4654     if (return_barrier) {
4655       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
4656       __ ld(x10, Address(sp, 1 * wordSize));
4657       __ fld(f10, Address(sp, 0 * wordSize));
4658       __ addi(sp, sp, 2 * wordSize);
4659     } else {
4660       __ mv(x10, zr); // return 0 (success) from doYield
4661     }
4662 
4663     // we're now on the yield frame (which is in an address above us b/c sp has been pushed down)
4664     __ mv(fp, t1);
4665     __ subi(sp, t1, 2 * wordSize); // now pointing to fp spill
4666 
4667     if (return_barrier_exception) {
4668       __ ld(c_rarg1, Address(fp, -1 * wordSize)); // return address
4669       __ verify_oop(x10);
4670       __ mv(x9, x10); // save return value contaning the exception oop in callee-saved x9
4671 
4672       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), xthread, c_rarg1);
4673 
4674       // see OptoRuntime::generate_exception_blob: x10 -- exception oop, x13 -- exception pc
4675 
4676       __ mv(x11, x10); // the exception handler
4677       __ mv(x10, x9); // restore return value contaning the exception oop
4678       __ verify_oop(x10);
4679 
4680       __ leave();
4681       __ mv(x13, ra);
4682       __ jr(x11); // the exception handler
4683     } else {
4684       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
4685       __ leave();
4686       __ ret();
4687     }
4688 
4689     return start;
4690   }
4691 
4692   address generate_cont_thaw() {
4693     if (!Continuations::enabled()) return nullptr;
4694 
4695     StubId stub_id = StubId::stubgen_cont_thaw_id;
4696     StubCodeMark mark(this, stub_id);
4697     address start = __ pc();
4698     generate_cont_thaw(Continuation::thaw_top);
4699     return start;
4700   }
4701 
4702   address generate_cont_returnBarrier() {
4703     if (!Continuations::enabled()) return nullptr;
4704 
4705     // TODO: will probably need multiple return barriers depending on return type
4706     StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
4707     StubCodeMark mark(this, stub_id);
4708     address start = __ pc();
4709 
4710     generate_cont_thaw(Continuation::thaw_return_barrier);
4711 
4712     return start;
4713   }
4714 
4715   address generate_cont_returnBarrier_exception() {
4716     if (!Continuations::enabled()) return nullptr;
4717 
4718     StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
4719     StubCodeMark mark(this, stub_id);
4720     address start = __ pc();
4721 
4722     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
4723 
4724     return start;
4725   }
4726 
4727   address generate_cont_preempt_stub() {
4728     if (!Continuations::enabled()) return nullptr;
4729     StubId stub_id = StubId::stubgen_cont_preempt_id;
4730     StubCodeMark mark(this, stub_id);
4731     address start = __ pc();
4732 
4733     __ reset_last_Java_frame(true);
4734 
4735     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
4736     __ ld(sp, Address(xthread, JavaThread::cont_entry_offset()));
4737 
4738     Label preemption_cancelled;
4739     __ lbu(t0, Address(xthread, JavaThread::preemption_cancelled_offset()));
4740     __ bnez(t0, preemption_cancelled);
4741 
4742     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
4743     SharedRuntime::continuation_enter_cleanup(_masm);
4744     __ leave();
4745     __ ret();
4746 
4747     // We acquired the monitor after freezing the frames so call thaw to continue execution.
4748     __ bind(preemption_cancelled);
4749     __ sb(zr, Address(xthread, JavaThread::preemption_cancelled_offset()));
4750     __ la(fp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size() + 2 * wordSize)));
4751     __ la(t1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
4752     __ ld(t1, Address(t1));
4753     __ jr(t1);
4754 
4755     return start;
4756   }
4757 
4758 #ifdef COMPILER2
4759 
4760 #undef __
4761 #define __ this->
4762 
4763   class Sha2Generator : public MacroAssembler {
4764     StubCodeGenerator* _cgen;
4765    public:
4766       Sha2Generator(MacroAssembler* masm, StubCodeGenerator* cgen) : MacroAssembler(masm->code()), _cgen(cgen) {}
4767       address generate_sha256_implCompress(StubId stub_id) {
4768         return generate_sha2_implCompress(Assembler::e32, stub_id);
4769       }
4770       address generate_sha512_implCompress(StubId stub_id) {
4771         return generate_sha2_implCompress(Assembler::e64, stub_id);
4772       }
4773    private:
4774 
4775     void vleXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) {
4776       if (vset_sew == Assembler::e32) __ vle32_v(vr, sr);
4777       else                            __ vle64_v(vr, sr);
4778     }
4779 
4780     void vseXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) {
4781       if (vset_sew == Assembler::e32) __ vse32_v(vr, sr);
4782       else                            __ vse64_v(vr, sr);
4783     }
4784 
4785     // Overview of the logic in each "quad round".
4786     //
4787     // The code below repeats 16/20 times the logic implementing four rounds
4788     // of the SHA-256/512 core loop as documented by NIST. 16/20 "quad rounds"
4789     // to implementing the 64/80 single rounds.
4790     //
4791     //    // Load four word (u32/64) constants (K[t+3], K[t+2], K[t+1], K[t+0])
4792     //    // Output:
4793     //    //   vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]}
4794     //    vl1reXX.v vTmp1, ofs
4795     //
4796     //    // Increment word constant address by stride (16/32 bytes, 4*4B/8B, 128b/256b)
4797     //    addi ofs, ofs, 16/32
4798     //
4799     //    // Add constants to message schedule words:
4800     //    //  Input
4801     //    //    vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]}
4802     //    //    vW0 = {W[t+3], W[t+2], W[t+1], W[t+0]}; // Vt0 = W[3:0];
4803     //    //  Output
4804     //    //    vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
4805     //    vadd.vv vTmp0, vTmp1, vW0
4806     //
4807     //    //  2 rounds of working variables updates.
4808     //    //     vState1[t+4] <- vState1[t], vState0[t], vTmp0[t]
4809     //    //  Input:
4810     //    //    vState1 = {c[t],d[t],g[t],h[t]}   " = vState1[t] "
4811     //    //    vState0 = {a[t],b[t],e[t],f[t]}
4812     //    //    vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
4813     //    //  Output:
4814     //    //    vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]}  " = vState0[t+2] "
4815     //    //        = {h[t+4],g[t+4],d[t+4],c[t+4]}  " = vState1[t+4] "
4816     //    vsha2cl.vv vState1, vState0, vTmp0
4817     //
4818     //    //  2 rounds of working variables updates.
4819     //    //     vState0[t+4] <- vState0[t], vState0[t+2], vTmp0[t]
4820     //    //  Input
4821     //    //   vState0 = {a[t],b[t],e[t],f[t]}       " = vState0[t] "
4822     //    //       = {h[t+2],g[t+2],d[t+2],c[t+2]}   " = vState1[t+2] "
4823     //    //   vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]}   " = vState0[t+2] "
4824     //    //   vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
4825     //    //  Output:
4826     //    //   vState0 = {f[t+4],e[t+4],b[t+4],a[t+4]}   " = vState0[t+4] "
4827     //    vsha2ch.vv vState0, vState1, vTmp0
4828     //
4829     //    // Combine 2QW into 1QW
4830     //    //
4831     //    // To generate the next 4 words, "new_vW0"/"vTmp0" from vW0-vW3, vsha2ms needs
4832     //    //     vW0[0..3], vW1[0], vW2[1..3], vW3[0, 2..3]
4833     //    // and it can only take 3 vectors as inputs. Hence we need to combine
4834     //    // vW1[0] and vW2[1..3] in a single vector.
4835     //    //
4836     //    // vmerge Vt4, Vt1, Vt2, V0
4837     //    // Input
4838     //    //  V0 = mask // first word from vW2, 1..3 words from vW1
4839     //    //  vW2 = {Wt-8, Wt-7, Wt-6, Wt-5}
4840     //    //  vW1 = {Wt-12, Wt-11, Wt-10, Wt-9}
4841     //    // Output
4842     //    //  Vt4 = {Wt-12, Wt-7, Wt-6, Wt-5}
4843     //    vmerge.vvm vTmp0, vW2, vW1, v0
4844     //
4845     //    // Generate next Four Message Schedule Words (hence allowing for 4 more rounds)
4846     //    // Input
4847     //    //  vW0 = {W[t+ 3], W[t+ 2], W[t+ 1], W[t+ 0]}     W[ 3: 0]
4848     //    //  vW3 = {W[t+15], W[t+14], W[t+13], W[t+12]}     W[15:12]
4849     //    //  vTmp0 = {W[t+11], W[t+10], W[t+ 9], W[t+ 4]}     W[11: 9,4]
4850     //    // Output (next four message schedule words)
4851     //    //  vW0 = {W[t+19],  W[t+18],  W[t+17],  W[t+16]}  W[19:16]
4852     //    vsha2ms.vv vW0, vTmp0, vW3
4853     //
4854     // BEFORE
4855     //  vW0 - vW3 hold the message schedule words (initially the block words)
4856     //    vW0 = W[ 3: 0]   "oldest"
4857     //    vW1 = W[ 7: 4]
4858     //    vW2 = W[11: 8]
4859     //    vW3 = W[15:12]   "newest"
4860     //
4861     //  vt6 - vt7 hold the working state variables
4862     //    vState0 = {a[t],b[t],e[t],f[t]}   // initially {H5,H4,H1,H0}
4863     //    vState1 = {c[t],d[t],g[t],h[t]}   // initially {H7,H6,H3,H2}
4864     //
4865     // AFTER
4866     //  vW0 - vW3 hold the message schedule words (initially the block words)
4867     //    vW1 = W[ 7: 4]   "oldest"
4868     //    vW2 = W[11: 8]
4869     //    vW3 = W[15:12]
4870     //    vW0 = W[19:16]   "newest"
4871     //
4872     //  vState0 and vState1 hold the working state variables
4873     //    vState0 = {a[t+4],b[t+4],e[t+4],f[t+4]}
4874     //    vState1 = {c[t+4],d[t+4],g[t+4],h[t+4]}
4875     //
4876     //  The group of vectors vW0,vW1,vW2,vW3 is "rotated" by one in each quad-round,
4877     //  hence the uses of those vectors rotate in each round, and we get back to the
4878     //  initial configuration every 4 quad-rounds. We could avoid those changes at
4879     //  the cost of moving those vectors at the end of each quad-rounds.
4880     void sha2_quad_round(Assembler::SEW vset_sew, VectorRegister rot1, VectorRegister rot2, VectorRegister rot3, VectorRegister rot4,
4881                          Register scalarconst, VectorRegister vtemp, VectorRegister vtemp2, VectorRegister v_abef, VectorRegister v_cdgh,
4882                          bool gen_words = true, bool step_const = true) {
4883       __ vleXX_v(vset_sew, vtemp, scalarconst);
4884       if (step_const) {
4885         __ addi(scalarconst, scalarconst, vset_sew == Assembler::e32 ? 16 : 32);
4886       }
4887       __ vadd_vv(vtemp2, vtemp, rot1);
4888       __ vsha2cl_vv(v_cdgh, v_abef, vtemp2);
4889       __ vsha2ch_vv(v_abef, v_cdgh, vtemp2);
4890       if (gen_words) {
4891         __ vmerge_vvm(vtemp2, rot3, rot2);
4892         __ vsha2ms_vv(rot1, vtemp2, rot4);
4893       }
4894     }
4895 
4896     // Arguments:
4897     //
4898     // Inputs:
4899     //   c_rarg0   - byte[]  source+offset
4900     //   c_rarg1   - int[]   SHA.state
4901     //   c_rarg2   - int     offset
4902     //   c_rarg3   - int     limit
4903     //
4904     address generate_sha2_implCompress(Assembler::SEW vset_sew, StubId stub_id) {
4905       alignas(64) static const uint32_t round_consts_256[64] = {
4906         0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
4907         0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
4908         0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
4909         0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
4910         0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
4911         0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
4912         0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
4913         0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
4914         0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
4915         0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
4916         0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
4917         0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
4918         0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
4919         0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
4920         0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
4921         0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
4922       };
4923       alignas(64) static const uint64_t round_consts_512[80] = {
4924         0x428a2f98d728ae22l, 0x7137449123ef65cdl, 0xb5c0fbcfec4d3b2fl,
4925         0xe9b5dba58189dbbcl, 0x3956c25bf348b538l, 0x59f111f1b605d019l,
4926         0x923f82a4af194f9bl, 0xab1c5ed5da6d8118l, 0xd807aa98a3030242l,
4927         0x12835b0145706fbel, 0x243185be4ee4b28cl, 0x550c7dc3d5ffb4e2l,
4928         0x72be5d74f27b896fl, 0x80deb1fe3b1696b1l, 0x9bdc06a725c71235l,
4929         0xc19bf174cf692694l, 0xe49b69c19ef14ad2l, 0xefbe4786384f25e3l,
4930         0x0fc19dc68b8cd5b5l, 0x240ca1cc77ac9c65l, 0x2de92c6f592b0275l,
4931         0x4a7484aa6ea6e483l, 0x5cb0a9dcbd41fbd4l, 0x76f988da831153b5l,
4932         0x983e5152ee66dfabl, 0xa831c66d2db43210l, 0xb00327c898fb213fl,
4933         0xbf597fc7beef0ee4l, 0xc6e00bf33da88fc2l, 0xd5a79147930aa725l,
4934         0x06ca6351e003826fl, 0x142929670a0e6e70l, 0x27b70a8546d22ffcl,
4935         0x2e1b21385c26c926l, 0x4d2c6dfc5ac42aedl, 0x53380d139d95b3dfl,
4936         0x650a73548baf63del, 0x766a0abb3c77b2a8l, 0x81c2c92e47edaee6l,
4937         0x92722c851482353bl, 0xa2bfe8a14cf10364l, 0xa81a664bbc423001l,
4938         0xc24b8b70d0f89791l, 0xc76c51a30654be30l, 0xd192e819d6ef5218l,
4939         0xd69906245565a910l, 0xf40e35855771202al, 0x106aa07032bbd1b8l,
4940         0x19a4c116b8d2d0c8l, 0x1e376c085141ab53l, 0x2748774cdf8eeb99l,
4941         0x34b0bcb5e19b48a8l, 0x391c0cb3c5c95a63l, 0x4ed8aa4ae3418acbl,
4942         0x5b9cca4f7763e373l, 0x682e6ff3d6b2b8a3l, 0x748f82ee5defb2fcl,
4943         0x78a5636f43172f60l, 0x84c87814a1f0ab72l, 0x8cc702081a6439ecl,
4944         0x90befffa23631e28l, 0xa4506cebde82bde9l, 0xbef9a3f7b2c67915l,
4945         0xc67178f2e372532bl, 0xca273eceea26619cl, 0xd186b8c721c0c207l,
4946         0xeada7dd6cde0eb1el, 0xf57d4f7fee6ed178l, 0x06f067aa72176fbal,
4947         0x0a637dc5a2c898a6l, 0x113f9804bef90dael, 0x1b710b35131c471bl,
4948         0x28db77f523047d84l, 0x32caab7b40c72493l, 0x3c9ebe0a15c9bebcl,
4949         0x431d67c49c100d4cl, 0x4cc5d4becb3e42b6l, 0x597f299cfc657e2al,
4950         0x5fcb6fab3ad6faecl, 0x6c44198c4a475817l
4951       };
4952       const int const_add = vset_sew == Assembler::e32 ? 16 : 32;
4953 
4954       bool multi_block;
4955       switch (stub_id) {
4956       case StubId::stubgen_sha256_implCompress_id:
4957         assert (vset_sew == Assembler::e32, "wrong macroassembler for stub");
4958         multi_block = false;
4959         break;
4960       case StubId::stubgen_sha256_implCompressMB_id:
4961         assert (vset_sew == Assembler::e32, "wrong macroassembler for stub");
4962         multi_block = true;
4963         break;
4964       case StubId::stubgen_sha512_implCompress_id:
4965         assert (vset_sew == Assembler::e64, "wrong macroassembler for stub");
4966         multi_block = false;
4967         break;
4968       case StubId::stubgen_sha512_implCompressMB_id:
4969         assert (vset_sew == Assembler::e64, "wrong macroassembler for stub");
4970         multi_block = true;
4971         break;
4972       default:
4973         ShouldNotReachHere();
4974       };
4975       __ align(CodeEntryAlignment);
4976       StubCodeMark mark(_cgen, stub_id);
4977       address start = __ pc();
4978 
4979       Register buf   = c_rarg0;
4980       Register state = c_rarg1;
4981       Register ofs   = c_rarg2;
4982       Register limit = c_rarg3;
4983       Register consts =  t2; // caller saved
4984       Register state_c = x28; // caller saved
4985       VectorRegister vindex = v2;
4986       VectorRegister vW0 = v4;
4987       VectorRegister vW1 = v6;
4988       VectorRegister vW2 = v8;
4989       VectorRegister vW3 = v10;
4990       VectorRegister vState0 = v12;
4991       VectorRegister vState1 = v14;
4992       VectorRegister vHash0  = v16;
4993       VectorRegister vHash1  = v18;
4994       VectorRegister vTmp0   = v20;
4995       VectorRegister vTmp1   = v22;
4996 
4997       Label multi_block_loop;
4998 
4999       __ enter();
5000 
5001       address constant_table = vset_sew == Assembler::e32 ? (address)round_consts_256 : (address)round_consts_512;
5002       la(consts, ExternalAddress(constant_table));
5003 
5004       // Register use in this function:
5005       //
5006       // VECTORS
5007       //  vW0 - vW3 (512/1024-bits / 4*128/256 bits / 4*4*32/65 bits), hold the message
5008       //             schedule words (Wt). They start with the message block
5009       //             content (W0 to W15), then further words in the message
5010       //             schedule generated via vsha2ms from previous Wt.
5011       //   Initially:
5012       //     vW0 = W[  3:0] = { W3,  W2,  W1,  W0}
5013       //     vW1 = W[  7:4] = { W7,  W6,  W5,  W4}
5014       //     vW2 = W[ 11:8] = {W11, W10,  W9,  W8}
5015       //     vW3 = W[15:12] = {W15, W14, W13, W12}
5016       //
5017       //  vState0 - vState1 hold the working state variables (a, b, ..., h)
5018       //    vState0 = {f[t],e[t],b[t],a[t]}
5019       //    vState1 = {h[t],g[t],d[t],c[t]}
5020       //   Initially:
5021       //    vState0 = {H5i-1, H4i-1, H1i-1 , H0i-1}
5022       //    vState1 = {H7i-i, H6i-1, H3i-1 , H2i-1}
5023       //
5024       //  v0 = masks for vrgather/vmerge. Single value during the 16 rounds.
5025       //
5026       //  vTmp0 = temporary, Wt+Kt
5027       //  vTmp1 = temporary, Kt
5028       //
5029       //  vHash0/vHash1 = hold the initial values of the hash, byte-swapped.
5030       //
5031       // During most of the function the vector state is configured so that each
5032       // vector is interpreted as containing four 32/64 bits (e32/e64) elements (128/256 bits).
5033 
5034       // vsha2ch/vsha2cl uses EGW of 4*SEW.
5035       // SHA256 SEW = e32, EGW = 128-bits
5036       // SHA512 SEW = e64, EGW = 256-bits
5037       //
5038       // VLEN is required to be at least 128.
5039       // For the case of VLEN=128 and SHA512 we need LMUL=2 to work with 4*e64 (EGW = 256)
5040       //
5041       // m1: LMUL=1/2
5042       // ta: tail agnostic (don't care about those lanes)
5043       // ma: mask agnostic (don't care about those lanes)
5044       // x0 is not written, we known the number of vector elements.
5045 
5046       if (vset_sew == Assembler::e64 && MaxVectorSize == 16) { // SHA512 and VLEN = 128
5047         __ vsetivli(x0, 4, vset_sew, Assembler::m2, Assembler::ma, Assembler::ta);
5048       } else {
5049         __ vsetivli(x0, 4, vset_sew, Assembler::m1, Assembler::ma, Assembler::ta);
5050       }
5051 
5052       int64_t indexes = vset_sew == Assembler::e32 ? 0x00041014ul : 0x00082028ul;
5053       __ li(t0, indexes);
5054       __ vmv_v_x(vindex, t0);
5055 
5056       // Step-over a,b, so we are pointing to c.
5057       // const_add is equal to 4x state variable, div by 2 is thus 2, a,b
5058       __ addi(state_c, state, const_add/2);
5059 
5060       // Use index-load to get {f,e,b,a},{h,g,d,c}
5061       __ vluxei8_v(vState0, state, vindex);
5062       __ vluxei8_v(vState1, state_c, vindex);
5063 
5064       __ bind(multi_block_loop);
5065 
5066       // Capture the initial H values in vHash0 and vHash1 to allow for computing
5067       // the resulting H', since H' = H+{a',b',c',...,h'}.
5068       __ vmv_v_v(vHash0, vState0);
5069       __ vmv_v_v(vHash1, vState1);
5070 
5071       // Load the 512/1024-bits of the message block in vW0-vW3 and perform
5072       // an endian swap on each 4/8 bytes element.
5073       //
5074       // If Zvkb is not implemented one can use vrgather
5075       // with an index sequence to byte-swap.
5076       //  sequence = [3 2 1 0   7 6 5 4  11 10 9 8   15 14 13 12]
5077       //   <https://oeis.org/A004444> gives us "N ^ 3" as a nice formula to generate
5078       //  this sequence. 'vid' gives us the N.
5079       __ vleXX_v(vset_sew, vW0, buf);
5080       __ vrev8_v(vW0, vW0);
5081       __ addi(buf, buf, const_add);
5082       __ vleXX_v(vset_sew, vW1, buf);
5083       __ vrev8_v(vW1, vW1);
5084       __ addi(buf, buf, const_add);
5085       __ vleXX_v(vset_sew, vW2, buf);
5086       __ vrev8_v(vW2, vW2);
5087       __ addi(buf, buf, const_add);
5088       __ vleXX_v(vset_sew, vW3, buf);
5089       __ vrev8_v(vW3, vW3);
5090       __ addi(buf, buf, const_add);
5091 
5092       // Set v0 up for the vmerge that replaces the first word (idx==0)
5093       __ vid_v(v0);
5094       __ vmseq_vi(v0, v0, 0x0);  // v0.mask[i] = (i == 0 ? 1 : 0)
5095 
5096       VectorRegister rotation_regs[] = {vW0, vW1, vW2, vW3};
5097       int rot_pos = 0;
5098       // Quad-round #0 (+0, vW0->vW1->vW2->vW3) ... #11 (+3, vW3->vW0->vW1->vW2)
5099       const int qr_end = vset_sew == Assembler::e32 ? 12 : 16;
5100       for (int i = 0; i < qr_end; i++) {
5101         sha2_quad_round(vset_sew,
5102                    rotation_regs[(rot_pos + 0) & 0x3],
5103                    rotation_regs[(rot_pos + 1) & 0x3],
5104                    rotation_regs[(rot_pos + 2) & 0x3],
5105                    rotation_regs[(rot_pos + 3) & 0x3],
5106                    consts,
5107                    vTmp1, vTmp0, vState0, vState1);
5108         ++rot_pos;
5109       }
5110       // Quad-round #12 (+0, vW0->vW1->vW2->vW3) ... #15 (+3, vW3->vW0->vW1->vW2)
5111       // Note that we stop generating new message schedule words (Wt, vW0-13)
5112       // as we already generated all the words we end up consuming (i.e., W[63:60]).
5113       const int qr_c_end = qr_end + 4;
5114       for (int i = qr_end; i < qr_c_end; i++) {
5115         sha2_quad_round(vset_sew,
5116                    rotation_regs[(rot_pos + 0) & 0x3],
5117                    rotation_regs[(rot_pos + 1) & 0x3],
5118                    rotation_regs[(rot_pos + 2) & 0x3],
5119                    rotation_regs[(rot_pos + 3) & 0x3],
5120                    consts,
5121                    vTmp1, vTmp0, vState0, vState1, false, i < (qr_c_end-1));
5122         ++rot_pos;
5123       }
5124 
5125       //--------------------------------------------------------------------------------
5126       // Compute the updated hash value H'
5127       //   H' = H + {h',g',...,b',a'}
5128       //      = {h,g,...,b,a} + {h',g',...,b',a'}
5129       //      = {h+h',g+g',...,b+b',a+a'}
5130 
5131       // H' = H+{a',b',c',...,h'}
5132       __ vadd_vv(vState0, vHash0, vState0);
5133       __ vadd_vv(vState1, vHash1, vState1);
5134 
5135       if (multi_block) {
5136         int total_adds = vset_sew == Assembler::e32 ? 240 : 608;
5137         __ subi(consts, consts, total_adds);
5138         __ addi(ofs, ofs, vset_sew == Assembler::e32 ? 64 : 128);
5139         __ ble(ofs, limit, multi_block_loop);
5140         __ mv(c_rarg0, ofs); // return ofs
5141       }
5142 
5143       // Store H[0..8] = {a,b,c,d,e,f,g,h} from
5144       //  vState0 = {f,e,b,a}
5145       //  vState1 = {h,g,d,c}
5146       __ vsuxei8_v(vState0, state,   vindex);
5147       __ vsuxei8_v(vState1, state_c, vindex);
5148 
5149       __ leave();
5150       __ ret();
5151 
5152       return start;
5153     }
5154   };
5155 
5156 #undef __
5157 #define __ _masm->
5158 
5159   // Set of L registers that correspond to a contiguous memory area.
5160   // Each 64-bit register typically corresponds to 2 32-bit integers.
5161   template <uint L>
5162   class RegCache {
5163   private:
5164     MacroAssembler *_masm;
5165     Register _regs[L];
5166 
5167   public:
5168     RegCache(MacroAssembler *masm, RegSet rs): _masm(masm) {
5169       assert(rs.size() == L, "%u registers are used to cache %u 4-byte data", rs.size(), 2 * L);
5170       auto it = rs.begin();
5171       for (auto &r: _regs) {
5172         r = *it;
5173         ++it;
5174       }
5175     }
5176 
5177     // generate load for the i'th register
5178     void gen_load(uint i, Register base) {
5179       assert(i < L, "invalid i: %u", i);
5180       __ ld(_regs[i], Address(base, 8 * i));
5181     }
5182 
5183     // add i'th 32-bit integer to dest
5184     void add_u32(const Register dest, uint i, const Register rtmp = t0) {
5185       assert(i < 2 * L, "invalid i: %u", i);
5186 
5187       if (is_even(i)) {
5188         // Use the bottom 32 bits. No need to mask off the top 32 bits
5189         // as addw will do the right thing.
5190         __ addw(dest, dest, _regs[i / 2]);
5191       } else {
5192         // Use the top 32 bits by right-shifting them.
5193         __ srli(rtmp, _regs[i / 2], 32);
5194         __ addw(dest, dest, rtmp);
5195       }
5196     }
5197   };
5198 
5199   typedef RegCache<8> BufRegCache;
5200 
5201   // a += value + x + ac;
5202   // a = Integer.rotateLeft(a, s) + b;
5203   void m5_FF_GG_HH_II_epilogue(BufRegCache& reg_cache,
5204                                Register a, Register b, Register c, Register d,
5205                                int k, int s, int t,
5206                                Register value) {
5207     // a += ac
5208     __ addw(a, a, t, t1);
5209 
5210     // a += x;
5211     reg_cache.add_u32(a, k);
5212     // a += value;
5213     __ addw(a, a, value);
5214 
5215     // a = Integer.rotateLeft(a, s) + b;
5216     __ rolw(a, a, s);
5217     __ addw(a, a, b);
5218   }
5219 
5220   // a += ((b & c) | ((~b) & d)) + x + ac;
5221   // a = Integer.rotateLeft(a, s) + b;
5222   void md5_FF(BufRegCache& reg_cache,
5223               Register a, Register b, Register c, Register d,
5224               int k, int s, int t,
5225               Register rtmp1, Register rtmp2) {
5226     // rtmp1 = b & c
5227     __ andr(rtmp1, b, c);
5228 
5229     // rtmp2 = (~b) & d
5230     __ andn(rtmp2, d, b);
5231 
5232     // rtmp1 = (b & c) | ((~b) & d)
5233     __ orr(rtmp1, rtmp1, rtmp2);
5234 
5235     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
5236   }
5237 
5238   // a += ((b & d) | (c & (~d))) + x + ac;
5239   // a = Integer.rotateLeft(a, s) + b;
5240   void md5_GG(BufRegCache& reg_cache,
5241               Register a, Register b, Register c, Register d,
5242               int k, int s, int t,
5243               Register rtmp1, Register rtmp2) {
5244     // rtmp1 = b & d
5245     __ andr(rtmp1, b, d);
5246 
5247     // rtmp2 = c & (~d)
5248     __ andn(rtmp2, c, d);
5249 
5250     // rtmp1 = (b & d) | (c & (~d))
5251     __ orr(rtmp1, rtmp1, rtmp2);
5252 
5253     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
5254   }
5255 
5256   // a += ((b ^ c) ^ d) + x + ac;
5257   // a = Integer.rotateLeft(a, s) + b;
5258   void md5_HH(BufRegCache& reg_cache,
5259               Register a, Register b, Register c, Register d,
5260               int k, int s, int t,
5261               Register rtmp1, Register rtmp2) {
5262     // rtmp1 = (b ^ c) ^ d
5263     __ xorr(rtmp2, b, c);
5264     __ xorr(rtmp1, rtmp2, d);
5265 
5266     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
5267   }
5268 
5269   // a += (c ^ (b | (~d))) + x + ac;
5270   // a = Integer.rotateLeft(a, s) + b;
5271   void md5_II(BufRegCache& reg_cache,
5272               Register a, Register b, Register c, Register d,
5273               int k, int s, int t,
5274               Register rtmp1, Register rtmp2) {
5275     // rtmp1 = c ^ (b | (~d))
5276     __ orn(rtmp2, b, d);
5277     __ xorr(rtmp1, c, rtmp2);
5278 
5279     m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
5280   }
5281 
5282   // Arguments:
5283   //
5284   // Inputs:
5285   //   c_rarg0   - byte[]  source+offset
5286   //   c_rarg1   - int[]   SHA.state
5287   //   c_rarg2   - int     offset  (multi_block == True)
5288   //   c_rarg3   - int     limit   (multi_block == True)
5289   //
5290   // Registers:
5291   //    x0   zero  (zero)
5292   //    x1     ra  (return address)
5293   //    x2     sp  (stack pointer)
5294   //    x3     gp  (global pointer)
5295   //    x4     tp  (thread pointer)
5296   //    x5     t0  (tmp register)
5297   //    x6     t1  (tmp register)
5298   //    x7     t2  state0
5299   //    x8  f0/s0  (frame pointer)
5300   //    x9     s1
5301   //   x10     a0  rtmp1 / c_rarg0
5302   //   x11     a1  rtmp2 / c_rarg1
5303   //   x12     a2  a     / c_rarg2
5304   //   x13     a3  b     / c_rarg3
5305   //   x14     a4  c
5306   //   x15     a5  d
5307   //   x16     a6  buf
5308   //   x17     a7  state
5309   //   x18     s2  ofs     [saved-reg]  (multi_block == True)
5310   //   x19     s3  limit   [saved-reg]  (multi_block == True)
5311   //   x20     s4  state1  [saved-reg]
5312   //   x21     s5  state2  [saved-reg]
5313   //   x22     s6  state3  [saved-reg]
5314   //   x23     s7
5315   //   x24     s8  buf0    [saved-reg]
5316   //   x25     s9  buf1    [saved-reg]
5317   //   x26    s10  buf2    [saved-reg]
5318   //   x27    s11  buf3    [saved-reg]
5319   //   x28     t3  buf4
5320   //   x29     t4  buf5
5321   //   x30     t5  buf6
5322   //   x31     t6  buf7
5323   address generate_md5_implCompress(StubId stub_id) {
5324     __ align(CodeEntryAlignment);
5325     bool multi_block;
5326     switch (stub_id) {
5327     case StubId::stubgen_md5_implCompress_id:
5328       multi_block = false;
5329       break;
5330     case StubId::stubgen_md5_implCompressMB_id:
5331       multi_block = true;
5332       break;
5333     default:
5334       ShouldNotReachHere();
5335     };
5336     StubCodeMark mark(this, stub_id);
5337     address start = __ pc();
5338 
5339     // rotation constants
5340     const int S11 = 7;
5341     const int S12 = 12;
5342     const int S13 = 17;
5343     const int S14 = 22;
5344     const int S21 = 5;
5345     const int S22 = 9;
5346     const int S23 = 14;
5347     const int S24 = 20;
5348     const int S31 = 4;
5349     const int S32 = 11;
5350     const int S33 = 16;
5351     const int S34 = 23;
5352     const int S41 = 6;
5353     const int S42 = 10;
5354     const int S43 = 15;
5355     const int S44 = 21;
5356 
5357     const int64_t mask32 = 0xffffffff;
5358 
5359     Register buf_arg   = c_rarg0; // a0
5360     Register state_arg = c_rarg1; // a1
5361     Register ofs_arg   = c_rarg2; // a2
5362     Register limit_arg = c_rarg3; // a3
5363 
5364     // we'll copy the args to these registers to free up a0-a3
5365     // to use for other values manipulated by instructions
5366     // that can be compressed
5367     Register buf       = x16; // a6
5368     Register state     = x17; // a7
5369     Register ofs       = x18; // s2
5370     Register limit     = x19; // s3
5371 
5372     // using x12->15 to allow compressed instructions
5373     Register a         = x12; // a2
5374     Register b         = x13; // a3
5375     Register c         = x14; // a4
5376     Register d         = x15; // a5
5377 
5378     Register state0    =  x7; // t2
5379     Register state1    = x20; // s4
5380     Register state2    = x21; // s5
5381     Register state3    = x22; // s6
5382 
5383     // using x10->x11 to allow compressed instructions
5384     Register rtmp1     = x10; // a0
5385     Register rtmp2     = x11; // a1
5386 
5387     RegSet reg_cache_saved_regs = RegSet::of(x24, x25, x26, x27); // s8, s9, s10, s11
5388     RegSet reg_cache_regs;
5389     reg_cache_regs += reg_cache_saved_regs;
5390     reg_cache_regs += RegSet::of(t3, t4, t5, t6);
5391     BufRegCache reg_cache(_masm, reg_cache_regs);
5392 
5393     RegSet saved_regs;
5394     if (multi_block) {
5395       saved_regs += RegSet::of(ofs, limit);
5396     }
5397     saved_regs += RegSet::of(state1, state2, state3);
5398     saved_regs += reg_cache_saved_regs;
5399 
5400     __ push_reg(saved_regs, sp);
5401 
5402     __ mv(buf, buf_arg);
5403     __ mv(state, state_arg);
5404     if (multi_block) {
5405       __ mv(ofs, ofs_arg);
5406       __ mv(limit, limit_arg);
5407     }
5408 
5409     // to minimize the number of memory operations:
5410     // read the 4 state 4-byte values in pairs, with a single ld,
5411     // and split them into 2 registers.
5412     //
5413     // And, as the core algorithm of md5 works on 32-bits words, so
5414     // in the following code, it does not care about the content of
5415     // higher 32-bits in state[x]. Based on this observation,
5416     // we can apply further optimization, which is to just ignore the
5417     // higher 32-bits in state0/state2, rather than set the higher
5418     // 32-bits of state0/state2 to zero explicitly with extra instructions.
5419     __ ld(state0, Address(state));
5420     __ srli(state1, state0, 32);
5421     __ ld(state2, Address(state, 8));
5422     __ srli(state3, state2, 32);
5423 
5424     Label md5_loop;
5425     __ BIND(md5_loop);
5426 
5427     __ mv(a, state0);
5428     __ mv(b, state1);
5429     __ mv(c, state2);
5430     __ mv(d, state3);
5431 
5432     // Round 1
5433     reg_cache.gen_load(0, buf);
5434     md5_FF(reg_cache, a, b, c, d,  0, S11, 0xd76aa478, rtmp1, rtmp2);
5435     md5_FF(reg_cache, d, a, b, c,  1, S12, 0xe8c7b756, rtmp1, rtmp2);
5436     reg_cache.gen_load(1, buf);
5437     md5_FF(reg_cache, c, d, a, b,  2, S13, 0x242070db, rtmp1, rtmp2);
5438     md5_FF(reg_cache, b, c, d, a,  3, S14, 0xc1bdceee, rtmp1, rtmp2);
5439     reg_cache.gen_load(2, buf);
5440     md5_FF(reg_cache, a, b, c, d,  4, S11, 0xf57c0faf, rtmp1, rtmp2);
5441     md5_FF(reg_cache, d, a, b, c,  5, S12, 0x4787c62a, rtmp1, rtmp2);
5442     reg_cache.gen_load(3, buf);
5443     md5_FF(reg_cache, c, d, a, b,  6, S13, 0xa8304613, rtmp1, rtmp2);
5444     md5_FF(reg_cache, b, c, d, a,  7, S14, 0xfd469501, rtmp1, rtmp2);
5445     reg_cache.gen_load(4, buf);
5446     md5_FF(reg_cache, a, b, c, d,  8, S11, 0x698098d8, rtmp1, rtmp2);
5447     md5_FF(reg_cache, d, a, b, c,  9, S12, 0x8b44f7af, rtmp1, rtmp2);
5448     reg_cache.gen_load(5, buf);
5449     md5_FF(reg_cache, c, d, a, b, 10, S13, 0xffff5bb1, rtmp1, rtmp2);
5450     md5_FF(reg_cache, b, c, d, a, 11, S14, 0x895cd7be, rtmp1, rtmp2);
5451     reg_cache.gen_load(6, buf);
5452     md5_FF(reg_cache, a, b, c, d, 12, S11, 0x6b901122, rtmp1, rtmp2);
5453     md5_FF(reg_cache, d, a, b, c, 13, S12, 0xfd987193, rtmp1, rtmp2);
5454     reg_cache.gen_load(7, buf);
5455     md5_FF(reg_cache, c, d, a, b, 14, S13, 0xa679438e, rtmp1, rtmp2);
5456     md5_FF(reg_cache, b, c, d, a, 15, S14, 0x49b40821, rtmp1, rtmp2);
5457 
5458     // Round 2
5459     md5_GG(reg_cache, a, b, c, d,  1, S21, 0xf61e2562, rtmp1, rtmp2);
5460     md5_GG(reg_cache, d, a, b, c,  6, S22, 0xc040b340, rtmp1, rtmp2);
5461     md5_GG(reg_cache, c, d, a, b, 11, S23, 0x265e5a51, rtmp1, rtmp2);
5462     md5_GG(reg_cache, b, c, d, a,  0, S24, 0xe9b6c7aa, rtmp1, rtmp2);
5463     md5_GG(reg_cache, a, b, c, d,  5, S21, 0xd62f105d, rtmp1, rtmp2);
5464     md5_GG(reg_cache, d, a, b, c, 10, S22, 0x02441453, rtmp1, rtmp2);
5465     md5_GG(reg_cache, c, d, a, b, 15, S23, 0xd8a1e681, rtmp1, rtmp2);
5466     md5_GG(reg_cache, b, c, d, a,  4, S24, 0xe7d3fbc8, rtmp1, rtmp2);
5467     md5_GG(reg_cache, a, b, c, d,  9, S21, 0x21e1cde6, rtmp1, rtmp2);
5468     md5_GG(reg_cache, d, a, b, c, 14, S22, 0xc33707d6, rtmp1, rtmp2);
5469     md5_GG(reg_cache, c, d, a, b,  3, S23, 0xf4d50d87, rtmp1, rtmp2);
5470     md5_GG(reg_cache, b, c, d, a,  8, S24, 0x455a14ed, rtmp1, rtmp2);
5471     md5_GG(reg_cache, a, b, c, d, 13, S21, 0xa9e3e905, rtmp1, rtmp2);
5472     md5_GG(reg_cache, d, a, b, c,  2, S22, 0xfcefa3f8, rtmp1, rtmp2);
5473     md5_GG(reg_cache, c, d, a, b,  7, S23, 0x676f02d9, rtmp1, rtmp2);
5474     md5_GG(reg_cache, b, c, d, a, 12, S24, 0x8d2a4c8a, rtmp1, rtmp2);
5475 
5476     // Round 3
5477     md5_HH(reg_cache, a, b, c, d,  5, S31, 0xfffa3942, rtmp1, rtmp2);
5478     md5_HH(reg_cache, d, a, b, c,  8, S32, 0x8771f681, rtmp1, rtmp2);
5479     md5_HH(reg_cache, c, d, a, b, 11, S33, 0x6d9d6122, rtmp1, rtmp2);
5480     md5_HH(reg_cache, b, c, d, a, 14, S34, 0xfde5380c, rtmp1, rtmp2);
5481     md5_HH(reg_cache, a, b, c, d,  1, S31, 0xa4beea44, rtmp1, rtmp2);
5482     md5_HH(reg_cache, d, a, b, c,  4, S32, 0x4bdecfa9, rtmp1, rtmp2);
5483     md5_HH(reg_cache, c, d, a, b,  7, S33, 0xf6bb4b60, rtmp1, rtmp2);
5484     md5_HH(reg_cache, b, c, d, a, 10, S34, 0xbebfbc70, rtmp1, rtmp2);
5485     md5_HH(reg_cache, a, b, c, d, 13, S31, 0x289b7ec6, rtmp1, rtmp2);
5486     md5_HH(reg_cache, d, a, b, c,  0, S32, 0xeaa127fa, rtmp1, rtmp2);
5487     md5_HH(reg_cache, c, d, a, b,  3, S33, 0xd4ef3085, rtmp1, rtmp2);
5488     md5_HH(reg_cache, b, c, d, a,  6, S34, 0x04881d05, rtmp1, rtmp2);
5489     md5_HH(reg_cache, a, b, c, d,  9, S31, 0xd9d4d039, rtmp1, rtmp2);
5490     md5_HH(reg_cache, d, a, b, c, 12, S32, 0xe6db99e5, rtmp1, rtmp2);
5491     md5_HH(reg_cache, c, d, a, b, 15, S33, 0x1fa27cf8, rtmp1, rtmp2);
5492     md5_HH(reg_cache, b, c, d, a,  2, S34, 0xc4ac5665, rtmp1, rtmp2);
5493 
5494     // Round 4
5495     md5_II(reg_cache, a, b, c, d,  0, S41, 0xf4292244, rtmp1, rtmp2);
5496     md5_II(reg_cache, d, a, b, c,  7, S42, 0x432aff97, rtmp1, rtmp2);
5497     md5_II(reg_cache, c, d, a, b, 14, S43, 0xab9423a7, rtmp1, rtmp2);
5498     md5_II(reg_cache, b, c, d, a,  5, S44, 0xfc93a039, rtmp1, rtmp2);
5499     md5_II(reg_cache, a, b, c, d, 12, S41, 0x655b59c3, rtmp1, rtmp2);
5500     md5_II(reg_cache, d, a, b, c,  3, S42, 0x8f0ccc92, rtmp1, rtmp2);
5501     md5_II(reg_cache, c, d, a, b, 10, S43, 0xffeff47d, rtmp1, rtmp2);
5502     md5_II(reg_cache, b, c, d, a,  1, S44, 0x85845dd1, rtmp1, rtmp2);
5503     md5_II(reg_cache, a, b, c, d,  8, S41, 0x6fa87e4f, rtmp1, rtmp2);
5504     md5_II(reg_cache, d, a, b, c, 15, S42, 0xfe2ce6e0, rtmp1, rtmp2);
5505     md5_II(reg_cache, c, d, a, b,  6, S43, 0xa3014314, rtmp1, rtmp2);
5506     md5_II(reg_cache, b, c, d, a, 13, S44, 0x4e0811a1, rtmp1, rtmp2);
5507     md5_II(reg_cache, a, b, c, d,  4, S41, 0xf7537e82, rtmp1, rtmp2);
5508     md5_II(reg_cache, d, a, b, c, 11, S42, 0xbd3af235, rtmp1, rtmp2);
5509     md5_II(reg_cache, c, d, a, b,  2, S43, 0x2ad7d2bb, rtmp1, rtmp2);
5510     md5_II(reg_cache, b, c, d, a,  9, S44, 0xeb86d391, rtmp1, rtmp2);
5511 
5512     __ addw(state0, state0, a);
5513     __ addw(state1, state1, b);
5514     __ addw(state2, state2, c);
5515     __ addw(state3, state3, d);
5516 
5517     if (multi_block) {
5518       __ addi(buf, buf, 64);
5519       __ addi(ofs, ofs, 64);
5520       // if (ofs <= limit) goto m5_loop
5521       __ bge(limit, ofs, md5_loop);
5522       __ mv(c_rarg0, ofs); // return ofs
5523     }
5524 
5525     // to minimize the number of memory operations:
5526     // write back the 4 state 4-byte values in pairs, with a single sd
5527     __ mv(t0, mask32);
5528     __ andr(state0, state0, t0);
5529     __ slli(state1, state1, 32);
5530     __ orr(state0, state0, state1);
5531     __ sd(state0, Address(state));
5532     __ andr(state2, state2, t0);
5533     __ slli(state3, state3, 32);
5534     __ orr(state2, state2, state3);
5535     __ sd(state2, Address(state, 8));
5536 
5537     __ pop_reg(saved_regs, sp);
5538     __ ret();
5539 
5540     return (address) start;
5541   }
5542 
5543   /**
5544    * Perform the quarter round calculations on values contained within four vector registers.
5545    *
5546    * @param aVec the SIMD register containing only the "a" values
5547    * @param bVec the SIMD register containing only the "b" values
5548    * @param cVec the SIMD register containing only the "c" values
5549    * @param dVec the SIMD register containing only the "d" values
5550    * @param tmp_vr temporary vector register holds intermedia values.
5551    */
5552   void chacha20_quarter_round(VectorRegister aVec, VectorRegister bVec,
5553                           VectorRegister cVec, VectorRegister dVec, VectorRegister tmp_vr) {
5554     // a += b, d ^= a, d <<<= 16
5555     __ vadd_vv(aVec, aVec, bVec);
5556     __ vxor_vv(dVec, dVec, aVec);
5557     __ vrole32_vi(dVec, 16, tmp_vr);
5558 
5559     // c += d, b ^= c, b <<<= 12
5560     __ vadd_vv(cVec, cVec, dVec);
5561     __ vxor_vv(bVec, bVec, cVec);
5562     __ vrole32_vi(bVec, 12, tmp_vr);
5563 
5564     // a += b, d ^= a, d <<<= 8
5565     __ vadd_vv(aVec, aVec, bVec);
5566     __ vxor_vv(dVec, dVec, aVec);
5567     __ vrole32_vi(dVec, 8, tmp_vr);
5568 
5569     // c += d, b ^= c, b <<<= 7
5570     __ vadd_vv(cVec, cVec, dVec);
5571     __ vxor_vv(bVec, bVec, cVec);
5572     __ vrole32_vi(bVec, 7, tmp_vr);
5573   }
5574 
5575   /**
5576    * int com.sun.crypto.provider.ChaCha20Cipher.implChaCha20Block(int[] initState, byte[] result)
5577    *
5578    *  Input arguments:
5579    *  c_rarg0   - state, the starting state
5580    *  c_rarg1   - key_stream, the array that will hold the result of the ChaCha20 block function
5581    *
5582    *  Implementation Note:
5583    *   Parallelization is achieved by loading individual state elements into vectors for N blocks.
5584    *   N depends on single vector register length.
5585    */
5586   address generate_chacha20Block() {
5587     Label L_Rounds;
5588 
5589     __ align(CodeEntryAlignment);
5590     StubId stub_id = StubId::stubgen_chacha20Block_id;
5591     StubCodeMark mark(this, stub_id);
5592     address start = __ pc();
5593     __ enter();
5594 
5595     const int states_len = 16;
5596     const int step = 4;
5597     const Register state = c_rarg0;
5598     const Register key_stream = c_rarg1;
5599     const Register tmp_addr = t0;
5600     const Register length = t1;
5601 
5602     // Organize vector registers in an array that facilitates
5603     // putting repetitive opcodes into loop structures below.
5604     const VectorRegister work_vrs[16] = {
5605       v0, v1, v2,  v3,  v4,  v5,  v6,  v7,
5606       v8, v9, v10, v11, v12, v13, v14, v15
5607     };
5608     const VectorRegister tmp_vr = v16;
5609     const VectorRegister counter_vr = v17;
5610 
5611     {
5612       // Put 16 here, as com.sun.crypto.providerChaCha20Cipher.KS_MAX_LEN is 1024
5613       // in java level.
5614       __ vsetivli(length, 16, Assembler::e32, Assembler::m1);
5615     }
5616 
5617     // Load from source state.
5618     // Every element in source state is duplicated to all elements in the corresponding vector.
5619     __ mv(tmp_addr, state);
5620     for (int i = 0; i < states_len; i += 1) {
5621       __ vlse32_v(work_vrs[i], tmp_addr, zr);
5622       __ addi(tmp_addr, tmp_addr, step);
5623     }
5624     // Adjust counter for every individual block.
5625     __ vid_v(counter_vr);
5626     __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
5627 
5628     // Perform 10 iterations of the 8 quarter round set
5629     {
5630       const Register loop = t2; // share t2 with other non-overlapping usages.
5631       __ mv(loop, 10);
5632       __ BIND(L_Rounds);
5633 
5634       chacha20_quarter_round(work_vrs[0], work_vrs[4], work_vrs[8],  work_vrs[12], tmp_vr);
5635       chacha20_quarter_round(work_vrs[1], work_vrs[5], work_vrs[9],  work_vrs[13], tmp_vr);
5636       chacha20_quarter_round(work_vrs[2], work_vrs[6], work_vrs[10], work_vrs[14], tmp_vr);
5637       chacha20_quarter_round(work_vrs[3], work_vrs[7], work_vrs[11], work_vrs[15], tmp_vr);
5638 
5639       chacha20_quarter_round(work_vrs[0], work_vrs[5], work_vrs[10], work_vrs[15], tmp_vr);
5640       chacha20_quarter_round(work_vrs[1], work_vrs[6], work_vrs[11], work_vrs[12], tmp_vr);
5641       chacha20_quarter_round(work_vrs[2], work_vrs[7], work_vrs[8],  work_vrs[13], tmp_vr);
5642       chacha20_quarter_round(work_vrs[3], work_vrs[4], work_vrs[9],  work_vrs[14], tmp_vr);
5643 
5644       __ subi(loop, loop, 1);
5645       __ bnez(loop, L_Rounds);
5646     }
5647 
5648     // Add the original state into the end working state.
5649     // We do this by first duplicating every element in source state array to the corresponding
5650     // vector, then adding it to the post-loop working state.
5651     __ mv(tmp_addr, state);
5652     for (int i = 0; i < states_len; i += 1) {
5653       __ vlse32_v(tmp_vr, tmp_addr, zr);
5654       __ addi(tmp_addr, tmp_addr, step);
5655       __ vadd_vv(work_vrs[i], work_vrs[i], tmp_vr);
5656     }
5657     // Add the counter overlay onto work_vrs[12] at the end.
5658     __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
5659 
5660     // Store result to key stream.
5661     {
5662       const Register stride = t2; // share t2 with other non-overlapping usages.
5663       // Every block occupies 64 bytes, so we use 64 as stride of the vector store.
5664       __ mv(stride, 64);
5665       for (int i = 0; i < states_len; i += 1) {
5666         __ vsse32_v(work_vrs[i], key_stream, stride);
5667         __ addi(key_stream, key_stream, step);
5668       }
5669     }
5670 
5671     // Return length of output key_stream
5672     __ slli(c_rarg0, length, 6);
5673 
5674     __ leave();
5675     __ ret();
5676 
5677     return (address) start;
5678   }
5679 
5680 
5681   // ------------------------ SHA-1 intrinsic ------------------------
5682 
5683   // K't =
5684   //    5a827999, 0  <= t <= 19
5685   //    6ed9eba1, 20 <= t <= 39
5686   //    8f1bbcdc, 40 <= t <= 59
5687   //    ca62c1d6, 60 <= t <= 79
5688   void sha1_prepare_k(Register cur_k, int round) {
5689     assert(round >= 0 && round < 80, "must be");
5690 
5691     static const int64_t ks[] = {0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6};
5692     if ((round % 20) == 0) {
5693       __ mv(cur_k, ks[round/20]);
5694     }
5695   }
5696 
5697   // W't =
5698   //    M't,                                      0 <=  t <= 15
5699   //    ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16),  16 <= t <= 79
5700   void sha1_prepare_w(Register cur_w, Register ws[], Register buf, int round) {
5701     assert(round >= 0 && round < 80, "must be");
5702 
5703     if (round < 16) {
5704       // in the first 16 rounds, in ws[], every register contains 2 W't, e.g.
5705       //   in ws[0], high part contains W't-0, low part contains W't-1,
5706       //   in ws[1], high part contains W't-2, low part contains W't-3,
5707       //   ...
5708       //   in ws[7], high part contains W't-14, low part contains W't-15.
5709 
5710       if ((round % 2) == 0) {
5711         __ ld(ws[round/2], Address(buf, (round/2) * 8));
5712         // reverse bytes, as SHA-1 is defined in big-endian.
5713         __ revb(ws[round/2], ws[round/2]);
5714         __ srli(cur_w, ws[round/2], 32);
5715       } else {
5716         __ mv(cur_w, ws[round/2]);
5717       }
5718 
5719       return;
5720     }
5721 
5722     if ((round % 2) == 0) {
5723       int idx = 16;
5724       // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16),  16 <= t <= 79
5725       __ srli(t1, ws[(idx-8)/2], 32);
5726       __ xorr(t0, ws[(idx-3)/2], t1);
5727 
5728       __ srli(t1, ws[(idx-14)/2], 32);
5729       __ srli(cur_w, ws[(idx-16)/2], 32);
5730       __ xorr(cur_w, cur_w, t1);
5731 
5732       __ xorr(cur_w, cur_w, t0);
5733       __ rolw(cur_w, cur_w, 1, t0);
5734 
5735       // copy the cur_w value to ws[8].
5736       // now, valid w't values are at:
5737       //  w0:       ws[0]'s lower 32 bits
5738       //  w1 ~ w14: ws[1] ~ ws[7]
5739       //  w15:      ws[8]'s higher 32 bits
5740       __ slli(ws[idx/2], cur_w, 32);
5741 
5742       return;
5743     }
5744 
5745     int idx = 17;
5746     // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16),  16 <= t <= 79
5747     __ srli(t1, ws[(idx-3)/2], 32);
5748     __ xorr(t0, t1, ws[(idx-8)/2]);
5749 
5750     __ xorr(cur_w, ws[(idx-16)/2], ws[(idx-14)/2]);
5751 
5752     __ xorr(cur_w, cur_w, t0);
5753     __ rolw(cur_w, cur_w, 1, t0);
5754 
5755     // copy the cur_w value to ws[8]
5756     __ zext(cur_w, cur_w, 32);
5757     __ orr(ws[idx/2], ws[idx/2], cur_w);
5758 
5759     // shift the w't registers, so they start from ws[0] again.
5760     // now, valid w't values are at:
5761     //  w0 ~ w15: ws[0] ~ ws[7]
5762     Register ws_0 = ws[0];
5763     for (int i = 0; i < 16/2; i++) {
5764       ws[i] = ws[i+1];
5765     }
5766     ws[8] = ws_0;
5767   }
5768 
5769   // f't(x, y, z) =
5770   //    Ch(x, y, z)     = (x & y) ^ (~x & z)            , 0  <= t <= 19
5771   //    Parity(x, y, z) = x ^ y ^ z                     , 20 <= t <= 39
5772   //    Maj(x, y, z)    = (x & y) ^ (x & z) ^ (y & z)   , 40 <= t <= 59
5773   //    Parity(x, y, z) = x ^ y ^ z                     , 60 <= t <= 79
5774   void sha1_f(Register dst, Register x, Register y, Register z, int round) {
5775     assert(round >= 0 && round < 80, "must be");
5776     assert_different_registers(dst, x, y, z, t0, t1);
5777 
5778     if (round < 20) {
5779       // (x & y) ^ (~x & z)
5780       __ andr(t0, x, y);
5781       __ andn(dst, z, x);
5782       __ xorr(dst, dst, t0);
5783     } else if (round >= 40 && round < 60) {
5784       // (x & y) ^ (x & z) ^ (y & z)
5785       __ andr(t0, x, y);
5786       __ andr(t1, x, z);
5787       __ andr(dst, y, z);
5788       __ xorr(dst, dst, t0);
5789       __ xorr(dst, dst, t1);
5790     } else {
5791       // x ^ y ^ z
5792       __ xorr(dst, x, y);
5793       __ xorr(dst, dst, z);
5794     }
5795   }
5796 
5797   // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
5798   // e = d
5799   // d = c
5800   // c = ROTL'30(b)
5801   // b = a
5802   // a = T
5803   void sha1_process_round(Register a, Register b, Register c, Register d, Register e,
5804                           Register cur_k, Register cur_w, Register tmp, int round) {
5805     assert(round >= 0 && round < 80, "must be");
5806     assert_different_registers(a, b, c, d, e, cur_w, cur_k, tmp, t0);
5807 
5808     // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
5809 
5810     // cur_w will be recalculated at the beginning of each round,
5811     // so, we can reuse it as a temp register here.
5812     Register tmp2 = cur_w;
5813 
5814     // reuse e as a temporary register, as we will mv new value into it later
5815     Register tmp3 = e;
5816     __ add(tmp2, cur_k, tmp2);
5817     __ add(tmp3, tmp3, tmp2);
5818     __ rolw(tmp2, a, 5, t0);
5819 
5820     sha1_f(tmp, b, c, d, round);
5821 
5822     __ add(tmp2, tmp2, tmp);
5823     __ add(tmp2, tmp2, tmp3);
5824 
5825     // e = d
5826     // d = c
5827     // c = ROTL'30(b)
5828     // b = a
5829     // a = T
5830     __ mv(e, d);
5831     __ mv(d, c);
5832 
5833     __ rolw(c, b, 30);
5834     __ mv(b, a);
5835     __ mv(a, tmp2);
5836   }
5837 
5838   // H(i)0 = a + H(i-1)0
5839   // H(i)1 = b + H(i-1)1
5840   // H(i)2 = c + H(i-1)2
5841   // H(i)3 = d + H(i-1)3
5842   // H(i)4 = e + H(i-1)4
5843   void sha1_calculate_im_hash(Register a, Register b, Register c, Register d, Register e,
5844                               Register prev_ab, Register prev_cd, Register prev_e) {
5845     assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e);
5846 
5847     __ add(a, a, prev_ab);
5848     __ srli(prev_ab, prev_ab, 32);
5849     __ add(b, b, prev_ab);
5850 
5851     __ add(c, c, prev_cd);
5852     __ srli(prev_cd, prev_cd, 32);
5853     __ add(d, d, prev_cd);
5854 
5855     __ add(e, e, prev_e);
5856   }
5857 
5858   void sha1_preserve_prev_abcde(Register a, Register b, Register c, Register d, Register e,
5859                                 Register prev_ab, Register prev_cd, Register prev_e) {
5860     assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e, t0);
5861 
5862     __ slli(t0, b, 32);
5863     __ zext(prev_ab, a, 32);
5864     __ orr(prev_ab, prev_ab, t0);
5865 
5866     __ slli(t0, d, 32);
5867     __ zext(prev_cd, c, 32);
5868     __ orr(prev_cd, prev_cd, t0);
5869 
5870     __ mv(prev_e, e);
5871   }
5872 
5873   // Intrinsic for:
5874   //   void sun.security.provider.SHA.implCompress0(byte[] buf, int ofs)
5875   //   void sun.security.provider.DigestBase.implCompressMultiBlock0(byte[] b, int ofs, int limit)
5876   //
5877   // Arguments:
5878   //
5879   // Inputs:
5880   //   c_rarg0: byte[]  src array + offset
5881   //   c_rarg1: int[]   SHA.state
5882   //   - - - - - - below are only for implCompressMultiBlock0 - - - - - -
5883   //   c_rarg2: int     offset
5884   //   c_rarg3: int     limit
5885   //
5886   // Outputs:
5887   //   - - - - - - below are only for implCompressMultiBlock0 - - - - - -
5888   //   c_rarg0: int offset, when (multi_block == true)
5889   //
5890   address generate_sha1_implCompress(StubId stub_id) {
5891       bool multi_block;
5892       switch (stub_id) {
5893       case StubId::stubgen_sha1_implCompress_id:
5894         multi_block = false;
5895         break;
5896       case StubId::stubgen_sha1_implCompressMB_id:
5897         multi_block = true;
5898         break;
5899       default:
5900         ShouldNotReachHere();
5901       };
5902     __ align(CodeEntryAlignment);
5903     StubCodeMark mark(this, stub_id);
5904 
5905     address start = __ pc();
5906     __ enter();
5907 
5908     RegSet saved_regs = RegSet::range(x18, x27);
5909     if (multi_block) {
5910       // use x9 as src below.
5911       saved_regs += RegSet::of(x9);
5912     }
5913     __ push_reg(saved_regs, sp);
5914 
5915     // c_rarg0 - c_rarg3: x10 - x13
5916     Register buf    = c_rarg0;
5917     Register state  = c_rarg1;
5918     Register offset = c_rarg2;
5919     Register limit  = c_rarg3;
5920     // use src to contain the original start point of the array.
5921     Register src    = x9;
5922 
5923     if (multi_block) {
5924       __ sub(limit, limit, offset);
5925       __ add(limit, limit, buf);
5926       __ sub(src, buf, offset);
5927     }
5928 
5929     // [args-reg]:  x14 - x17
5930     // [temp-reg]:  x28 - x31
5931     // [saved-reg]: x18 - x27
5932 
5933     // h0/1/2/3/4
5934     const Register a = x14, b = x15, c = x16, d = x17, e = x28;
5935     // w0, w1, ... w15
5936     // put two adjecent w's in one register:
5937     //    one at high word part, another at low word part
5938     // at different round (even or odd), w't value reside in different items in ws[].
5939     // w0 ~ w15, either reside in
5940     //    ws[0] ~ ws[7], where
5941     //      w0 at higher 32 bits of ws[0],
5942     //      w1 at lower 32 bits of ws[0],
5943     //      ...
5944     //      w14 at higher 32 bits of ws[7],
5945     //      w15 at lower 32 bits of ws[7].
5946     // or, reside in
5947     //    w0:       ws[0]'s lower 32 bits
5948     //    w1 ~ w14: ws[1] ~ ws[7]
5949     //    w15:      ws[8]'s higher 32 bits
5950     Register ws[9] = {x29, x30, x31, x18,
5951                       x19, x20, x21, x22,
5952                       x23}; // auxiliary register for calculating w's value
5953     // current k't's value
5954     const Register cur_k = x24;
5955     // current w't's value
5956     const Register cur_w = x25;
5957     // values of a, b, c, d, e in the previous round
5958     const Register prev_ab = x26, prev_cd = x27;
5959     const Register prev_e = offset; // reuse offset/c_rarg2
5960 
5961     // load 5 words state into a, b, c, d, e.
5962     //
5963     // To minimize the number of memory operations, we apply following
5964     // optimization: read the states (a/b/c/d) of 4-byte values in pairs,
5965     // with a single ld, and split them into 2 registers.
5966     //
5967     // And, as the core algorithm of SHA-1 works on 32-bits words, so
5968     // in the following code, it does not care about the content of
5969     // higher 32-bits in a/b/c/d/e. Based on this observation,
5970     // we can apply further optimization, which is to just ignore the
5971     // higher 32-bits in a/c/e, rather than set the higher
5972     // 32-bits of a/c/e to zero explicitly with extra instructions.
5973     __ ld(a, Address(state, 0));
5974     __ srli(b, a, 32);
5975     __ ld(c, Address(state, 8));
5976     __ srli(d, c, 32);
5977     __ lw(e, Address(state, 16));
5978 
5979     Label L_sha1_loop;
5980     if (multi_block) {
5981       __ BIND(L_sha1_loop);
5982     }
5983 
5984     sha1_preserve_prev_abcde(a, b, c, d, e, prev_ab, prev_cd, prev_e);
5985 
5986     for (int round = 0; round < 80; round++) {
5987       // prepare K't value
5988       sha1_prepare_k(cur_k, round);
5989 
5990       // prepare W't value
5991       sha1_prepare_w(cur_w, ws, buf, round);
5992 
5993       // one round process
5994       sha1_process_round(a, b, c, d, e, cur_k, cur_w, t2, round);
5995     }
5996 
5997     // compute the intermediate hash value
5998     sha1_calculate_im_hash(a, b, c, d, e, prev_ab, prev_cd, prev_e);
5999 
6000     if (multi_block) {
6001       int64_t block_bytes = 16 * 4;
6002       __ addi(buf, buf, block_bytes);
6003 
6004       __ bge(limit, buf, L_sha1_loop, true);
6005     }
6006 
6007     // store back the state.
6008     __ zext(a, a, 32);
6009     __ slli(b, b, 32);
6010     __ orr(a, a, b);
6011     __ sd(a, Address(state, 0));
6012     __ zext(c, c, 32);
6013     __ slli(d, d, 32);
6014     __ orr(c, c, d);
6015     __ sd(c, Address(state, 8));
6016     __ sw(e, Address(state, 16));
6017 
6018     // return offset
6019     if (multi_block) {
6020       __ sub(c_rarg0, buf, src);
6021     }
6022 
6023     __ pop_reg(saved_regs, sp);
6024 
6025     __ leave();
6026     __ ret();
6027 
6028     return (address) start;
6029   }
6030 
6031   /**
6032    * vector registers:
6033    *   input VectorRegister's:  intputV1-V3, for m2 they could be v2, v4, v6, for m1 they could be v1, v2, v3
6034    *   index VectorRegister's:  idxV1-V4, for m2 they could be v8, v10, v12, v14, for m1 they could be v4, v5, v6, v7
6035    *   output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v8, v9, v10, v11
6036    *
6037    * NOTE: each field will occupy a vector register group
6038    */
6039   void base64_vector_encode_round(Register src, Register dst, Register codec,
6040                     Register size, Register stepSrc, Register stepDst,
6041                     VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3,
6042                     VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4,
6043                     VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3, VectorRegister outputV4,
6044                     Assembler::LMUL lmul) {
6045     // set vector register type/len
6046     __ vsetvli(x0, size, Assembler::e8, lmul);
6047 
6048     // segmented load src into v registers: mem(src) => vr(3)
6049     __ vlseg3e8_v(inputV1, src);
6050 
6051     // src = src + register_group_len_bytes * 3
6052     __ add(src, src, stepSrc);
6053 
6054     // encoding
6055     //   1. compute index into lookup table: vr(3) => vr(4)
6056     __ vsrl_vi(idxV1, inputV1, 2);
6057 
6058     __ vsrl_vi(idxV2, inputV2, 2);
6059     __ vsll_vi(inputV1, inputV1, 6);
6060     __ vor_vv(idxV2, idxV2, inputV1);
6061     __ vsrl_vi(idxV2, idxV2, 2);
6062 
6063     __ vsrl_vi(idxV3, inputV3, 4);
6064     __ vsll_vi(inputV2, inputV2, 4);
6065     __ vor_vv(idxV3, inputV2, idxV3);
6066     __ vsrl_vi(idxV3, idxV3, 2);
6067 
6068     __ vsll_vi(idxV4, inputV3, 2);
6069     __ vsrl_vi(idxV4, idxV4, 2);
6070 
6071     //   2. indexed load: vr(4) => vr(4)
6072     __ vluxei8_v(outputV1, codec, idxV1);
6073     __ vluxei8_v(outputV2, codec, idxV2);
6074     __ vluxei8_v(outputV3, codec, idxV3);
6075     __ vluxei8_v(outputV4, codec, idxV4);
6076 
6077     // segmented store encoded data in v registers back to dst: vr(4) => mem(dst)
6078     __ vsseg4e8_v(outputV1, dst);
6079 
6080     // dst = dst + register_group_len_bytes * 4
6081     __ add(dst, dst, stepDst);
6082   }
6083 
6084   /**
6085    *  void j.u.Base64.Encoder.encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL)
6086    *
6087    *  Input arguments:
6088    *  c_rarg0   - src, source array
6089    *  c_rarg1   - sp, src start offset
6090    *  c_rarg2   - sl, src end offset
6091    *  c_rarg3   - dst, dest array
6092    *  c_rarg4   - dp, dst start offset
6093    *  c_rarg5   - isURL, Base64 or URL character set
6094    */
6095   address generate_base64_encodeBlock() {
6096     alignas(64) static const char toBase64[64] = {
6097       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6098       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6099       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6100       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6101       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
6102     };
6103 
6104     alignas(64) static const char toBase64URL[64] = {
6105       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6106       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6107       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6108       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6109       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
6110     };
6111 
6112     __ align(CodeEntryAlignment);
6113     StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
6114     StubCodeMark mark(this, stub_id);
6115     address start = __ pc();
6116     __ enter();
6117 
6118     Register src    = c_rarg0;
6119     Register soff   = c_rarg1;
6120     Register send   = c_rarg2;
6121     Register dst    = c_rarg3;
6122     Register doff   = c_rarg4;
6123     Register isURL  = c_rarg5;
6124 
6125     Register codec  = c_rarg6;
6126     Register length = c_rarg7; // total length of src data in bytes
6127 
6128     Label ProcessData, Exit;
6129 
6130     // length should be multiple of 3
6131     __ sub(length, send, soff);
6132     // real src/dst to process data
6133     __ add(src, src, soff);
6134     __ add(dst, dst, doff);
6135 
6136     // load the codec base address
6137     __ la(codec, ExternalAddress((address) toBase64));
6138     __ beqz(isURL, ProcessData);
6139     __ la(codec, ExternalAddress((address) toBase64URL));
6140     __ BIND(ProcessData);
6141 
6142     // vector version
6143     if (UseRVV) {
6144       Label ProcessM2, ProcessM1, ProcessScalar;
6145 
6146       Register size      = soff;
6147       Register stepSrcM1 = send;
6148       Register stepSrcM2 = doff;
6149       Register stepDst   = isURL;
6150 
6151       __ mv(size, MaxVectorSize * 2);
6152       __ mv(stepSrcM1, MaxVectorSize * 3);
6153       __ slli(stepSrcM2, stepSrcM1, 1);
6154       __ mv(stepDst, MaxVectorSize * 2 * 4);
6155 
6156       __ blt(length, stepSrcM2, ProcessM1);
6157 
6158       __ BIND(ProcessM2);
6159       base64_vector_encode_round(src, dst, codec,
6160                     size, stepSrcM2, stepDst,
6161                     v2, v4, v6,         // inputs
6162                     v8, v10, v12, v14,  // indexes
6163                     v16, v18, v20, v22, // outputs
6164                     Assembler::m2);
6165 
6166       __ sub(length, length, stepSrcM2);
6167       __ bge(length, stepSrcM2, ProcessM2);
6168 
6169       __ BIND(ProcessM1);
6170       __ blt(length, stepSrcM1, ProcessScalar);
6171 
6172       __ srli(size, size, 1);
6173       __ srli(stepDst, stepDst, 1);
6174       base64_vector_encode_round(src, dst, codec,
6175                     size, stepSrcM1, stepDst,
6176                     v1, v2, v3,         // inputs
6177                     v4, v5, v6, v7,     // indexes
6178                     v8, v9, v10, v11,   // outputs
6179                     Assembler::m1);
6180       __ sub(length, length, stepSrcM1);
6181 
6182       __ BIND(ProcessScalar);
6183     }
6184 
6185     // scalar version
6186     {
6187       Register byte1 = soff, byte0 = send, byte2 = doff;
6188       Register combined24Bits = isURL;
6189 
6190       __ beqz(length, Exit);
6191 
6192       Label ScalarLoop;
6193       __ BIND(ScalarLoop);
6194       {
6195         // plain:   [byte0[7:0] : byte1[7:0] : byte2[7:0]] =>
6196         // encoded: [byte0[7:2] : byte0[1:0]+byte1[7:4] : byte1[3:0]+byte2[7:6] : byte2[5:0]]
6197 
6198         // load 3 bytes src data
6199         __ lbu(byte0, Address(src, 0));
6200         __ lbu(byte1, Address(src, 1));
6201         __ lbu(byte2, Address(src, 2));
6202         __ addi(src, src, 3);
6203 
6204         // construct 24 bits from 3 bytes
6205         __ slliw(byte0, byte0, 16);
6206         __ slliw(byte1, byte1, 8);
6207         __ orr(combined24Bits, byte0, byte1);
6208         __ orr(combined24Bits, combined24Bits, byte2);
6209 
6210         // get codec index and encode(ie. load from codec by index)
6211         __ slliw(byte0, combined24Bits, 8);
6212         __ srliw(byte0, byte0, 26);
6213         __ add(byte0, codec, byte0);
6214         __ lbu(byte0, byte0);
6215 
6216         __ slliw(byte1, combined24Bits, 14);
6217         __ srliw(byte1, byte1, 26);
6218         __ add(byte1, codec, byte1);
6219         __ lbu(byte1, byte1);
6220 
6221         __ slliw(byte2, combined24Bits, 20);
6222         __ srliw(byte2, byte2, 26);
6223         __ add(byte2, codec, byte2);
6224         __ lbu(byte2, byte2);
6225 
6226         __ andi(combined24Bits, combined24Bits, 0x3f);
6227         __ add(combined24Bits, codec, combined24Bits);
6228         __ lbu(combined24Bits, combined24Bits);
6229 
6230         // store 4 bytes encoded data
6231         __ sb(byte0, Address(dst, 0));
6232         __ sb(byte1, Address(dst, 1));
6233         __ sb(byte2, Address(dst, 2));
6234         __ sb(combined24Bits, Address(dst, 3));
6235 
6236         __ subi(length, length, 3);
6237         __ addi(dst, dst, 4);
6238         // loop back
6239         __ bnez(length, ScalarLoop);
6240       }
6241     }
6242 
6243     __ BIND(Exit);
6244 
6245     __ leave();
6246     __ ret();
6247 
6248     return (address) start;
6249   }
6250 
6251   /**
6252    * vector registers:
6253    * input VectorRegister's:  intputV1-V4, for m2 they could be v2, v4, v6, for m1 they could be v2, v4, v6, v8
6254    * index VectorRegister's:  idxV1-V3, for m2 they could be v8, v10, v12, v14, for m1 they could be v10, v12, v14, v16
6255    * output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v18, v20, v22
6256    *
6257    * NOTE: each field will occupy a single vector register group
6258    */
6259   void base64_vector_decode_round(Register src, Register dst, Register codec,
6260                     Register size, Register stepSrc, Register stepDst, Register failedIdx,
6261                     VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3, VectorRegister inputV4,
6262                     VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4,
6263                     VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3,
6264                     Assembler::LMUL lmul) {
6265     // set vector register type/len
6266     __ vsetvli(x0, size, Assembler::e8, lmul, Assembler::ma, Assembler::ta);
6267 
6268     // segmented load src into v registers: mem(src) => vr(4)
6269     __ vlseg4e8_v(inputV1, src);
6270 
6271     // src = src + register_group_len_bytes * 4
6272     __ add(src, src, stepSrc);
6273 
6274     // decoding
6275     //   1. indexed load: vr(4) => vr(4)
6276     __ vluxei8_v(idxV1, codec, inputV1);
6277     __ vluxei8_v(idxV2, codec, inputV2);
6278     __ vluxei8_v(idxV3, codec, inputV3);
6279     __ vluxei8_v(idxV4, codec, inputV4);
6280 
6281     //   2. check wrong data
6282     __ vor_vv(outputV1, idxV1, idxV2);
6283     __ vor_vv(outputV2, idxV3, idxV4);
6284     __ vor_vv(outputV1, outputV1, outputV2);
6285     __ vmseq_vi(v0, outputV1, -1);
6286     __ vfirst_m(failedIdx, v0);
6287     Label NoFailure, FailureAtIdx0;
6288     // valid value can only be -1 when < 0
6289     __ bltz(failedIdx, NoFailure);
6290     // when the first data (at index 0) fails, no need to process data anymore
6291     __ beqz(failedIdx, FailureAtIdx0);
6292     __ vsetvli(x0, failedIdx, Assembler::e8, lmul, Assembler::mu, Assembler::tu);
6293     __ slli(stepDst, failedIdx, 1);
6294     __ add(stepDst, failedIdx, stepDst);
6295     __ BIND(NoFailure);
6296 
6297     //   3. compute the decoded data: vr(4) => vr(3)
6298     __ vsll_vi(idxV1, idxV1, 2);
6299     __ vsrl_vi(outputV1, idxV2, 4);
6300     __ vor_vv(outputV1, outputV1, idxV1);
6301 
6302     __ vsll_vi(idxV2, idxV2, 4);
6303     __ vsrl_vi(outputV2, idxV3, 2);
6304     __ vor_vv(outputV2, outputV2, idxV2);
6305 
6306     __ vsll_vi(idxV3, idxV3, 6);
6307     __ vor_vv(outputV3, idxV4, idxV3);
6308 
6309     // segmented store encoded data in v registers back to dst: vr(3) => mem(dst)
6310     __ vsseg3e8_v(outputV1, dst);
6311 
6312     // dst = dst + register_group_len_bytes * 3
6313     __ add(dst, dst, stepDst);
6314     __ BIND(FailureAtIdx0);
6315   }
6316 
6317   /**
6318    * int j.u.Base64.Decoder.decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, boolean isMIME)
6319    *
6320    *  Input arguments:
6321    *  c_rarg0   - src, source array
6322    *  c_rarg1   - sp, src start offset
6323    *  c_rarg2   - sl, src end offset
6324    *  c_rarg3   - dst, dest array
6325    *  c_rarg4   - dp, dst start offset
6326    *  c_rarg5   - isURL, Base64 or URL character set
6327    *  c_rarg6   - isMIME, Decoding MIME block
6328    */
6329   address generate_base64_decodeBlock() {
6330 
6331     static const uint8_t fromBase64[256] = {
6332         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6333         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6334         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6335         52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6336         255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6337         15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
6338         255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6339         41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6340         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6341         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6342         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6343         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6344         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6345         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6346         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6347         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6348     };
6349 
6350     static const uint8_t fromBase64URL[256] = {
6351         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6352         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6353         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6354         52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6355         255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6356         15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
6357         255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6358         41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6359         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6360         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6361         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6362         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6363         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6364         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6365         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6366         255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6367     };
6368 
6369     __ align(CodeEntryAlignment);
6370     StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
6371     StubCodeMark mark(this, stub_id);
6372     address start = __ pc();
6373     __ enter();
6374 
6375     Register src    = c_rarg0;
6376     Register soff   = c_rarg1;
6377     Register send   = c_rarg2;
6378     Register dst    = c_rarg3;
6379     Register doff   = c_rarg4;
6380     Register isURL  = c_rarg5;
6381     Register isMIME = c_rarg6;
6382 
6383     Register codec     = c_rarg7;
6384     Register dstBackup = t6;
6385     Register length    = t3;     // total length of src data in bytes
6386 
6387     Label ProcessData, Exit;
6388     Label ProcessScalar, ScalarLoop;
6389 
6390     // passed in length (send - soff) is guaranteed to be > 4,
6391     // and in this intrinsic we only process data of length in multiple of 4,
6392     // it's not guaranteed to be multiple of 4 by java level, so do it explicitly
6393     __ sub(length, send, soff);
6394     __ andi(length, length, -4);
6395     // real src/dst to process data
6396     __ add(src, src, soff);
6397     __ add(dst, dst, doff);
6398     // backup of dst, used to calculate the return value at exit
6399     __ mv(dstBackup, dst);
6400 
6401     // load the codec base address
6402     __ la(codec, ExternalAddress((address) fromBase64));
6403     __ beqz(isURL, ProcessData);
6404     __ la(codec, ExternalAddress((address) fromBase64URL));
6405     __ BIND(ProcessData);
6406 
6407     // vector version
6408     if (UseRVV) {
6409       // for MIME case, it has a default length limit of 76 which could be
6410       // different(smaller) from (send - soff), so in MIME case, we go through
6411       // the scalar code path directly.
6412       __ bnez(isMIME, ScalarLoop);
6413 
6414       Label ProcessM1, ProcessM2;
6415 
6416       Register failedIdx = soff;
6417       Register stepSrcM1 = send;
6418       Register stepSrcM2 = doff;
6419       Register stepDst   = isURL;
6420       Register size      = t4;
6421 
6422       __ mv(size, MaxVectorSize * 2);
6423       __ mv(stepSrcM1, MaxVectorSize * 4);
6424       __ slli(stepSrcM2, stepSrcM1, 1);
6425       __ mv(stepDst, MaxVectorSize * 2 * 3);
6426 
6427       __ blt(length, stepSrcM2, ProcessM1);
6428 
6429 
6430       // Assembler::m2
6431       __ BIND(ProcessM2);
6432       base64_vector_decode_round(src, dst, codec,
6433                     size, stepSrcM2, stepDst, failedIdx,
6434                     v2, v4, v6, v8,      // inputs
6435                     v10, v12, v14, v16,  // indexes
6436                     v18, v20, v22,       // outputs
6437                     Assembler::m2);
6438       __ sub(length, length, stepSrcM2);
6439 
6440       // error check
6441       // valid value of failedIdx can only be -1 when < 0
6442       __ bgez(failedIdx, Exit);
6443 
6444       __ bge(length, stepSrcM2, ProcessM2);
6445 
6446 
6447       // Assembler::m1
6448       __ BIND(ProcessM1);
6449       __ blt(length, stepSrcM1, ProcessScalar);
6450 
6451       __ srli(size, size, 1);
6452       __ srli(stepDst, stepDst, 1);
6453       base64_vector_decode_round(src, dst, codec,
6454                     size, stepSrcM1, stepDst, failedIdx,
6455                     v1, v2, v3, v4,      // inputs
6456                     v5, v6, v7, v8,      // indexes
6457                     v9, v10, v11,        // outputs
6458                     Assembler::m1);
6459       __ sub(length, length, stepSrcM1);
6460 
6461       // error check
6462       // valid value of failedIdx can only be -1 when < 0
6463       __ bgez(failedIdx, Exit);
6464 
6465       __ BIND(ProcessScalar);
6466       __ beqz(length, Exit);
6467     }
6468 
6469     // scalar version
6470     {
6471       Register byte0 = soff, byte1 = send, byte2 = doff, byte3 = isURL;
6472       Register combined32Bits = t4;
6473 
6474       // encoded:   [byte0[5:0] : byte1[5:0] : byte2[5:0]] : byte3[5:0]] =>
6475       // plain:     [byte0[5:0]+byte1[5:4] : byte1[3:0]+byte2[5:2] : byte2[1:0]+byte3[5:0]]
6476       __ BIND(ScalarLoop);
6477 
6478       // load 4 bytes encoded src data
6479       __ lbu(byte0, Address(src, 0));
6480       __ lbu(byte1, Address(src, 1));
6481       __ lbu(byte2, Address(src, 2));
6482       __ lbu(byte3, Address(src, 3));
6483       __ addi(src, src, 4);
6484 
6485       // get codec index and decode (ie. load from codec by index)
6486       __ add(byte0, codec, byte0);
6487       __ add(byte1, codec, byte1);
6488       __ lb(byte0, Address(byte0, 0));
6489       __ lb(byte1, Address(byte1, 0));
6490       __ add(byte2, codec, byte2);
6491       __ add(byte3, codec, byte3);
6492       __ lb(byte2, Address(byte2, 0));
6493       __ lb(byte3, Address(byte3, 0));
6494       __ slliw(byte0, byte0, 18);
6495       __ slliw(byte1, byte1, 12);
6496       __ orr(byte0, byte0, byte1);
6497       __ orr(byte0, byte0, byte3);
6498       __ slliw(byte2, byte2, 6);
6499       // For performance consideration, `combined32Bits` is constructed for 2 purposes at the same time,
6500       //  1. error check below
6501       //  2. decode below
6502       __ orr(combined32Bits, byte0, byte2);
6503 
6504       // error check
6505       __ bltz(combined32Bits, Exit);
6506 
6507       // store 3 bytes decoded data
6508       __ sraiw(byte0, combined32Bits, 16);
6509       __ sraiw(byte1, combined32Bits, 8);
6510       __ sb(byte0, Address(dst, 0));
6511       __ sb(byte1, Address(dst, 1));
6512       __ sb(combined32Bits, Address(dst, 2));
6513 
6514       __ subi(length, length, 4);
6515       __ addi(dst, dst, 3);
6516       // loop back
6517       __ bnez(length, ScalarLoop);
6518     }
6519 
6520     __ BIND(Exit);
6521     __ sub(c_rarg0, dst, dstBackup);
6522 
6523     __ leave();
6524     __ ret();
6525 
6526     return (address) start;
6527   }
6528 
6529   void adler32_process_bytes(Register buff, Register s1, Register s2, VectorRegister vtable,
6530     VectorRegister vzero, VectorRegister vbytes, VectorRegister vs1acc, VectorRegister vs2acc,
6531     Register temp0, Register temp1, Register temp2,  Register temp3,
6532     VectorRegister vtemp1, VectorRegister vtemp2, int step, Assembler::LMUL lmul) {
6533 
6534     assert((lmul == Assembler::m4 && step == 64) ||
6535            (lmul == Assembler::m2 && step == 32) ||
6536            (lmul == Assembler::m1 && step == 16),
6537            "LMUL should be aligned with step: m4 and 64, m2 and 32 or m1 and 16");
6538     // Below is function for calculating Adler32 checksum with 64-, 32- or 16-byte step. LMUL=m4, m2 or m1 is used.
6539     // The results are in v12, v13, ..., v22, v23. Example below is for 64-byte step case.
6540     // We use b1, b2, ..., b64 to denote the 64 bytes loaded in each iteration.
6541     // In non-vectorized code, we update s1 and s2 as:
6542     //   s1 <- s1 + b1
6543     //   s2 <- s2 + s1
6544     //   s1 <- s1 + b2
6545     //   s2 <- s2 + b1
6546     //   ...
6547     //   s1 <- s1 + b64
6548     //   s2 <- s2 + s1
6549     // Putting above assignments together, we have:
6550     //   s1_new = s1 + b1 + b2 + ... + b64
6551     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b64) =
6552     //          = s2 + s1 * 64 + (b1 * 64 + b2 * 63 + ... + b64 * 1) =
6553     //          = s2 + s1 * 64 + (b1, b2, ... b64) dot (64, 63, ... 1)
6554 
6555     __ mv(temp3, step);
6556     // Load data
6557     __ vsetvli(temp0, temp3, Assembler::e8, lmul);
6558     __ vle8_v(vbytes, buff);
6559     __ addi(buff, buff, step);
6560 
6561     // Upper bound reduction sum for s1_new:
6562     // 0xFF * 64 = 0x3FC0, so:
6563     // 1. Need to do vector-widening reduction sum
6564     // 2. It is safe to perform sign-extension during vmv.x.s with 16-bits elements
6565     __ vwredsumu_vs(vs1acc, vbytes, vzero);
6566     // Multiplication for s2_new
6567     __ vwmulu_vv(vs2acc, vtable, vbytes);
6568 
6569     // s2 = s2 + s1 * log2(step)
6570     __ slli(temp1, s1, exact_log2(step));
6571     __ add(s2, s2, temp1);
6572 
6573     // Summing up calculated results for s2_new
6574     if (MaxVectorSize > 16) {
6575       __ vsetvli(temp0, temp3, Assembler::e16, lmul);
6576     } else {
6577       // Half of vector-widening multiplication result is in successor of vs2acc
6578       // group for vlen == 16, in which case we need to double vector register
6579       // group width in order to reduction sum all of them
6580       Assembler::LMUL lmulx2 = (lmul == Assembler::m1) ? Assembler::m2 :
6581                                (lmul == Assembler::m2) ? Assembler::m4 : Assembler::m8;
6582       __ vsetvli(temp0, temp3, Assembler::e16, lmulx2);
6583     }
6584     // Upper bound for reduction sum:
6585     // 0xFF * (64 + 63 + ... + 2 + 1) = 0x817E0 max for whole register group, so:
6586     // 1. Need to do vector-widening reduction sum
6587     // 2. It is safe to perform sign-extension during vmv.x.s with 32-bits elements
6588     __ vwredsumu_vs(vtemp1, vs2acc, vzero);
6589 
6590     // Extracting results for:
6591     // s1_new
6592     __ vmv_x_s(temp0, vs1acc);
6593     __ add(s1, s1, temp0);
6594     // s2_new
6595     __ vsetvli(temp0, temp3, Assembler::e32, Assembler::m1);
6596     __ vmv_x_s(temp1, vtemp1);
6597     __ add(s2, s2, temp1);
6598   }
6599 
6600   /***
6601    *  int java.util.zip.Adler32.updateBytes(int adler, byte[] b, int off, int len)
6602    *
6603    *  Arguments:
6604    *
6605    *  Inputs:
6606    *   c_rarg0   - int   adler
6607    *   c_rarg1   - byte* buff (b + off)
6608    *   c_rarg2   - int   len
6609    *
6610    *  Output:
6611    *   c_rarg0   - int adler result
6612    */
6613   address generate_updateBytesAdler32() {
6614     __ align(CodeEntryAlignment);
6615     StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
6616     StubCodeMark mark(this, stub_id);
6617     address start = __ pc();
6618 
6619     Label L_nmax, L_nmax_loop, L_nmax_loop_entry, L_by16, L_by16_loop,
6620       L_by16_loop_unroll, L_by1_loop, L_do_mod, L_combine, L_by1;
6621 
6622     // Aliases
6623     Register adler  = c_rarg0;
6624     Register s1     = c_rarg0;
6625     Register s2     = c_rarg3;
6626     Register buff   = c_rarg1;
6627     Register len    = c_rarg2;
6628     Register nmax  = c_rarg4;
6629     Register base  = c_rarg5;
6630     Register count = c_rarg6;
6631     Register temp0 = t3;
6632     Register temp1 = t4;
6633     Register temp2 = t5;
6634     Register temp3 = t6;
6635 
6636     VectorRegister vzero = v31;
6637     VectorRegister vbytes = v8; // group: v8, v9, v10, v11
6638     VectorRegister vs1acc = v12; // group: v12, v13, v14, v15
6639     VectorRegister vs2acc = v16; // group: v16, v17, v18, v19, v20, v21, v22, v23
6640     VectorRegister vtable_64 = v24; // group: v24, v25, v26, v27
6641     VectorRegister vtable_32 = v4; // group: v4, v5
6642     VectorRegister vtable_16 = v30;
6643     VectorRegister vtemp1 = v28;
6644     VectorRegister vtemp2 = v29;
6645 
6646     // Max number of bytes we can process before having to take the mod
6647     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
6648     const uint64_t BASE = 0xfff1;
6649     const uint64_t NMAX = 0x15B0;
6650 
6651     // Loops steps
6652     int step_64 = 64;
6653     int step_32 = 32;
6654     int step_16 = 16;
6655     int step_1  = 1;
6656 
6657     __ enter(); // Required for proper stackwalking of RuntimeStub frame
6658     __ mv(temp1, 64);
6659     __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m4);
6660 
6661     // Generating accumulation coefficients for further calculations
6662     // vtable_64:
6663     __ vid_v(vtemp1);
6664     __ vrsub_vx(vtable_64, vtemp1, temp1);
6665     // vtable_64 group now contains { 0x40, 0x3f, 0x3e, ..., 0x3, 0x2, 0x1 }
6666 
6667     // vtable_32:
6668     __ mv(temp1, 32);
6669     __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m2);
6670     __ vid_v(vtemp1);
6671     __ vrsub_vx(vtable_32, vtemp1, temp1);
6672     // vtable_32 group now contains { 0x20, 0x1f, 0x1e, ..., 0x3, 0x2, 0x1 }
6673 
6674     __ vsetivli(temp0, 16, Assembler::e8, Assembler::m1);
6675     // vtable_16:
6676     __ mv(temp1, 16);
6677     __ vid_v(vtemp1);
6678     __ vrsub_vx(vtable_16, vtemp1, temp1);
6679     // vtable_16 now contains { 0x10, 0xf, 0xe, ..., 0x3, 0x2, 0x1 }
6680 
6681     __ vmv_v_i(vzero, 0);
6682 
6683     __ mv(base, BASE);
6684     __ mv(nmax, NMAX);
6685 
6686     // s1 is initialized to the lower 16 bits of adler
6687     // s2 is initialized to the upper 16 bits of adler
6688     __ srliw(s2, adler, 16); // s2 = ((adler >> 16) & 0xffff)
6689     __ zext(s1, adler, 16); // s1 = (adler & 0xffff)
6690 
6691     // The pipelined loop needs at least 16 elements for 1 iteration
6692     // It does check this, but it is more effective to skip to the cleanup loop
6693     __ mv(temp0, step_16);
6694     __ bgeu(len, temp0, L_nmax);
6695     __ beqz(len, L_combine);
6696 
6697     // Jumping to L_by1_loop
6698     __ subi(len, len, step_1);
6699     __ j(L_by1_loop);
6700 
6701   __ bind(L_nmax);
6702     __ sub(len, len, nmax);
6703     __ subi(count, nmax, 16);
6704     __ bltz(len, L_by16);
6705 
6706   // Align L_nmax loop by 64
6707   __ bind(L_nmax_loop_entry);
6708     __ subi(count, count, 32);
6709 
6710   __ bind(L_nmax_loop);
6711     adler32_process_bytes(buff, s1, s2, vtable_64, vzero,
6712       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6713       vtemp1, vtemp2, step_64, Assembler::m4);
6714     __ subi(count, count, step_64);
6715     __ bgtz(count, L_nmax_loop);
6716 
6717     // There are three iterations left to do
6718     adler32_process_bytes(buff, s1, s2, vtable_32, vzero,
6719       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6720       vtemp1, vtemp2, step_32, Assembler::m2);
6721     adler32_process_bytes(buff, s1, s2, vtable_16, vzero,
6722       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6723       vtemp1, vtemp2, step_16, Assembler::m1);
6724 
6725     // s1 = s1 % BASE
6726     __ remuw(s1, s1, base);
6727     // s2 = s2 % BASE
6728     __ remuw(s2, s2, base);
6729 
6730     __ sub(len, len, nmax);
6731     __ subi(count, nmax, 16);
6732     __ bgez(len, L_nmax_loop_entry);
6733 
6734   __ bind(L_by16);
6735     __ add(len, len, count);
6736     __ bltz(len, L_by1);
6737     // Trying to unroll
6738     __ mv(temp3, step_64);
6739     __ blt(len, temp3, L_by16_loop);
6740 
6741   __ bind(L_by16_loop_unroll);
6742     adler32_process_bytes(buff, s1, s2, vtable_64, vzero,
6743       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6744       vtemp1, vtemp2, step_64, Assembler::m4);
6745     __ subi(len, len, step_64);
6746     // By now the temp3 should still be 64
6747     __ bge(len, temp3, L_by16_loop_unroll);
6748 
6749   __ bind(L_by16_loop);
6750     adler32_process_bytes(buff, s1, s2, vtable_16, vzero,
6751       vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6752       vtemp1, vtemp2, step_16, Assembler::m1);
6753     __ subi(len, len, step_16);
6754     __ bgez(len, L_by16_loop);
6755 
6756   __ bind(L_by1);
6757     __ addi(len, len, 15);
6758     __ bltz(len, L_do_mod);
6759 
6760   __ bind(L_by1_loop);
6761     __ lbu(temp0, Address(buff, 0));
6762     __ addi(buff, buff, step_1);
6763     __ add(s1, temp0, s1);
6764     __ add(s2, s2, s1);
6765     __ subi(len, len, step_1);
6766     __ bgez(len, L_by1_loop);
6767 
6768   __ bind(L_do_mod);
6769     // s1 = s1 % BASE
6770     __ remuw(s1, s1, base);
6771     // s2 = s2 % BASE
6772     __ remuw(s2, s2, base);
6773 
6774     // Combine lower bits and higher bits
6775     // adler = s1 | (s2 << 16)
6776   __ bind(L_combine);
6777     __ slli(s2, s2, 16);
6778     __ orr(s1, s1, s2);
6779 
6780     __ leave(); // Required for proper stackwalking of RuntimeStub frame
6781     __ ret();
6782 
6783     return start;
6784   }
6785 
6786 #endif // COMPILER2
6787 
6788   // x10 = input (float16)
6789   // f10 = result (float)
6790   // t1  = temporary register
6791   address generate_float16ToFloat() {
6792     __ align(CodeEntryAlignment);
6793     StubId stub_id = StubId::stubgen_hf2f_id;
6794     StubCodeMark mark(this, stub_id);
6795     address entry = __ pc();
6796     BLOCK_COMMENT("float16ToFloat:");
6797 
6798     FloatRegister dst = f10;
6799     Register src = x10;
6800     Label NaN_SLOW;
6801 
6802     assert(VM_Version::supports_float16_float_conversion(), "must");
6803 
6804     // On riscv, NaN needs a special process as fcvt does not work in that case.
6805     // On riscv, Inf does not need a special process as fcvt can handle it correctly.
6806     // but we consider to get the slow path to process NaN and Inf at the same time,
6807     // as both of them are rare cases, and if we try to get the slow path to handle
6808     // only NaN case it would sacrifise the performance for normal cases,
6809     // i.e. non-NaN and non-Inf cases.
6810 
6811     // check whether it's a NaN or +/- Inf.
6812     __ mv(t0, 0x7c00);
6813     __ andr(t1, src, t0);
6814     // jump to stub processing NaN and Inf cases.
6815     __ beq(t0, t1, NaN_SLOW);
6816 
6817     // non-NaN or non-Inf cases, just use built-in instructions.
6818     __ fmv_h_x(dst, src);
6819     __ fcvt_s_h(dst, dst);
6820     __ ret();
6821 
6822     __ bind(NaN_SLOW);
6823     // following instructions mainly focus on NaN, as riscv does not handle
6824     // NaN well with fcvt, but the code also works for Inf at the same time.
6825 
6826     // construct a NaN in 32 bits from the NaN in 16 bits,
6827     // we need the payloads of non-canonical NaNs to be preserved.
6828     __ mv(t1, 0x7f800000);
6829     // sign-bit was already set via sign-extension if necessary.
6830     __ slli(t0, src, 13);
6831     __ orr(t1, t0, t1);
6832     __ fmv_w_x(dst, t1);
6833 
6834     __ ret();
6835     return entry;
6836   }
6837 
6838   // f10 = input (float)
6839   // x10 = result (float16)
6840   // f11 = temporary float register
6841   // t1  = temporary register
6842   address generate_floatToFloat16() {
6843     __ align(CodeEntryAlignment);
6844     StubId stub_id = StubId::stubgen_f2hf_id;
6845     StubCodeMark mark(this, stub_id);
6846     address entry = __ pc();
6847     BLOCK_COMMENT("floatToFloat16:");
6848 
6849     Register dst = x10;
6850     FloatRegister src = f10, ftmp = f11;
6851     Label NaN_SLOW;
6852 
6853     assert(VM_Version::supports_float16_float_conversion(), "must");
6854 
6855     // On riscv, NaN needs a special process as fcvt does not work in that case.
6856 
6857     // check whether it's a NaN.
6858     // replace fclass with feq as performance optimization.
6859     __ feq_s(t0, src, src);
6860     // jump to stub processing NaN cases.
6861     __ beqz(t0, NaN_SLOW);
6862 
6863     // non-NaN cases, just use built-in instructions.
6864     __ fcvt_h_s(ftmp, src);
6865     __ fmv_x_h(dst, ftmp);
6866     __ ret();
6867 
6868     __ bind(NaN_SLOW);
6869 
6870     __ float_to_float16_NaN(dst, src, t0, t1);
6871 
6872     __ ret();
6873     return entry;
6874   }
6875 
6876 #ifdef COMPILER2
6877 
6878 static const int64_t right_2_bits = right_n_bits(2);
6879 static const int64_t right_3_bits = right_n_bits(3);
6880 
6881   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
6882   // are represented as long[5], with BITS_PER_LIMB = 26.
6883   // Pack five 26-bit limbs into three 64-bit registers.
6884   void poly1305_pack_26(Register dest0, Register dest1, Register dest2, Register src, Register tmp1, Register tmp2) {
6885     assert_different_registers(dest0, dest1, dest2, src, tmp1, tmp2);
6886 
6887     // The goal is to have 128-bit value in dest2:dest1:dest0
6888     __ ld(dest0, Address(src, 0));    // 26 bits in dest0
6889 
6890     __ ld(tmp1, Address(src, sizeof(jlong)));
6891     __ slli(tmp1, tmp1, 26);
6892     __ add(dest0, dest0, tmp1);       // 52 bits in dest0
6893 
6894     __ ld(tmp2, Address(src, 2 * sizeof(jlong)));
6895     __ slli(tmp1, tmp2, 52);
6896     __ add(dest0, dest0, tmp1);       // dest0 is full
6897 
6898     __ srli(dest1, tmp2, 12);         // 14-bit in dest1
6899 
6900     __ ld(tmp1, Address(src, 3 * sizeof(jlong)));
6901     __ slli(tmp1, tmp1, 14);
6902     __ add(dest1, dest1, tmp1);       // 40-bit in dest1
6903 
6904     __ ld(tmp1, Address(src, 4 * sizeof(jlong)));
6905     __ slli(tmp2, tmp1, 40);
6906     __ add(dest1, dest1, tmp2);       // dest1 is full
6907 
6908     if (dest2->is_valid()) {
6909       __ srli(tmp1, tmp1, 24);
6910       __ mv(dest2, tmp1);               // 2 bits in dest2
6911     } else {
6912 #ifdef ASSERT
6913       Label OK;
6914       __ srli(tmp1, tmp1, 24);
6915       __ beq(zr, tmp1, OK);           // 2 bits
6916       __ stop("high bits of Poly1305 integer should be zero");
6917       __ should_not_reach_here();
6918       __ bind(OK);
6919 #endif
6920     }
6921   }
6922 
6923   // As above, but return only a 128-bit integer, packed into two
6924   // 64-bit registers.
6925   void poly1305_pack_26(Register dest0, Register dest1, Register src, Register tmp1, Register tmp2) {
6926     poly1305_pack_26(dest0, dest1, noreg, src, tmp1, tmp2);
6927   }
6928 
6929   // U_2:U_1:U_0: += (U_2 >> 2) * 5
6930   void poly1305_reduce(Register U_2, Register U_1, Register U_0, Register tmp1, Register tmp2) {
6931     assert_different_registers(U_2, U_1, U_0, tmp1, tmp2);
6932 
6933     // First, U_2:U_1:U_0 += (U_2 >> 2)
6934     __ srli(tmp1, U_2, 2);
6935     __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2
6936     __ andi(U_2, U_2, right_2_bits); // Clear U_2 except for the lowest two bits
6937     __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2
6938     __ add(U_2, U_2, tmp2);
6939 
6940     // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
6941     __ slli(tmp1, tmp1, 2);
6942     __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2
6943     __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2
6944     __ add(U_2, U_2, tmp2);
6945   }
6946 
6947   // Poly1305, RFC 7539
6948   // void com.sun.crypto.provider.Poly1305.processMultipleBlocks(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs)
6949 
6950   // Arguments:
6951   //    c_rarg0:   input_start -- where the input is stored
6952   //    c_rarg1:   length
6953   //    c_rarg2:   acc_start -- where the output will be stored
6954   //    c_rarg3:   r_start -- where the randomly generated 128-bit key is stored
6955 
6956   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
6957   // description of the tricks used to simplify and accelerate this
6958   // computation.
6959 
6960   address generate_poly1305_processBlocks() {
6961     __ align(CodeEntryAlignment);
6962     StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
6963     StubCodeMark mark(this, stub_id);
6964     address start = __ pc();
6965     __ enter();
6966     Label here;
6967 
6968     RegSet saved_regs = RegSet::range(x18, x21);
6969     RegSetIterator<Register> regs = (RegSet::range(x14, x31) - RegSet::range(x22, x27)).begin();
6970     __ push_reg(saved_regs, sp);
6971 
6972     // Arguments
6973     const Register input_start = c_rarg0, length = c_rarg1, acc_start = c_rarg2, r_start = c_rarg3;
6974 
6975     // R_n is the 128-bit randomly-generated key, packed into two
6976     // registers. The caller passes this key to us as long[5], with
6977     // BITS_PER_LIMB = 26.
6978     const Register R_0 = *regs, R_1 = *++regs;
6979     poly1305_pack_26(R_0, R_1, r_start, t1, t2);
6980 
6981     // RR_n is (R_n >> 2) * 5
6982     const Register RR_0 = *++regs, RR_1 = *++regs;
6983     __ srli(t1, R_0, 2);
6984     __ shadd(RR_0, t1, t1, t2, 2);
6985     __ srli(t1, R_1, 2);
6986     __ shadd(RR_1, t1, t1, t2, 2);
6987 
6988     // U_n is the current checksum
6989     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
6990     poly1305_pack_26(U_0, U_1, U_2, acc_start, t1, t2);
6991 
6992     static constexpr int BLOCK_LENGTH = 16;
6993     Label DONE, LOOP;
6994 
6995     __ mv(t1, BLOCK_LENGTH);
6996     __ blt(length, t1, DONE); {
6997       __ bind(LOOP);
6998 
6999       // S_n is to be the sum of U_n and the next block of data
7000       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
7001       __ ld(S_0, Address(input_start, 0));
7002       __ ld(S_1, Address(input_start, wordSize));
7003 
7004       __ cad(S_0, S_0, U_0, t1); // Add U_0 to S_0 with carry output to t1
7005       __ cadc(S_1, S_1, U_1, t1); // Add U_1 with carry to S_1 with carry output to t1
7006       __ add(S_2, U_2, t1);
7007 
7008       __ addi(S_2, S_2, 1);
7009 
7010       const Register U_0HI = *++regs, U_1HI = *++regs;
7011 
7012       // NB: this logic depends on some of the special properties of
7013       // Poly1305 keys. In particular, because we know that the top
7014       // four bits of R_0 and R_1 are zero, we can add together
7015       // partial products without any risk of needing to propagate a
7016       // carry out.
7017       __ wide_mul(U_0, U_0HI, S_0, R_0);
7018       __ wide_madd(U_0, U_0HI, S_1, RR_1, t1, t2);
7019       __ wide_madd(U_0, U_0HI, S_2, RR_0, t1, t2);
7020 
7021       __ wide_mul(U_1, U_1HI, S_0, R_1);
7022       __ wide_madd(U_1, U_1HI, S_1, R_0, t1, t2);
7023       __ wide_madd(U_1, U_1HI, S_2, RR_1, t1, t2);
7024 
7025       __ andi(U_2, R_0, right_2_bits);
7026       __ mul(U_2, S_2, U_2);
7027 
7028       // Partial reduction mod 2**130 - 5
7029       __ cad(U_1, U_1, U_0HI, t1); // Add U_0HI to U_1 with carry output to t1
7030       __ adc(U_2, U_2, U_1HI, t1);
7031       // Sum is now in U_2:U_1:U_0.
7032 
7033       // U_2:U_1:U_0: += (U_2 >> 2) * 5
7034       poly1305_reduce(U_2, U_1, U_0, t1, t2);
7035 
7036       __ subi(length, length, BLOCK_LENGTH);
7037       __ addi(input_start, input_start, BLOCK_LENGTH);
7038       __ mv(t1, BLOCK_LENGTH);
7039       __ bge(length, t1, LOOP);
7040     }
7041 
7042     // Further reduce modulo 2^130 - 5
7043     poly1305_reduce(U_2, U_1, U_0, t1, t2);
7044 
7045     // Unpack the sum into five 26-bit limbs and write to memory.
7046     // First 26 bits is the first limb
7047     __ slli(t1, U_0, 38); // Take lowest 26 bits
7048     __ srli(t1, t1, 38);
7049     __ sd(t1, Address(acc_start)); // First 26-bit limb
7050 
7051     // 27-52 bits of U_0 is the second limb
7052     __ slli(t1, U_0, 12); // Take next 27-52 bits
7053     __ srli(t1, t1, 38);
7054     __ sd(t1, Address(acc_start, sizeof (jlong))); // Second 26-bit limb
7055 
7056     // Getting 53-64 bits of U_0 and 1-14 bits of U_1 in one register
7057     __ srli(t1, U_0, 52);
7058     __ slli(t2, U_1, 50);
7059     __ srli(t2, t2, 38);
7060     __ add(t1, t1, t2);
7061     __ sd(t1, Address(acc_start, 2 * sizeof (jlong))); // Third 26-bit limb
7062 
7063     // Storing 15-40 bits of U_1
7064     __ slli(t1, U_1, 24); // Already used up 14 bits
7065     __ srli(t1, t1, 38); // Clear all other bits from t1
7066     __ sd(t1, Address(acc_start, 3 * sizeof (jlong))); // Fourth 26-bit limb
7067 
7068     // Storing 41-64 bits of U_1 and first three bits from U_2 in one register
7069     __ srli(t1, U_1, 40);
7070     __ andi(t2, U_2, right_3_bits);
7071     __ slli(t2, t2, 24);
7072     __ add(t1, t1, t2);
7073     __ sd(t1, Address(acc_start, 4 * sizeof (jlong))); // Fifth 26-bit limb
7074 
7075     __ bind(DONE);
7076     __ pop_reg(saved_regs, sp);
7077     __ leave(); // Required for proper stackwalking
7078     __ ret();
7079 
7080     return start;
7081   }
7082 
7083   address generate_arrays_hashcode_powers_of_31() {
7084     assert(UseRVV, "sanity");
7085     const int lmul = 2;
7086     const int stride = MaxVectorSize / sizeof(jint) * lmul;
7087     __ align(CodeEntryAlignment);
7088     StubCodeMark mark(this, "StubRoutines", "arrays_hashcode_powers_of_31");
7089     address start = __ pc();
7090     for (int i = stride; i >= 0; i--) {
7091         jint power_of_31 = 1;
7092         for (int j = i; j > 0; j--) {
7093           power_of_31 = java_multiply(power_of_31, 31);
7094         }
7095         __ emit_int32(power_of_31);
7096     }
7097 
7098     return start;
7099   }
7100 
7101 #endif // COMPILER2
7102 
7103   /**
7104    *  Arguments:
7105    *
7106    * Inputs:
7107    *   c_rarg0   - int crc
7108    *   c_rarg1   - byte* buf
7109    *   c_rarg2   - int length
7110    *
7111    * Output:
7112    *   c_rarg0   - int crc result
7113    */
7114   address generate_updateBytesCRC32() {
7115     assert(UseCRC32Intrinsics, "what are we doing here?");
7116 
7117     __ align(CodeEntryAlignment);
7118     StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
7119     StubCodeMark mark(this, stub_id);
7120 
7121     address start = __ pc();
7122 
7123     // input parameters
7124     const Register crc    = c_rarg0;  // crc
7125     const Register buf    = c_rarg1;  // source java byte array address
7126     const Register len    = c_rarg2;  // length
7127 
7128     BLOCK_COMMENT("Entry:");
7129     __ enter(); // required for proper stackwalking of RuntimeStub frame
7130 
7131     __ kernel_crc32(crc, buf, len,
7132                     c_rarg3, c_rarg4, c_rarg5, c_rarg6, // tmp's for tables
7133                     c_rarg7, t2, t3, t4, t5, t6);       // misc tmps
7134 
7135     __ leave(); // required for proper stackwalking of RuntimeStub frame
7136     __ ret();
7137 
7138     return start;
7139   }
7140 
7141   // exception handler for upcall stubs
7142   address generate_upcall_stub_exception_handler() {
7143     StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
7144     StubCodeMark mark(this, stub_id);
7145     address start = __ pc();
7146 
7147     // Native caller has no idea how to handle exceptions,
7148     // so we just crash here. Up to callee to catch exceptions.
7149     __ verify_oop(x10); // return a exception oop in a0
7150     __ rt_call(CAST_FROM_FN_PTR(address, UpcallLinker::handle_uncaught_exception));
7151     __ should_not_reach_here();
7152 
7153     return start;
7154   }
7155 
7156   // load Method* target of MethodHandle
7157   // j_rarg0 = jobject receiver
7158   // xmethod = Method* result
7159   address generate_upcall_stub_load_target() {
7160 
7161     StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
7162     StubCodeMark mark(this, stub_id);
7163     address start = __ pc();
7164 
7165     __ resolve_global_jobject(j_rarg0, t0, t1);
7166       // Load target method from receiver
7167     __ load_heap_oop(xmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), t0, t1);
7168     __ load_heap_oop(xmethod, Address(xmethod, java_lang_invoke_LambdaForm::vmentry_offset()), t0, t1);
7169     __ load_heap_oop(xmethod, Address(xmethod, java_lang_invoke_MemberName::method_offset()), t0, t1);
7170     __ access_load_at(T_ADDRESS, IN_HEAP, xmethod,
7171                       Address(xmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
7172                       noreg, noreg);
7173     __ sd(xmethod, Address(xthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
7174 
7175     __ ret();
7176 
7177     return start;
7178   }
7179 
7180 #undef __
7181 
7182   // Initialization
7183   void generate_preuniverse_stubs() {
7184     // preuniverse stubs are not needed for riscv
7185   }
7186 
7187   void generate_initial_stubs() {
7188     // Generate initial stubs and initializes the entry points
7189 
7190     // entry points that exist in all platforms Note: This is code
7191     // that could be shared among different platforms - however the
7192     // benefit seems to be smaller than the disadvantage of having a
7193     // much more complicated generator structure. See also comment in
7194     // stubRoutines.hpp.
7195 
7196     StubRoutines::_forward_exception_entry = generate_forward_exception();
7197 
7198     if (UnsafeMemoryAccess::_table == nullptr) {
7199       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
7200     }
7201 
7202     StubRoutines::_call_stub_entry =
7203       generate_call_stub(StubRoutines::_call_stub_return_address);
7204 
7205     // is referenced by megamorphic call
7206     StubRoutines::_catch_exception_entry = generate_catch_exception();
7207 
7208     if (UseCRC32Intrinsics) {
7209       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
7210     }
7211 
7212     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
7213         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
7214       StubRoutines::_hf2f = generate_float16ToFloat();
7215       StubRoutines::_f2hf = generate_floatToFloat16();
7216     }
7217   }
7218 
7219   void generate_continuation_stubs() {
7220     // Continuation stubs:
7221     StubRoutines::_cont_thaw             = generate_cont_thaw();
7222     StubRoutines::_cont_returnBarrier    = generate_cont_returnBarrier();
7223     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
7224     StubRoutines::_cont_preempt_stub     = generate_cont_preempt_stub();
7225   }
7226 
7227   void generate_final_stubs() {
7228     // support for verify_oop (must happen after universe_init)
7229     if (VerifyOops) {
7230       StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
7231     }
7232 
7233     // arraycopy stubs used by compilers
7234     generate_arraycopy_stubs();
7235 
7236     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
7237 
7238 #ifdef COMPILER2
7239     if (UseSecondarySupersTable) {
7240       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
7241       if (!InlineSecondarySupersTest) {
7242         generate_lookup_secondary_supers_table_stub();
7243       }
7244     }
7245 #endif // COMPILER2
7246 
7247     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
7248     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
7249 
7250     StubRoutines::riscv::set_completed();
7251   }
7252 
7253   void generate_compiler_stubs() {
7254 #ifdef COMPILER2
7255     if (UseMulAddIntrinsic) {
7256       StubRoutines::_mulAdd = generate_mulAdd();
7257     }
7258 
7259     if (UseMultiplyToLenIntrinsic) {
7260       StubRoutines::_multiplyToLen = generate_multiplyToLen();
7261     }
7262 
7263     if (UseSquareToLenIntrinsic) {
7264       StubRoutines::_squareToLen = generate_squareToLen();
7265     }
7266 
7267     if (UseMontgomeryMultiplyIntrinsic) {
7268       StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
7269       StubCodeMark mark(this, stub_id);
7270       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
7271       StubRoutines::_montgomeryMultiply = g.generate_multiply();
7272     }
7273 
7274     if (UseMontgomerySquareIntrinsic) {
7275       StubId stub_id = StubId::stubgen_montgomerySquare_id;
7276       StubCodeMark mark(this, stub_id);
7277       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
7278       StubRoutines::_montgomerySquare = g.generate_square();
7279     }
7280 
7281     if (UseAESIntrinsics) {
7282       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
7283       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
7284       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
7285       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
7286     }
7287 
7288     if (UseAESCTRIntrinsics) {
7289       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
7290     }
7291 
7292     if (UseGHASHIntrinsics) {
7293       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
7294     }
7295 
7296     if (UsePoly1305Intrinsics) {
7297       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
7298     }
7299 
7300     if (UseRVV) {
7301       StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
7302       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
7303     }
7304 
7305     if (UseVectorizedHashCodeIntrinsic && UseRVV) {
7306       StubRoutines::riscv::_arrays_hashcode_powers_of_31 = generate_arrays_hashcode_powers_of_31();
7307     }
7308 
7309     if (UseSHA256Intrinsics) {
7310       Sha2Generator sha2(_masm, this);
7311       StubRoutines::_sha256_implCompress   = sha2.generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
7312       StubRoutines::_sha256_implCompressMB = sha2.generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
7313     }
7314 
7315     if (UseSHA512Intrinsics) {
7316       Sha2Generator sha2(_masm, this);
7317       StubRoutines::_sha512_implCompress   = sha2.generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
7318       StubRoutines::_sha512_implCompressMB = sha2.generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
7319     }
7320 
7321     if (UseMD5Intrinsics) {
7322       StubRoutines::_md5_implCompress   = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
7323       StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
7324     }
7325 
7326     if (UseChaCha20Intrinsics) {
7327       StubRoutines::_chacha20Block = generate_chacha20Block();
7328     }
7329 
7330     if (UseSHA1Intrinsics) {
7331       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
7332       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
7333     }
7334 
7335     if (UseBASE64Intrinsics) {
7336       StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
7337       StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
7338     }
7339 
7340     if (UseAdler32Intrinsics) {
7341       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
7342     }
7343 
7344     generate_compare_long_strings();
7345 
7346     generate_string_indexof_stubs();
7347 
7348 #endif // COMPILER2
7349   }
7350 
7351  public:
7352   StubGenerator(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) : StubCodeGenerator(code, blob_id, stub_data) {
7353     switch(blob_id) {
7354     case BlobId::stubgen_preuniverse_id:
7355       generate_preuniverse_stubs();
7356       break;
7357     case BlobId::stubgen_initial_id:
7358       generate_initial_stubs();
7359       break;
7360     case BlobId::stubgen_continuation_id:
7361       generate_continuation_stubs();
7362       break;
7363     case BlobId::stubgen_compiler_id:
7364       generate_compiler_stubs();
7365       break;
7366     case BlobId::stubgen_final_id:
7367       generate_final_stubs();
7368       break;
7369     default:
7370       fatal("unexpected blob id: %s", StubInfo::name(blob_id));
7371       break;
7372     };
7373   }
7374 }; // end class declaration
7375 
7376 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) {
7377   StubGenerator g(code, blob_id, stub_data);
7378 }