1 /*
   2  * Copyright (c) 2016, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2016, 2024 SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "asm/macroAssembler.inline.hpp"
  27 #include "registerSaver_s390.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "gc/shared/barrierSetNMethod.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "interpreter/interp_masm.hpp"
  33 #include "memory/universe.hpp"
  34 #include "nativeInst_s390.hpp"
  35 #include "oops/instanceOop.hpp"
  36 #include "oops/objArrayKlass.hpp"
  37 #include "oops/oop.inline.hpp"
  38 #include "prims/methodHandles.hpp"
  39 #include "prims/upcallLinker.hpp"
  40 #include "runtime/frame.inline.hpp"
  41 #include "runtime/handles.inline.hpp"
  42 #include "runtime/javaThread.hpp"
  43 #include "runtime/sharedRuntime.hpp"
  44 #include "runtime/stubCodeGenerator.hpp"
  45 #include "runtime/stubRoutines.hpp"
  46 #include "utilities/formatBuffer.hpp"
  47 #include "utilities/macros.hpp"
  48 #include "utilities/powerOfTwo.hpp"
  49 
  50 // Declaration and definition of StubGenerator (no .hpp file).
  51 // For a more detailed description of the stub routine structure
  52 // see the comment in stubRoutines.hpp.
  53 
  54 #ifdef PRODUCT
  55 #define __ _masm->
  56 #else
  57 #define __ (Verbose ? (_masm->block_comment(FILE_AND_LINE),_masm):_masm)->
  58 #endif
  59 
  60 #define BLOCK_COMMENT(str) if (PrintAssembly || PrintStubCode) __ block_comment(str)
  61 #define BIND(label)        bind(label); BLOCK_COMMENT(#label ":")
  62 
  63 
  64   // These static, partially const, variables are for the AES intrinsics.
  65   // They are declared/initialized here to make them available across function bodies.
  66 
  67       static const int AES_parmBlk_align    = 32;                  // octoword alignment.
  68       static const int AES_stackSpace_incr  = AES_parmBlk_align;   // add'l stack space is allocated in such increments.
  69                                                                    // Must be multiple of AES_parmBlk_align.
  70 
  71       static int AES_ctrVal_len  = 0;                              // ctr init value len (in bytes), expected: length of dataBlk (16)
  72       static int AES_ctrVec_len  = 0;                              // # of ctr vector elements. That many block can be ciphered with one instruction execution
  73       static int AES_ctrArea_len = 0;                              // reserved stack space (in bytes) for ctr (= ctrVal_len * ctrVec_len)
  74 
  75       static int AES_parmBlk_addspace = 0;  // Must be multiple of AES_parmblk_align.
  76                                             // Will be set by stub generator to stub specific value.
  77       static int AES_dataBlk_space    = 0;  // Must be multiple of AES_parmblk_align.
  78                                             // Will be set by stub generator to stub specific value.
  79       static int AES_dataBlk_offset   = 0;  // offset of the local src and dst dataBlk buffers
  80                                             // Will be set by stub generator to stub specific value.
  81 
  82       // These offsets are relative to the parameter block address (Register parmBlk = Z_R1)
  83       static const int keylen_offset     =  -1;
  84       static const int fCode_offset      =  -2;
  85       static const int ctrVal_len_offset =  -4;
  86       static const int msglen_offset     =  -8;
  87       static const int unextSP_offset    = -16;
  88       static const int rem_msgblk_offset = -20;
  89       static const int argsave_offset    = -2*AES_parmBlk_align;
  90       static const int regsave_offset    = -4*AES_parmBlk_align; // save space for work regs (Z_R10..13)
  91       static const int msglen_red_offset = regsave_offset + AES_parmBlk_align; // reduced len after preLoop;
  92       static const int counter_offset    = msglen_red_offset+8;  // current counter vector position.
  93       static const int localSpill_offset = argsave_offset + 24;  // arg2..arg4 are saved
  94 
  95 
  96       // -----------------------------------------------------------------------
  97 // Stub Code definitions
  98 
  99 class StubGenerator: public StubCodeGenerator {
 100  private:
 101 
 102   //----------------------------------------------------------------------
 103   // Call stubs are used to call Java from C.
 104 
 105   //
 106   // Arguments:
 107   //
 108   //   R2        - call wrapper address     : address
 109   //   R3        - result                   : intptr_t*
 110   //   R4        - result type              : BasicType
 111   //   R5        - method                   : method
 112   //   R6        - frame mgr entry point    : address
 113   //   [SP+160]  - parameter block          : intptr_t*
 114   //   [SP+172]  - parameter count in words : int
 115   //   [SP+176]  - thread                   : Thread*
 116   //
 117   address generate_call_stub(address& return_address) {
 118     // Set up a new C frame, copy Java arguments, call frame manager
 119     // or native_entry, and process result.
 120 
 121     StubGenStubId stub_id = StubGenStubId::call_stub_id;
 122     StubCodeMark mark(this, stub_id);
 123     address start = __ pc();
 124 
 125     Register r_arg_call_wrapper_addr   = Z_ARG1;
 126     Register r_arg_result_addr         = Z_ARG2;
 127     Register r_arg_result_type         = Z_ARG3;
 128     Register r_arg_method              = Z_ARG4;
 129     Register r_arg_entry               = Z_ARG5;
 130 
 131     // offsets to fp
 132     #define d_arg_thread 176
 133     #define d_arg_argument_addr 160
 134     #define d_arg_argument_count 168+4
 135 
 136     Register r_entryframe_fp           = Z_tmp_1;
 137     Register r_top_of_arguments_addr   = Z_ARG4;
 138     Register r_new_arg_entry = Z_R14;
 139 
 140     // macros for frame offsets
 141     #define call_wrapper_address_offset \
 142                _z_entry_frame_locals_neg(call_wrapper_address)
 143     #define result_address_offset \
 144               _z_entry_frame_locals_neg(result_address)
 145     #define result_type_offset \
 146               _z_entry_frame_locals_neg(result_type)
 147     #define arguments_tos_address_offset \
 148               _z_entry_frame_locals_neg(arguments_tos_address)
 149 
 150     {
 151       //
 152       // STACK on entry to call_stub:
 153       //
 154       //     F1      [C_FRAME]
 155       //            ...
 156       //
 157 
 158       Register r_argument_addr              = Z_tmp_3;
 159       Register r_argumentcopy_addr          = Z_tmp_4;
 160       Register r_argument_size_in_bytes     = Z_ARG5;
 161       Register r_frame_size                 = Z_R1;
 162 
 163       Label arguments_copied;
 164 
 165       // Save non-volatile registers to ABI of caller frame.
 166       BLOCK_COMMENT("save registers, push frame {");
 167       __ z_stmg(Z_R6, Z_R14, 16, Z_SP);
 168       __ z_std(Z_F8, 96, Z_SP);
 169       __ z_std(Z_F9, 104, Z_SP);
 170       __ z_std(Z_F10, 112, Z_SP);
 171       __ z_std(Z_F11, 120, Z_SP);
 172       __ z_std(Z_F12, 128, Z_SP);
 173       __ z_std(Z_F13, 136, Z_SP);
 174       __ z_std(Z_F14, 144, Z_SP);
 175       __ z_std(Z_F15, 152, Z_SP);
 176 
 177       //
 178       // Push ENTRY_FRAME including arguments:
 179       //
 180       //     F0      [TOP_IJAVA_FRAME_ABI]
 181       //             [outgoing Java arguments]
 182       //             [ENTRY_FRAME_LOCALS]
 183       //     F1      [C_FRAME]
 184       //             ...
 185       //
 186 
 187       // Calculate new frame size and push frame.
 188       #define abi_plus_locals_size \
 189                 (frame::z_top_ijava_frame_abi_size + frame::z_entry_frame_locals_size)
 190       if (abi_plus_locals_size % BytesPerWord == 0) {
 191         // Preload constant part of frame size.
 192         __ load_const_optimized(r_frame_size, -abi_plus_locals_size/BytesPerWord);
 193         // Keep copy of our frame pointer (caller's SP).
 194         __ z_lgr(r_entryframe_fp, Z_SP);
 195         // Add space required by arguments to frame size.
 196         __ z_slgf(r_frame_size, d_arg_argument_count, Z_R0, Z_SP);
 197         // Move Z_ARG5 early, it will be used as a local.
 198         __ z_lgr(r_new_arg_entry, r_arg_entry);
 199         // Convert frame size from words to bytes.
 200         __ z_sllg(r_frame_size, r_frame_size, LogBytesPerWord);
 201         __ push_frame(r_frame_size, r_entryframe_fp,
 202                       false/*don't copy SP*/, true /*frame size sign inverted*/);
 203       } else {
 204         guarantee(false, "frame sizes should be multiples of word size (BytesPerWord)");
 205       }
 206       BLOCK_COMMENT("} save, push");
 207 
 208       // Load argument registers for call.
 209       BLOCK_COMMENT("prepare/copy arguments {");
 210       __ z_lgr(Z_method, r_arg_method);
 211       __ z_lg(Z_thread, d_arg_thread, r_entryframe_fp);
 212 
 213       // Calculate top_of_arguments_addr which will be tos (not prepushed) later.
 214       // Wimply use SP + frame::top_ijava_frame_size.
 215       __ add2reg(r_top_of_arguments_addr,
 216                  frame::z_top_ijava_frame_abi_size - BytesPerWord, Z_SP);
 217 
 218       // Initialize call_stub locals (step 1).
 219       if ((call_wrapper_address_offset + BytesPerWord == result_address_offset) &&
 220           (result_address_offset + BytesPerWord == result_type_offset)          &&
 221           (result_type_offset + BytesPerWord == arguments_tos_address_offset)) {
 222 
 223         __ z_stmg(r_arg_call_wrapper_addr, r_top_of_arguments_addr,
 224                   call_wrapper_address_offset, r_entryframe_fp);
 225       } else {
 226         __ z_stg(r_arg_call_wrapper_addr,
 227                  call_wrapper_address_offset, r_entryframe_fp);
 228         __ z_stg(r_arg_result_addr,
 229                  result_address_offset, r_entryframe_fp);
 230         __ z_stg(r_arg_result_type,
 231                  result_type_offset, r_entryframe_fp);
 232         __ z_stg(r_top_of_arguments_addr,
 233                  arguments_tos_address_offset, r_entryframe_fp);
 234       }
 235 
 236       // Copy Java arguments.
 237 
 238       // Any arguments to copy?
 239       __ load_and_test_int2long(Z_R1, Address(r_entryframe_fp, d_arg_argument_count));
 240       __ z_bre(arguments_copied);
 241 
 242       // Prepare loop and copy arguments in reverse order.
 243       {
 244         // Calculate argument size in bytes.
 245         __ z_sllg(r_argument_size_in_bytes, Z_R1, LogBytesPerWord);
 246 
 247         // Get addr of first incoming Java argument.
 248         __ z_lg(r_argument_addr, d_arg_argument_addr, r_entryframe_fp);
 249 
 250         // Let r_argumentcopy_addr point to last outgoing Java argument.
 251         __ add2reg(r_argumentcopy_addr, BytesPerWord, r_top_of_arguments_addr); // = Z_SP+160 effectively.
 252 
 253         // Let r_argument_addr point to last incoming Java argument.
 254         __ add2reg_with_index(r_argument_addr, -BytesPerWord,
 255                               r_argument_size_in_bytes, r_argument_addr);
 256 
 257         // Now loop while Z_R1 > 0 and copy arguments.
 258         {
 259           Label next_argument;
 260           __ bind(next_argument);
 261           // Mem-mem move.
 262           __ z_mvc(0, BytesPerWord-1, r_argumentcopy_addr, 0, r_argument_addr);
 263           __ add2reg(r_argument_addr,    -BytesPerWord);
 264           __ add2reg(r_argumentcopy_addr, BytesPerWord);
 265           __ z_brct(Z_R1, next_argument);
 266         }
 267       }  // End of argument copy loop.
 268 
 269       __ bind(arguments_copied);
 270     }
 271     BLOCK_COMMENT("} arguments");
 272 
 273     BLOCK_COMMENT("call {");
 274     {
 275       // Call frame manager or native entry.
 276 
 277       //
 278       // Register state on entry to frame manager / native entry:
 279       //
 280       //   Z_ARG1 = r_top_of_arguments_addr  - intptr_t *sender tos (prepushed)
 281       //                                       Lesp = (SP) + copied_arguments_offset - 8
 282       //   Z_method                          - method
 283       //   Z_thread                          - JavaThread*
 284       //
 285 
 286       // Here, the usual SP is the initial_caller_sp.
 287       __ z_lgr(Z_R10, Z_SP);
 288 
 289       // Z_esp points to the slot below the last argument.
 290       __ z_lgr(Z_esp, r_top_of_arguments_addr);
 291 
 292       //
 293       // Stack on entry to frame manager / native entry:
 294       //
 295       //     F0      [TOP_IJAVA_FRAME_ABI]
 296       //             [outgoing Java arguments]
 297       //             [ENTRY_FRAME_LOCALS]
 298       //     F1      [C_FRAME]
 299       //             ...
 300       //
 301 
 302       // Do a light-weight C-call here, r_new_arg_entry holds the address
 303       // of the interpreter entry point (frame manager or native entry)
 304       // and save runtime-value of return_pc in return_address
 305       // (call by reference argument).
 306       return_address = __ call_stub(r_new_arg_entry);
 307     }
 308     BLOCK_COMMENT("} call");
 309 
 310     {
 311       BLOCK_COMMENT("restore registers {");
 312       // Returned from frame manager or native entry.
 313       // Now pop frame, process result, and return to caller.
 314 
 315       //
 316       // Stack on exit from frame manager / native entry:
 317       //
 318       //     F0      [ABI]
 319       //             ...
 320       //             [ENTRY_FRAME_LOCALS]
 321       //     F1      [C_FRAME]
 322       //             ...
 323       //
 324       // Just pop the topmost frame ...
 325       //
 326 
 327       // Restore frame pointer.
 328       __ z_lg(r_entryframe_fp, _z_abi(callers_sp), Z_SP);
 329       // Pop frame. Done here to minimize stalls.
 330       __ pop_frame();
 331 
 332       // Reload some volatile registers which we've spilled before the call
 333       // to frame manager / native entry.
 334       // Access all locals via frame pointer, because we know nothing about
 335       // the topmost frame's size.
 336       __ z_lg(r_arg_result_addr, result_address_offset, r_entryframe_fp);
 337       __ z_lg(r_arg_result_type, result_type_offset, r_entryframe_fp);
 338 
 339       // Restore non-volatiles.
 340       __ z_lmg(Z_R6, Z_R14, 16, Z_SP);
 341       __ z_ld(Z_F8, 96, Z_SP);
 342       __ z_ld(Z_F9, 104, Z_SP);
 343       __ z_ld(Z_F10, 112, Z_SP);
 344       __ z_ld(Z_F11, 120, Z_SP);
 345       __ z_ld(Z_F12, 128, Z_SP);
 346       __ z_ld(Z_F13, 136, Z_SP);
 347       __ z_ld(Z_F14, 144, Z_SP);
 348       __ z_ld(Z_F15, 152, Z_SP);
 349       BLOCK_COMMENT("} restore");
 350 
 351       //
 352       // Stack on exit from call_stub:
 353       //
 354       //     0       [C_FRAME]
 355       //             ...
 356       //
 357       // No call_stub frames left.
 358       //
 359 
 360       // All non-volatiles have been restored at this point!!
 361 
 362       //------------------------------------------------------------------------
 363       // The following code makes some assumptions on the T_<type> enum values.
 364       // The enum is defined in globalDefinitions.hpp.
 365       // The validity of the assumptions is tested as far as possible.
 366       //   The assigned values should not be shuffled
 367       //   T_BOOLEAN==4    - lowest used enum value
 368       //   T_NARROWOOP==16 - largest used enum value
 369       //------------------------------------------------------------------------
 370       BLOCK_COMMENT("process result {");
 371       Label firstHandler;
 372       int   handlerLen= 8;
 373 #ifdef ASSERT
 374       char  assertMsg[] = "check BasicType definition in globalDefinitions.hpp";
 375       __ z_chi(r_arg_result_type, T_BOOLEAN);
 376       __ asm_assert(Assembler::bcondNotLow, assertMsg, 0x0234);
 377       __ z_chi(r_arg_result_type, T_NARROWOOP);
 378       __ asm_assert(Assembler::bcondNotHigh, assertMsg, 0x0235);
 379 #endif
 380       __ add2reg(r_arg_result_type, -T_BOOLEAN);          // Remove offset.
 381       __ z_larl(Z_R1, firstHandler);                      // location of first handler
 382       __ z_sllg(r_arg_result_type, r_arg_result_type, 3); // Each handler is 8 bytes long.
 383       __ z_bc(MacroAssembler::bcondAlways, 0, r_arg_result_type, Z_R1);
 384 
 385       __ align(handlerLen);
 386       __ bind(firstHandler);
 387       // T_BOOLEAN:
 388         guarantee(T_BOOLEAN == 4, "check BasicType definition in globalDefinitions.hpp");
 389         __ z_st(Z_RET, 0, r_arg_result_addr);
 390         __ z_br(Z_R14); // Return to caller.
 391         __ align(handlerLen);
 392       // T_CHAR:
 393         guarantee(T_CHAR == T_BOOLEAN+1, "check BasicType definition in globalDefinitions.hpp");
 394         __ z_st(Z_RET, 0, r_arg_result_addr);
 395         __ z_br(Z_R14); // Return to caller.
 396         __ align(handlerLen);
 397       // T_FLOAT:
 398         guarantee(T_FLOAT == T_CHAR+1, "check BasicType definition in globalDefinitions.hpp");
 399         __ z_ste(Z_FRET, 0, r_arg_result_addr);
 400         __ z_br(Z_R14); // Return to caller.
 401         __ align(handlerLen);
 402       // T_DOUBLE:
 403         guarantee(T_DOUBLE == T_FLOAT+1, "check BasicType definition in globalDefinitions.hpp");
 404         __ z_std(Z_FRET, 0, r_arg_result_addr);
 405         __ z_br(Z_R14); // Return to caller.
 406         __ align(handlerLen);
 407       // T_BYTE:
 408         guarantee(T_BYTE == T_DOUBLE+1, "check BasicType definition in globalDefinitions.hpp");
 409         __ z_st(Z_RET, 0, r_arg_result_addr);
 410         __ z_br(Z_R14); // Return to caller.
 411         __ align(handlerLen);
 412       // T_SHORT:
 413         guarantee(T_SHORT == T_BYTE+1, "check BasicType definition in globalDefinitions.hpp");
 414         __ z_st(Z_RET, 0, r_arg_result_addr);
 415         __ z_br(Z_R14); // Return to caller.
 416         __ align(handlerLen);
 417       // T_INT:
 418         guarantee(T_INT == T_SHORT+1, "check BasicType definition in globalDefinitions.hpp");
 419         __ z_st(Z_RET, 0, r_arg_result_addr);
 420         __ z_br(Z_R14); // Return to caller.
 421         __ align(handlerLen);
 422       // T_LONG:
 423         guarantee(T_LONG == T_INT+1, "check BasicType definition in globalDefinitions.hpp");
 424         __ z_stg(Z_RET, 0, r_arg_result_addr);
 425         __ z_br(Z_R14); // Return to caller.
 426         __ align(handlerLen);
 427       // T_OBJECT:
 428         guarantee(T_OBJECT == T_LONG+1, "check BasicType definition in globalDefinitions.hpp");
 429         __ z_stg(Z_RET, 0, r_arg_result_addr);
 430         __ z_br(Z_R14); // Return to caller.
 431         __ align(handlerLen);
 432       // T_ARRAY:
 433         guarantee(T_ARRAY == T_OBJECT+1, "check BasicType definition in globalDefinitions.hpp");
 434         __ z_stg(Z_RET, 0, r_arg_result_addr);
 435         __ z_br(Z_R14); // Return to caller.
 436         __ align(handlerLen);
 437       // T_VOID:
 438         guarantee(T_VOID == T_ARRAY+1, "check BasicType definition in globalDefinitions.hpp");
 439         __ z_stg(Z_RET, 0, r_arg_result_addr);
 440         __ z_br(Z_R14); // Return to caller.
 441         __ align(handlerLen);
 442       // T_ADDRESS:
 443         guarantee(T_ADDRESS == T_VOID+1, "check BasicType definition in globalDefinitions.hpp");
 444         __ z_stg(Z_RET, 0, r_arg_result_addr);
 445         __ z_br(Z_R14); // Return to caller.
 446         __ align(handlerLen);
 447       // T_NARROWOOP:
 448         guarantee(T_NARROWOOP == T_ADDRESS+1, "check BasicType definition in globalDefinitions.hpp");
 449         __ z_st(Z_RET, 0, r_arg_result_addr);
 450         __ z_br(Z_R14); // Return to caller.
 451         __ align(handlerLen);
 452       BLOCK_COMMENT("} process result");
 453     }
 454     return start;
 455   }
 456 
 457   // Return point for a Java call if there's an exception thrown in
 458   // Java code. The exception is caught and transformed into a
 459   // pending exception stored in JavaThread that can be tested from
 460   // within the VM.
 461   address generate_catch_exception() {
 462     StubGenStubId stub_id = StubGenStubId::catch_exception_id;
 463     StubCodeMark mark(this, stub_id);
 464 
 465     address start = __ pc();
 466 
 467     //
 468     // Registers alive
 469     //
 470     //   Z_thread
 471     //   Z_ARG1 - address of pending exception
 472     //   Z_ARG2 - return address in call stub
 473     //
 474 
 475     const Register exception_file = Z_R0;
 476     const Register exception_line = Z_R1;
 477 
 478     __ load_const_optimized(exception_file, (void*)__FILE__);
 479     __ load_const_optimized(exception_line, (void*)__LINE__);
 480 
 481     __ z_stg(Z_ARG1, thread_(pending_exception));
 482     // Store into `char *'.
 483     __ z_stg(exception_file, thread_(exception_file));
 484     // Store into `int'.
 485     __ z_st(exception_line, thread_(exception_line));
 486 
 487     // Complete return to VM.
 488     assert(StubRoutines::_call_stub_return_address != nullptr, "must have been generated before");
 489 
 490     // Continue in call stub.
 491     __ z_br(Z_ARG2);
 492 
 493     return start;
 494   }
 495 
 496   // Continuation point for runtime calls returning with a pending
 497   // exception. The pending exception check happened in the runtime
 498   // or native call stub. The pending exception in Thread is
 499   // converted into a Java-level exception.
 500   //
 501   // Read:
 502   //   Z_R14: pc the runtime library callee wants to return to.
 503   //   Since the exception occurred in the callee, the return pc
 504   //   from the point of view of Java is the exception pc.
 505   //
 506   // Invalidate:
 507   //   Volatile registers (except below).
 508   //
 509   // Update:
 510   //   Z_ARG1: exception
 511   //   (Z_R14 is unchanged and is live out).
 512   //
 513   address generate_forward_exception() {
 514     StubGenStubId stub_id = StubGenStubId::forward_exception_id;
 515     StubCodeMark mark(this, stub_id);
 516     address start = __ pc();
 517 
 518     #define pending_exception_offset in_bytes(Thread::pending_exception_offset())
 519 #ifdef ASSERT
 520     // Get pending exception oop.
 521     __ z_lg(Z_ARG1, pending_exception_offset, Z_thread);
 522 
 523     // Make sure that this code is only executed if there is a pending exception.
 524     {
 525       Label L;
 526       __ z_ltgr(Z_ARG1, Z_ARG1);
 527       __ z_brne(L);
 528       __ stop("StubRoutines::forward exception: no pending exception (1)");
 529       __ bind(L);
 530     }
 531 
 532     __ verify_oop(Z_ARG1, "StubRoutines::forward exception: not an oop");
 533 #endif
 534 
 535     __ z_lgr(Z_ARG2, Z_R14); // Copy exception pc into Z_ARG2.
 536     __ save_return_pc();
 537     __ push_frame_abi160(0);
 538     // Find exception handler.
 539     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address),
 540                     Z_thread,
 541                     Z_ARG2);
 542     // Copy handler's address.
 543     __ z_lgr(Z_R1, Z_RET);
 544     __ pop_frame();
 545     __ restore_return_pc();
 546 
 547     // Set up the arguments for the exception handler:
 548     // - Z_ARG1: exception oop
 549     // - Z_ARG2: exception pc
 550 
 551     // Load pending exception oop.
 552     __ z_lg(Z_ARG1, pending_exception_offset, Z_thread);
 553 
 554     // The exception pc is the return address in the caller,
 555     // must load it into Z_ARG2
 556     __ z_lgr(Z_ARG2, Z_R14);
 557 
 558 #ifdef ASSERT
 559     // Make sure exception is set.
 560     { Label L;
 561       __ z_ltgr(Z_ARG1, Z_ARG1);
 562       __ z_brne(L);
 563       __ stop("StubRoutines::forward exception: no pending exception (2)");
 564       __ bind(L);
 565     }
 566 #endif
 567     // Clear the pending exception.
 568     __ clear_mem(Address(Z_thread, pending_exception_offset), sizeof(void *));
 569     // Jump to exception handler
 570     __ z_br(Z_R1 /*handler address*/);
 571 
 572     return start;
 573 
 574     #undef pending_exception_offset
 575   }
 576 
 577 #undef __
 578 #ifdef PRODUCT
 579 #define __ _masm->
 580 #else
 581 #define __ (Verbose ? (_masm->block_comment(FILE_AND_LINE),_masm):_masm)->
 582 #endif
 583 
 584   // Support for uint StubRoutine::zarch::partial_subtype_check(Klass
 585   // sub, Klass super);
 586   //
 587   // Arguments:
 588   //   ret  : Z_RET, returned
 589   //   sub  : Z_ARG2, argument, not changed
 590   //   super: Z_ARG3, argument, not changed
 591   //
 592   //   raddr: Z_R14, blown by call
 593   //
 594   address generate_partial_subtype_check() {
 595     StubGenStubId stub_id = StubGenStubId::partial_subtype_check_id;
 596     StubCodeMark mark(this, stub_id);
 597     Label miss;
 598 
 599     address start = __ pc();
 600 
 601     const Register Rsubklass   = Z_ARG2; // subklass
 602     const Register Rsuperklass = Z_ARG3; // superklass
 603 
 604     // No args, but tmp registers that are killed.
 605     const Register Rlength     = Z_ARG4; // cache array length
 606     const Register Rarray_ptr  = Z_ARG5; // Current value from cache array.
 607 
 608     if (UseCompressedOops) {
 609       assert(Universe::heap() != nullptr, "java heap must be initialized to generate partial_subtype_check stub");
 610     }
 611 
 612     // Always take the slow path.
 613     __ check_klass_subtype_slow_path(Rsubklass, Rsuperklass,
 614                                      Rarray_ptr, Rlength, nullptr, &miss);
 615 
 616     // Match falls through here.
 617     __ clear_reg(Z_RET);               // Zero indicates a match. Set EQ flag in CC.
 618     __ z_br(Z_R14);
 619 
 620     __ BIND(miss);
 621     __ load_const_optimized(Z_RET, 1); // One indicates a miss.
 622     __ z_ltgr(Z_RET, Z_RET);           // Set NE flag in CR.
 623     __ z_br(Z_R14);
 624 
 625     return start;
 626   }
 627 
 628   void generate_lookup_secondary_supers_table_stub() {
 629     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id;
 630     StubCodeMark mark(this, stub_id);
 631 
 632     const Register
 633         r_super_klass  = Z_ARG1,
 634         r_sub_klass    = Z_ARG2,
 635         r_array_index  = Z_ARG3,
 636         r_array_length = Z_ARG4,
 637         r_array_base   = Z_ARG5,
 638         r_bitmap       = Z_R10,
 639         r_result       = Z_R11;
 640     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
 641       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
 642       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
 643                                              r_array_base, r_array_length, r_array_index,
 644                                              r_bitmap, r_result, slot);
 645 
 646       __ z_br(Z_R14);
 647     }
 648   }
 649 
 650   // Slow path implementation for UseSecondarySupersTable.
 651   address generate_lookup_secondary_supers_table_slow_path_stub() {
 652     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id;
 653     StubCodeMark mark(this, stub_id);
 654 
 655     address start = __ pc();
 656 
 657     const Register
 658         r_super_klass  = Z_ARG1,
 659         r_array_base   = Z_ARG5,
 660         r_temp1        = Z_ARG4,
 661         r_array_index  = Z_ARG3,
 662         r_bitmap       = Z_R10,
 663         r_result       = Z_R11;
 664 
 665     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base,
 666                                                r_array_index, r_bitmap, r_temp1, r_result, /* is_stub */ true);
 667 
 668     __ z_br(Z_R14);
 669 
 670     return start;
 671   }
 672 
 673 #if !defined(PRODUCT)
 674   // Wrapper which calls oopDesc::is_oop_or_null()
 675   // Only called by MacroAssembler::verify_oop
 676   static void verify_oop_helper(const char* message, oopDesc* o) {
 677     if (!oopDesc::is_oop_or_null(o)) {
 678       fatal("%s. oop: " PTR_FORMAT, message, p2i(o));
 679     }
 680     ++ StubRoutines::_verify_oop_count;
 681   }
 682 #endif
 683 
 684   // Return address of code to be called from code generated by
 685   // MacroAssembler::verify_oop.
 686   //
 687   // Don't generate, rather use C++ code.
 688   address generate_verify_oop_subroutine() {
 689     // Don't generate a StubCodeMark, because no code is generated!
 690     // Generating the mark triggers notifying the oprofile jvmti agent
 691     // about the dynamic code generation, but the stub without
 692     // code (code_size == 0) confuses opjitconv
 693     // StubCodeMark mark(this, "StubRoutines", "verify_oop_stub");
 694 
 695     address start = nullptr;
 696 
 697 #if !defined(PRODUCT)
 698     start = CAST_FROM_FN_PTR(address, verify_oop_helper);
 699 #endif
 700 
 701     return start;
 702   }
 703 
 704   // This is to test that the count register contains a positive int value.
 705   // Required because C2 does not respect int to long conversion for stub calls.
 706   void assert_positive_int(Register count) {
 707 #ifdef ASSERT
 708     __ z_srag(Z_R0, count, 31);  // Just leave the sign (must be zero) in Z_R0.
 709     __ asm_assert(Assembler::bcondZero, "missing zero extend", 0xAFFE);
 710 #endif
 711   }
 712 
 713   //  Generate overlap test for array copy stubs.
 714   //  If no actual overlap is detected, control is transferred to the
 715   //  "normal" copy stub (entry address passed in disjoint_copy_target).
 716   //  Otherwise, execution continues with the code generated by the
 717   //  caller of array_overlap_test.
 718   //
 719   //  Input:
 720   //    Z_ARG1    - from
 721   //    Z_ARG2    - to
 722   //    Z_ARG3    - element count
 723   void array_overlap_test(address disjoint_copy_target, int log2_elem_size) {
 724     __ MacroAssembler::compare_and_branch_optimized(Z_ARG2, Z_ARG1, Assembler::bcondNotHigh,
 725                                                     disjoint_copy_target, /*len64=*/true, /*has_sign=*/false);
 726 
 727     Register index = Z_ARG3;
 728     if (log2_elem_size > 0) {
 729       __ z_sllg(Z_R1, Z_ARG3, log2_elem_size);  // byte count
 730       index = Z_R1;
 731     }
 732     __ add2reg_with_index(Z_R1, 0, index, Z_ARG1);  // First byte after "from" range.
 733 
 734     __ MacroAssembler::compare_and_branch_optimized(Z_R1, Z_ARG2, Assembler::bcondNotHigh,
 735                                                     disjoint_copy_target, /*len64=*/true, /*has_sign=*/false);
 736 
 737     // Destructive overlap: let caller generate code for that.
 738   }
 739 
 740   //  Generate stub for disjoint array copy. If "aligned" is true, the
 741   //  "from" and "to" addresses are assumed to be heapword aligned.
 742   //
 743   //  Arguments for generated stub:
 744   //      from:  Z_ARG1
 745   //      to:    Z_ARG2
 746   //      count: Z_ARG3 treated as signed
 747   void generate_disjoint_copy(bool aligned, int element_size,
 748                               bool branchToEnd,
 749                               bool restoreArgs) {
 750     // This is the zarch specific stub generator for general array copy tasks.
 751     // It has the following prereqs and features:
 752     //
 753     // - No destructive overlap allowed (else unpredictable results).
 754     // - Destructive overlap does not exist if the leftmost byte of the target
 755     //   does not coincide with any of the source bytes (except the leftmost).
 756     //
 757     //   Register usage upon entry:
 758     //      Z_ARG1 == Z_R2 :   address of source array
 759     //      Z_ARG2 == Z_R3 :   address of target array
 760     //      Z_ARG3 == Z_R4 :   length of operands (# of elements on entry)
 761     //
 762     // Register usage within the generator:
 763     // - Z_R0 and Z_R1 are KILLed by the stub routine (target addr/len).
 764     //                 Used as pair register operand in complex moves, scratch registers anyway.
 765     // - Z_R5 is KILLed by the stub routine (source register pair addr/len) (even/odd reg).
 766     //                  Same as R0/R1, but no scratch register.
 767     // - Z_ARG1, Z_ARG2, Z_ARG3 are USEd but preserved by the stub routine,
 768     //                          but they might get temporarily overwritten.
 769 
 770     Register  save_reg    = Z_ARG4;   // (= Z_R5), holds original target operand address for restore.
 771 
 772     {
 773       Register   llen_reg = Z_R1;     // Holds left operand len (odd reg).
 774       Register  laddr_reg = Z_R0;     // Holds left operand addr (even reg), overlaps with data_reg.
 775       Register   rlen_reg = Z_R5;     // Holds right operand len (odd reg), overlaps with save_reg.
 776       Register  raddr_reg = Z_R4;     // Holds right operand addr (even reg), overlaps with len_reg.
 777 
 778       Register   data_reg = Z_R0;     // Holds copied data chunk in alignment process and copy loop.
 779       Register    len_reg = Z_ARG3;   // Holds operand len (#elements at entry, #bytes shortly after).
 780       Register    dst_reg = Z_ARG2;   // Holds left (target)  operand addr.
 781       Register    src_reg = Z_ARG1;   // Holds right (source) operand addr.
 782 
 783       Label     doMVCLOOP, doMVCLOOPcount, doMVCLOOPiterate;
 784       Label     doMVCUnrolled;
 785       NearLabel doMVC,  doMVCgeneral, done;
 786       Label     MVC_template;
 787       address   pcMVCblock_b, pcMVCblock_e;
 788 
 789       bool      usedMVCLE       = true;
 790       bool      usedMVCLOOP     = true;
 791       bool      usedMVCUnrolled = false;
 792       bool      usedMVC         = false;
 793       bool      usedMVCgeneral  = false;
 794 
 795       int       stride;
 796       Register  stride_reg;
 797       Register  ix_reg;
 798 
 799       assert((element_size<=256) && (256%element_size == 0), "element size must be <= 256, power of 2");
 800       unsigned int log2_size = exact_log2(element_size);
 801 
 802       switch (element_size) {
 803         case 1:  BLOCK_COMMENT("ARRAYCOPY DISJOINT byte  {"); break;
 804         case 2:  BLOCK_COMMENT("ARRAYCOPY DISJOINT short {"); break;
 805         case 4:  BLOCK_COMMENT("ARRAYCOPY DISJOINT int   {"); break;
 806         case 8:  BLOCK_COMMENT("ARRAYCOPY DISJOINT long  {"); break;
 807         default: BLOCK_COMMENT("ARRAYCOPY DISJOINT       {"); break;
 808       }
 809 
 810       assert_positive_int(len_reg);
 811 
 812       BLOCK_COMMENT("preparation {");
 813 
 814       // No copying if len <= 0.
 815       if (branchToEnd) {
 816         __ compare64_and_branch(len_reg, (intptr_t) 0, Assembler::bcondNotHigh, done);
 817       } else {
 818         if (VM_Version::has_CompareBranch()) {
 819           __ z_cgib(len_reg, 0, Assembler::bcondNotHigh, 0, Z_R14);
 820         } else {
 821           __ z_ltgr(len_reg, len_reg);
 822           __ z_bcr(Assembler::bcondNotPositive, Z_R14);
 823         }
 824       }
 825 
 826       // Prefetch just one cache line. Speculative opt for short arrays.
 827       // Do not use Z_R1 in prefetch. Is undefined here.
 828       if (VM_Version::has_Prefetch()) {
 829         __ z_pfd(0x01, 0, Z_R0, src_reg); // Fetch access.
 830         __ z_pfd(0x02, 0, Z_R0, dst_reg); // Store access.
 831       }
 832 
 833       BLOCK_COMMENT("} preparation");
 834 
 835       // Save args only if really needed.
 836       // Keep len test local to branch. Is generated only once.
 837 
 838       BLOCK_COMMENT("mode selection {");
 839 
 840       // Special handling for arrays with only a few elements.
 841       // Nothing fancy: just an executed MVC.
 842       if (log2_size > 0) {
 843         __ z_sllg(Z_R1, len_reg, log2_size); // Remember #bytes in Z_R1.
 844       }
 845       if (element_size != 8) {
 846         __ z_cghi(len_reg, 256/element_size);
 847         __ z_brnh(doMVC);
 848         usedMVC = true;
 849       }
 850       if (element_size == 8) { // Long and oop arrays are always aligned.
 851         __ z_cghi(len_reg, 256/element_size);
 852         __ z_brnh(doMVCUnrolled);
 853         usedMVCUnrolled = true;
 854       }
 855 
 856       // Prefetch another cache line. We, for sure, have more than one line to copy.
 857       if (VM_Version::has_Prefetch()) {
 858         __ z_pfd(0x01, 256, Z_R0, src_reg); // Fetch access.
 859         __ z_pfd(0x02, 256, Z_R0, dst_reg); // Store access.
 860       }
 861 
 862       if (restoreArgs) {
 863         // Remember entry value of ARG2 to restore all arguments later from that knowledge.
 864         __ z_lgr(save_reg, dst_reg);
 865       }
 866 
 867       __ z_cghi(len_reg, 4096/element_size);
 868       if (log2_size == 0) {
 869         __ z_lgr(Z_R1, len_reg); // Init Z_R1 with #bytes
 870       }
 871       __ z_brnh(doMVCLOOP);
 872 
 873       // Fall through to MVCLE case.
 874 
 875       BLOCK_COMMENT("} mode selection");
 876 
 877       // MVCLE: for long arrays
 878       //   DW aligned: Best performance for sizes > 4kBytes.
 879       //   unaligned:  Least complex for sizes > 256 bytes.
 880       if (usedMVCLE) {
 881         BLOCK_COMMENT("mode MVCLE {");
 882 
 883         // Setup registers for mvcle.
 884         //__ z_lgr(llen_reg, len_reg);// r1 <- r4  #bytes already in Z_R1, aka llen_reg.
 885         __ z_lgr(laddr_reg, dst_reg); // r0 <- r3
 886         __ z_lgr(raddr_reg, src_reg); // r4 <- r2
 887         __ z_lgr(rlen_reg, llen_reg); // r5 <- r1
 888 
 889         __ MacroAssembler::move_long_ext(laddr_reg, raddr_reg, 0xb0);    // special: bypass cache
 890         // __ MacroAssembler::move_long_ext(laddr_reg, raddr_reg, 0xb8); // special: Hold data in cache.
 891         // __ MacroAssembler::move_long_ext(laddr_reg, raddr_reg, 0);
 892 
 893         if (restoreArgs) {
 894           // MVCLE updates the source (Z_R4,Z_R5) and target (Z_R0,Z_R1) register pairs.
 895           // Dst_reg (Z_ARG2) and src_reg (Z_ARG1) are left untouched. No restore required.
 896           // Len_reg (Z_ARG3) is destroyed and must be restored.
 897           __ z_slgr(laddr_reg, dst_reg);    // copied #bytes
 898           if (log2_size > 0) {
 899             __ z_srag(Z_ARG3, laddr_reg, log2_size); // Convert back to #elements.
 900           } else {
 901             __ z_lgr(Z_ARG3, laddr_reg);
 902           }
 903         }
 904         if (branchToEnd) {
 905           __ z_bru(done);
 906         } else {
 907           __ z_br(Z_R14);
 908         }
 909         BLOCK_COMMENT("} mode MVCLE");
 910       }
 911       // No fallthru possible here.
 912 
 913       //  MVCUnrolled: for short, aligned arrays.
 914 
 915       if (usedMVCUnrolled) {
 916         BLOCK_COMMENT("mode MVC unrolled {");
 917         stride = 8;
 918 
 919         // Generate unrolled MVC instructions.
 920         for (int ii = 32; ii > 1; ii--) {
 921           __ z_mvc(0, ii * stride-1, dst_reg, 0, src_reg); // ii*8 byte copy
 922           if (branchToEnd) {
 923             __ z_bru(done);
 924           } else {
 925             __ z_br(Z_R14);
 926           }
 927         }
 928 
 929         pcMVCblock_b = __ pc();
 930         __ z_mvc(0, 1 * stride-1, dst_reg, 0, src_reg); // 8 byte copy
 931         if (branchToEnd) {
 932           __ z_bru(done);
 933         } else {
 934           __ z_br(Z_R14);
 935         }
 936 
 937         pcMVCblock_e = __ pc();
 938         Label MVC_ListEnd;
 939         __ bind(MVC_ListEnd);
 940 
 941         // This is an absolute fast path:
 942         // - Array len in bytes must be not greater than 256.
 943         // - Array len in bytes must be an integer mult of DW
 944         //   to save expensive handling of trailing bytes.
 945         // - Argument restore is not done,
 946         //   i.e. previous code must not alter arguments (this code doesn't either).
 947 
 948         __ bind(doMVCUnrolled);
 949 
 950         // Avoid mul, prefer shift where possible.
 951         // Combine shift right (for #DW) with shift left (for block size).
 952         // Set CC for zero test below (asm_assert).
 953         // Note: #bytes comes in Z_R1, #DW in len_reg.
 954         unsigned int MVCblocksize    = pcMVCblock_e - pcMVCblock_b;
 955         unsigned int logMVCblocksize = 0xffffffffU; // Pacify compiler ("used uninitialized" warning).
 956 
 957         if (log2_size > 0) { // Len was scaled into Z_R1.
 958           switch (MVCblocksize) {
 959 
 960             case  8: logMVCblocksize = 3;
 961                      __ z_ltgr(Z_R0, Z_R1); // #bytes is index
 962                      break;                 // reasonable size, use shift
 963 
 964             case 16: logMVCblocksize = 4;
 965                      __ z_slag(Z_R0, Z_R1, logMVCblocksize-log2_size);
 966                      break;                 // reasonable size, use shift
 967 
 968             default: logMVCblocksize = 0;
 969                      __ z_ltgr(Z_R0, len_reg); // #DW for mul
 970                      break;                 // all other sizes: use mul
 971           }
 972         } else {
 973           guarantee(log2_size, "doMVCUnrolled: only for DW entities");
 974         }
 975 
 976         // This test (and branch) is redundant. Previous code makes sure that
 977         //  - element count > 0
 978         //  - element size == 8.
 979         // Thus, len reg should never be zero here. We insert an asm_assert() here,
 980         // just to double-check and to be on the safe side.
 981         __ asm_assert(false, "zero len cannot occur", 99);
 982 
 983         __ z_larl(Z_R1, MVC_ListEnd);        // Get addr of last instr block.
 984         // Avoid mul, prefer shift where possible.
 985         if (logMVCblocksize == 0) {
 986           __ z_mghi(Z_R0, MVCblocksize);
 987         }
 988         __ z_slgr(Z_R1, Z_R0);
 989         __ z_br(Z_R1);
 990         BLOCK_COMMENT("} mode MVC unrolled");
 991       }
 992       // No fallthru possible here.
 993 
 994       // MVC execute template
 995       // Must always generate. Usage may be switched on below.
 996       // There is no suitable place after here to put the template.
 997       __ bind(MVC_template);
 998       __ z_mvc(0,0,dst_reg,0,src_reg);      // Instr template, never exec directly!
 999 
1000 
1001       // MVC Loop: for medium-sized arrays
1002 
1003       // Only for DW aligned arrays (src and dst).
1004       // #bytes to copy must be at least 256!!!
1005       // Non-aligned cases handled separately.
1006       stride     = 256;
1007       stride_reg = Z_R1;   // Holds #bytes when control arrives here.
1008       ix_reg     = Z_ARG3; // Alias for len_reg.
1009 
1010 
1011       if (usedMVCLOOP) {
1012         BLOCK_COMMENT("mode MVC loop {");
1013         __ bind(doMVCLOOP);
1014 
1015         __ z_lcgr(ix_reg, Z_R1);         // Ix runs from -(n-2)*stride to 1*stride (inclusive).
1016         __ z_llill(stride_reg, stride);
1017         __ add2reg(ix_reg, 2*stride);    // Thus: increment ix by 2*stride.
1018 
1019         __ bind(doMVCLOOPiterate);
1020           __ z_mvc(0, stride-1, dst_reg, 0, src_reg);
1021           __ add2reg(dst_reg, stride);
1022           __ add2reg(src_reg, stride);
1023           __ bind(doMVCLOOPcount);
1024           __ z_brxlg(ix_reg, stride_reg, doMVCLOOPiterate);
1025 
1026         // Don 't use add2reg() here, since we must set the condition code!
1027         __ z_aghi(ix_reg, -2*stride);       // Compensate incr from above: zero diff means "all copied".
1028 
1029         if (restoreArgs) {
1030           __ z_lcgr(Z_R1, ix_reg);          // Prepare ix_reg for copy loop, #bytes expected in Z_R1.
1031           __ z_brnz(doMVCgeneral);          // We're not done yet, ix_reg is not zero.
1032 
1033           // ARG1, ARG2, and ARG3 were altered by the code above, so restore them building on save_reg.
1034           __ z_slgr(dst_reg, save_reg);     // copied #bytes
1035           __ z_slgr(src_reg, dst_reg);      // = ARG1 (now restored)
1036           if (log2_size) {
1037             __ z_srag(Z_ARG3, dst_reg, log2_size); // Convert back to #elements to restore ARG3.
1038           } else {
1039             __ z_lgr(Z_ARG3, dst_reg);
1040           }
1041           __ z_lgr(Z_ARG2, save_reg);       // ARG2 now restored.
1042 
1043           if (branchToEnd) {
1044             __ z_bru(done);
1045           } else {
1046             __ z_br(Z_R14);
1047           }
1048 
1049         } else {
1050             if (branchToEnd) {
1051               __ z_brz(done);                        // CC set by aghi instr.
1052           } else {
1053               __ z_bcr(Assembler::bcondZero, Z_R14); // We're all done if zero.
1054             }
1055 
1056           __ z_lcgr(Z_R1, ix_reg);    // Prepare ix_reg for copy loop, #bytes expected in Z_R1.
1057           // __ z_bru(doMVCgeneral);  // fallthru
1058         }
1059         usedMVCgeneral = true;
1060         BLOCK_COMMENT("} mode MVC loop");
1061       }
1062       // Fallthru to doMVCgeneral
1063 
1064       // MVCgeneral: for short, unaligned arrays, after other copy operations
1065 
1066       // Somewhat expensive due to use of EX instruction, but simple.
1067       if (usedMVCgeneral) {
1068         BLOCK_COMMENT("mode MVC general {");
1069         __ bind(doMVCgeneral);
1070 
1071         __ add2reg(len_reg, -1, Z_R1);             // Get #bytes-1 for EXECUTE.
1072         if (VM_Version::has_ExecuteExtensions()) {
1073           __ z_exrl(len_reg, MVC_template);        // Execute MVC with variable length.
1074         } else {
1075           __ z_larl(Z_R1, MVC_template);           // Get addr of instr template.
1076           __ z_ex(len_reg, 0, Z_R0, Z_R1);         // Execute MVC with variable length.
1077         }                                          // penalty: 9 ticks
1078 
1079         if (restoreArgs) {
1080           // ARG1, ARG2, and ARG3 were altered by code executed before, so restore them building on save_reg
1081           __ z_slgr(dst_reg, save_reg);            // Copied #bytes without the "doMVCgeneral" chunk
1082           __ z_slgr(src_reg, dst_reg);             // = ARG1 (now restored), was not advanced for "doMVCgeneral" chunk
1083           __ add2reg_with_index(dst_reg, 1, len_reg, dst_reg); // Len of executed MVC was not accounted for, yet.
1084           if (log2_size) {
1085             __ z_srag(Z_ARG3, dst_reg, log2_size); // Convert back to #elements to restore ARG3
1086           } else {
1087              __ z_lgr(Z_ARG3, dst_reg);
1088           }
1089           __ z_lgr(Z_ARG2, save_reg);              // ARG2 now restored.
1090         }
1091 
1092         if (usedMVC) {
1093           if (branchToEnd) {
1094             __ z_bru(done);
1095           } else {
1096             __ z_br(Z_R14);
1097         }
1098         } else {
1099           if (!branchToEnd) __ z_br(Z_R14);
1100         }
1101         BLOCK_COMMENT("} mode MVC general");
1102       }
1103       // Fallthru possible if following block not generated.
1104 
1105       // MVC: for short, unaligned arrays
1106 
1107       // Somewhat expensive due to use of EX instruction, but simple. penalty: 9 ticks.
1108       // Differs from doMVCgeneral in reconstruction of ARG2, ARG3, and ARG4.
1109       if (usedMVC) {
1110         BLOCK_COMMENT("mode MVC {");
1111         __ bind(doMVC);
1112 
1113         // get #bytes-1 for EXECUTE
1114         if (log2_size) {
1115           __ add2reg(Z_R1, -1);                // Length was scaled into Z_R1.
1116         } else {
1117           __ add2reg(Z_R1, -1, len_reg);       // Length was not scaled.
1118         }
1119 
1120         if (VM_Version::has_ExecuteExtensions()) {
1121           __ z_exrl(Z_R1, MVC_template);       // Execute MVC with variable length.
1122         } else {
1123           __ z_lgr(Z_R0, Z_R5);                // Save ARG4, may be unnecessary.
1124           __ z_larl(Z_R5, MVC_template);       // Get addr of instr template.
1125           __ z_ex(Z_R1, 0, Z_R0, Z_R5);        // Execute MVC with variable length.
1126           __ z_lgr(Z_R5, Z_R0);                // Restore ARG4, may be unnecessary.
1127         }
1128 
1129         if (!branchToEnd) {
1130           __ z_br(Z_R14);
1131         }
1132         BLOCK_COMMENT("} mode MVC");
1133       }
1134 
1135       __ bind(done);
1136 
1137       switch (element_size) {
1138         case 1:  BLOCK_COMMENT("} ARRAYCOPY DISJOINT byte "); break;
1139         case 2:  BLOCK_COMMENT("} ARRAYCOPY DISJOINT short"); break;
1140         case 4:  BLOCK_COMMENT("} ARRAYCOPY DISJOINT int  "); break;
1141         case 8:  BLOCK_COMMENT("} ARRAYCOPY DISJOINT long "); break;
1142         default: BLOCK_COMMENT("} ARRAYCOPY DISJOINT      "); break;
1143       }
1144     }
1145   }
1146 
1147   // Generate stub for conjoint array copy. If "aligned" is true, the
1148   // "from" and "to" addresses are assumed to be heapword aligned.
1149   //
1150   // Arguments for generated stub:
1151   //   from:  Z_ARG1
1152   //   to:    Z_ARG2
1153   //   count: Z_ARG3 treated as signed
1154   void generate_conjoint_copy(bool aligned, int element_size, bool branchToEnd) {
1155 
1156     // This is the zarch specific stub generator for general array copy tasks.
1157     // It has the following prereqs and features:
1158     //
1159     // - Destructive overlap exists and is handled by reverse copy.
1160     // - Destructive overlap exists if the leftmost byte of the target
1161     //   does coincide with any of the source bytes (except the leftmost).
1162     // - Z_R0 and Z_R1 are KILLed by the stub routine (data and stride)
1163     // - Z_ARG1 and Z_ARG2 are USEd but preserved by the stub routine.
1164     // - Z_ARG3 is USED but preserved by the stub routine.
1165     // - Z_ARG4 is used as index register and is thus KILLed.
1166     //
1167     {
1168       Register stride_reg = Z_R1;     // Stride & compare value in loop (negative element_size).
1169       Register   data_reg = Z_R0;     // Holds value of currently processed element.
1170       Register     ix_reg = Z_ARG4;   // Holds byte index of currently processed element.
1171       Register    len_reg = Z_ARG3;   // Holds length (in #elements) of arrays.
1172       Register    dst_reg = Z_ARG2;   // Holds left  operand addr.
1173       Register    src_reg = Z_ARG1;   // Holds right operand addr.
1174 
1175       assert(256%element_size == 0, "Element size must be power of 2.");
1176       assert(element_size     <= 8, "Can't handle more than DW units.");
1177 
1178       switch (element_size) {
1179         case 1:  BLOCK_COMMENT("ARRAYCOPY CONJOINT byte  {"); break;
1180         case 2:  BLOCK_COMMENT("ARRAYCOPY CONJOINT short {"); break;
1181         case 4:  BLOCK_COMMENT("ARRAYCOPY CONJOINT int   {"); break;
1182         case 8:  BLOCK_COMMENT("ARRAYCOPY CONJOINT long  {"); break;
1183         default: BLOCK_COMMENT("ARRAYCOPY CONJOINT       {"); break;
1184       }
1185 
1186       assert_positive_int(len_reg);
1187 
1188       if (VM_Version::has_Prefetch()) {
1189         __ z_pfd(0x01, 0, Z_R0, src_reg); // Fetch access.
1190         __ z_pfd(0x02, 0, Z_R0, dst_reg); // Store access.
1191       }
1192 
1193       unsigned int log2_size = exact_log2(element_size);
1194       if (log2_size) {
1195         __ z_sllg(ix_reg, len_reg, log2_size);
1196       } else {
1197         __ z_lgr(ix_reg, len_reg);
1198       }
1199 
1200       // Optimize reverse copy loop.
1201       // Main loop copies DW units which may be unaligned. Unaligned access adds some penalty ticks.
1202       // Unaligned DW access (neither fetch nor store) is DW-atomic, but should be alignment-atomic.
1203       // Preceding the main loop, some bytes are copied to obtain a DW-multiple remaining length.
1204 
1205       Label countLoop1;
1206       Label copyLoop1;
1207       Label skipBY;
1208       Label skipHW;
1209       int   stride = -8;
1210 
1211       __ load_const_optimized(stride_reg, stride); // Prepare for DW copy loop.
1212 
1213       if (element_size == 8)    // Nothing to do here.
1214         __ z_bru(countLoop1);
1215       else {                    // Do not generate dead code.
1216         __ z_tmll(ix_reg, 7);   // Check the "odd" bits.
1217         __ z_bre(countLoop1);   // There are none, very good!
1218       }
1219 
1220       if (log2_size == 0) {     // Handle leftover Byte.
1221         __ z_tmll(ix_reg, 1);
1222         __ z_bre(skipBY);
1223         __ z_lb(data_reg,   -1, ix_reg, src_reg);
1224         __ z_stcy(data_reg, -1, ix_reg, dst_reg);
1225         __ add2reg(ix_reg, -1); // Decrement delayed to avoid AGI.
1226         __ bind(skipBY);
1227         // fallthru
1228       }
1229       if (log2_size <= 1) {     // Handle leftover HW.
1230         __ z_tmll(ix_reg, 2);
1231         __ z_bre(skipHW);
1232         __ z_lhy(data_reg,  -2, ix_reg, src_reg);
1233         __ z_sthy(data_reg, -2, ix_reg, dst_reg);
1234         __ add2reg(ix_reg, -2); // Decrement delayed to avoid AGI.
1235         __ bind(skipHW);
1236         __ z_tmll(ix_reg, 4);
1237         __ z_bre(countLoop1);
1238         // fallthru
1239       }
1240       if (log2_size <= 2) {     // There are just 4 bytes (left) that need to be copied.
1241         __ z_ly(data_reg,  -4, ix_reg, src_reg);
1242         __ z_sty(data_reg, -4, ix_reg, dst_reg);
1243         __ add2reg(ix_reg, -4); // Decrement delayed to avoid AGI.
1244         __ z_bru(countLoop1);
1245       }
1246 
1247       // Control can never get to here. Never! Never ever!
1248       __ z_illtrap(0x99);
1249       __ bind(copyLoop1);
1250       __ z_lg(data_reg,  0, ix_reg, src_reg);
1251       __ z_stg(data_reg, 0, ix_reg, dst_reg);
1252       __ bind(countLoop1);
1253       __ z_brxhg(ix_reg, stride_reg, copyLoop1);
1254 
1255       if (!branchToEnd)
1256         __ z_br(Z_R14);
1257 
1258       switch (element_size) {
1259         case 1:  BLOCK_COMMENT("} ARRAYCOPY CONJOINT byte "); break;
1260         case 2:  BLOCK_COMMENT("} ARRAYCOPY CONJOINT short"); break;
1261         case 4:  BLOCK_COMMENT("} ARRAYCOPY CONJOINT int  "); break;
1262         case 8:  BLOCK_COMMENT("} ARRAYCOPY CONJOINT long "); break;
1263         default: BLOCK_COMMENT("} ARRAYCOPY CONJOINT      "); break;
1264       }
1265     }
1266   }
1267 
1268   address generate_disjoint_nonoop_copy(StubGenStubId stub_id) {
1269     bool aligned;
1270     int element_size;
1271     switch (stub_id) {
1272     case jbyte_disjoint_arraycopy_id:
1273       aligned = false;
1274       element_size = 1;
1275       break;
1276     case arrayof_jbyte_disjoint_arraycopy_id:
1277       aligned = true;
1278       element_size = 1;
1279       break;
1280     case jshort_disjoint_arraycopy_id:
1281       aligned = false;
1282       element_size = 2;
1283       break;
1284     case arrayof_jshort_disjoint_arraycopy_id:
1285       aligned = true;
1286       element_size = 2;
1287       break;
1288     case jint_disjoint_arraycopy_id:
1289       aligned = false;
1290       element_size = 4;
1291       break;
1292     case arrayof_jint_disjoint_arraycopy_id:
1293       aligned = true;
1294       element_size = 4;
1295       break;
1296     case jlong_disjoint_arraycopy_id:
1297       aligned = false;
1298       element_size = 8;
1299       break;
1300     case arrayof_jlong_disjoint_arraycopy_id:
1301       aligned = true;
1302       element_size = 8;
1303       break;
1304     default:
1305       ShouldNotReachHere();
1306     }
1307     StubCodeMark mark(this, stub_id);
1308     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1309     generate_disjoint_copy(aligned, element_size, false, false);
1310     return __ addr_at(start_off);
1311   }
1312 
1313   address generate_disjoint_oop_copy(StubGenStubId stub_id) {
1314     bool aligned;
1315     bool dest_uninitialized;
1316     switch (stub_id) {
1317     case oop_disjoint_arraycopy_id:
1318       aligned = false;
1319       dest_uninitialized = false;
1320       break;
1321     case arrayof_oop_disjoint_arraycopy_id:
1322       aligned = true;
1323       dest_uninitialized = false;
1324       break;
1325     case oop_disjoint_arraycopy_uninit_id:
1326       aligned = false;
1327       dest_uninitialized = true;
1328       break;
1329     case arrayof_oop_disjoint_arraycopy_uninit_id:
1330       aligned = true;
1331       dest_uninitialized = true;
1332       break;
1333     default:
1334       ShouldNotReachHere();
1335     }
1336     StubCodeMark mark(this, stub_id);
1337     // This is the zarch specific stub generator for oop array copy.
1338     // Refer to generate_disjoint_copy for a list of prereqs and features.
1339     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1340     unsigned int size      = UseCompressedOops ? 4 : 8;
1341 
1342     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1343     if (dest_uninitialized) {
1344       decorators |= IS_DEST_UNINITIALIZED;
1345     }
1346     if (aligned) {
1347       decorators |= ARRAYCOPY_ALIGNED;
1348     }
1349 
1350     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1351     bs->arraycopy_prologue(_masm, decorators, T_OBJECT, Z_ARG1, Z_ARG2, Z_ARG3);
1352 
1353     generate_disjoint_copy(aligned, size, true, true);
1354 
1355     bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, Z_ARG2, Z_ARG3, true);
1356 
1357     return __ addr_at(start_off);
1358   }
1359 
1360   address generate_conjoint_nonoop_copy(StubGenStubId stub_id) {
1361     bool aligned;
1362     int shift; // i.e. log2(element size)
1363     address nooverlap_target;
1364     switch (stub_id) {
1365     case jbyte_arraycopy_id:
1366       aligned = false;
1367       shift = 0;
1368       nooverlap_target = StubRoutines::jbyte_disjoint_arraycopy();
1369       break;
1370     case arrayof_jbyte_arraycopy_id:
1371       aligned = true;
1372       shift = 0;
1373       nooverlap_target = StubRoutines::arrayof_jbyte_disjoint_arraycopy();
1374       break;
1375     case jshort_arraycopy_id:
1376       aligned = false;
1377       shift = 1;
1378       nooverlap_target = StubRoutines::jshort_disjoint_arraycopy();
1379       break;
1380     case arrayof_jshort_arraycopy_id:
1381       aligned = true;
1382       shift = 1;
1383       nooverlap_target = StubRoutines::arrayof_jshort_disjoint_arraycopy();
1384       break;
1385     case jint_arraycopy_id:
1386       aligned = false;
1387       shift = 2;
1388       nooverlap_target = StubRoutines::jint_disjoint_arraycopy();
1389       break;
1390     case arrayof_jint_arraycopy_id:
1391       aligned = true;
1392       shift = 2;
1393       nooverlap_target = StubRoutines::arrayof_jint_disjoint_arraycopy();
1394       break;
1395     case jlong_arraycopy_id:
1396       aligned = false;
1397       shift = 3;
1398       nooverlap_target = StubRoutines::jlong_disjoint_arraycopy();
1399       break;
1400     case arrayof_jlong_arraycopy_id:
1401       aligned = true;
1402       shift = 3;
1403       nooverlap_target = StubRoutines::arrayof_jlong_disjoint_arraycopy();
1404       break;
1405     default:
1406       ShouldNotReachHere();
1407     }
1408     StubCodeMark mark(this, stub_id);
1409     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1410     array_overlap_test(nooverlap_target, shift); // Branch away to nooverlap_target if disjoint.
1411     generate_conjoint_copy(aligned, 1 << shift, false);
1412     return __ addr_at(start_off);
1413   }
1414 
1415   address generate_conjoint_oop_copy(StubGenStubId stub_id) {
1416     bool aligned;
1417     bool dest_uninitialized;
1418     address nooverlap_target;
1419     switch (stub_id) {
1420     case oop_arraycopy_id:
1421       aligned = false;
1422       dest_uninitialized = false;
1423       nooverlap_target = StubRoutines::oop_disjoint_arraycopy(dest_uninitialized);
1424       break;
1425     case arrayof_oop_arraycopy_id:
1426       aligned = true;
1427       dest_uninitialized = false;
1428       nooverlap_target = StubRoutines::arrayof_oop_disjoint_arraycopy(dest_uninitialized);
1429       break;
1430     case oop_arraycopy_uninit_id:
1431       aligned = false;
1432       dest_uninitialized = true;
1433       nooverlap_target = StubRoutines::oop_disjoint_arraycopy(dest_uninitialized);
1434       break;
1435     case arrayof_oop_arraycopy_uninit_id:
1436       aligned = true;
1437       dest_uninitialized = true;
1438       nooverlap_target = StubRoutines::arrayof_oop_disjoint_arraycopy(dest_uninitialized);
1439       break;
1440     default:
1441       ShouldNotReachHere();
1442     }
1443     StubCodeMark mark(this, stub_id);
1444     // This is the zarch specific stub generator for overlapping oop array copy.
1445     // Refer to generate_conjoint_copy for a list of prereqs and features.
1446     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1447     unsigned int size      = UseCompressedOops ? 4 : 8;
1448     unsigned int shift     = UseCompressedOops ? 2 : 3;
1449 
1450     // Branch to disjoint_copy (if applicable) before pre_barrier to avoid double pre_barrier.
1451     array_overlap_test(nooverlap_target, shift);  // Branch away to nooverlap_target if disjoint.
1452 
1453     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1454     if (dest_uninitialized) {
1455       decorators |= IS_DEST_UNINITIALIZED;
1456     }
1457     if (aligned) {
1458       decorators |= ARRAYCOPY_ALIGNED;
1459     }
1460 
1461     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1462     bs->arraycopy_prologue(_masm, decorators, T_OBJECT, Z_ARG1, Z_ARG2, Z_ARG3);
1463 
1464     generate_conjoint_copy(aligned, size, true);  // Must preserve ARG2, ARG3.
1465 
1466     bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, Z_ARG2, Z_ARG3, true);
1467 
1468     return __ addr_at(start_off);
1469   }
1470 
1471   //
1472   //  Generate 'unsafe' set memory stub
1473   //  Though just as safe as the other stubs, it takes an unscaled
1474   //  size_t (# bytes) argument instead of an element count.
1475   //
1476   //  Input:
1477   //    Z_ARG1   - destination array address
1478   //    Z_ARG2   - byte count (size_t)
1479   //    Z_ARG3   - byte value
1480   //
1481   address generate_unsafe_setmemory(address unsafe_byte_fill) {
1482     __ align(CodeEntryAlignment);
1483     StubCodeMark mark(this, StubGenStubId::unsafe_setmemory_id);
1484     unsigned int start_off = __ offset();
1485 
1486     // bump this on entry, not on exit:
1487     // inc_counter_np(SharedRuntime::_unsafe_set_memory_ctr);
1488 
1489     const Register dest = Z_ARG1;
1490     const Register size = Z_ARG2;
1491     const Register byteVal = Z_ARG3;
1492     NearLabel tail, finished;
1493     // fill_to_memory_atomic(unsigned char*, unsigned long, unsigned char)
1494 
1495     // Mark remaining code as such which performs Unsafe accesses.
1496     UnsafeMemoryAccessMark umam(this, true, false);
1497 
1498     __ z_vlvgb(Z_V0, byteVal, 0);
1499     __ z_vrepb(Z_V0, Z_V0, 0);
1500 
1501     __ z_aghi(size, -32);
1502     __ z_brl(tail);
1503 
1504     {
1505       NearLabel again;
1506       __ bind(again);
1507       __ z_vst(Z_V0, Address(dest, 0));
1508       __ z_vst(Z_V0, Address(dest, 16));
1509       __ z_aghi(dest, 32);
1510       __ z_aghi(size, -32);
1511       __ z_brnl(again);
1512     }
1513 
1514     __ bind(tail);
1515 
1516     {
1517       NearLabel dont;
1518       __ testbit(size, 4);
1519       __ z_brz(dont);
1520       __ z_vst(Z_V0, Address(dest, 0));
1521       __ z_aghi(dest, 16);
1522       __ bind(dont);
1523     }
1524 
1525     {
1526       NearLabel dont;
1527       __ testbit(size, 3);
1528       __ z_brz(dont);
1529       __ z_vsteg(Z_V0, 0, Z_R0, dest, 0);
1530       __ z_aghi(dest, 8);
1531       __ bind(dont);
1532     }
1533 
1534     __ z_tmll(size, 7);
1535     __ z_brc(Assembler::bcondAllZero, finished);
1536 
1537     {
1538       NearLabel dont;
1539       __ testbit(size, 2);
1540       __ z_brz(dont);
1541       __ z_vstef(Z_V0, 0, Z_R0, dest, 0);
1542       __ z_aghi(dest, 4);
1543       __ bind(dont);
1544     }
1545 
1546     {
1547       NearLabel dont;
1548       __ testbit(size, 1);
1549       __ z_brz(dont);
1550       __ z_vsteh(Z_V0, 0, Z_R0, dest, 0);
1551       __ z_aghi(dest, 2);
1552       __ bind(dont);
1553     }
1554 
1555     {
1556       NearLabel dont;
1557       __ testbit(size, 0);
1558       __ z_brz(dont);
1559       __ z_vsteb(Z_V0, 0, Z_R0, dest, 0);
1560       __ bind(dont);
1561     }
1562 
1563     __ bind(finished);
1564     __ z_br(Z_R14);
1565 
1566     return __ addr_at(start_off);
1567   }
1568 
1569   // This is common errorexit stub for UnsafeMemoryAccess.
1570   address generate_unsafecopy_common_error_exit() {
1571     unsigned int start_off = __ offset();
1572     __ z_lghi(Z_RET, 0); // return 0
1573     __ z_br(Z_R14);
1574     return __ addr_at(start_off);
1575   }
1576 
1577   void generate_arraycopy_stubs() {
1578 
1579     // Note: the disjoint stubs must be generated first, some of
1580     // the conjoint stubs use them.
1581 
1582     address ucm_common_error_exit       =  generate_unsafecopy_common_error_exit();
1583     UnsafeMemoryAccess::set_common_exit_stub_pc(ucm_common_error_exit);
1584 
1585     StubRoutines::_jbyte_disjoint_arraycopy      = generate_disjoint_nonoop_copy (StubGenStubId::jbyte_disjoint_arraycopy_id);
1586     StubRoutines::_jshort_disjoint_arraycopy     = generate_disjoint_nonoop_copy(StubGenStubId::jshort_disjoint_arraycopy_id);
1587     StubRoutines::_jint_disjoint_arraycopy       = generate_disjoint_nonoop_copy  (StubGenStubId::jint_disjoint_arraycopy_id);
1588     StubRoutines::_jlong_disjoint_arraycopy      = generate_disjoint_nonoop_copy (StubGenStubId::jlong_disjoint_arraycopy_id);
1589     StubRoutines::_oop_disjoint_arraycopy        = generate_disjoint_oop_copy  (StubGenStubId::oop_disjoint_arraycopy_id);
1590     StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy  (StubGenStubId::oop_disjoint_arraycopy_uninit_id);
1591 
1592     StubRoutines::_arrayof_jbyte_disjoint_arraycopy      = generate_disjoint_nonoop_copy (StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id);
1593     StubRoutines::_arrayof_jshort_disjoint_arraycopy     = generate_disjoint_nonoop_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id);
1594     StubRoutines::_arrayof_jint_disjoint_arraycopy       = generate_disjoint_nonoop_copy  (StubGenStubId::arrayof_jint_disjoint_arraycopy_id);
1595     StubRoutines::_arrayof_jlong_disjoint_arraycopy      = generate_disjoint_nonoop_copy (StubGenStubId::arrayof_jlong_disjoint_arraycopy_id);
1596     StubRoutines::_arrayof_oop_disjoint_arraycopy        = generate_disjoint_oop_copy  (StubGenStubId::arrayof_oop_disjoint_arraycopy_id);
1597     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy  (StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id);
1598 
1599     StubRoutines::_jbyte_arraycopy           = generate_conjoint_nonoop_copy(StubGenStubId::jbyte_arraycopy_id);
1600     StubRoutines::_jshort_arraycopy          = generate_conjoint_nonoop_copy(StubGenStubId::jshort_arraycopy_id);
1601     StubRoutines::_jint_arraycopy            = generate_conjoint_nonoop_copy(StubGenStubId::jint_arraycopy_id);
1602     StubRoutines::_jlong_arraycopy           = generate_conjoint_nonoop_copy(StubGenStubId::jlong_arraycopy_id);
1603     StubRoutines::_oop_arraycopy             = generate_conjoint_oop_copy(StubGenStubId::oop_arraycopy_id);
1604     StubRoutines::_oop_arraycopy_uninit      = generate_conjoint_oop_copy(StubGenStubId::oop_arraycopy_uninit_id);
1605 
1606     StubRoutines::_arrayof_jbyte_arraycopy      = generate_conjoint_nonoop_copy(StubGenStubId::arrayof_jbyte_arraycopy_id);
1607     StubRoutines::_arrayof_jshort_arraycopy     = generate_conjoint_nonoop_copy(StubGenStubId::arrayof_jshort_arraycopy_id);
1608     StubRoutines::_arrayof_jint_arraycopy       = generate_conjoint_nonoop_copy (StubGenStubId::arrayof_jint_arraycopy_id);
1609     StubRoutines::_arrayof_jlong_arraycopy      = generate_conjoint_nonoop_copy(StubGenStubId::arrayof_jlong_arraycopy_id);
1610     StubRoutines::_arrayof_oop_arraycopy        = generate_conjoint_oop_copy(StubGenStubId::arrayof_oop_arraycopy_id);
1611     StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id);
1612 
1613 #ifdef COMPILER2
1614     StubRoutines::_unsafe_setmemory =
1615              VM_Version::has_VectorFacility() ? generate_unsafe_setmemory(StubRoutines::_jbyte_fill) : nullptr;
1616 
1617 #endif // COMPILER2
1618   }
1619 
1620   // Call interface for AES_encryptBlock, AES_decryptBlock stubs.
1621   //
1622   //   Z_ARG1 - source data block. Ptr to leftmost byte to be processed.
1623   //   Z_ARG2 - destination data block. Ptr to leftmost byte to be stored.
1624   //            For in-place encryption/decryption, ARG1 and ARG2 can point
1625   //            to the same piece of storage.
1626   //   Z_ARG3 - Crypto key address (expanded key). The first n bits of
1627   //            the expanded key constitute the original AES-<n> key (see below).
1628   //
1629   //   Z_RET  - return value. First unprocessed byte offset in src buffer.
1630   //
1631   // Some remarks:
1632   //   The crypto key, as passed from the caller to these encryption stubs,
1633   //   is a so-called expanded key. It is derived from the original key
1634   //   by the Rijndael key schedule, see http://en.wikipedia.org/wiki/Rijndael_key_schedule
1635   //   With the expanded key, the cipher/decipher task is decomposed in
1636   //   multiple, less complex steps, called rounds. Sun SPARC and Intel
1637   //   processors obviously implement support for those less complex steps.
1638   //   z/Architecture provides instructions for full cipher/decipher complexity.
1639   //   Therefore, we need the original, not the expanded key here.
1640   //   Luckily, the first n bits of an AES-<n> expanded key are formed
1641   //   by the original key itself. That takes us out of trouble. :-)
1642   //   The key length (in bytes) relation is as follows:
1643   //     original    expanded   rounds  key bit     keylen
1644   //    key bytes   key bytes            length   in words
1645   //           16         176       11      128         44
1646   //           24         208       13      192         52
1647   //           32         240       15      256         60
1648   //
1649   // The crypto instructions used in the AES* stubs have some specific register requirements.
1650   //   Z_R0   holds the crypto function code. Please refer to the KM/KMC instruction
1651   //          description in the "z/Architecture Principles of Operation" manual for details.
1652   //   Z_R1   holds the parameter block address. The parameter block contains the cryptographic key
1653   //          (KM instruction) and the chaining value (KMC instruction).
1654   //   dst    must designate an even-numbered register, holding the address of the output message.
1655   //   src    must designate an even/odd register pair, holding the address/length of the original message
1656 
1657   // Helper function which generates code to
1658   //  - load the function code in register fCode (== Z_R0).
1659   //  - load the data block length (depends on cipher function) into register srclen if requested.
1660   //  - is_decipher switches between cipher/decipher function codes
1661   //  - set_len requests (if true) loading the data block length in register srclen
1662   void generate_load_AES_fCode(Register keylen, Register fCode, Register srclen, bool is_decipher) {
1663 
1664     BLOCK_COMMENT("Set fCode {"); {
1665       Label fCode_set;
1666       int   mode = is_decipher ? VM_Version::CipherMode::decipher : VM_Version::CipherMode::cipher;
1667       bool  identical_dataBlk_len =  (VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES192_dataBlk)
1668                                   && (VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES256_dataBlk);
1669       // Expanded key length is 44/52/60 * 4 bytes for AES-128/AES-192/AES-256.
1670       __ z_cghi(keylen, 52); // Check only once at the beginning. keylen and fCode may share the same register.
1671 
1672       __ z_lghi(fCode, VM_Version::Cipher::_AES128 + mode);
1673       if (!identical_dataBlk_len) {
1674         __ z_lghi(srclen, VM_Version::Cipher::_AES128_dataBlk);
1675       }
1676       __ z_brl(fCode_set);  // keyLen <  52: AES128
1677 
1678       __ z_lghi(fCode, VM_Version::Cipher::_AES192 + mode);
1679       if (!identical_dataBlk_len) {
1680         __ z_lghi(srclen, VM_Version::Cipher::_AES192_dataBlk);
1681       }
1682       __ z_bre(fCode_set);  // keyLen == 52: AES192
1683 
1684       __ z_lghi(fCode, VM_Version::Cipher::_AES256 + mode);
1685       if (!identical_dataBlk_len) {
1686         __ z_lghi(srclen, VM_Version::Cipher::_AES256_dataBlk);
1687       }
1688       // __ z_brh(fCode_set);  // keyLen <  52: AES128           // fallthru
1689 
1690       __ bind(fCode_set);
1691       if (identical_dataBlk_len) {
1692         __ z_lghi(srclen, VM_Version::Cipher::_AES128_dataBlk);
1693       }
1694     }
1695     BLOCK_COMMENT("} Set fCode");
1696   }
1697 
1698   // Push a parameter block for the cipher/decipher instruction on the stack.
1699   // Layout of the additional stack space allocated for AES_cipherBlockChaining:
1700   //
1701   //   |        |
1702   //   +--------+ <-- SP before expansion
1703   //   |        |
1704   //   :        :  alignment loss (part 2), 0..(AES_parmBlk_align-1) bytes
1705   //   |        |
1706   //   +--------+
1707   //   |        |
1708   //   :        :  space for parameter block, size VM_Version::Cipher::_AES*_parmBlk_C
1709   //   |        |
1710   //   +--------+ <-- parmBlk, octoword-aligned, start of parameter block
1711   //   |        |
1712   //   :        :  additional stack space for spills etc., size AES_parmBlk_addspace, DW @ Z_SP not usable!!!
1713   //   |        |
1714   //   +--------+ <-- Z_SP + alignment loss, octoword-aligned
1715   //   |        |
1716   //   :        :  alignment loss (part 1), 0..(AES_parmBlk_align-1) bytes. DW @ Z_SP not usable!!!
1717   //   |        |
1718   //   +--------+ <-- Z_SP after expansion
1719 
1720   void generate_push_Block(int dataBlk_len, int parmBlk_len, int crypto_fCode,
1721                            Register parmBlk, Register keylen, Register fCode, Register cv, Register key) {
1722 
1723     AES_parmBlk_addspace = AES_parmBlk_align; // Must be multiple of AES_parmblk_align.
1724                                               // spill space for regs etc., don't use DW @SP!
1725     const int cv_len     = dataBlk_len;
1726     const int key_len    = parmBlk_len - cv_len;
1727     // This len must be known at JIT compile time. Only then are we able to recalc the SP before resize.
1728     // We buy this knowledge by wasting some (up to AES_parmBlk_align) bytes of stack space.
1729     const int resize_len = cv_len + key_len + AES_parmBlk_align + AES_parmBlk_addspace;
1730 
1731     // Use parmBlk as temp reg here to hold the frame pointer.
1732     __ resize_frame(-resize_len, parmBlk, true);
1733 
1734     // calculate parmBlk address from updated (resized) SP.
1735     __ add2reg(parmBlk, resize_len - (cv_len + key_len), Z_SP);
1736     __ z_nill(parmBlk, (~(AES_parmBlk_align-1)) & 0xffff); // Align parameter block.
1737 
1738     // There is room for stuff in the range [parmBlk-AES_parmBlk_addspace+8, parmBlk).
1739     __ z_stg(keylen,  -8, parmBlk);                        // Spill keylen for later use.
1740 
1741     // calculate (SP before resize) from updated SP.
1742     __ add2reg(keylen, resize_len, Z_SP);                  // keylen holds prev SP for now.
1743     __ z_stg(keylen, -16, parmBlk);                        // Spill prev SP for easy revert.
1744 
1745     __ z_mvc(0,      cv_len-1,  parmBlk, 0, cv);     // Copy cv.
1746     __ z_mvc(cv_len, key_len-1, parmBlk, 0, key);    // Copy key.
1747     __ z_lghi(fCode, crypto_fCode);
1748   }
1749 
1750   // NOTE:
1751   //   Before returning, the stub has to copy the chaining value from
1752   //   the parmBlk, where it was updated by the crypto instruction, back
1753   //   to the chaining value array the address of which was passed in the cv argument.
1754   //   As all the available registers are used and modified by KMC, we need to save
1755   //   the key length across the KMC instruction. We do so by spilling it to the stack,
1756   //   just preceding the parmBlk (at (parmBlk - 8)).
1757   void generate_push_parmBlk(Register keylen, Register fCode, Register parmBlk, Register key, Register cv, bool is_decipher) {
1758     int       mode = is_decipher ? VM_Version::CipherMode::decipher : VM_Version::CipherMode::cipher;
1759     Label     parmBlk_128, parmBlk_192, parmBlk_256, parmBlk_set;
1760 
1761     BLOCK_COMMENT("push parmBlk {");
1762     // We have just three cipher strengths which translates into three
1763     // possible extended key lengths: 44, 52, and 60 bytes.
1764     // We therefore can compare the actual length against the "middle" length
1765     // and get: lt -> len=44, eq -> len=52, gt -> len=60.
1766     __ z_cghi(keylen, 52);
1767     if (VM_Version::has_Crypto_AES128()) { __ z_brl(parmBlk_128); }  // keyLen <  52: AES128
1768     if (VM_Version::has_Crypto_AES192()) { __ z_bre(parmBlk_192); }  // keyLen == 52: AES192
1769     if (VM_Version::has_Crypto_AES256()) { __ z_brh(parmBlk_256); }  // keyLen >  52: AES256
1770 
1771     // Security net: requested AES function not available on this CPU.
1772     // NOTE:
1773     //   As of now (March 2015), this safety net is not required. JCE policy files limit the
1774     //   cryptographic strength of the keys used to 128 bit. If we have AES hardware support
1775     //   at all, we have at least AES-128.
1776     __ stop_static("AES key strength not supported by CPU. Use -XX:-UseAES as remedy.", 0);
1777 
1778     if (VM_Version::has_Crypto_AES256()) {
1779       __ bind(parmBlk_256);
1780       generate_push_Block(VM_Version::Cipher::_AES256_dataBlk,
1781                           VM_Version::Cipher::_AES256_parmBlk_C,
1782                           VM_Version::Cipher::_AES256 + mode,
1783                           parmBlk, keylen, fCode, cv, key);
1784       if (VM_Version::has_Crypto_AES128() || VM_Version::has_Crypto_AES192()) {
1785         __ z_bru(parmBlk_set);  // Fallthru otherwise.
1786       }
1787     }
1788 
1789     if (VM_Version::has_Crypto_AES192()) {
1790       __ bind(parmBlk_192);
1791       generate_push_Block(VM_Version::Cipher::_AES192_dataBlk,
1792                           VM_Version::Cipher::_AES192_parmBlk_C,
1793                           VM_Version::Cipher::_AES192 + mode,
1794                           parmBlk, keylen, fCode, cv, key);
1795       if (VM_Version::has_Crypto_AES128()) {
1796         __ z_bru(parmBlk_set);  // Fallthru otherwise.
1797       }
1798     }
1799 
1800     if (VM_Version::has_Crypto_AES128()) {
1801       __ bind(parmBlk_128);
1802       generate_push_Block(VM_Version::Cipher::_AES128_dataBlk,
1803                           VM_Version::Cipher::_AES128_parmBlk_C,
1804                           VM_Version::Cipher::_AES128 + mode,
1805                           parmBlk, keylen, fCode, cv, key);
1806       // Fallthru
1807     }
1808 
1809     __ bind(parmBlk_set);
1810     BLOCK_COMMENT("} push parmBlk");
1811   }
1812 
1813   // Pop a parameter block from the stack. The chaining value portion of the parameter block
1814   // is copied back to the cv array as it is needed for subsequent cipher steps.
1815   // The keylen value as well as the original SP (before resizing) was pushed to the stack
1816   // when pushing the parameter block.
1817   void generate_pop_parmBlk(Register keylen, Register parmBlk, Register key, Register cv) {
1818 
1819     BLOCK_COMMENT("pop parmBlk {");
1820     bool identical_dataBlk_len =  (VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES192_dataBlk) &&
1821                                   (VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES256_dataBlk);
1822     if (identical_dataBlk_len) {
1823       int cv_len = VM_Version::Cipher::_AES128_dataBlk;
1824       __ z_mvc(0, cv_len-1, cv, 0, parmBlk);  // Copy cv.
1825     } else {
1826       int cv_len;
1827       Label parmBlk_128, parmBlk_192, parmBlk_256, parmBlk_set;
1828       __ z_lg(keylen, -8, parmBlk);  // restore keylen
1829       __ z_cghi(keylen, 52);
1830       if (VM_Version::has_Crypto_AES256()) __ z_brh(parmBlk_256);  // keyLen >  52: AES256
1831       if (VM_Version::has_Crypto_AES192()) __ z_bre(parmBlk_192);  // keyLen == 52: AES192
1832       // if (VM_Version::has_Crypto_AES128()) __ z_brl(parmBlk_128);  // keyLen <  52: AES128  // fallthru
1833 
1834       // Security net: there is no one here. If we would need it, we should have
1835       // fallen into it already when pushing the parameter block.
1836       if (VM_Version::has_Crypto_AES128()) {
1837         __ bind(parmBlk_128);
1838         cv_len = VM_Version::Cipher::_AES128_dataBlk;
1839         __ z_mvc(0, cv_len-1, cv, 0, parmBlk);  // Copy cv.
1840         if (VM_Version::has_Crypto_AES192() || VM_Version::has_Crypto_AES256()) {
1841           __ z_bru(parmBlk_set);
1842         }
1843       }
1844 
1845       if (VM_Version::has_Crypto_AES192()) {
1846         __ bind(parmBlk_192);
1847         cv_len = VM_Version::Cipher::_AES192_dataBlk;
1848         __ z_mvc(0, cv_len-1, cv, 0, parmBlk);  // Copy cv.
1849         if (VM_Version::has_Crypto_AES256()) {
1850           __ z_bru(parmBlk_set);
1851         }
1852       }
1853 
1854       if (VM_Version::has_Crypto_AES256()) {
1855         __ bind(parmBlk_256);
1856         cv_len = VM_Version::Cipher::_AES256_dataBlk;
1857         __ z_mvc(0, cv_len-1, cv, 0, parmBlk);  // Copy cv.
1858         // __ z_bru(parmBlk_set);  // fallthru
1859       }
1860       __ bind(parmBlk_set);
1861     }
1862     __ z_lg(Z_SP, -16, parmBlk); // Revert resize_frame_absolute. Z_SP saved by push_parmBlk.
1863     BLOCK_COMMENT("} pop parmBlk");
1864   }
1865 
1866   // Compute AES encrypt/decrypt function.
1867   void generate_AES_cipherBlock(bool is_decipher) {
1868     // Incoming arguments.
1869     Register       from    = Z_ARG1; // source byte array
1870     Register       to      = Z_ARG2; // destination byte array
1871     Register       key     = Z_ARG3; // expanded key array
1872 
1873     const Register keylen  = Z_R0;   // Temporarily (until fCode is set) holds the expanded key array length.
1874 
1875     // Register definitions as required by KM instruction.
1876     const Register fCode   = Z_R0;   // crypto function code
1877     const Register parmBlk = Z_R1;   // parameter block address (points to crypto key)
1878     const Register src     = Z_ARG1; // Must be even reg (KM requirement).
1879     const Register srclen  = Z_ARG2; // Must be odd reg and pair with src. Overwrites destination address.
1880     const Register dst     = Z_ARG3; // Must be even reg (KM requirement). Overwrites expanded key address.
1881 
1882     // Read key len of expanded key (in 4-byte words).
1883     __ z_lgf(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
1884 
1885     // Copy arguments to registers as required by crypto instruction.
1886     __ z_lgr(parmBlk, key);          // crypto key (in T_INT array).
1887     __ lgr_if_needed(src, from);     // Copy src address. Will not emit, src/from are identical.
1888     __ z_lgr(dst, to);               // Copy dst address, even register required.
1889 
1890     // Construct function code into fCode(Z_R0), data block length into srclen(Z_ARG2).
1891     generate_load_AES_fCode(keylen, fCode, srclen, is_decipher);
1892 
1893     __ km(dst, src);                 // Cipher the message.
1894 
1895     __ z_br(Z_R14);
1896   }
1897 
1898   // Compute AES encrypt function.
1899   address generate_AES_encryptBlock() {
1900     __ align(CodeEntryAlignment);
1901     StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id;
1902     StubCodeMark mark(this, stub_id);
1903     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1904 
1905     generate_AES_cipherBlock(false);
1906 
1907     return __ addr_at(start_off);
1908   }
1909 
1910   // Compute AES decrypt function.
1911   address generate_AES_decryptBlock() {
1912     __ align(CodeEntryAlignment);
1913     StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id;
1914     StubCodeMark mark(this, stub_id);
1915     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1916 
1917     generate_AES_cipherBlock(true);
1918 
1919     return __ addr_at(start_off);
1920   }
1921 
1922   // These stubs receive the addresses of the cryptographic key and of the chaining value as two separate
1923   // arguments (registers "key" and "cv", respectively). The KMC instruction, on the other hand, requires
1924   // chaining value and key to be, in this sequence, adjacent in storage. Thus, we need to allocate some
1925   // thread-local working storage. Using heap memory incurs all the hassles of allocating/freeing.
1926   // Stack space, on the contrary, is deallocated automatically when we return from the stub to the caller.
1927   // *** WARNING ***
1928   // Please note that we do not formally allocate stack space, nor do we
1929   // update the stack pointer. Therefore, no function calls are allowed
1930   // and nobody else must use the stack range where the parameter block
1931   // is located.
1932   // We align the parameter block to the next available octoword.
1933   //
1934   // Compute chained AES encrypt function.
1935   void generate_AES_cipherBlockChaining(bool is_decipher) {
1936 
1937     Register       from    = Z_ARG1; // source byte array (clear text)
1938     Register       to      = Z_ARG2; // destination byte array (ciphered)
1939     Register       key     = Z_ARG3; // expanded key array.
1940     Register       cv      = Z_ARG4; // chaining value
1941     const Register msglen  = Z_ARG5; // Total length of the msg to be encrypted. Value must be returned
1942                                      // in Z_RET upon completion of this stub. Is 32-bit integer.
1943 
1944     const Register keylen  = Z_R0;   // Expanded key length, as read from key array. Temp only.
1945     const Register fCode   = Z_R0;   // crypto function code
1946     const Register parmBlk = Z_R1;   // parameter block address (points to crypto key)
1947     const Register src     = Z_ARG1; // is Z_R2
1948     const Register srclen  = Z_ARG2; // Overwrites destination address.
1949     const Register dst     = Z_ARG3; // Overwrites key address.
1950 
1951     // Read key len of expanded key (in 4-byte words).
1952     __ z_lgf(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
1953 
1954     // Construct parm block address in parmBlk (== Z_R1), copy cv and key to parm block.
1955     // Construct function code in fCode (Z_R0).
1956     generate_push_parmBlk(keylen, fCode, parmBlk, key, cv, is_decipher);
1957 
1958     // Prepare other registers for instruction.
1959     __ lgr_if_needed(src, from);     // Copy src address. Will not emit, src/from are identical.
1960     __ z_lgr(dst, to);
1961     __ z_llgfr(srclen, msglen);      // We pass the offsets as ints, not as longs as required.
1962 
1963     __ kmc(dst, src);                // Cipher the message.
1964 
1965     generate_pop_parmBlk(keylen, parmBlk, key, cv);
1966 
1967     __ z_llgfr(Z_RET, msglen);       // We pass the offsets as ints, not as longs as required.
1968     __ z_br(Z_R14);
1969   }
1970 
1971   // Compute chained AES encrypt function.
1972   address generate_cipherBlockChaining_AES_encrypt() {
1973     __ align(CodeEntryAlignment);
1974     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_encryptAESCrypt_id;
1975     StubCodeMark mark(this, stub_id);
1976     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
1977 
1978     generate_AES_cipherBlockChaining(false);
1979 
1980     return __ addr_at(start_off);
1981   }
1982 
1983   // Compute chained AES decrypt function.
1984   address generate_cipherBlockChaining_AES_decrypt() {
1985     __ align(CodeEntryAlignment);
1986     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_decryptAESCrypt_id;
1987     StubCodeMark mark(this, stub_id);
1988     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
1989 
1990     generate_AES_cipherBlockChaining(true);
1991 
1992     return __ addr_at(start_off);
1993   }
1994 
1995 
1996   // *****************************************************************************
1997 
1998   // AES CounterMode
1999   // Push a parameter block for the cipher/decipher instruction on the stack.
2000   // Layout of the additional stack space allocated for counterMode_AES_cipherBlock
2001   //
2002   //   |        |
2003   //   +--------+ <-- SP before expansion
2004   //   |        |
2005   //   :        :  alignment loss (part 2), 0..(AES_parmBlk_align-1) bytes.
2006   //   |        |
2007   //   +--------+ <-- gap = parmBlk + parmBlk_len + ctrArea_len
2008   //   |        |
2009   //   :        :  byte[] ctr - kmctr expects a counter vector the size of the input vector.
2010   //   :        :         The interface only provides byte[16] iv, the init vector.
2011   //   :        :         The size of this area is a tradeoff between stack space, init effort, and speed.
2012   //   |        |         Each counter is a 128bit int. Vector element [0] is a copy of iv.
2013   //   |        |         Vector element [i] is formed by incrementing element [i-1].
2014   //   +--------+ <-- ctr = parmBlk + parmBlk_len
2015   //   |        |
2016   //   :        :  space for parameter block, size VM_Version::Cipher::_AES*_parmBlk_G
2017   //   |        |
2018   //   +--------+ <-- parmBlk = Z_SP + (alignment loss (part 1+2)) + AES_dataBlk_space + AES_parmBlk_addSpace, octoword-aligned, start of parameter block
2019   //   |        |
2020   //   :        :  additional stack space for spills etc., min. size AES_parmBlk_addspace, all bytes usable.
2021   //   |        |
2022   //   +--------+ <-- Z_SP + alignment loss (part 1+2) + AES_dataBlk_space, octoword-aligned
2023   //   |        |
2024   //   :        :  space for one source data block and one dest data block.
2025   //   |        |
2026   //   +--------+ <-- Z_SP + alignment loss (part 1+2), octoword-aligned
2027   //   |        |
2028   //   :        :  additional alignment loss. Blocks above can't tolerate unusable DW @SP.
2029   //   |        |
2030   //   +--------+ <-- Z_SP + alignment loss (part 1), octoword-aligned
2031   //   |        |
2032   //   :        :  alignment loss (part 1), 0..(AES_parmBlk_align-1) bytes. DW @ Z_SP holds frame ptr.
2033   //   |        |
2034   //   +--------+ <-- Z_SP after expansion
2035   //
2036   //   additional space allocation (per DW):
2037   //    spillSpace = parmBlk - AES_parmBlk_addspace
2038   //    dataBlocks = spillSpace - AES_dataBlk_space
2039   //
2040   //    parmBlk-8  various fields of various lengths
2041   //               parmBlk-1: key_len (only one byte is stored at parmBlk-1)
2042   //               parmBlk-2: fCode (only one byte is stored at parmBlk-2)
2043   //               parmBlk-4: ctrVal_len (as retrieved from iv array), in bytes, as HW
2044   //               parmBlk-8: msglen length (in bytes) of crypto msg, as passed in by caller
2045   //                          return value is calculated from this: rv = msglen - processed.
2046   //    parmBlk-16 old_SP (SP before resize)
2047   //    parmBlk-24 temp values
2048   //                up to and including main loop in generate_counterMode_AES
2049   //                 - parmBlk-20: remmsg_len remaining msg len (aka unprocessed msg bytes)
2050   //                after main loop in generate_counterMode_AES
2051   //                 - parmBlk-24: spill slot for various address values
2052   //
2053   //    parmBlk-40 free spill slot, used for local spills.
2054   //    parmBlk-64 ARG2(dst) ptr spill slot
2055   //    parmBlk-56 ARG3(crypto key) ptr spill slot
2056   //    parmBlk-48 ARG4(icv value) ptr spill slot
2057   //
2058   //    parmBlk-72
2059   //    parmBlk-80
2060   //    parmBlk-88 counter vector current position
2061   //    parmBlk-96 reduced msg len (after preLoop processing)
2062   //
2063   //    parmBlk-104 Z_R13 spill slot (preLoop only)
2064   //    parmBlk-112 Z_R12 spill slot (preLoop only)
2065   //    parmBlk-120 Z_R11 spill slot (preLoop only)
2066   //    parmBlk-128 Z_R10 spill slot (preLoop only)
2067   //
2068   //
2069   // Layout of the parameter block (instruction KMCTR, function KMCTR-AES*
2070   //
2071   //   +--------+ key_len: +16 (AES-128), +24 (AES-192), +32 (AES-256)
2072   //   |        |
2073   //   |        |  cryptographic key
2074   //   |        |
2075   //   +--------+ <-- parmBlk
2076   //
2077   // On exit:
2078   //   Z_SP     points to resized frame
2079   //            Z_SP before resize available from -16(parmBlk)
2080   //   parmBlk  points to crypto instruction parameter block
2081   //            parameter block is filled with crypto key.
2082   //   msglen   unchanged, saved for later at -24(parmBlk)
2083   //   fCode    contains function code for instruction
2084   //   key      unchanged
2085   //
2086   void generate_counterMode_prepare_Stack(Register parmBlk, Register ctr, Register counter, Register scratch) {
2087 
2088     BLOCK_COMMENT("prepare stack counterMode_AESCrypt {");
2089 
2090     // save argument registers.
2091     //   ARG1(from) is Z_RET as well. Not saved or restored.
2092     //   ARG5(msglen) is restored by other means.
2093     __ z_stmg(Z_ARG2, Z_ARG4, argsave_offset,    parmBlk);
2094 
2095     assert(AES_ctrVec_len > 0, "sanity. We need a counter vector");
2096     __ add2reg(counter, AES_parmBlk_align, parmBlk);       // counter array is located behind crypto key. Available range is disp12 only.
2097     __ z_mvc(0, AES_ctrVal_len-1, counter, 0, ctr);        // move first copy of iv
2098     for (int j = 1; j < AES_ctrVec_len; j+=j) {            // j (and amount of moved data) doubles with every iteration
2099       int offset = j * AES_ctrVal_len;
2100       if (offset <= 256) {
2101         __ z_mvc(offset, offset-1, counter, 0, counter);   // move iv
2102       } else {
2103         for (int k = 0; k < offset; k += 256) {
2104           __ z_mvc(offset+k, 255, counter, 0, counter);
2105         }
2106       }
2107     }
2108 
2109     Label noCarry, done;
2110     __ z_lg(scratch, Address(ctr, 8));                     // get low-order DW of initial counter.
2111     __ z_algfi(scratch, AES_ctrVec_len);                   // check if we will overflow during init.
2112     __ z_brc(Assembler::bcondLogNoCarry, noCarry);         // No, 64-bit increment is sufficient.
2113 
2114     for (int j = 1; j < AES_ctrVec_len; j++) {             // start with j = 1; no need to add 0 to the first counter value.
2115       int offset = j * AES_ctrVal_len;
2116       generate_increment128(counter, offset, j, scratch);  // increment iv by index value
2117     }
2118     __ z_bru(done);
2119 
2120     __ bind(noCarry);
2121     for (int j = 1; j < AES_ctrVec_len; j++) {             // start with j = 1; no need to add 0 to the first counter value.
2122       int offset = j * AES_ctrVal_len;
2123       generate_increment64(counter, offset, j);            // increment iv by index value
2124     }
2125 
2126     __ bind(done);
2127 
2128     BLOCK_COMMENT("} prepare stack counterMode_AESCrypt");
2129   }
2130 
2131 
2132   void generate_counterMode_increment_ctrVector(Register parmBlk, Register counter, Register scratch, bool v0_only) {
2133 
2134     BLOCK_COMMENT("increment ctrVector counterMode_AESCrypt {");
2135 
2136     __ add2reg(counter, AES_parmBlk_align, parmBlk);       // ptr to counter array needs to be restored
2137 
2138     if (v0_only) {
2139       int offset = 0;
2140       generate_increment128(counter, offset, AES_ctrVec_len, scratch); // increment iv by # vector elements
2141     } else {
2142       int j = 0;
2143       if (VM_Version::has_VectorFacility()) {
2144         bool first_call = true;
2145         for (; j < (AES_ctrVec_len - 3); j+=4) {                       // increment blocks of 4 iv elements
2146           int offset = j * AES_ctrVal_len;
2147           generate_increment128x4(counter, offset, AES_ctrVec_len, first_call);
2148           first_call = false;
2149         }
2150       }
2151       for (; j < AES_ctrVec_len; j++) {
2152         int offset = j * AES_ctrVal_len;
2153         generate_increment128(counter, offset, AES_ctrVec_len, scratch); // increment iv by # vector elements
2154       }
2155     }
2156 
2157     BLOCK_COMMENT("} increment ctrVector counterMode_AESCrypt");
2158   }
2159 
2160   // IBM s390 (IBM z/Architecture, to be more exact) uses Big-Endian number representation.
2161   // Therefore, the bits are ordered from most significant to least significant. The address
2162   // of a number in memory points to its lowest location where the most significant bit is stored.
2163   void generate_increment64(Register counter, int offset, int increment) {
2164     __ z_algsi(offset + 8, counter, increment);            // increment, no overflow check
2165   }
2166 
2167   void generate_increment128(Register counter, int offset, int increment, Register scratch) {
2168     __ clear_reg(scratch);                                 // prepare to add carry to high-order DW
2169     __ z_algsi(offset + 8, counter, increment);            // increment low order DW
2170     __ z_alcg(scratch, Address(counter, offset));          // add carry to high-order DW
2171     __ z_stg(scratch, Address(counter, offset));           // store back
2172   }
2173 
2174   void generate_increment128(Register counter, int offset, Register increment, Register scratch) {
2175     __ clear_reg(scratch);                                 // prepare to add carry to high-order DW
2176     __ z_alg(increment, Address(counter, offset + 8));     // increment low order DW
2177     __ z_stg(increment, Address(counter, offset + 8));     // store back
2178     __ z_alcg(scratch, Address(counter, offset));          // add carry to high-order DW
2179     __ z_stg(scratch, Address(counter, offset));           // store back
2180   }
2181 
2182   // This is the vector variant of increment128, incrementing 4 ctr vector elements per call.
2183   void generate_increment128x4(Register counter, int offset, int increment, bool init) {
2184     VectorRegister Vincr      = Z_V16;
2185     VectorRegister Vctr0      = Z_V20;
2186     VectorRegister Vctr1      = Z_V21;
2187     VectorRegister Vctr2      = Z_V22;
2188     VectorRegister Vctr3      = Z_V23;
2189 
2190     // Initialize the increment value only once for a series of increments.
2191     // It must be assured that the non-initializing generator calls are
2192     // immediately subsequent. Otherwise, there is no guarantee for Vincr to be unchanged.
2193     if (init) {
2194       __ z_vzero(Vincr);                                   // preset VReg with constant increment
2195       __ z_vleih(Vincr, increment, 7);                     // rightmost HW has ix = 7
2196     }
2197 
2198     __ z_vlm(Vctr0, Vctr3, offset, counter);               // get the counter values
2199     __ z_vaq(Vctr0, Vctr0, Vincr);                         // increment them
2200     __ z_vaq(Vctr1, Vctr1, Vincr);
2201     __ z_vaq(Vctr2, Vctr2, Vincr);
2202     __ z_vaq(Vctr3, Vctr3, Vincr);
2203     __ z_vstm(Vctr0, Vctr3, offset, counter);              // store the counter values
2204   }
2205 
2206   unsigned int generate_counterMode_push_Block(int dataBlk_len, int parmBlk_len, int crypto_fCode,
2207                            Register parmBlk, Register msglen, Register fCode, Register key) {
2208 
2209     // space for data blocks (src and dst, one each) for partial block processing)
2210     AES_parmBlk_addspace = AES_stackSpace_incr             // spill space (temp data)
2211                          + AES_stackSpace_incr             // for argument save/restore
2212                          + AES_stackSpace_incr*2           // for work reg save/restore
2213                          ;
2214     AES_dataBlk_space    = roundup(2*dataBlk_len, AES_parmBlk_align);
2215     AES_dataBlk_offset   = -(AES_parmBlk_addspace+AES_dataBlk_space);
2216     const int key_len    = parmBlk_len;                    // The length of the unextended key (16, 24, 32)
2217 
2218     assert((AES_ctrVal_len == 0) || (AES_ctrVal_len == dataBlk_len), "varying dataBlk_len is not supported.");
2219     AES_ctrVal_len  = dataBlk_len;                         // ctr init value len (in bytes)
2220     AES_ctrArea_len = AES_ctrVec_len * AES_ctrVal_len;     // space required on stack for ctr vector
2221 
2222     // This len must be known at JIT compile time. Only then are we able to recalc the SP before resize.
2223     // We buy this knowledge by wasting some (up to AES_parmBlk_align) bytes of stack space.
2224     const int resize_len = AES_parmBlk_align               // room for alignment of parmBlk
2225                          + AES_parmBlk_align               // extra room for alignment
2226                          + AES_dataBlk_space               // one src and one dst data blk
2227                          + AES_parmBlk_addspace            // spill space for local data
2228                          + roundup(parmBlk_len, AES_parmBlk_align)  // aligned length of parmBlk
2229                          + AES_ctrArea_len                 // stack space for ctr vector
2230                          ;
2231     Register scratch     = fCode;  // We can use fCode as a scratch register. It's contents on entry
2232                                    // is irrelevant and it is set at the very end of this code block.
2233 
2234     assert(key_len < 256, "excessive crypto key len: %d, limit: 256", key_len);
2235 
2236     BLOCK_COMMENT(err_msg("push_Block (%d bytes) counterMode_AESCrypt%d {", resize_len, parmBlk_len*8));
2237 
2238     // After the frame is resized, the parmBlk is positioned such
2239     // that it is octoword-aligned. This potentially creates some
2240     // alignment waste in addspace and/or in the gap area.
2241     // After resize_frame, scratch contains the frame pointer.
2242     __ resize_frame(-resize_len, scratch, true);
2243 #ifdef ASSERT
2244     __ clear_mem(Address(Z_SP, (intptr_t)8), resize_len - 8);
2245 #endif
2246 
2247     // calculate aligned parmBlk address from updated (resized) SP.
2248     __ add2reg(parmBlk, AES_parmBlk_addspace + AES_dataBlk_space + (2*AES_parmBlk_align-1), Z_SP);
2249     __ z_nill(parmBlk, (~(AES_parmBlk_align-1)) & 0xffff); // Align parameter block.
2250 
2251     // There is room to spill stuff in the range [parmBlk-AES_parmBlk_addspace+8, parmBlk).
2252     __ z_mviy(keylen_offset, parmBlk, key_len - 1);        // Spill crypto key length for later use. Decrement by one for direct use with xc template.
2253     __ z_mviy(fCode_offset,  parmBlk, crypto_fCode);       // Crypto function code, will be loaded into Z_R0 later.
2254     __ z_sty(msglen, msglen_offset, parmBlk);              // full plaintext/ciphertext len.
2255     __ z_sty(msglen, msglen_red_offset, parmBlk);          // save for main loop, may get updated in preLoop.
2256     __ z_sra(msglen, exact_log2(dataBlk_len));             // # full cipher blocks that can be formed from input text.
2257     __ z_sty(msglen, rem_msgblk_offset, parmBlk);
2258 
2259     __ add2reg(scratch, resize_len, Z_SP);                 // calculate (SP before resize) from resized SP.
2260     __ z_stg(scratch, unextSP_offset, parmBlk);            // Spill unextended SP for easy revert.
2261     __ z_stmg(Z_R10, Z_R13, regsave_offset, parmBlk);      // make some regs available as work registers
2262 
2263     // Fill parmBlk with all required data
2264     __ z_mvc(0, key_len-1, parmBlk, 0, key);               // Copy key. Need to do it here - key_len is only known here.
2265     BLOCK_COMMENT(err_msg("} push_Block (%d bytes) counterMode_AESCrypt%d", resize_len, parmBlk_len*8));
2266     return resize_len;
2267   }
2268 
2269 
2270   void generate_counterMode_pop_Block(Register parmBlk, Register msglen, Label& eraser) {
2271     // For added safety, clear the stack area where the crypto key was stored.
2272     Register scratch = msglen;
2273     assert_different_registers(scratch, Z_R0);             // can't use Z_R0 for exrl.
2274 
2275     // wipe out key on stack
2276     __ z_llgc(scratch, keylen_offset, parmBlk);            // get saved (key_len-1) value (we saved just one byte!)
2277     __ z_exrl(scratch, eraser);                            // template relies on parmBlk still pointing to key on stack
2278 
2279     // restore argument registers.
2280     //   ARG1(from) is Z_RET as well. Not restored - will hold return value anyway.
2281     //   ARG5(msglen) is restored further down.
2282     __ z_lmg(Z_ARG2, Z_ARG4, argsave_offset,    parmBlk);
2283 
2284     // restore work registers
2285     __ z_lmg(Z_R10, Z_R13, regsave_offset, parmBlk);       // make some regs available as work registers
2286 
2287     __ z_lgf(msglen, msglen_offset,  parmBlk);             // Restore msglen, only low order FW is valid
2288 #ifdef ASSERT
2289     {
2290       Label skip2last, skip2done;
2291       // Z_RET (aka Z_R2) can be used as scratch as well. It will be set from msglen before return.
2292       __ z_lgr(Z_RET, Z_SP);                                 // save extended SP
2293       __ z_lg(Z_SP,    unextSP_offset, parmBlk);             // trim stack back to unextended size
2294       __ z_sgrk(Z_R1, Z_SP, Z_RET);
2295 
2296       __ z_cghi(Z_R1, 256);
2297       __ z_brl(skip2last);
2298       __ z_xc(0, 255, Z_RET, 0, Z_RET);
2299       __ z_aghi(Z_RET, 256);
2300       __ z_aghi(Z_R1, -256);
2301 
2302       __ z_cghi(Z_R1, 256);
2303       __ z_brl(skip2last);
2304       __ z_xc(0, 255, Z_RET, 0, Z_RET);
2305       __ z_aghi(Z_RET, 256);
2306       __ z_aghi(Z_R1, -256);
2307 
2308       __ z_cghi(Z_R1, 256);
2309       __ z_brl(skip2last);
2310       __ z_xc(0, 255, Z_RET, 0, Z_RET);
2311       __ z_aghi(Z_RET, 256);
2312       __ z_aghi(Z_R1, -256);
2313 
2314       __ bind(skip2last);
2315       __ z_lgr(Z_R0, Z_RET);
2316       __ z_aghik(Z_RET, Z_R1, -1);  // decrement for exrl
2317       __ z_brl(skip2done);
2318       __ z_lgr(parmBlk, Z_R0);      // parmBlk == Z_R1, used in eraser template
2319       __ z_exrl(Z_RET, eraser);
2320 
2321       __ bind(skip2done);
2322     }
2323 #else
2324     __ z_lg(Z_SP,    unextSP_offset, parmBlk);             // trim stack back to unextended size
2325 #endif
2326   }
2327 
2328 
2329   int generate_counterMode_push_parmBlk(Register parmBlk, Register msglen, Register fCode, Register key, bool is_decipher) {
2330     int       resize_len = 0;
2331     int       mode = is_decipher ? VM_Version::CipherMode::decipher : VM_Version::CipherMode::cipher;
2332     Label     parmBlk_128, parmBlk_192, parmBlk_256, parmBlk_set;
2333     Register  keylen = fCode;      // Expanded key length, as read from key array, Temp only.
2334                                    // use fCode as scratch; fCode receives its final value later.
2335 
2336     // Read key len of expanded key (in 4-byte words).
2337     __ z_lgf(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2338     __ z_cghi(keylen, 52);
2339     if (VM_Version::has_Crypto_AES_CTR256()) { __ z_brh(parmBlk_256); }  // keyLen >  52: AES256. Assume: most frequent
2340     if (VM_Version::has_Crypto_AES_CTR128()) { __ z_brl(parmBlk_128); }  // keyLen <  52: AES128.
2341     if (VM_Version::has_Crypto_AES_CTR192()) { __ z_bre(parmBlk_192); }  // keyLen == 52: AES192. Assume: least frequent
2342 
2343     // Safety net: requested AES_CTR function for requested keylen not available on this CPU.
2344     __ stop_static("AES key strength not supported by CPU. Use -XX:-UseAESCTRIntrinsics as remedy.", 0);
2345 
2346     if (VM_Version::has_Crypto_AES_CTR128()) {
2347       __ bind(parmBlk_128);
2348       resize_len = generate_counterMode_push_Block(VM_Version::Cipher::_AES128_dataBlk,
2349                           VM_Version::Cipher::_AES128_parmBlk_G,
2350                           VM_Version::Cipher::_AES128 + mode,
2351                           parmBlk, msglen, fCode, key);
2352       if (VM_Version::has_Crypto_AES_CTR256() || VM_Version::has_Crypto_AES_CTR192()) {
2353         __ z_bru(parmBlk_set);  // Fallthru otherwise.
2354       }
2355     }
2356 
2357     if (VM_Version::has_Crypto_AES_CTR192()) {
2358       __ bind(parmBlk_192);
2359       resize_len = generate_counterMode_push_Block(VM_Version::Cipher::_AES192_dataBlk,
2360                           VM_Version::Cipher::_AES192_parmBlk_G,
2361                           VM_Version::Cipher::_AES192 + mode,
2362                           parmBlk, msglen, fCode, key);
2363       if (VM_Version::has_Crypto_AES_CTR256()) {
2364         __ z_bru(parmBlk_set);  // Fallthru otherwise.
2365       }
2366     }
2367 
2368     if (VM_Version::has_Crypto_AES_CTR256()) {
2369       __ bind(parmBlk_256);
2370       resize_len = generate_counterMode_push_Block(VM_Version::Cipher::_AES256_dataBlk,
2371                           VM_Version::Cipher::_AES256_parmBlk_G,
2372                           VM_Version::Cipher::_AES256 + mode,
2373                           parmBlk, msglen, fCode, key);
2374       // Fallthru
2375     }
2376 
2377     __ bind(parmBlk_set);
2378     return resize_len;
2379   }
2380 
2381 
2382   void generate_counterMode_pop_parmBlk(Register parmBlk, Register msglen, Label& eraser) {
2383 
2384     BLOCK_COMMENT("pop parmBlk counterMode_AESCrypt {");
2385 
2386     generate_counterMode_pop_Block(parmBlk, msglen, eraser);
2387 
2388     BLOCK_COMMENT("} pop parmBlk counterMode_AESCrypt");
2389   }
2390 
2391   // Implementation of counter-mode AES encrypt/decrypt function.
2392   //
2393   void generate_counterMode_AES_impl(bool is_decipher) {
2394 
2395     // On entry:
2396     // if there was a previous call to update(), and this previous call did not fully use
2397     // the current encrypted counter, that counter is available at arg6_Offset(Z_SP).
2398     // The index of the first unused bayte in the encrypted counter is available at arg7_Offset(Z_SP).
2399     // The index is in the range [1..AES_ctrVal_len] ([1..16]), where index == 16 indicates a fully
2400     // used previous encrypted counter.
2401     // The unencrypted counter has already been incremented and is ready to be used for the next
2402     // data block, after the unused bytes from the previous call have been consumed.
2403     // The unencrypted counter follows the "increment-after use" principle.
2404 
2405     // On exit:
2406     // The index of the first unused byte of the encrypted counter is written back to arg7_Offset(Z_SP).
2407     // A value of AES_ctrVal_len (16) indicates there is no leftover byte.
2408     // If there is at least one leftover byte (1 <= index < AES_ctrVal_len), the encrypted counter value
2409     // is written back to arg6_Offset(Z_SP). If there is no leftover, nothing is written back.
2410     // The unencrypted counter value is written back after having been incremented.
2411 
2412     Register       from    = Z_ARG1; // byte[], source byte array (clear text)
2413     Register       to      = Z_ARG2; // byte[], destination byte array (ciphered)
2414     Register       key     = Z_ARG3; // byte[], expanded key array.
2415     Register       ctr     = Z_ARG4; // byte[], counter byte array.
2416     const Register msglen  = Z_ARG5; // int, Total length of the msg to be encrypted. Value must be
2417                                      // returned in Z_RET upon completion of this stub.
2418                                      // This is a jint. Negative values are illegal, but technically possible.
2419                                      // Do not rely on high word. Contents is undefined.
2420                // encCtr   = Z_ARG6  - encrypted counter (byte array),
2421                //                      address passed on stack at _z_abi(remaining_cargs) + 0 * WordSize
2422                // cvIndex  = Z_ARG7  - # used (consumed) bytes of encrypted counter,
2423                //                      passed on stack at _z_abi(remaining_cargs) + 1 * WordSize
2424                //                      Caution:4-byte value, right-justified in 8-byte stack word
2425 
2426     const Register fCode   = Z_R0;   // crypto function code
2427     const Register parmBlk = Z_R1;   // parameter block address (points to crypto key)
2428     const Register src     = Z_ARG1; // is Z_R2, forms even/odd pair with srclen
2429     const Register srclen  = Z_ARG2; // Overwrites destination address.
2430     const Register dst     = Z_ARG3; // Overwrites key address.
2431     const Register counter = Z_ARG5; // Overwrites msglen. Must have counter array in an even register.
2432 
2433     Label srcMover, dstMover, fromMover, ctrXOR, dataEraser;  // EXRL (execution) templates.
2434     Label CryptoLoop, CryptoLoop_doit, CryptoLoop_end, CryptoLoop_setupAndDoLast, CryptoLoop_ctrVal_inc;
2435     Label allDone, allDone_noInc, popAndExit, Exit;
2436 
2437     int    arg6_Offset = _z_abi(remaining_cargs) + 0 * HeapWordSize;
2438     int    arg7_Offset = _z_abi(remaining_cargs) + 1 * HeapWordSize; // stack slot holds ptr to int value
2439     int   oldSP_Offset = 0;
2440 
2441     // Is there anything to do at all? Protect against negative len as well.
2442     __ z_ltr(msglen, msglen);
2443     __ z_brnh(Exit);
2444 
2445     // Expand stack, load parm block address into parmBlk (== Z_R1), copy crypto key to parm block.
2446     oldSP_Offset = generate_counterMode_push_parmBlk(parmBlk, msglen, fCode, key, is_decipher);
2447     arg6_Offset += oldSP_Offset;
2448     arg7_Offset += oldSP_Offset;
2449 
2450     // Check if there is a leftover, partially used encrypted counter from last invocation.
2451     // If so, use those leftover counter bytes first before starting the "normal" encryption.
2452 
2453     // We do not have access to the encrypted counter value. It is generated and used only
2454     // internally within the previous kmctr instruction. But, at the end of call to this stub,
2455     // the last encrypted couner is extracted by ciphering a 0x00 byte stream. The result is
2456     // stored at the arg6 location for use with the subsequent call.
2457     //
2458     // The #used bytes of the encrypted counter (from a previous call) is provided via arg7.
2459     // It is used as index into the encrypted counter to access the first byte availabla for ciphering.
2460     // To cipher the input text, we move the number of remaining bytes in the encrypted counter from
2461     // input to output. Then we simply XOR the output bytes with the associated encrypted counter bytes.
2462 
2463     Register cvIxAddr  = Z_R10;                  // Address of index into encCtr. Preserved for use @CryptoLoop_end.
2464     __ z_lg(cvIxAddr, arg7_Offset, Z_SP);        // arg7: addr of field encCTR_index.
2465 
2466     {
2467       Register cvUnused  = Z_R11;                // # unused bytes of encrypted counter value (= 16 - cvIndex)
2468       Register encCtr    = Z_R12;                // encrypted counter value, points to first ununsed byte.
2469       Register cvIndex   = Z_R13;                // # index of first unused byte of encrypted counter value
2470       Label    preLoop_end;
2471 
2472       // preLoop is necessary only if there is a partially used encrypted counter (encCtr).
2473       // Partially used means cvIndex is in [1, dataBlk_len-1].
2474       // cvIndex == 0:           encCtr is set up but not used at all. Should not occur.
2475       // cvIndex == dataBlk_len: encCtr is exhausted, all bytes used.
2476       // Using unsigned compare protects against cases where (cvIndex < 0).
2477       __ z_clfhsi(0, cvIxAddr, AES_ctrVal_len);  // check #used bytes in encCtr against ctr len.
2478       __ z_brnl(preLoop_end);                    // if encCtr is fully used, skip to normal processing.
2479       __ z_ltgf(cvIndex, 0, Z_R0, cvIxAddr);     // # used bytes in encCTR.
2480       __ z_brz(preLoop_end);                     // if encCtr has no used bytes, skip to normal processing.
2481 
2482       __ z_lg(encCtr, arg6_Offset, Z_SP);        // encrypted counter from last call to update()
2483       __ z_agr(encCtr, cvIndex);                 // now points to first unused byte
2484 
2485       __ add2reg(cvUnused, -AES_ctrVal_len, cvIndex); // calculate #unused bytes in encCtr.
2486       __ z_lcgr(cvUnused, cvUnused);             // previous checks ensure cvUnused in range [1, dataBlk_len-1]
2487 
2488       __ z_lgf(msglen, msglen_offset, parmBlk);  // Restore msglen (jint value)
2489       __ z_cr(cvUnused, msglen);                 // check if msg can consume all unused encCtr bytes
2490       __ z_locr(cvUnused, msglen, Assembler::bcondHigh); // take the shorter length
2491       __ z_aghi(cvUnused, -1);                   // decrement # unused bytes by 1 for exrl instruction
2492                                                  // preceding checks ensure cvUnused in range [1, dataBlk_len-1]
2493       __ z_exrl(cvUnused, fromMover);
2494       __ z_exrl(cvUnused, ctrXOR);
2495 
2496       __ z_aghi(cvUnused, 1);                    // revert decrement from above
2497       __ z_agr(cvIndex, cvUnused);               // update index into encCtr (first unused byte)
2498       __ z_st(cvIndex, 0, cvIxAddr);             // write back arg7, cvIxAddr is still valid
2499 
2500       // update pointers and counters to prepare for main loop
2501       __ z_agr(from, cvUnused);
2502       __ z_agr(to, cvUnused);
2503       __ z_sr(msglen, cvUnused);                 // #bytes not yet processed
2504       __ z_sty(msglen, msglen_red_offset, parmBlk); // save for calculations in main loop
2505       __ z_srak(Z_R0, msglen, exact_log2(AES_ctrVal_len));// # full cipher blocks that can be formed from input text.
2506       __ z_sty(Z_R0, rem_msgblk_offset, parmBlk);
2507 
2508       // check remaining msglen. If zero, all msg bytes were processed in preLoop.
2509       __ z_ltr(msglen, msglen);
2510       __ z_brnh(popAndExit);
2511 
2512       __ bind(preLoop_end);
2513     }
2514 
2515     // Create count vector on stack to accommodate up to AES_ctrVec_len blocks.
2516     generate_counterMode_prepare_Stack(parmBlk, ctr, counter, fCode);
2517 
2518     // Prepare other registers for instruction.
2519     __ lgr_if_needed(src, from);     // Copy src address. Will not emit, src/from are identical.
2520     __ z_lgr(dst, to);
2521     __ z_llgc(fCode, fCode_offset, Z_R0, parmBlk);
2522 
2523     __ bind(CryptoLoop);
2524       __ z_lghi(srclen, AES_ctrArea_len);                     // preset len (#bytes) for next iteration: max possible.
2525       __ z_asi(rem_msgblk_offset, parmBlk, -AES_ctrVec_len);  // decrement #remaining blocks (16 bytes each). Range: [+127..-128]
2526       __ z_brl(CryptoLoop_setupAndDoLast);                    // Handling the last iteration (using less than max #blocks) out-of-line
2527 
2528       __ bind(CryptoLoop_doit);
2529       __ kmctr(dst, counter, src);   // Cipher the message.
2530 
2531       __ z_lt(srclen, rem_msgblk_offset, Z_R0, parmBlk);      // check if this was the last iteration
2532       __ z_brz(CryptoLoop_ctrVal_inc);                        // == 0: ctrVector fully used. Need to increment the first
2533                                                               //       vector element to encrypt remaining unprocessed bytes.
2534 //    __ z_brl(CryptoLoop_end);                               //  < 0: this was detected before and handled at CryptoLoop_setupAndDoLast
2535                                                               //  > 0: this is the fallthru case, need another iteration
2536 
2537       generate_counterMode_increment_ctrVector(parmBlk, counter, srclen, false); // srclen unused here (serves as scratch)
2538       __ z_bru(CryptoLoop);
2539 
2540     __ bind(CryptoLoop_end);
2541 
2542     // OK, when we arrive here, we have encrypted all of the "from" byte stream
2543     // except for the last few [0..dataBlk_len) bytes. In addition, we know that
2544     // there are no more unused bytes in the previously generated encrypted counter.
2545     // The (unencrypted) counter, however, is ready to use (it was incremented before).
2546 
2547     // To encrypt the few remaining bytes, we need to form an extra src and dst
2548     // data block of dataBlk_len each. This is because we can only process full
2549     // blocks but we must not read or write beyond the boundaries of the argument
2550     // arrays. Here is what we do:
2551     //  - The ctrVector has at least one unused element. This is ensured by CryptoLoop code.
2552     //  - The (first) unused element is pointed at by the counter register.
2553     //  - The src data block is filled with the remaining "from" bytes, remainder of block undefined.
2554     //  - The single src data block is encrypted into the dst data block.
2555     //  - The dst data block is copied into the "to" array, but only the leftmost few bytes
2556     //    (as many as were left in the source byte stream).
2557     //  - The counter value to be used is pointed at by the counter register.
2558     //  - Fortunately, the crypto instruction (kmctr) has updated all related addresses such that
2559     //    we know where to continue with "from" and "to" and which counter value to use next.
2560 
2561     Register encCtr    = Z_R12;  // encrypted counter value, points to stub argument.
2562     Register tmpDst    = Z_R12;  // addr of temp destination (for last partial block encryption)
2563 
2564     __ z_lgf(srclen, msglen_red_offset, parmBlk);          // plaintext/ciphertext len after potential preLoop processing.
2565     __ z_nilf(srclen, AES_ctrVal_len - 1);                 // those rightmost bits indicate the unprocessed #bytes
2566     __ z_stg(srclen, localSpill_offset, parmBlk);          // save for later reuse
2567     __ z_mvhi(0, cvIxAddr, 16);                            // write back arg7 (default 16 in case of allDone).
2568     __ z_braz(allDone_noInc);                              // no unprocessed bytes? Then we are done.
2569                                                            // This also means the last block of data processed was
2570                                                            // a full-sized block (AES_ctrVal_len bytes) which results
2571                                                            // in no leftover encrypted counter bytes.
2572     __ z_st(srclen, 0, cvIxAddr);                          // This will be the index of the first unused byte in the encrypted counter.
2573     __ z_stg(counter, counter_offset, parmBlk);            // save counter location for easy later restore
2574 
2575     // calculate address (on stack) for final dst and src blocks.
2576     __ add2reg(tmpDst, AES_dataBlk_offset, parmBlk);       // tmp dst (on stack) is right before tmp src
2577 
2578     // We have a residue of [1..15] unprocessed bytes, srclen holds the exact number.
2579     // Residue == 0 was checked just above, residue == AES_ctrVal_len would be another
2580     // full-sized block and would have been handled by CryptoLoop.
2581 
2582     __ add2reg(srclen, -1);                                // decrement for exrl
2583     __ z_exrl(srclen, srcMover);                           // copy remaining bytes of src byte stream
2584     __ load_const_optimized(srclen, AES_ctrVal_len);       // kmctr processes only complete blocks
2585     __ add2reg(src, AES_ctrVal_len, tmpDst);               // tmp dst is right before tmp src
2586 
2587     __ kmctr(tmpDst, counter, src);                        // Cipher the remaining bytes.
2588 
2589     __ add2reg(tmpDst, -AES_ctrVal_len, tmpDst);           // restore tmp dst address
2590     __ z_lg(srclen, localSpill_offset, parmBlk);           // residual len, saved above
2591     __ add2reg(srclen, -1);                                // decrement for exrl
2592     __ z_exrl(srclen, dstMover);
2593 
2594     // Write back new encrypted counter
2595     __ add2reg(src, AES_dataBlk_offset, parmBlk);
2596     __ clear_mem(Address(src, RegisterOrConstant((intptr_t)0)), AES_ctrVal_len);
2597     __ load_const_optimized(srclen, AES_ctrVal_len);       // kmctr processes only complete blocks
2598     __ z_lg(encCtr, arg6_Offset, Z_SP);                    // write encrypted counter to arg6
2599     __ z_lg(counter, counter_offset, parmBlk);             // restore counter
2600     __ kmctr(encCtr, counter, src);
2601 
2602     // The last used element of the counter vector contains the latest counter value that was used.
2603     // As described above, the counter value on exit must be the one to be used next.
2604     __ bind(allDone);
2605     __ z_lg(counter, counter_offset, parmBlk);             // restore counter
2606     generate_increment128(counter, 0, 1, Z_R0);
2607 
2608     __ bind(allDone_noInc);
2609     __ z_mvc(0, AES_ctrVal_len, ctr, 0, counter);
2610 
2611     __ bind(popAndExit);
2612     generate_counterMode_pop_parmBlk(parmBlk, msglen, dataEraser);
2613 
2614     __ bind(Exit);
2615     __ z_lgfr(Z_RET, msglen);
2616 
2617     __ z_br(Z_R14);
2618 
2619     //----------------------------
2620     //---<  out-of-line code  >---
2621     //----------------------------
2622     __ bind(CryptoLoop_setupAndDoLast);
2623       __ z_lgf(srclen, rem_msgblk_offset, parmBlk);           // remaining #blocks in memory is < 0
2624       __ z_aghi(srclen, AES_ctrVec_len);                      // recalculate the actually remaining #blocks
2625       __ z_sllg(srclen, srclen, exact_log2(AES_ctrVal_len));  // convert to #bytes. Counter value is same length as data block
2626       __ kmctr(dst, counter, src);                            // Cipher the last integral blocks of the message.
2627       __ z_bru(CryptoLoop_end);                               // There is at least one unused counter vector element.
2628                                                               // no need to increment.
2629 
2630     __ bind(CryptoLoop_ctrVal_inc);
2631       generate_counterMode_increment_ctrVector(parmBlk, counter, srclen, true); // srclen unused here (serves as scratch)
2632       __ z_bru(CryptoLoop_end);
2633 
2634     //-------------------------------------------
2635     //---<  execution templates for preLoop  >---
2636     //-------------------------------------------
2637     __ bind(fromMover);
2638     __ z_mvc(0, 0, to, 0, from);               // Template instruction to move input data to dst.
2639     __ bind(ctrXOR);
2640     __ z_xc(0,  0, to, 0, encCtr);             // Template instruction to XOR input data (now in to) with encrypted counter.
2641 
2642     //-------------------------------
2643     //---<  execution templates  >---
2644     //-------------------------------
2645     __ bind(dataEraser);
2646     __ z_xc(0, 0, parmBlk, 0, parmBlk);  // Template instruction to erase crypto key on stack.
2647     __ bind(dstMover);
2648     __ z_mvc(0, 0, dst, 0, tmpDst);      // Template instruction to move encrypted reminder from stack to dst.
2649     __ bind(srcMover);
2650     __ z_mvc(AES_ctrVal_len, 0, tmpDst, 0, src); // Template instruction to move reminder of source byte stream to stack.
2651   }
2652 
2653 
2654   // Create two intrinsic variants, optimized for short and long plaintexts.
2655   void generate_counterMode_AES(bool is_decipher) {
2656 
2657     const Register msglen  = Z_ARG5;    // int, Total length of the msg to be encrypted. Value must be
2658                                         // returned in Z_RET upon completion of this stub.
2659     const int threshold = 256;          // above this length (in bytes), text is considered long.
2660     const int vec_short = threshold>>6; // that many blocks (16 bytes each) per iteration, max 4 loop iterations
2661     const int vec_long  = threshold>>2; // that many blocks (16 bytes each) per iteration.
2662 
2663     Label AESCTR_short, AESCTR_long;
2664 
2665     __ z_chi(msglen, threshold);
2666     __ z_brh(AESCTR_long);
2667 
2668     __ bind(AESCTR_short);
2669 
2670     BLOCK_COMMENT(err_msg("counterMode_AESCrypt (text len <= %d, block size = %d) {", threshold, vec_short*16));
2671 
2672     AES_ctrVec_len = vec_short;
2673     generate_counterMode_AES_impl(false);   // control of generated code will not return
2674 
2675     BLOCK_COMMENT(err_msg("} counterMode_AESCrypt (text len <= %d, block size = %d)", threshold, vec_short*16));
2676 
2677     __ align(32); // Octoword alignment benefits branch targets.
2678 
2679     BLOCK_COMMENT(err_msg("counterMode_AESCrypt (text len > %d, block size = %d) {", threshold, vec_long*16));
2680 
2681     __ bind(AESCTR_long);
2682     AES_ctrVec_len = vec_long;
2683     generate_counterMode_AES_impl(false);   // control of generated code will not return
2684 
2685     BLOCK_COMMENT(err_msg("} counterMode_AESCrypt (text len > %d, block size = %d)", threshold, vec_long*16));
2686   }
2687 
2688 
2689   // Compute AES-CTR crypto function.
2690   // Encrypt or decrypt is selected via parameters. Only one stub is necessary.
2691   address generate_counterMode_AESCrypt() {
2692     __ align(CodeEntryAlignment);
2693     StubGenStubId stub_id = StubGenStubId::counterMode_AESCrypt_id;
2694     StubCodeMark mark(this, stub_id);
2695     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
2696 
2697     generate_counterMode_AES(false);
2698 
2699     return __ addr_at(start_off);
2700   }
2701 
2702 // *****************************************************************************
2703 
2704   // Compute GHASH function.
2705   address generate_ghash_processBlocks() {
2706     __ align(CodeEntryAlignment);
2707     StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_id;
2708     StubCodeMark mark(this, stub_id);
2709     unsigned int start_off = __ offset();   // Remember stub start address (is rtn value).
2710 
2711     const Register state   = Z_ARG1;
2712     const Register subkeyH = Z_ARG2;
2713     const Register data    = Z_ARG3; // 1st of even-odd register pair.
2714     const Register blocks  = Z_ARG4;
2715     const Register len     = blocks; // 2nd of even-odd register pair.
2716 
2717     const int param_block_size = 4 * 8;
2718     const int frame_resize = param_block_size + 8; // Extra space for copy of fp.
2719 
2720     // Reserve stack space for parameter block (R1).
2721     __ z_lgr(Z_R1, Z_SP);
2722     __ resize_frame(-frame_resize, Z_R0, true);
2723     __ z_aghi(Z_R1, -param_block_size);
2724 
2725     // Fill parameter block.
2726     __ z_mvc(Address(Z_R1)    , Address(state)  , 16);
2727     __ z_mvc(Address(Z_R1, 16), Address(subkeyH), 16);
2728 
2729     // R4+5: data pointer + length
2730     __ z_llgfr(len, blocks);  // Cast to 64-bit.
2731 
2732     // R0: function code
2733     __ load_const_optimized(Z_R0, (int)VM_Version::MsgDigest::_GHASH);
2734 
2735     // Compute.
2736     __ z_sllg(len, len, 4);  // In bytes.
2737     __ kimd(data);
2738 
2739     // Copy back result and free parameter block.
2740     __ z_mvc(Address(state), Address(Z_R1), 16);
2741     __ z_xc(Address(Z_R1), param_block_size, Address(Z_R1));
2742     __ z_aghi(Z_SP, frame_resize);
2743 
2744     __ z_br(Z_R14);
2745 
2746     return __ addr_at(start_off);
2747   }
2748 
2749 
2750   // Call interface for all SHA* stubs.
2751   //
2752   //   Z_ARG1 - source data block. Ptr to leftmost byte to be processed.
2753   //   Z_ARG2 - current SHA state. Ptr to state area. This area serves as
2754   //            parameter block as required by the crypto instruction.
2755   //   Z_ARG3 - current byte offset in source data block.
2756   //   Z_ARG4 - last byte offset in source data block.
2757   //            (Z_ARG4 - Z_ARG3) gives the #bytes remaining to be processed.
2758   //
2759   //   Z_RET  - return value. First unprocessed byte offset in src buffer.
2760   //
2761   //   A few notes on the call interface:
2762   //    - All stubs, whether they are single-block or multi-block, are assumed to
2763   //      digest an integer multiple of the data block length of data. All data
2764   //      blocks are digested using the intermediate message digest (KIMD) instruction.
2765   //      Special end processing, as done by the KLMD instruction, seems to be
2766   //      emulated by the calling code.
2767   //
2768   //    - Z_ARG1 addresses the first byte of source data. The offset (Z_ARG3) is
2769   //      already accounted for.
2770   //
2771   //    - The current SHA state (the intermediate message digest value) is contained
2772   //      in an area addressed by Z_ARG2. The area size depends on the SHA variant
2773   //      and is accessible via the enum VM_Version::MsgDigest::_SHA<n>_parmBlk_I
2774   //
2775   //    - The single-block stub is expected to digest exactly one data block, starting
2776   //      at the address passed in Z_ARG1.
2777   //
2778   //    - The multi-block stub is expected to digest all data blocks which start in
2779   //      the offset interval [srcOff(Z_ARG3), srcLimit(Z_ARG4)). The exact difference
2780   //      (srcLimit-srcOff), rounded up to the next multiple of the data block length,
2781   //      gives the number of blocks to digest. It must be assumed that the calling code
2782   //      provides for a large enough source data buffer.
2783   //
2784   // Compute SHA-1 function.
2785   address generate_SHA1_stub(StubGenStubId stub_id) {
2786     bool multiBlock;
2787     switch (stub_id) {
2788     case sha1_implCompress_id:
2789       multiBlock = false;
2790       break;
2791     case sha1_implCompressMB_id:
2792       multiBlock = true;
2793       break;
2794     default:
2795       ShouldNotReachHere();
2796     }
2797     __ align(CodeEntryAlignment);
2798     StubCodeMark mark(this, stub_id);
2799     unsigned int start_off = __ offset();   // Remember stub start address (is rtn value).
2800 
2801     const Register srcBuff        = Z_ARG1; // Points to first block to process (offset already added).
2802     const Register SHAState       = Z_ARG2; // Only on entry. Reused soon thereafter for kimd register pairs.
2803     const Register srcOff         = Z_ARG3; // int
2804     const Register srcLimit       = Z_ARG4; // Only passed in multiBlock case. int
2805 
2806     const Register SHAState_local = Z_R1;
2807     const Register SHAState_save  = Z_ARG3;
2808     const Register srcBufLen      = Z_ARG2; // Destroys state address, must be copied before.
2809     Label useKLMD, rtn;
2810 
2811     __ load_const_optimized(Z_R0, (int)VM_Version::MsgDigest::_SHA1);   // function code
2812     __ z_lgr(SHAState_local, SHAState);                                 // SHAState == parameter block
2813 
2814     if (multiBlock) {  // Process everything from offset to limit.
2815 
2816       // The following description is valid if we get a raw (unpimped) source data buffer,
2817       // spanning the range between [srcOff(Z_ARG3), srcLimit(Z_ARG4)). As detailed above,
2818       // the calling convention for these stubs is different. We leave the description in
2819       // to inform the reader what must be happening hidden in the calling code.
2820       //
2821       // The data block to be processed can have arbitrary length, i.e. its length does not
2822       // need to be an integer multiple of SHA<n>_datablk. Therefore, we need to implement
2823       // two different paths. If the length is an integer multiple, we use KIMD, saving us
2824       // to copy the SHA state back and forth. If the length is odd, we copy the SHA state
2825       // to the stack, execute a KLMD instruction on it and copy the result back to the
2826       // caller's SHA state location.
2827 
2828       // Total #srcBuff blocks to process.
2829       if (VM_Version::has_DistinctOpnds()) {
2830         __ z_srk(srcBufLen, srcLimit, srcOff); // exact difference
2831         __ z_ahi(srcBufLen, VM_Version::MsgDigest::_SHA1_dataBlk-1);   // round up
2832         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA1_dataBlk-1)) & 0xffff);
2833         __ z_ark(srcLimit, srcOff, srcBufLen); // Srclimit temporarily holds return value.
2834         __ z_llgfr(srcBufLen, srcBufLen);      // Cast to 64-bit.
2835       } else {
2836         __ z_lgfr(srcBufLen, srcLimit);        // Exact difference. srcLimit passed as int.
2837         __ z_sgfr(srcBufLen, srcOff);          // SrcOff passed as int, now properly casted to long.
2838         __ z_aghi(srcBufLen, VM_Version::MsgDigest::_SHA1_dataBlk-1);   // round up
2839         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA1_dataBlk-1)) & 0xffff);
2840         __ z_lgr(srcLimit, srcOff);            // SrcLimit temporarily holds return value.
2841         __ z_agr(srcLimit, srcBufLen);
2842       }
2843 
2844       // Integral #blocks to digest?
2845       // As a result of the calculations above, srcBufLen MUST be an integer
2846       // multiple of _SHA1_dataBlk, or else we are in big trouble.
2847       // We insert an asm_assert into the KLMD case to guard against that.
2848       __ z_tmll(srcBufLen, VM_Version::MsgDigest::_SHA1_dataBlk-1);
2849       __ z_brc(Assembler::bcondNotAllZero, useKLMD);
2850 
2851       // Process all full blocks.
2852       __ kimd(srcBuff);
2853 
2854       __ z_lgr(Z_RET, srcLimit);  // Offset of first unprocessed byte in buffer.
2855     } else {  // Process one data block only.
2856       __ load_const_optimized(srcBufLen, (int)VM_Version::MsgDigest::_SHA1_dataBlk);   // #srcBuff bytes to process
2857       __ kimd(srcBuff);
2858       __ add2reg(Z_RET, (int)VM_Version::MsgDigest::_SHA1_dataBlk, srcOff);            // Offset of first unprocessed byte in buffer. No 32 to 64 bit extension needed.
2859     }
2860 
2861     __ bind(rtn);
2862     __ z_br(Z_R14);
2863 
2864     if (multiBlock) {
2865       __ bind(useKLMD);
2866 
2867 #if 1
2868       // Security net: this stub is believed to be called for full-sized data blocks only
2869       // NOTE: The following code is believed to be correct, but is is not tested.
2870       __ stop_static("SHA128 stub can digest full data blocks only. Use -XX:-UseSHA as remedy.", 0);
2871 #endif
2872     }
2873 
2874     return __ addr_at(start_off);
2875   }
2876 
2877   // Compute SHA-256 function.
2878   address generate_SHA256_stub(StubGenStubId stub_id) {
2879     bool multiBlock;
2880     switch (stub_id) {
2881     case sha256_implCompress_id:
2882       multiBlock = false;
2883       break;
2884     case sha256_implCompressMB_id:
2885       multiBlock = true;
2886       break;
2887     default:
2888       ShouldNotReachHere();
2889     }
2890     __ align(CodeEntryAlignment);
2891     StubCodeMark mark(this, stub_id);
2892     unsigned int start_off = __ offset();   // Remember stub start address (is rtn value).
2893 
2894     const Register srcBuff        = Z_ARG1;
2895     const Register SHAState       = Z_ARG2; // Only on entry. Reused soon thereafter.
2896     const Register SHAState_local = Z_R1;
2897     const Register SHAState_save  = Z_ARG3;
2898     const Register srcOff         = Z_ARG3;
2899     const Register srcLimit       = Z_ARG4;
2900     const Register srcBufLen      = Z_ARG2; // Destroys state address, must be copied before.
2901     Label useKLMD, rtn;
2902 
2903     __ load_const_optimized(Z_R0, (int)VM_Version::MsgDigest::_SHA256); // function code
2904     __ z_lgr(SHAState_local, SHAState);                                 // SHAState == parameter block
2905 
2906     if (multiBlock) {  // Process everything from offset to limit.
2907       // The following description is valid if we get a raw (unpimped) source data buffer,
2908       // spanning the range between [srcOff(Z_ARG3), srcLimit(Z_ARG4)). As detailed above,
2909       // the calling convention for these stubs is different. We leave the description in
2910       // to inform the reader what must be happening hidden in the calling code.
2911       //
2912       // The data block to be processed can have arbitrary length, i.e. its length does not
2913       // need to be an integer multiple of SHA<n>_datablk. Therefore, we need to implement
2914       // two different paths. If the length is an integer multiple, we use KIMD, saving us
2915       // to copy the SHA state back and forth. If the length is odd, we copy the SHA state
2916       // to the stack, execute a KLMD instruction on it and copy the result back to the
2917       // caller's SHA state location.
2918 
2919       // total #srcBuff blocks to process
2920       if (VM_Version::has_DistinctOpnds()) {
2921         __ z_srk(srcBufLen, srcLimit, srcOff);   // exact difference
2922         __ z_ahi(srcBufLen, VM_Version::MsgDigest::_SHA256_dataBlk-1); // round up
2923         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA256_dataBlk-1)) & 0xffff);
2924         __ z_ark(srcLimit, srcOff, srcBufLen);   // Srclimit temporarily holds return value.
2925         __ z_llgfr(srcBufLen, srcBufLen);        // Cast to 64-bit.
2926       } else {
2927         __ z_lgfr(srcBufLen, srcLimit);          // exact difference
2928         __ z_sgfr(srcBufLen, srcOff);
2929         __ z_aghi(srcBufLen, VM_Version::MsgDigest::_SHA256_dataBlk-1); // round up
2930         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA256_dataBlk-1)) & 0xffff);
2931         __ z_lgr(srcLimit, srcOff);              // Srclimit temporarily holds return value.
2932         __ z_agr(srcLimit, srcBufLen);
2933       }
2934 
2935       // Integral #blocks to digest?
2936       // As a result of the calculations above, srcBufLen MUST be an integer
2937       // multiple of _SHA1_dataBlk, or else we are in big trouble.
2938       // We insert an asm_assert into the KLMD case to guard against that.
2939       __ z_tmll(srcBufLen, VM_Version::MsgDigest::_SHA256_dataBlk-1);
2940       __ z_brc(Assembler::bcondNotAllZero, useKLMD);
2941 
2942       // Process all full blocks.
2943       __ kimd(srcBuff);
2944 
2945       __ z_lgr(Z_RET, srcLimit);  // Offset of first unprocessed byte in buffer.
2946     } else {  // Process one data block only.
2947       __ load_const_optimized(srcBufLen, (int)VM_Version::MsgDigest::_SHA256_dataBlk); // #srcBuff bytes to process
2948       __ kimd(srcBuff);
2949       __ add2reg(Z_RET, (int)VM_Version::MsgDigest::_SHA256_dataBlk, srcOff);          // Offset of first unprocessed byte in buffer.
2950     }
2951 
2952     __ bind(rtn);
2953     __ z_br(Z_R14);
2954 
2955     if (multiBlock) {
2956       __ bind(useKLMD);
2957 #if 1
2958       // Security net: this stub is believed to be called for full-sized data blocks only.
2959       // NOTE:
2960       //   The following code is believed to be correct, but is is not tested.
2961       __ stop_static("SHA256 stub can digest full data blocks only. Use -XX:-UseSHA as remedy.", 0);
2962 #endif
2963     }
2964 
2965     return __ addr_at(start_off);
2966   }
2967 
2968   // Compute SHA-512 function.
2969   address generate_SHA512_stub(StubGenStubId stub_id) {
2970     bool multiBlock;
2971     switch (stub_id) {
2972     case sha512_implCompress_id:
2973       multiBlock = false;
2974       break;
2975     case sha512_implCompressMB_id:
2976       multiBlock = true;
2977       break;
2978     default:
2979       ShouldNotReachHere();
2980     }
2981     __ align(CodeEntryAlignment);
2982     StubCodeMark mark(this, stub_id);
2983     unsigned int start_off = __ offset();   // Remember stub start address (is rtn value).
2984 
2985     const Register srcBuff        = Z_ARG1;
2986     const Register SHAState       = Z_ARG2; // Only on entry. Reused soon thereafter.
2987     const Register SHAState_local = Z_R1;
2988     const Register SHAState_save  = Z_ARG3;
2989     const Register srcOff         = Z_ARG3;
2990     const Register srcLimit       = Z_ARG4;
2991     const Register srcBufLen      = Z_ARG2; // Destroys state address, must be copied before.
2992     Label useKLMD, rtn;
2993 
2994     __ load_const_optimized(Z_R0, (int)VM_Version::MsgDigest::_SHA512); // function code
2995     __ z_lgr(SHAState_local, SHAState);                                 // SHAState == parameter block
2996 
2997     if (multiBlock) {  // Process everything from offset to limit.
2998       // The following description is valid if we get a raw (unpimped) source data buffer,
2999       // spanning the range between [srcOff(Z_ARG3), srcLimit(Z_ARG4)). As detailed above,
3000       // the calling convention for these stubs is different. We leave the description in
3001       // to inform the reader what must be happening hidden in the calling code.
3002       //
3003       // The data block to be processed can have arbitrary length, i.e. its length does not
3004       // need to be an integer multiple of SHA<n>_datablk. Therefore, we need to implement
3005       // two different paths. If the length is an integer multiple, we use KIMD, saving us
3006       // to copy the SHA state back and forth. If the length is odd, we copy the SHA state
3007       // to the stack, execute a KLMD instruction on it and copy the result back to the
3008       // caller's SHA state location.
3009 
3010       // total #srcBuff blocks to process
3011       if (VM_Version::has_DistinctOpnds()) {
3012         __ z_srk(srcBufLen, srcLimit, srcOff);   // exact difference
3013         __ z_ahi(srcBufLen, VM_Version::MsgDigest::_SHA512_dataBlk-1); // round up
3014         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA512_dataBlk-1)) & 0xffff);
3015         __ z_ark(srcLimit, srcOff, srcBufLen);   // Srclimit temporarily holds return value.
3016         __ z_llgfr(srcBufLen, srcBufLen);        // Cast to 64-bit.
3017       } else {
3018         __ z_lgfr(srcBufLen, srcLimit);          // exact difference
3019         __ z_sgfr(srcBufLen, srcOff);
3020         __ z_aghi(srcBufLen, VM_Version::MsgDigest::_SHA512_dataBlk-1); // round up
3021         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA512_dataBlk-1)) & 0xffff);
3022         __ z_lgr(srcLimit, srcOff);              // Srclimit temporarily holds return value.
3023         __ z_agr(srcLimit, srcBufLen);
3024       }
3025 
3026       // integral #blocks to digest?
3027       // As a result of the calculations above, srcBufLen MUST be an integer
3028       // multiple of _SHA1_dataBlk, or else we are in big trouble.
3029       // We insert an asm_assert into the KLMD case to guard against that.
3030       __ z_tmll(srcBufLen, VM_Version::MsgDigest::_SHA512_dataBlk-1);
3031       __ z_brc(Assembler::bcondNotAllZero, useKLMD);
3032 
3033       // Process all full blocks.
3034       __ kimd(srcBuff);
3035 
3036       __ z_lgr(Z_RET, srcLimit);  // Offset of first unprocessed byte in buffer.
3037     } else {  // Process one data block only.
3038       __ load_const_optimized(srcBufLen, (int)VM_Version::MsgDigest::_SHA512_dataBlk); // #srcBuff bytes to process
3039       __ kimd(srcBuff);
3040       __ add2reg(Z_RET, (int)VM_Version::MsgDigest::_SHA512_dataBlk, srcOff);          // Offset of first unprocessed byte in buffer.
3041     }
3042 
3043     __ bind(rtn);
3044     __ z_br(Z_R14);
3045 
3046     if (multiBlock) {
3047       __ bind(useKLMD);
3048 #if 1
3049       // Security net: this stub is believed to be called for full-sized data blocks only
3050       // NOTE:
3051       //   The following code is believed to be correct, but is is not tested.
3052       __ stop_static("SHA512 stub can digest full data blocks only. Use -XX:-UseSHA as remedy.", 0);
3053 #endif
3054     }
3055 
3056     return __ addr_at(start_off);
3057   }
3058 
3059 
3060   /**
3061    *  Arguments:
3062    *
3063    * Inputs:
3064    *   Z_ARG1    - int   crc
3065    *   Z_ARG2    - byte* buf
3066    *   Z_ARG3    - int   length (of buffer)
3067    *
3068    * Result:
3069    *   Z_RET     - int   crc result
3070    **/
3071   // Compute CRC function (generic, for all polynomials).
3072   void generate_CRC_updateBytes(Register table, bool invertCRC) {
3073 
3074     // arguments to kernel_crc32:
3075     Register       crc     = Z_ARG1;  // Current checksum, preset by caller or result from previous call, int.
3076     Register       data    = Z_ARG2;  // source byte array
3077     Register       dataLen = Z_ARG3;  // #bytes to process, int
3078 //    Register       table   = Z_ARG4;  // crc table address. Preloaded and passed in by caller.
3079     const Register t0      = Z_R10;   // work reg for kernel* emitters
3080     const Register t1      = Z_R11;   // work reg for kernel* emitters
3081     const Register t2      = Z_R12;   // work reg for kernel* emitters
3082     const Register t3      = Z_R13;   // work reg for kernel* emitters
3083 
3084 
3085     assert_different_registers(crc, data, dataLen, table);
3086 
3087     // We pass these values as ints, not as longs as required by C calling convention.
3088     // Crc used as int.
3089     __ z_llgfr(dataLen, dataLen);
3090 
3091     __ resize_frame(-(6*8), Z_R0, true); // Resize frame to provide add'l space to spill 5 registers.
3092     __ z_stmg(Z_R10, Z_R13, 1*8, Z_SP);  // Spill regs 10..11 to make them available as work registers.
3093     __ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, invertCRC);
3094     __ z_lmg(Z_R10, Z_R13, 1*8, Z_SP);   // Spill regs 10..11 back from stack.
3095     __ resize_frame(+(6*8), Z_R0, true); // Resize frame to provide add'l space to spill 5 registers.
3096 
3097     __ z_llgfr(Z_RET, crc);  // Updated crc is function result. No copying required, just zero upper 32 bits.
3098     __ z_br(Z_R14);          // Result already in Z_RET == Z_ARG1.
3099   }
3100 
3101 
3102   // Compute CRC32 function.
3103   address generate_CRC32_updateBytes() {
3104     __ align(CodeEntryAlignment);
3105     StubGenStubId stub_id =  StubGenStubId::updateBytesCRC32_id;
3106     StubCodeMark mark(this, stub_id);
3107     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
3108 
3109     assert(UseCRC32Intrinsics, "should not generate this stub (%s) with CRC32 intrinsics disabled", StubRoutines::get_stub_name(stub_id));
3110 
3111     BLOCK_COMMENT("CRC32_updateBytes {");
3112     Register       table   = Z_ARG4;  // crc32 table address.
3113     StubRoutines::zarch::generate_load_crc_table_addr(_masm, table);
3114 
3115     generate_CRC_updateBytes(table, true);
3116     BLOCK_COMMENT("} CRC32_updateBytes");
3117 
3118     return __ addr_at(start_off);
3119   }
3120 
3121 
3122   // Compute CRC32C function.
3123   address generate_CRC32C_updateBytes() {
3124     __ align(CodeEntryAlignment);
3125     StubGenStubId stub_id =  StubGenStubId::updateBytesCRC32C_id;
3126     StubCodeMark mark(this, stub_id);
3127     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
3128 
3129     assert(UseCRC32CIntrinsics, "should not generate this stub (%s) with CRC32C intrinsics disabled", StubRoutines::get_stub_name(stub_id));
3130 
3131     BLOCK_COMMENT("CRC32C_updateBytes {");
3132     Register       table   = Z_ARG4;  // crc32c table address.
3133     StubRoutines::zarch::generate_load_crc32c_table_addr(_masm, table);
3134 
3135     generate_CRC_updateBytes(table, false);
3136     BLOCK_COMMENT("} CRC32C_updateBytes");
3137 
3138     return __ addr_at(start_off);
3139   }
3140 
3141 
3142   // Arguments:
3143   //   Z_ARG1    - x address
3144   //   Z_ARG2    - x length
3145   //   Z_ARG3    - y address
3146   //   Z_ARG4    - y length
3147   //   Z_ARG5    - z address
3148   address generate_multiplyToLen() {
3149     __ align(CodeEntryAlignment);
3150     StubGenStubId stub_id =  StubGenStubId::multiplyToLen_id;
3151     StubCodeMark mark(this, stub_id);
3152 
3153     address start = __ pc();
3154 
3155     const Register x    = Z_ARG1;
3156     const Register xlen = Z_ARG2;
3157     const Register y    = Z_ARG3;
3158     const Register ylen = Z_ARG4;
3159     const Register z    = Z_ARG5;
3160 
3161     // Next registers will be saved on stack in multiply_to_len().
3162     const Register tmp1 = Z_tmp_1;
3163     const Register tmp2 = Z_tmp_2;
3164     const Register tmp3 = Z_tmp_3;
3165     const Register tmp4 = Z_tmp_4;
3166     const Register tmp5 = Z_R9;
3167 
3168     BLOCK_COMMENT("Entry:");
3169 
3170     __ z_llgfr(xlen, xlen);
3171     __ z_llgfr(ylen, ylen);
3172 
3173     __ multiply_to_len(x, xlen, y, ylen, z, tmp1, tmp2, tmp3, tmp4, tmp5);
3174 
3175     __ z_br(Z_R14);  // Return to caller.
3176 
3177     return start;
3178   }
3179 
3180   address generate_method_entry_barrier() {
3181     __ align(CodeEntryAlignment);
3182     StubGenStubId stub_id =  StubGenStubId::method_entry_barrier_id;
3183     StubCodeMark mark(this, stub_id);
3184 
3185     address start = __ pc();
3186 
3187     int nbytes_volatile = (8 + 5) * BytesPerWord;
3188 
3189     // VM-Call Prologue
3190     __ save_return_pc();
3191     __ push_frame_abi160(nbytes_volatile);
3192     __ save_volatile_regs(Z_SP, frame::z_abi_160_size, true, false);
3193 
3194     // Prep arg for VM call
3195     // Create ptr to stored return_pc in caller frame.
3196     __ z_la(Z_ARG1, _z_abi(return_pc) + frame::z_abi_160_size + nbytes_volatile, Z_R0, Z_SP);
3197 
3198     // VM-Call: BarrierSetNMethod::nmethod_stub_entry_barrier(address* return_address_ptr)
3199     __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier));
3200     __ z_ltr(Z_R0_scratch, Z_RET);
3201 
3202     // VM-Call Epilogue
3203     __ restore_volatile_regs(Z_SP, frame::z_abi_160_size, true, false);
3204     __ pop_frame();
3205     __ restore_return_pc();
3206 
3207     // Check return val of VM-Call
3208     __ z_bcr(Assembler::bcondZero, Z_R14);
3209 
3210     // Pop frame built in prologue.
3211     // Required so wrong_method_stub can deduce caller.
3212     __ pop_frame();
3213     __ restore_return_pc();
3214 
3215     // VM-Call indicates deoptimization required
3216     __ load_const_optimized(Z_R1_scratch, SharedRuntime::get_handle_wrong_method_stub());
3217     __ z_br(Z_R1_scratch);
3218 
3219     return start;
3220   }
3221 
3222   address generate_cont_thaw(bool return_barrier, bool exception) {
3223     if (!Continuations::enabled()) return nullptr;
3224     Unimplemented();
3225     return nullptr;
3226   }
3227 
3228   address generate_cont_thaw() {
3229     if (!Continuations::enabled()) return nullptr;
3230     Unimplemented();
3231     return nullptr;
3232   }
3233 
3234   address generate_cont_returnBarrier() {
3235     if (!Continuations::enabled()) return nullptr;
3236     Unimplemented();
3237     return nullptr;
3238   }
3239 
3240   address generate_cont_returnBarrier_exception() {
3241     if (!Continuations::enabled()) return nullptr;
3242     Unimplemented();
3243     return nullptr;
3244   }
3245 
3246   // exception handler for upcall stubs
3247   address generate_upcall_stub_exception_handler() {
3248     StubGenStubId stub_id =  StubGenStubId::upcall_stub_exception_handler_id;
3249     StubCodeMark mark(this, stub_id);
3250     address start = __ pc();
3251 
3252     // Native caller has no idea how to handle exceptions,
3253     // so we just crash here. Up to callee to catch exceptions.
3254     __ verify_oop(Z_ARG1);
3255     __ load_const_optimized(Z_R1_scratch, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
3256     __ call_c(Z_R1_scratch);
3257     __ should_not_reach_here();
3258 
3259     return start;
3260   }
3261 
3262   // load Method* target of MethodHandle
3263   // Z_ARG1 = jobject receiver
3264   // Z_method = Method* result
3265   address generate_upcall_stub_load_target() {
3266     StubGenStubId stub_id =  StubGenStubId::upcall_stub_load_target_id;
3267     StubCodeMark mark(this, stub_id);
3268     address start = __ pc();
3269 
3270     __ resolve_global_jobject(Z_ARG1, Z_tmp_1, Z_tmp_2);
3271       // Load target method from receiver
3272     __ load_heap_oop(Z_method, Address(Z_ARG1, java_lang_invoke_MethodHandle::form_offset()),
3273                     noreg, noreg, IS_NOT_NULL);
3274     __ load_heap_oop(Z_method, Address(Z_method, java_lang_invoke_LambdaForm::vmentry_offset()),
3275                     noreg, noreg, IS_NOT_NULL);
3276     __ load_heap_oop(Z_method, Address(Z_method, java_lang_invoke_MemberName::method_offset()),
3277                     noreg, noreg, IS_NOT_NULL);
3278     __ z_lg(Z_method, Address(Z_method, java_lang_invoke_ResolvedMethodName::vmtarget_offset()));
3279     __ z_stg(Z_method, Address(Z_thread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
3280 
3281     __ z_br(Z_R14);
3282 
3283     return start;
3284   }
3285 
3286   void generate_initial_stubs() {
3287     // Generates all stubs and initializes the entry points.
3288 
3289     // Entry points that exist in all platforms.
3290     // Note: This is code that could be shared among different
3291     // platforms - however the benefit seems to be smaller than the
3292     // disadvantage of having a much more complicated generator
3293     // structure. See also comment in stubRoutines.hpp.
3294     StubRoutines::_forward_exception_entry                 = generate_forward_exception();
3295 
3296     StubRoutines::_call_stub_entry                         = generate_call_stub(StubRoutines::_call_stub_return_address);
3297     StubRoutines::_catch_exception_entry                   = generate_catch_exception();
3298 
3299     //----------------------------------------------------------------------
3300     // Entry points that are platform specific.
3301 
3302     if (UnsafeMemoryAccess::_table == nullptr) {
3303       UnsafeMemoryAccess::create_table(4); // 4 for setMemory
3304     }
3305 
3306     if (UseCRC32Intrinsics) {
3307       StubRoutines::_crc_table_adr     = (address)StubRoutines::zarch::_crc_table;
3308       StubRoutines::_updateBytesCRC32  = generate_CRC32_updateBytes();
3309     }
3310 
3311     if (UseCRC32CIntrinsics) {
3312       StubRoutines::_crc32c_table_addr = (address)StubRoutines::zarch::_crc32c_table;
3313       StubRoutines::_updateBytesCRC32C = generate_CRC32C_updateBytes();
3314     }
3315 
3316     // Comapct string intrinsics: Translate table for string inflate intrinsic. Used by trot instruction.
3317     StubRoutines::zarch::_trot_table_addr = (address)StubRoutines::zarch::_trot_table;
3318   }
3319 
3320   void generate_continuation_stubs() {
3321     if (!Continuations::enabled()) return;
3322 
3323     // Continuation stubs:
3324     StubRoutines::_cont_thaw          = generate_cont_thaw();
3325     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
3326     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
3327   }
3328 
3329   void generate_final_stubs() {
3330     // Generates all stubs and initializes the entry points.
3331 
3332     // Support for verify_oop (must happen after universe_init).
3333     StubRoutines::_verify_oop_subroutine_entry             = generate_verify_oop_subroutine();
3334 
3335     // Arraycopy stubs used by compilers.
3336     generate_arraycopy_stubs();
3337 
3338     // nmethod entry barriers for concurrent class unloading
3339     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
3340 
3341 #ifdef COMPILER2
3342     if (UseSecondarySupersTable) {
3343       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
3344       if (!InlineSecondarySupersTest) {
3345         generate_lookup_secondary_supers_table_stub();
3346       }
3347     }
3348 #endif // COMPILER2
3349 
3350     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
3351     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
3352   }
3353 
3354   void generate_compiler_stubs() {
3355 
3356     StubRoutines::zarch::_partial_subtype_check            = generate_partial_subtype_check();
3357 
3358 #if COMPILER2_OR_JVMCI
3359     // Generate AES intrinsics code.
3360     if (UseAESIntrinsics) {
3361       if (VM_Version::has_Crypto_AES()) {
3362         StubRoutines::_aescrypt_encryptBlock = generate_AES_encryptBlock();
3363         StubRoutines::_aescrypt_decryptBlock = generate_AES_decryptBlock();
3364         StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_AES_encrypt();
3365         StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_AES_decrypt();
3366       } else {
3367         // In PRODUCT builds, the function pointers will keep their initial (null) value.
3368         // LibraryCallKit::try_to_inline() will return false then, preventing the intrinsic to be called.
3369         assert(VM_Version::has_Crypto_AES(), "Inconsistent settings. Check vm_version_s390.cpp");
3370       }
3371     }
3372 
3373     if (UseAESCTRIntrinsics) {
3374       if (VM_Version::has_Crypto_AES_CTR()) {
3375         StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
3376       } else {
3377         // In PRODUCT builds, the function pointers will keep their initial (null) value.
3378         // LibraryCallKit::try_to_inline() will return false then, preventing the intrinsic to be called.
3379         assert(VM_Version::has_Crypto_AES_CTR(), "Inconsistent settings. Check vm_version_s390.cpp");
3380       }
3381     }
3382 
3383     // Generate GHASH intrinsics code
3384     if (UseGHASHIntrinsics) {
3385       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
3386     }
3387 
3388     // Generate SHA1/SHA256/SHA512 intrinsics code.
3389     if (UseSHA1Intrinsics) {
3390       StubRoutines::_sha1_implCompress     = generate_SHA1_stub(StubGenStubId::sha1_implCompress_id);
3391       StubRoutines::_sha1_implCompressMB   = generate_SHA1_stub(StubGenStubId::sha1_implCompressMB_id);
3392     }
3393     if (UseSHA256Intrinsics) {
3394       StubRoutines::_sha256_implCompress   = generate_SHA256_stub(StubGenStubId::sha256_implCompress_id);
3395       StubRoutines::_sha256_implCompressMB = generate_SHA256_stub(StubGenStubId::sha256_implCompressMB_id);
3396     }
3397     if (UseSHA512Intrinsics) {
3398       StubRoutines::_sha512_implCompress   = generate_SHA512_stub(StubGenStubId::sha512_implCompress_id);
3399       StubRoutines::_sha512_implCompressMB = generate_SHA512_stub(StubGenStubId::sha512_implCompressMB_id);
3400     }
3401 
3402 #ifdef COMPILER2
3403     if (UseMultiplyToLenIntrinsic) {
3404       StubRoutines::_multiplyToLen = generate_multiplyToLen();
3405     }
3406     if (UseMontgomeryMultiplyIntrinsic) {
3407       StubRoutines::_montgomeryMultiply
3408         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
3409     }
3410     if (UseMontgomerySquareIntrinsic) {
3411       StubRoutines::_montgomerySquare
3412         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
3413     }
3414 #endif
3415 #endif // COMPILER2_OR_JVMCI
3416   }
3417 
3418  public:
3419   StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) {
3420     switch(blob_id) {
3421     case initial_id:
3422       generate_initial_stubs();
3423       break;
3424      case continuation_id:
3425       generate_continuation_stubs();
3426       break;
3427     case compiler_id:
3428       generate_compiler_stubs();
3429       break;
3430     case final_id:
3431       generate_final_stubs();
3432       break;
3433     default:
3434       fatal("unexpected blob id: %d", blob_id);
3435       break;
3436     };
3437   }
3438 
3439  private:
3440   int _stub_count;
3441   void stub_prolog(StubCodeDesc* cdesc) {
3442 #ifdef ASSERT
3443     // Put extra information in the stub code, to make it more readable.
3444     // Write the high part of the address.
3445     // [RGV] Check if there is a dependency on the size of this prolog.
3446     __ emit_data((intptr_t)cdesc >> 32);
3447     __ emit_data((intptr_t)cdesc);
3448     __ emit_data(++_stub_count);
3449 #endif
3450     align(true);
3451   }
3452 
3453   void align(bool at_header = false) {
3454     // z/Architecture cache line size is 256 bytes.
3455     // There is no obvious benefit in aligning stub
3456     // code to cache lines. Use CodeEntryAlignment instead.
3457     const unsigned int icache_line_size      = CodeEntryAlignment;
3458     const unsigned int icache_half_line_size = MIN2<unsigned int>(32, CodeEntryAlignment);
3459 
3460     if (at_header) {
3461       while ((intptr_t)(__ pc()) % icache_line_size != 0) {
3462         __ z_illtrap();
3463       }
3464     } else {
3465       while ((intptr_t)(__ pc()) % icache_half_line_size != 0) {
3466         __ z_nop();
3467       }
3468     }
3469   }
3470 
3471 };
3472 
3473 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) {
3474   StubGenerator g(code, blob_id);
3475 }