1 /*
   2  * Copyright (c) 2016, 2023, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2016, 2023 SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "registerSaver_s390.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "gc/shared/barrierSetNMethod.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "interpreter/interp_masm.hpp"
  34 #include "memory/universe.hpp"
  35 #include "nativeInst_s390.hpp"
  36 #include "oops/instanceOop.hpp"
  37 #include "oops/objArrayKlass.hpp"
  38 #include "oops/oop.inline.hpp"
  39 #include "prims/methodHandles.hpp"
  40 #include "prims/upcallLinker.hpp"
  41 #include "runtime/frame.inline.hpp"
  42 #include "runtime/handles.inline.hpp"
  43 #include "runtime/javaThread.hpp"
  44 #include "runtime/sharedRuntime.hpp"
  45 #include "runtime/stubCodeGenerator.hpp"
  46 #include "runtime/stubRoutines.hpp"
  47 #include "utilities/formatBuffer.hpp"
  48 #include "utilities/macros.hpp"
  49 #include "utilities/powerOfTwo.hpp"
  50 
  51 // Declaration and definition of StubGenerator (no .hpp file).
  52 // For a more detailed description of the stub routine structure
  53 // see the comment in stubRoutines.hpp.
  54 
  55 #ifdef PRODUCT
  56 #define __ _masm->
  57 #else
  58 #define __ (Verbose ? (_masm->block_comment(FILE_AND_LINE),_masm):_masm)->
  59 #endif
  60 
  61 #define BLOCK_COMMENT(str) if (PrintAssembly || PrintStubCode) __ block_comment(str)
  62 #define BIND(label)        bind(label); BLOCK_COMMENT(#label ":")
  63 
  64 
  65   // These static, partially const, variables are for the AES intrinsics.
  66   // They are declared/initialized here to make them available across function bodies.
  67 
  68       static const int AES_parmBlk_align    = 32;                  // octoword alignment.
  69       static const int AES_stackSpace_incr  = AES_parmBlk_align;   // add'l stack space is allocated in such increments.
  70                                                                    // Must be multiple of AES_parmBlk_align.
  71 
  72       static int AES_ctrVal_len  = 0;                              // ctr init value len (in bytes), expected: length of dataBlk (16)
  73       static int AES_ctrVec_len  = 0;                              // # of ctr vector elements. That many block can be ciphered with one instruction execution
  74       static int AES_ctrArea_len = 0;                              // reserved stack space (in bytes) for ctr (= ctrVal_len * ctrVec_len)
  75 
  76       static int AES_parmBlk_addspace = 0;  // Must be multiple of AES_parmblk_align.
  77                                             // Will be set by stub generator to stub specific value.
  78       static int AES_dataBlk_space    = 0;  // Must be multiple of AES_parmblk_align.
  79                                             // Will be set by stub generator to stub specific value.
  80       static int AES_dataBlk_offset   = 0;  // offset of the local src and dst dataBlk buffers
  81                                             // Will be set by stub generator to stub specific value.
  82 
  83       // These offsets are relative to the parameter block address (Register parmBlk = Z_R1)
  84       static const int keylen_offset     =  -1;
  85       static const int fCode_offset      =  -2;
  86       static const int ctrVal_len_offset =  -4;
  87       static const int msglen_offset     =  -8;
  88       static const int unextSP_offset    = -16;
  89       static const int rem_msgblk_offset = -20;
  90       static const int argsave_offset    = -2*AES_parmBlk_align;
  91       static const int regsave_offset    = -4*AES_parmBlk_align; // save space for work regs (Z_R10..13)
  92       static const int msglen_red_offset = regsave_offset + AES_parmBlk_align; // reduced len after preLoop;
  93       static const int counter_offset    = msglen_red_offset+8;  // current counter vector position.
  94       static const int localSpill_offset = argsave_offset + 24;  // arg2..arg4 are saved
  95 
  96 
  97       // -----------------------------------------------------------------------
  98 // Stub Code definitions
  99 
 100 class StubGenerator: public StubCodeGenerator {
 101  private:
 102 
 103   //----------------------------------------------------------------------
 104   // Call stubs are used to call Java from C.
 105 
 106   //
 107   // Arguments:
 108   //
 109   //   R2        - call wrapper address     : address
 110   //   R3        - result                   : intptr_t*
 111   //   R4        - result type              : BasicType
 112   //   R5        - method                   : method
 113   //   R6        - frame mgr entry point    : address
 114   //   [SP+160]  - parameter block          : intptr_t*
 115   //   [SP+172]  - parameter count in words : int
 116   //   [SP+176]  - thread                   : Thread*
 117   //
 118   address generate_call_stub(address& return_address) {
 119     // Set up a new C frame, copy Java arguments, call frame manager
 120     // or native_entry, and process result.
 121 
 122     StubCodeMark mark(this, "StubRoutines", "call_stub");
 123     address start = __ pc();
 124 
 125     Register r_arg_call_wrapper_addr   = Z_ARG1;
 126     Register r_arg_result_addr         = Z_ARG2;
 127     Register r_arg_result_type         = Z_ARG3;
 128     Register r_arg_method              = Z_ARG4;
 129     Register r_arg_entry               = Z_ARG5;
 130 
 131     // offsets to fp
 132     #define d_arg_thread 176
 133     #define d_arg_argument_addr 160
 134     #define d_arg_argument_count 168+4
 135 
 136     Register r_entryframe_fp           = Z_tmp_1;
 137     Register r_top_of_arguments_addr   = Z_ARG4;
 138     Register r_new_arg_entry = Z_R14;
 139 
 140     // macros for frame offsets
 141     #define call_wrapper_address_offset \
 142                _z_entry_frame_locals_neg(call_wrapper_address)
 143     #define result_address_offset \
 144               _z_entry_frame_locals_neg(result_address)
 145     #define result_type_offset \
 146               _z_entry_frame_locals_neg(result_type)
 147     #define arguments_tos_address_offset \
 148               _z_entry_frame_locals_neg(arguments_tos_address)
 149 
 150     {
 151       //
 152       // STACK on entry to call_stub:
 153       //
 154       //     F1      [C_FRAME]
 155       //            ...
 156       //
 157 
 158       Register r_argument_addr              = Z_tmp_3;
 159       Register r_argumentcopy_addr          = Z_tmp_4;
 160       Register r_argument_size_in_bytes     = Z_ARG5;
 161       Register r_frame_size                 = Z_R1;
 162 
 163       Label arguments_copied;
 164 
 165       // Save non-volatile registers to ABI of caller frame.
 166       BLOCK_COMMENT("save registers, push frame {");
 167       __ z_stmg(Z_R6, Z_R14, 16, Z_SP);
 168       __ z_std(Z_F8, 96, Z_SP);
 169       __ z_std(Z_F9, 104, Z_SP);
 170       __ z_std(Z_F10, 112, Z_SP);
 171       __ z_std(Z_F11, 120, Z_SP);
 172       __ z_std(Z_F12, 128, Z_SP);
 173       __ z_std(Z_F13, 136, Z_SP);
 174       __ z_std(Z_F14, 144, Z_SP);
 175       __ z_std(Z_F15, 152, Z_SP);
 176 
 177       //
 178       // Push ENTRY_FRAME including arguments:
 179       //
 180       //     F0      [TOP_IJAVA_FRAME_ABI]
 181       //             [outgoing Java arguments]
 182       //             [ENTRY_FRAME_LOCALS]
 183       //     F1      [C_FRAME]
 184       //             ...
 185       //
 186 
 187       // Calculate new frame size and push frame.
 188       #define abi_plus_locals_size \
 189                 (frame::z_top_ijava_frame_abi_size + frame::z_entry_frame_locals_size)
 190       if (abi_plus_locals_size % BytesPerWord == 0) {
 191         // Preload constant part of frame size.
 192         __ load_const_optimized(r_frame_size, -abi_plus_locals_size/BytesPerWord);
 193         // Keep copy of our frame pointer (caller's SP).
 194         __ z_lgr(r_entryframe_fp, Z_SP);
 195         // Add space required by arguments to frame size.
 196         __ z_slgf(r_frame_size, d_arg_argument_count, Z_R0, Z_SP);
 197         // Move Z_ARG5 early, it will be used as a local.
 198         __ z_lgr(r_new_arg_entry, r_arg_entry);
 199         // Convert frame size from words to bytes.
 200         __ z_sllg(r_frame_size, r_frame_size, LogBytesPerWord);
 201         __ push_frame(r_frame_size, r_entryframe_fp,
 202                       false/*don't copy SP*/, true /*frame size sign inverted*/);
 203       } else {
 204         guarantee(false, "frame sizes should be multiples of word size (BytesPerWord)");
 205       }
 206       BLOCK_COMMENT("} save, push");
 207 
 208       // Load argument registers for call.
 209       BLOCK_COMMENT("prepare/copy arguments {");
 210       __ z_lgr(Z_method, r_arg_method);
 211       __ z_lg(Z_thread, d_arg_thread, r_entryframe_fp);
 212 
 213       // Calculate top_of_arguments_addr which will be tos (not prepushed) later.
 214       // Wimply use SP + frame::top_ijava_frame_size.
 215       __ add2reg(r_top_of_arguments_addr,
 216                  frame::z_top_ijava_frame_abi_size - BytesPerWord, Z_SP);
 217 
 218       // Initialize call_stub locals (step 1).
 219       if ((call_wrapper_address_offset + BytesPerWord == result_address_offset) &&
 220           (result_address_offset + BytesPerWord == result_type_offset)          &&
 221           (result_type_offset + BytesPerWord == arguments_tos_address_offset)) {
 222 
 223         __ z_stmg(r_arg_call_wrapper_addr, r_top_of_arguments_addr,
 224                   call_wrapper_address_offset, r_entryframe_fp);
 225       } else {
 226         __ z_stg(r_arg_call_wrapper_addr,
 227                  call_wrapper_address_offset, r_entryframe_fp);
 228         __ z_stg(r_arg_result_addr,
 229                  result_address_offset, r_entryframe_fp);
 230         __ z_stg(r_arg_result_type,
 231                  result_type_offset, r_entryframe_fp);
 232         __ z_stg(r_top_of_arguments_addr,
 233                  arguments_tos_address_offset, r_entryframe_fp);
 234       }
 235 
 236       // Copy Java arguments.
 237 
 238       // Any arguments to copy?
 239       __ load_and_test_int2long(Z_R1, Address(r_entryframe_fp, d_arg_argument_count));
 240       __ z_bre(arguments_copied);
 241 
 242       // Prepare loop and copy arguments in reverse order.
 243       {
 244         // Calculate argument size in bytes.
 245         __ z_sllg(r_argument_size_in_bytes, Z_R1, LogBytesPerWord);
 246 
 247         // Get addr of first incoming Java argument.
 248         __ z_lg(r_argument_addr, d_arg_argument_addr, r_entryframe_fp);
 249 
 250         // Let r_argumentcopy_addr point to last outgoing Java argument.
 251         __ add2reg(r_argumentcopy_addr, BytesPerWord, r_top_of_arguments_addr); // = Z_SP+160 effectively.
 252 
 253         // Let r_argument_addr point to last incoming Java argument.
 254         __ add2reg_with_index(r_argument_addr, -BytesPerWord,
 255                               r_argument_size_in_bytes, r_argument_addr);
 256 
 257         // Now loop while Z_R1 > 0 and copy arguments.
 258         {
 259           Label next_argument;
 260           __ bind(next_argument);
 261           // Mem-mem move.
 262           __ z_mvc(0, BytesPerWord-1, r_argumentcopy_addr, 0, r_argument_addr);
 263           __ add2reg(r_argument_addr,    -BytesPerWord);
 264           __ add2reg(r_argumentcopy_addr, BytesPerWord);
 265           __ z_brct(Z_R1, next_argument);
 266         }
 267       }  // End of argument copy loop.
 268 
 269       __ bind(arguments_copied);
 270     }
 271     BLOCK_COMMENT("} arguments");
 272 
 273     BLOCK_COMMENT("call {");
 274     {
 275       // Call frame manager or native entry.
 276 
 277       //
 278       // Register state on entry to frame manager / native entry:
 279       //
 280       //   Z_ARG1 = r_top_of_arguments_addr  - intptr_t *sender tos (prepushed)
 281       //                                       Lesp = (SP) + copied_arguments_offset - 8
 282       //   Z_method                          - method
 283       //   Z_thread                          - JavaThread*
 284       //
 285 
 286       // Here, the usual SP is the initial_caller_sp.
 287       __ z_lgr(Z_R10, Z_SP);
 288 
 289       // Z_esp points to the slot below the last argument.
 290       __ z_lgr(Z_esp, r_top_of_arguments_addr);
 291 
 292       //
 293       // Stack on entry to frame manager / native entry:
 294       //
 295       //     F0      [TOP_IJAVA_FRAME_ABI]
 296       //             [outgoing Java arguments]
 297       //             [ENTRY_FRAME_LOCALS]
 298       //     F1      [C_FRAME]
 299       //             ...
 300       //
 301 
 302       // Do a light-weight C-call here, r_new_arg_entry holds the address
 303       // of the interpreter entry point (frame manager or native entry)
 304       // and save runtime-value of return_pc in return_address
 305       // (call by reference argument).
 306       return_address = __ call_stub(r_new_arg_entry);
 307     }
 308     BLOCK_COMMENT("} call");
 309 
 310     {
 311       BLOCK_COMMENT("restore registers {");
 312       // Returned from frame manager or native entry.
 313       // Now pop frame, process result, and return to caller.
 314 
 315       //
 316       // Stack on exit from frame manager / native entry:
 317       //
 318       //     F0      [ABI]
 319       //             ...
 320       //             [ENTRY_FRAME_LOCALS]
 321       //     F1      [C_FRAME]
 322       //             ...
 323       //
 324       // Just pop the topmost frame ...
 325       //
 326 
 327       // Restore frame pointer.
 328       __ z_lg(r_entryframe_fp, _z_abi(callers_sp), Z_SP);
 329       // Pop frame. Done here to minimize stalls.
 330       __ pop_frame();
 331 
 332       // Reload some volatile registers which we've spilled before the call
 333       // to frame manager / native entry.
 334       // Access all locals via frame pointer, because we know nothing about
 335       // the topmost frame's size.
 336       __ z_lg(r_arg_result_addr, result_address_offset, r_entryframe_fp);
 337       __ z_lg(r_arg_result_type, result_type_offset, r_entryframe_fp);
 338 
 339       // Restore non-volatiles.
 340       __ z_lmg(Z_R6, Z_R14, 16, Z_SP);
 341       __ z_ld(Z_F8, 96, Z_SP);
 342       __ z_ld(Z_F9, 104, Z_SP);
 343       __ z_ld(Z_F10, 112, Z_SP);
 344       __ z_ld(Z_F11, 120, Z_SP);
 345       __ z_ld(Z_F12, 128, Z_SP);
 346       __ z_ld(Z_F13, 136, Z_SP);
 347       __ z_ld(Z_F14, 144, Z_SP);
 348       __ z_ld(Z_F15, 152, Z_SP);
 349       BLOCK_COMMENT("} restore");
 350 
 351       //
 352       // Stack on exit from call_stub:
 353       //
 354       //     0       [C_FRAME]
 355       //             ...
 356       //
 357       // No call_stub frames left.
 358       //
 359 
 360       // All non-volatiles have been restored at this point!!
 361 
 362       //------------------------------------------------------------------------
 363       // The following code makes some assumptions on the T_<type> enum values.
 364       // The enum is defined in globalDefinitions.hpp.
 365       // The validity of the assumptions is tested as far as possible.
 366       //   The assigned values should not be shuffled
 367       //   T_BOOLEAN==4    - lowest used enum value
 368       //   T_NARROWOOP==16 - largest used enum value
 369       //------------------------------------------------------------------------
 370       BLOCK_COMMENT("process result {");
 371       Label firstHandler;
 372       int   handlerLen= 8;
 373 #ifdef ASSERT
 374       char  assertMsg[] = "check BasicType definition in globalDefinitions.hpp";
 375       __ z_chi(r_arg_result_type, T_BOOLEAN);
 376       __ asm_assert(Assembler::bcondNotLow, assertMsg, 0x0234);
 377       __ z_chi(r_arg_result_type, T_NARROWOOP);
 378       __ asm_assert(Assembler::bcondNotHigh, assertMsg, 0x0235);
 379 #endif
 380       __ add2reg(r_arg_result_type, -T_BOOLEAN);          // Remove offset.
 381       __ z_larl(Z_R1, firstHandler);                      // location of first handler
 382       __ z_sllg(r_arg_result_type, r_arg_result_type, 3); // Each handler is 8 bytes long.
 383       __ z_bc(MacroAssembler::bcondAlways, 0, r_arg_result_type, Z_R1);
 384 
 385       __ align(handlerLen);
 386       __ bind(firstHandler);
 387       // T_BOOLEAN:
 388         guarantee(T_BOOLEAN == 4, "check BasicType definition in globalDefinitions.hpp");
 389         __ z_st(Z_RET, 0, r_arg_result_addr);
 390         __ z_br(Z_R14); // Return to caller.
 391         __ align(handlerLen);
 392       // T_CHAR:
 393         guarantee(T_CHAR == T_BOOLEAN+1, "check BasicType definition in globalDefinitions.hpp");
 394         __ z_st(Z_RET, 0, r_arg_result_addr);
 395         __ z_br(Z_R14); // Return to caller.
 396         __ align(handlerLen);
 397       // T_FLOAT:
 398         guarantee(T_FLOAT == T_CHAR+1, "check BasicType definition in globalDefinitions.hpp");
 399         __ z_ste(Z_FRET, 0, r_arg_result_addr);
 400         __ z_br(Z_R14); // Return to caller.
 401         __ align(handlerLen);
 402       // T_DOUBLE:
 403         guarantee(T_DOUBLE == T_FLOAT+1, "check BasicType definition in globalDefinitions.hpp");
 404         __ z_std(Z_FRET, 0, r_arg_result_addr);
 405         __ z_br(Z_R14); // Return to caller.
 406         __ align(handlerLen);
 407       // T_BYTE:
 408         guarantee(T_BYTE == T_DOUBLE+1, "check BasicType definition in globalDefinitions.hpp");
 409         __ z_st(Z_RET, 0, r_arg_result_addr);
 410         __ z_br(Z_R14); // Return to caller.
 411         __ align(handlerLen);
 412       // T_SHORT:
 413         guarantee(T_SHORT == T_BYTE+1, "check BasicType definition in globalDefinitions.hpp");
 414         __ z_st(Z_RET, 0, r_arg_result_addr);
 415         __ z_br(Z_R14); // Return to caller.
 416         __ align(handlerLen);
 417       // T_INT:
 418         guarantee(T_INT == T_SHORT+1, "check BasicType definition in globalDefinitions.hpp");
 419         __ z_st(Z_RET, 0, r_arg_result_addr);
 420         __ z_br(Z_R14); // Return to caller.
 421         __ align(handlerLen);
 422       // T_LONG:
 423         guarantee(T_LONG == T_INT+1, "check BasicType definition in globalDefinitions.hpp");
 424         __ z_stg(Z_RET, 0, r_arg_result_addr);
 425         __ z_br(Z_R14); // Return to caller.
 426         __ align(handlerLen);
 427       // T_OBJECT:
 428         guarantee(T_OBJECT == T_LONG+1, "check BasicType definition in globalDefinitions.hpp");
 429         __ z_stg(Z_RET, 0, r_arg_result_addr);
 430         __ z_br(Z_R14); // Return to caller.
 431         __ align(handlerLen);
 432       // T_ARRAY:
 433         guarantee(T_ARRAY == T_OBJECT+1, "check BasicType definition in globalDefinitions.hpp");
 434         __ z_stg(Z_RET, 0, r_arg_result_addr);
 435         __ z_br(Z_R14); // Return to caller.
 436         __ align(handlerLen);
 437       // T_VOID:
 438         guarantee(T_VOID == T_ARRAY+1, "check BasicType definition in globalDefinitions.hpp");
 439         __ z_stg(Z_RET, 0, r_arg_result_addr);
 440         __ z_br(Z_R14); // Return to caller.
 441         __ align(handlerLen);
 442       // T_ADDRESS:
 443         guarantee(T_ADDRESS == T_VOID+1, "check BasicType definition in globalDefinitions.hpp");
 444         __ z_stg(Z_RET, 0, r_arg_result_addr);
 445         __ z_br(Z_R14); // Return to caller.
 446         __ align(handlerLen);
 447       // T_NARROWOOP:
 448         guarantee(T_NARROWOOP == T_ADDRESS+1, "check BasicType definition in globalDefinitions.hpp");
 449         __ z_st(Z_RET, 0, r_arg_result_addr);
 450         __ z_br(Z_R14); // Return to caller.
 451         __ align(handlerLen);
 452       BLOCK_COMMENT("} process result");
 453     }
 454     return start;
 455   }
 456 
 457   // Return point for a Java call if there's an exception thrown in
 458   // Java code. The exception is caught and transformed into a
 459   // pending exception stored in JavaThread that can be tested from
 460   // within the VM.
 461   address generate_catch_exception() {
 462     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 463 
 464     address start = __ pc();
 465 
 466     //
 467     // Registers alive
 468     //
 469     //   Z_thread
 470     //   Z_ARG1 - address of pending exception
 471     //   Z_ARG2 - return address in call stub
 472     //
 473 
 474     const Register exception_file = Z_R0;
 475     const Register exception_line = Z_R1;
 476 
 477     __ load_const_optimized(exception_file, (void*)__FILE__);
 478     __ load_const_optimized(exception_line, (void*)__LINE__);
 479 
 480     __ z_stg(Z_ARG1, thread_(pending_exception));
 481     // Store into `char *'.
 482     __ z_stg(exception_file, thread_(exception_file));
 483     // Store into `int'.
 484     __ z_st(exception_line, thread_(exception_line));
 485 
 486     // Complete return to VM.
 487     assert(StubRoutines::_call_stub_return_address != nullptr, "must have been generated before");
 488 
 489     // Continue in call stub.
 490     __ z_br(Z_ARG2);
 491 
 492     return start;
 493   }
 494 
 495   // Continuation point for runtime calls returning with a pending
 496   // exception. The pending exception check happened in the runtime
 497   // or native call stub. The pending exception in Thread is
 498   // converted into a Java-level exception.
 499   //
 500   // Read:
 501   //   Z_R14: pc the runtime library callee wants to return to.
 502   //   Since the exception occurred in the callee, the return pc
 503   //   from the point of view of Java is the exception pc.
 504   //
 505   // Invalidate:
 506   //   Volatile registers (except below).
 507   //
 508   // Update:
 509   //   Z_ARG1: exception
 510   //   (Z_R14 is unchanged and is live out).
 511   //
 512   address generate_forward_exception() {
 513     StubCodeMark mark(this, "StubRoutines", "forward_exception");
 514     address start = __ pc();
 515 
 516     #define pending_exception_offset in_bytes(Thread::pending_exception_offset())
 517 #ifdef ASSERT
 518     // Get pending exception oop.
 519     __ z_lg(Z_ARG1, pending_exception_offset, Z_thread);
 520 
 521     // Make sure that this code is only executed if there is a pending exception.
 522     {
 523       Label L;
 524       __ z_ltgr(Z_ARG1, Z_ARG1);
 525       __ z_brne(L);
 526       __ stop("StubRoutines::forward exception: no pending exception (1)");
 527       __ bind(L);
 528     }
 529 
 530     __ verify_oop(Z_ARG1, "StubRoutines::forward exception: not an oop");
 531 #endif
 532 
 533     __ z_lgr(Z_ARG2, Z_R14); // Copy exception pc into Z_ARG2.
 534     __ save_return_pc();
 535     __ push_frame_abi160(0);
 536     // Find exception handler.
 537     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address),
 538                     Z_thread,
 539                     Z_ARG2);
 540     // Copy handler's address.
 541     __ z_lgr(Z_R1, Z_RET);
 542     __ pop_frame();
 543     __ restore_return_pc();
 544 
 545     // Set up the arguments for the exception handler:
 546     // - Z_ARG1: exception oop
 547     // - Z_ARG2: exception pc
 548 
 549     // Load pending exception oop.
 550     __ z_lg(Z_ARG1, pending_exception_offset, Z_thread);
 551 
 552     // The exception pc is the return address in the caller,
 553     // must load it into Z_ARG2
 554     __ z_lgr(Z_ARG2, Z_R14);
 555 
 556 #ifdef ASSERT
 557     // Make sure exception is set.
 558     { Label L;
 559       __ z_ltgr(Z_ARG1, Z_ARG1);
 560       __ z_brne(L);
 561       __ stop("StubRoutines::forward exception: no pending exception (2)");
 562       __ bind(L);
 563     }
 564 #endif
 565     // Clear the pending exception.
 566     __ clear_mem(Address(Z_thread, pending_exception_offset), sizeof(void *));
 567     // Jump to exception handler
 568     __ z_br(Z_R1 /*handler address*/);
 569 
 570     return start;
 571 
 572     #undef pending_exception_offset
 573   }
 574 
 575   // Continuation point for throwing of implicit exceptions that are
 576   // not handled in the current activation. Fabricates an exception
 577   // oop and initiates normal exception dispatching in this
 578   // frame. Only callee-saved registers are preserved (through the
 579   // normal RegisterMap handling). If the compiler
 580   // needs all registers to be preserved between the fault point and
 581   // the exception handler then it must assume responsibility for that
 582   // in AbstractCompiler::continuation_for_implicit_null_exception or
 583   // continuation_for_implicit_division_by_zero_exception. All other
 584   // implicit exceptions (e.g., NullPointerException or
 585   // AbstractMethodError on entry) are either at call sites or
 586   // otherwise assume that stack unwinding will be initiated, so
 587   // caller saved registers were assumed volatile in the compiler.
 588 
 589   // Note that we generate only this stub into a RuntimeStub, because
 590   // it needs to be properly traversed and ignored during GC, so we
 591   // change the meaning of the "__" macro within this method.
 592 
 593   // Note: the routine set_pc_not_at_call_for_caller in
 594   // SharedRuntime.cpp requires that this code be generated into a
 595   // RuntimeStub.
 596 #undef __
 597 #define __ masm->
 598 
 599   address generate_throw_exception(const char* name, address runtime_entry,
 600                                    bool restore_saved_exception_pc,
 601                                    Register arg1 = noreg, Register arg2 = noreg) {
 602     assert_different_registers(arg1, Z_R0_scratch);  // would be destroyed by push_frame()
 603     assert_different_registers(arg2, Z_R0_scratch);  // would be destroyed by push_frame()
 604 
 605     int insts_size = 256;
 606     int locs_size  = 0;
 607     CodeBuffer      code(name, insts_size, locs_size);
 608     MacroAssembler* masm = new MacroAssembler(&code);
 609     int framesize_in_bytes;
 610     address start = __ pc();
 611 
 612     __ save_return_pc();
 613     framesize_in_bytes = __ push_frame_abi160(0);
 614 
 615     address frame_complete_pc = __ pc();
 616     if (restore_saved_exception_pc) {
 617       __ unimplemented("StubGenerator::throw_exception", 74);
 618     }
 619 
 620     // Note that we always have a runtime stub frame on the top of stack at this point.
 621     __ get_PC(Z_R1);
 622     __ set_last_Java_frame(/*sp*/Z_SP, /*pc*/Z_R1);
 623 
 624     // Do the call.
 625     BLOCK_COMMENT("call runtime_entry");
 626     __ call_VM_leaf(runtime_entry, Z_thread, arg1, arg2);
 627 
 628     __ reset_last_Java_frame();
 629 
 630 #ifdef ASSERT
 631     // Make sure that this code is only executed if there is a pending exception.
 632     { Label L;
 633       __ z_lg(Z_R0,
 634                 in_bytes(Thread::pending_exception_offset()),
 635                 Z_thread);
 636       __ z_ltgr(Z_R0, Z_R0);
 637       __ z_brne(L);
 638       __ stop("StubRoutines::throw_exception: no pending exception");
 639       __ bind(L);
 640     }
 641 #endif
 642 
 643     __ pop_frame();
 644     __ restore_return_pc();
 645 
 646     __ load_const_optimized(Z_R1, StubRoutines::forward_exception_entry());
 647     __ z_br(Z_R1);
 648 
 649     RuntimeStub* stub =
 650       RuntimeStub::new_runtime_stub(name, &code,
 651                                     frame_complete_pc - start,
 652                                     framesize_in_bytes/wordSize,
 653                                     nullptr /*oop_maps*/, false);
 654 
 655     return stub->entry_point();
 656   }
 657 
 658 #undef __
 659 #ifdef PRODUCT
 660 #define __ _masm->
 661 #else
 662 #define __ (Verbose ? (_masm->block_comment(FILE_AND_LINE),_masm):_masm)->
 663 #endif
 664 
 665   // Support for uint StubRoutine::zarch::partial_subtype_check(Klass
 666   // sub, Klass super);
 667   //
 668   // Arguments:
 669   //   ret  : Z_RET, returned
 670   //   sub  : Z_ARG2, argument, not changed
 671   //   super: Z_ARG3, argument, not changed
 672   //
 673   //   raddr: Z_R14, blown by call
 674   //
 675   address generate_partial_subtype_check() {
 676     StubCodeMark mark(this, "StubRoutines", "partial_subtype_check");
 677     Label miss;
 678 
 679     address start = __ pc();
 680 
 681     const Register Rsubklass   = Z_ARG2; // subklass
 682     const Register Rsuperklass = Z_ARG3; // superklass
 683 
 684     // No args, but tmp registers that are killed.
 685     const Register Rlength     = Z_ARG4; // cache array length
 686     const Register Rarray_ptr  = Z_ARG5; // Current value from cache array.
 687 
 688     if (UseCompressedOops) {
 689       assert(Universe::heap() != nullptr, "java heap must be initialized to generate partial_subtype_check stub");
 690     }
 691 
 692     // Always take the slow path.
 693     __ check_klass_subtype_slow_path(Rsubklass, Rsuperklass,
 694                                      Rarray_ptr, Rlength, nullptr, &miss);
 695 
 696     // Match falls through here.
 697     __ clear_reg(Z_RET);               // Zero indicates a match. Set EQ flag in CC.
 698     __ z_br(Z_R14);
 699 
 700     __ BIND(miss);
 701     __ load_const_optimized(Z_RET, 1); // One indicates a miss.
 702     __ z_ltgr(Z_RET, Z_RET);           // Set NE flag in CR.
 703     __ z_br(Z_R14);
 704 
 705     return start;
 706   }
 707 
 708 #if !defined(PRODUCT)
 709   // Wrapper which calls oopDesc::is_oop_or_null()
 710   // Only called by MacroAssembler::verify_oop
 711   static void verify_oop_helper(const char* message, oopDesc* o) {
 712     if (!oopDesc::is_oop_or_null(o)) {
 713       fatal("%s. oop: " PTR_FORMAT, message, p2i(o));
 714     }
 715     ++ StubRoutines::_verify_oop_count;
 716   }
 717 #endif
 718 
 719   // Return address of code to be called from code generated by
 720   // MacroAssembler::verify_oop.
 721   //
 722   // Don't generate, rather use C++ code.
 723   address generate_verify_oop_subroutine() {
 724     // Don't generate a StubCodeMark, because no code is generated!
 725     // Generating the mark triggers notifying the oprofile jvmti agent
 726     // about the dynamic code generation, but the stub without
 727     // code (code_size == 0) confuses opjitconv
 728     // StubCodeMark mark(this, "StubRoutines", "verify_oop_stub");
 729 
 730     address start = 0;
 731 
 732 #if !defined(PRODUCT)
 733     start = CAST_FROM_FN_PTR(address, verify_oop_helper);
 734 #endif
 735 
 736     return start;
 737   }
 738 
 739   // This is to test that the count register contains a positive int value.
 740   // Required because C2 does not respect int to long conversion for stub calls.
 741   void assert_positive_int(Register count) {
 742 #ifdef ASSERT
 743     __ z_srag(Z_R0, count, 31);  // Just leave the sign (must be zero) in Z_R0.
 744     __ asm_assert(Assembler::bcondZero, "missing zero extend", 0xAFFE);
 745 #endif
 746   }
 747 
 748   //  Generate overlap test for array copy stubs.
 749   //  If no actual overlap is detected, control is transferred to the
 750   //  "normal" copy stub (entry address passed in disjoint_copy_target).
 751   //  Otherwise, execution continues with the code generated by the
 752   //  caller of array_overlap_test.
 753   //
 754   //  Input:
 755   //    Z_ARG1    - from
 756   //    Z_ARG2    - to
 757   //    Z_ARG3    - element count
 758   void array_overlap_test(address disjoint_copy_target, int log2_elem_size) {
 759     __ MacroAssembler::compare_and_branch_optimized(Z_ARG2, Z_ARG1, Assembler::bcondNotHigh,
 760                                                     disjoint_copy_target, /*len64=*/true, /*has_sign=*/false);
 761 
 762     Register index = Z_ARG3;
 763     if (log2_elem_size > 0) {
 764       __ z_sllg(Z_R1, Z_ARG3, log2_elem_size);  // byte count
 765       index = Z_R1;
 766     }
 767     __ add2reg_with_index(Z_R1, 0, index, Z_ARG1);  // First byte after "from" range.
 768 
 769     __ MacroAssembler::compare_and_branch_optimized(Z_R1, Z_ARG2, Assembler::bcondNotHigh,
 770                                                     disjoint_copy_target, /*len64=*/true, /*has_sign=*/false);
 771 
 772     // Destructive overlap: let caller generate code for that.
 773   }
 774 
 775   //  Generate stub for disjoint array copy. If "aligned" is true, the
 776   //  "from" and "to" addresses are assumed to be heapword aligned.
 777   //
 778   //  Arguments for generated stub:
 779   //      from:  Z_ARG1
 780   //      to:    Z_ARG2
 781   //      count: Z_ARG3 treated as signed
 782   void generate_disjoint_copy(bool aligned, int element_size,
 783                               bool branchToEnd,
 784                               bool restoreArgs) {
 785     // This is the zarch specific stub generator for general array copy tasks.
 786     // It has the following prereqs and features:
 787     //
 788     // - No destructive overlap allowed (else unpredictable results).
 789     // - Destructive overlap does not exist if the leftmost byte of the target
 790     //   does not coincide with any of the source bytes (except the leftmost).
 791     //
 792     //   Register usage upon entry:
 793     //      Z_ARG1 == Z_R2 :   address of source array
 794     //      Z_ARG2 == Z_R3 :   address of target array
 795     //      Z_ARG3 == Z_R4 :   length of operands (# of elements on entry)
 796     //
 797     // Register usage within the generator:
 798     // - Z_R0 and Z_R1 are KILLed by the stub routine (target addr/len).
 799     //                 Used as pair register operand in complex moves, scratch registers anyway.
 800     // - Z_R5 is KILLed by the stub routine (source register pair addr/len) (even/odd reg).
 801     //                  Same as R0/R1, but no scratch register.
 802     // - Z_ARG1, Z_ARG2, Z_ARG3 are USEd but preserved by the stub routine,
 803     //                          but they might get temporarily overwritten.
 804 
 805     Register  save_reg    = Z_ARG4;   // (= Z_R5), holds original target operand address for restore.
 806 
 807     {
 808       Register   llen_reg = Z_R1;     // Holds left operand len (odd reg).
 809       Register  laddr_reg = Z_R0;     // Holds left operand addr (even reg), overlaps with data_reg.
 810       Register   rlen_reg = Z_R5;     // Holds right operand len (odd reg), overlaps with save_reg.
 811       Register  raddr_reg = Z_R4;     // Holds right operand addr (even reg), overlaps with len_reg.
 812 
 813       Register   data_reg = Z_R0;     // Holds copied data chunk in alignment process and copy loop.
 814       Register    len_reg = Z_ARG3;   // Holds operand len (#elements at entry, #bytes shortly after).
 815       Register    dst_reg = Z_ARG2;   // Holds left (target)  operand addr.
 816       Register    src_reg = Z_ARG1;   // Holds right (source) operand addr.
 817 
 818       Label     doMVCLOOP, doMVCLOOPcount, doMVCLOOPiterate;
 819       Label     doMVCUnrolled;
 820       NearLabel doMVC,  doMVCgeneral, done;
 821       Label     MVC_template;
 822       address   pcMVCblock_b, pcMVCblock_e;
 823 
 824       bool      usedMVCLE       = true;
 825       bool      usedMVCLOOP     = true;
 826       bool      usedMVCUnrolled = false;
 827       bool      usedMVC         = false;
 828       bool      usedMVCgeneral  = false;
 829 
 830       int       stride;
 831       Register  stride_reg;
 832       Register  ix_reg;
 833 
 834       assert((element_size<=256) && (256%element_size == 0), "element size must be <= 256, power of 2");
 835       unsigned int log2_size = exact_log2(element_size);
 836 
 837       switch (element_size) {
 838         case 1:  BLOCK_COMMENT("ARRAYCOPY DISJOINT byte  {"); break;
 839         case 2:  BLOCK_COMMENT("ARRAYCOPY DISJOINT short {"); break;
 840         case 4:  BLOCK_COMMENT("ARRAYCOPY DISJOINT int   {"); break;
 841         case 8:  BLOCK_COMMENT("ARRAYCOPY DISJOINT long  {"); break;
 842         default: BLOCK_COMMENT("ARRAYCOPY DISJOINT       {"); break;
 843       }
 844 
 845       assert_positive_int(len_reg);
 846 
 847       BLOCK_COMMENT("preparation {");
 848 
 849       // No copying if len <= 0.
 850       if (branchToEnd) {
 851         __ compare64_and_branch(len_reg, (intptr_t) 0, Assembler::bcondNotHigh, done);
 852       } else {
 853         if (VM_Version::has_CompareBranch()) {
 854           __ z_cgib(len_reg, 0, Assembler::bcondNotHigh, 0, Z_R14);
 855         } else {
 856           __ z_ltgr(len_reg, len_reg);
 857           __ z_bcr(Assembler::bcondNotPositive, Z_R14);
 858         }
 859       }
 860 
 861       // Prefetch just one cache line. Speculative opt for short arrays.
 862       // Do not use Z_R1 in prefetch. Is undefined here.
 863       if (VM_Version::has_Prefetch()) {
 864         __ z_pfd(0x01, 0, Z_R0, src_reg); // Fetch access.
 865         __ z_pfd(0x02, 0, Z_R0, dst_reg); // Store access.
 866       }
 867 
 868       BLOCK_COMMENT("} preparation");
 869 
 870       // Save args only if really needed.
 871       // Keep len test local to branch. Is generated only once.
 872 
 873       BLOCK_COMMENT("mode selection {");
 874 
 875       // Special handling for arrays with only a few elements.
 876       // Nothing fancy: just an executed MVC.
 877       if (log2_size > 0) {
 878         __ z_sllg(Z_R1, len_reg, log2_size); // Remember #bytes in Z_R1.
 879       }
 880       if (element_size != 8) {
 881         __ z_cghi(len_reg, 256/element_size);
 882         __ z_brnh(doMVC);
 883         usedMVC = true;
 884       }
 885       if (element_size == 8) { // Long and oop arrays are always aligned.
 886         __ z_cghi(len_reg, 256/element_size);
 887         __ z_brnh(doMVCUnrolled);
 888         usedMVCUnrolled = true;
 889       }
 890 
 891       // Prefetch another cache line. We, for sure, have more than one line to copy.
 892       if (VM_Version::has_Prefetch()) {
 893         __ z_pfd(0x01, 256, Z_R0, src_reg); // Fetch access.
 894         __ z_pfd(0x02, 256, Z_R0, dst_reg); // Store access.
 895       }
 896 
 897       if (restoreArgs) {
 898         // Remember entry value of ARG2 to restore all arguments later from that knowledge.
 899         __ z_lgr(save_reg, dst_reg);
 900       }
 901 
 902       __ z_cghi(len_reg, 4096/element_size);
 903       if (log2_size == 0) {
 904         __ z_lgr(Z_R1, len_reg); // Init Z_R1 with #bytes
 905       }
 906       __ z_brnh(doMVCLOOP);
 907 
 908       // Fall through to MVCLE case.
 909 
 910       BLOCK_COMMENT("} mode selection");
 911 
 912       // MVCLE: for long arrays
 913       //   DW aligned: Best performance for sizes > 4kBytes.
 914       //   unaligned:  Least complex for sizes > 256 bytes.
 915       if (usedMVCLE) {
 916         BLOCK_COMMENT("mode MVCLE {");
 917 
 918         // Setup registers for mvcle.
 919         //__ z_lgr(llen_reg, len_reg);// r1 <- r4  #bytes already in Z_R1, aka llen_reg.
 920         __ z_lgr(laddr_reg, dst_reg); // r0 <- r3
 921         __ z_lgr(raddr_reg, src_reg); // r4 <- r2
 922         __ z_lgr(rlen_reg, llen_reg); // r5 <- r1
 923 
 924         __ MacroAssembler::move_long_ext(laddr_reg, raddr_reg, 0xb0);    // special: bypass cache
 925         // __ MacroAssembler::move_long_ext(laddr_reg, raddr_reg, 0xb8); // special: Hold data in cache.
 926         // __ MacroAssembler::move_long_ext(laddr_reg, raddr_reg, 0);
 927 
 928         if (restoreArgs) {
 929           // MVCLE updates the source (Z_R4,Z_R5) and target (Z_R0,Z_R1) register pairs.
 930           // Dst_reg (Z_ARG2) and src_reg (Z_ARG1) are left untouched. No restore required.
 931           // Len_reg (Z_ARG3) is destroyed and must be restored.
 932           __ z_slgr(laddr_reg, dst_reg);    // copied #bytes
 933           if (log2_size > 0) {
 934             __ z_srag(Z_ARG3, laddr_reg, log2_size); // Convert back to #elements.
 935           } else {
 936             __ z_lgr(Z_ARG3, laddr_reg);
 937           }
 938         }
 939         if (branchToEnd) {
 940           __ z_bru(done);
 941         } else {
 942           __ z_br(Z_R14);
 943         }
 944         BLOCK_COMMENT("} mode MVCLE");
 945       }
 946       // No fallthru possible here.
 947 
 948       //  MVCUnrolled: for short, aligned arrays.
 949 
 950       if (usedMVCUnrolled) {
 951         BLOCK_COMMENT("mode MVC unrolled {");
 952         stride = 8;
 953 
 954         // Generate unrolled MVC instructions.
 955         for (int ii = 32; ii > 1; ii--) {
 956           __ z_mvc(0, ii * stride-1, dst_reg, 0, src_reg); // ii*8 byte copy
 957           if (branchToEnd) {
 958             __ z_bru(done);
 959           } else {
 960             __ z_br(Z_R14);
 961           }
 962         }
 963 
 964         pcMVCblock_b = __ pc();
 965         __ z_mvc(0, 1 * stride-1, dst_reg, 0, src_reg); // 8 byte copy
 966         if (branchToEnd) {
 967           __ z_bru(done);
 968         } else {
 969           __ z_br(Z_R14);
 970         }
 971 
 972         pcMVCblock_e = __ pc();
 973         Label MVC_ListEnd;
 974         __ bind(MVC_ListEnd);
 975 
 976         // This is an absolute fast path:
 977         // - Array len in bytes must be not greater than 256.
 978         // - Array len in bytes must be an integer mult of DW
 979         //   to save expensive handling of trailing bytes.
 980         // - Argument restore is not done,
 981         //   i.e. previous code must not alter arguments (this code doesn't either).
 982 
 983         __ bind(doMVCUnrolled);
 984 
 985         // Avoid mul, prefer shift where possible.
 986         // Combine shift right (for #DW) with shift left (for block size).
 987         // Set CC for zero test below (asm_assert).
 988         // Note: #bytes comes in Z_R1, #DW in len_reg.
 989         unsigned int MVCblocksize    = pcMVCblock_e - pcMVCblock_b;
 990         unsigned int logMVCblocksize = 0xffffffffU; // Pacify compiler ("used uninitialized" warning).
 991 
 992         if (log2_size > 0) { // Len was scaled into Z_R1.
 993           switch (MVCblocksize) {
 994 
 995             case  8: logMVCblocksize = 3;
 996                      __ z_ltgr(Z_R0, Z_R1); // #bytes is index
 997                      break;                 // reasonable size, use shift
 998 
 999             case 16: logMVCblocksize = 4;
1000                      __ z_slag(Z_R0, Z_R1, logMVCblocksize-log2_size);
1001                      break;                 // reasonable size, use shift
1002 
1003             default: logMVCblocksize = 0;
1004                      __ z_ltgr(Z_R0, len_reg); // #DW for mul
1005                      break;                 // all other sizes: use mul
1006           }
1007         } else {
1008           guarantee(log2_size, "doMVCUnrolled: only for DW entities");
1009         }
1010 
1011         // This test (and branch) is redundant. Previous code makes sure that
1012         //  - element count > 0
1013         //  - element size == 8.
1014         // Thus, len reg should never be zero here. We insert an asm_assert() here,
1015         // just to double-check and to be on the safe side.
1016         __ asm_assert(false, "zero len cannot occur", 99);
1017 
1018         __ z_larl(Z_R1, MVC_ListEnd);        // Get addr of last instr block.
1019         // Avoid mul, prefer shift where possible.
1020         if (logMVCblocksize == 0) {
1021           __ z_mghi(Z_R0, MVCblocksize);
1022         }
1023         __ z_slgr(Z_R1, Z_R0);
1024         __ z_br(Z_R1);
1025         BLOCK_COMMENT("} mode MVC unrolled");
1026       }
1027       // No fallthru possible here.
1028 
1029       // MVC execute template
1030       // Must always generate. Usage may be switched on below.
1031       // There is no suitable place after here to put the template.
1032       __ bind(MVC_template);
1033       __ z_mvc(0,0,dst_reg,0,src_reg);      // Instr template, never exec directly!
1034 
1035 
1036       // MVC Loop: for medium-sized arrays
1037 
1038       // Only for DW aligned arrays (src and dst).
1039       // #bytes to copy must be at least 256!!!
1040       // Non-aligned cases handled separately.
1041       stride     = 256;
1042       stride_reg = Z_R1;   // Holds #bytes when control arrives here.
1043       ix_reg     = Z_ARG3; // Alias for len_reg.
1044 
1045 
1046       if (usedMVCLOOP) {
1047         BLOCK_COMMENT("mode MVC loop {");
1048         __ bind(doMVCLOOP);
1049 
1050         __ z_lcgr(ix_reg, Z_R1);         // Ix runs from -(n-2)*stride to 1*stride (inclusive).
1051         __ z_llill(stride_reg, stride);
1052         __ add2reg(ix_reg, 2*stride);    // Thus: increment ix by 2*stride.
1053 
1054         __ bind(doMVCLOOPiterate);
1055           __ z_mvc(0, stride-1, dst_reg, 0, src_reg);
1056           __ add2reg(dst_reg, stride);
1057           __ add2reg(src_reg, stride);
1058           __ bind(doMVCLOOPcount);
1059           __ z_brxlg(ix_reg, stride_reg, doMVCLOOPiterate);
1060 
1061         // Don 't use add2reg() here, since we must set the condition code!
1062         __ z_aghi(ix_reg, -2*stride);       // Compensate incr from above: zero diff means "all copied".
1063 
1064         if (restoreArgs) {
1065           __ z_lcgr(Z_R1, ix_reg);          // Prepare ix_reg for copy loop, #bytes expected in Z_R1.
1066           __ z_brnz(doMVCgeneral);          // We're not done yet, ix_reg is not zero.
1067 
1068           // ARG1, ARG2, and ARG3 were altered by the code above, so restore them building on save_reg.
1069           __ z_slgr(dst_reg, save_reg);     // copied #bytes
1070           __ z_slgr(src_reg, dst_reg);      // = ARG1 (now restored)
1071           if (log2_size) {
1072             __ z_srag(Z_ARG3, dst_reg, log2_size); // Convert back to #elements to restore ARG3.
1073           } else {
1074             __ z_lgr(Z_ARG3, dst_reg);
1075           }
1076           __ z_lgr(Z_ARG2, save_reg);       // ARG2 now restored.
1077 
1078           if (branchToEnd) {
1079             __ z_bru(done);
1080           } else {
1081             __ z_br(Z_R14);
1082           }
1083 
1084         } else {
1085             if (branchToEnd) {
1086               __ z_brz(done);                        // CC set by aghi instr.
1087           } else {
1088               __ z_bcr(Assembler::bcondZero, Z_R14); // We're all done if zero.
1089             }
1090 
1091           __ z_lcgr(Z_R1, ix_reg);    // Prepare ix_reg for copy loop, #bytes expected in Z_R1.
1092           // __ z_bru(doMVCgeneral);  // fallthru
1093         }
1094         usedMVCgeneral = true;
1095         BLOCK_COMMENT("} mode MVC loop");
1096       }
1097       // Fallthru to doMVCgeneral
1098 
1099       // MVCgeneral: for short, unaligned arrays, after other copy operations
1100 
1101       // Somewhat expensive due to use of EX instruction, but simple.
1102       if (usedMVCgeneral) {
1103         BLOCK_COMMENT("mode MVC general {");
1104         __ bind(doMVCgeneral);
1105 
1106         __ add2reg(len_reg, -1, Z_R1);             // Get #bytes-1 for EXECUTE.
1107         if (VM_Version::has_ExecuteExtensions()) {
1108           __ z_exrl(len_reg, MVC_template);        // Execute MVC with variable length.
1109         } else {
1110           __ z_larl(Z_R1, MVC_template);           // Get addr of instr template.
1111           __ z_ex(len_reg, 0, Z_R0, Z_R1);         // Execute MVC with variable length.
1112         }                                          // penalty: 9 ticks
1113 
1114         if (restoreArgs) {
1115           // ARG1, ARG2, and ARG3 were altered by code executed before, so restore them building on save_reg
1116           __ z_slgr(dst_reg, save_reg);            // Copied #bytes without the "doMVCgeneral" chunk
1117           __ z_slgr(src_reg, dst_reg);             // = ARG1 (now restored), was not advanced for "doMVCgeneral" chunk
1118           __ add2reg_with_index(dst_reg, 1, len_reg, dst_reg); // Len of executed MVC was not accounted for, yet.
1119           if (log2_size) {
1120             __ z_srag(Z_ARG3, dst_reg, log2_size); // Convert back to #elements to restore ARG3
1121           } else {
1122              __ z_lgr(Z_ARG3, dst_reg);
1123           }
1124           __ z_lgr(Z_ARG2, save_reg);              // ARG2 now restored.
1125         }
1126 
1127         if (usedMVC) {
1128           if (branchToEnd) {
1129             __ z_bru(done);
1130           } else {
1131             __ z_br(Z_R14);
1132         }
1133         } else {
1134           if (!branchToEnd) __ z_br(Z_R14);
1135         }
1136         BLOCK_COMMENT("} mode MVC general");
1137       }
1138       // Fallthru possible if following block not generated.
1139 
1140       // MVC: for short, unaligned arrays
1141 
1142       // Somewhat expensive due to use of EX instruction, but simple. penalty: 9 ticks.
1143       // Differs from doMVCgeneral in reconstruction of ARG2, ARG3, and ARG4.
1144       if (usedMVC) {
1145         BLOCK_COMMENT("mode MVC {");
1146         __ bind(doMVC);
1147 
1148         // get #bytes-1 for EXECUTE
1149         if (log2_size) {
1150           __ add2reg(Z_R1, -1);                // Length was scaled into Z_R1.
1151         } else {
1152           __ add2reg(Z_R1, -1, len_reg);       // Length was not scaled.
1153         }
1154 
1155         if (VM_Version::has_ExecuteExtensions()) {
1156           __ z_exrl(Z_R1, MVC_template);       // Execute MVC with variable length.
1157         } else {
1158           __ z_lgr(Z_R0, Z_R5);                // Save ARG4, may be unnecessary.
1159           __ z_larl(Z_R5, MVC_template);       // Get addr of instr template.
1160           __ z_ex(Z_R1, 0, Z_R0, Z_R5);        // Execute MVC with variable length.
1161           __ z_lgr(Z_R5, Z_R0);                // Restore ARG4, may be unnecessary.
1162         }
1163 
1164         if (!branchToEnd) {
1165           __ z_br(Z_R14);
1166         }
1167         BLOCK_COMMENT("} mode MVC");
1168       }
1169 
1170       __ bind(done);
1171 
1172       switch (element_size) {
1173         case 1:  BLOCK_COMMENT("} ARRAYCOPY DISJOINT byte "); break;
1174         case 2:  BLOCK_COMMENT("} ARRAYCOPY DISJOINT short"); break;
1175         case 4:  BLOCK_COMMENT("} ARRAYCOPY DISJOINT int  "); break;
1176         case 8:  BLOCK_COMMENT("} ARRAYCOPY DISJOINT long "); break;
1177         default: BLOCK_COMMENT("} ARRAYCOPY DISJOINT      "); break;
1178       }
1179     }
1180   }
1181 
1182   // Generate stub for conjoint array copy. If "aligned" is true, the
1183   // "from" and "to" addresses are assumed to be heapword aligned.
1184   //
1185   // Arguments for generated stub:
1186   //   from:  Z_ARG1
1187   //   to:    Z_ARG2
1188   //   count: Z_ARG3 treated as signed
1189   void generate_conjoint_copy(bool aligned, int element_size, bool branchToEnd) {
1190 
1191     // This is the zarch specific stub generator for general array copy tasks.
1192     // It has the following prereqs and features:
1193     //
1194     // - Destructive overlap exists and is handled by reverse copy.
1195     // - Destructive overlap exists if the leftmost byte of the target
1196     //   does coincide with any of the source bytes (except the leftmost).
1197     // - Z_R0 and Z_R1 are KILLed by the stub routine (data and stride)
1198     // - Z_ARG1 and Z_ARG2 are USEd but preserved by the stub routine.
1199     // - Z_ARG3 is USED but preserved by the stub routine.
1200     // - Z_ARG4 is used as index register and is thus KILLed.
1201     //
1202     {
1203       Register stride_reg = Z_R1;     // Stride & compare value in loop (negative element_size).
1204       Register   data_reg = Z_R0;     // Holds value of currently processed element.
1205       Register     ix_reg = Z_ARG4;   // Holds byte index of currently processed element.
1206       Register    len_reg = Z_ARG3;   // Holds length (in #elements) of arrays.
1207       Register    dst_reg = Z_ARG2;   // Holds left  operand addr.
1208       Register    src_reg = Z_ARG1;   // Holds right operand addr.
1209 
1210       assert(256%element_size == 0, "Element size must be power of 2.");
1211       assert(element_size     <= 8, "Can't handle more than DW units.");
1212 
1213       switch (element_size) {
1214         case 1:  BLOCK_COMMENT("ARRAYCOPY CONJOINT byte  {"); break;
1215         case 2:  BLOCK_COMMENT("ARRAYCOPY CONJOINT short {"); break;
1216         case 4:  BLOCK_COMMENT("ARRAYCOPY CONJOINT int   {"); break;
1217         case 8:  BLOCK_COMMENT("ARRAYCOPY CONJOINT long  {"); break;
1218         default: BLOCK_COMMENT("ARRAYCOPY CONJOINT       {"); break;
1219       }
1220 
1221       assert_positive_int(len_reg);
1222 
1223       if (VM_Version::has_Prefetch()) {
1224         __ z_pfd(0x01, 0, Z_R0, src_reg); // Fetch access.
1225         __ z_pfd(0x02, 0, Z_R0, dst_reg); // Store access.
1226       }
1227 
1228       unsigned int log2_size = exact_log2(element_size);
1229       if (log2_size) {
1230         __ z_sllg(ix_reg, len_reg, log2_size);
1231       } else {
1232         __ z_lgr(ix_reg, len_reg);
1233       }
1234 
1235       // Optimize reverse copy loop.
1236       // Main loop copies DW units which may be unaligned. Unaligned access adds some penalty ticks.
1237       // Unaligned DW access (neither fetch nor store) is DW-atomic, but should be alignment-atomic.
1238       // Preceding the main loop, some bytes are copied to obtain a DW-multiple remaining length.
1239 
1240       Label countLoop1;
1241       Label copyLoop1;
1242       Label skipBY;
1243       Label skipHW;
1244       int   stride = -8;
1245 
1246       __ load_const_optimized(stride_reg, stride); // Prepare for DW copy loop.
1247 
1248       if (element_size == 8)    // Nothing to do here.
1249         __ z_bru(countLoop1);
1250       else {                    // Do not generate dead code.
1251         __ z_tmll(ix_reg, 7);   // Check the "odd" bits.
1252         __ z_bre(countLoop1);   // There are none, very good!
1253       }
1254 
1255       if (log2_size == 0) {     // Handle leftover Byte.
1256         __ z_tmll(ix_reg, 1);
1257         __ z_bre(skipBY);
1258         __ z_lb(data_reg,   -1, ix_reg, src_reg);
1259         __ z_stcy(data_reg, -1, ix_reg, dst_reg);
1260         __ add2reg(ix_reg, -1); // Decrement delayed to avoid AGI.
1261         __ bind(skipBY);
1262         // fallthru
1263       }
1264       if (log2_size <= 1) {     // Handle leftover HW.
1265         __ z_tmll(ix_reg, 2);
1266         __ z_bre(skipHW);
1267         __ z_lhy(data_reg,  -2, ix_reg, src_reg);
1268         __ z_sthy(data_reg, -2, ix_reg, dst_reg);
1269         __ add2reg(ix_reg, -2); // Decrement delayed to avoid AGI.
1270         __ bind(skipHW);
1271         __ z_tmll(ix_reg, 4);
1272         __ z_bre(countLoop1);
1273         // fallthru
1274       }
1275       if (log2_size <= 2) {     // There are just 4 bytes (left) that need to be copied.
1276         __ z_ly(data_reg,  -4, ix_reg, src_reg);
1277         __ z_sty(data_reg, -4, ix_reg, dst_reg);
1278         __ add2reg(ix_reg, -4); // Decrement delayed to avoid AGI.
1279         __ z_bru(countLoop1);
1280       }
1281 
1282       // Control can never get to here. Never! Never ever!
1283       __ z_illtrap(0x99);
1284       __ bind(copyLoop1);
1285       __ z_lg(data_reg,  0, ix_reg, src_reg);
1286       __ z_stg(data_reg, 0, ix_reg, dst_reg);
1287       __ bind(countLoop1);
1288       __ z_brxhg(ix_reg, stride_reg, copyLoop1);
1289 
1290       if (!branchToEnd)
1291         __ z_br(Z_R14);
1292 
1293       switch (element_size) {
1294         case 1:  BLOCK_COMMENT("} ARRAYCOPY CONJOINT byte "); break;
1295         case 2:  BLOCK_COMMENT("} ARRAYCOPY CONJOINT short"); break;
1296         case 4:  BLOCK_COMMENT("} ARRAYCOPY CONJOINT int  "); break;
1297         case 8:  BLOCK_COMMENT("} ARRAYCOPY CONJOINT long "); break;
1298         default: BLOCK_COMMENT("} ARRAYCOPY CONJOINT      "); break;
1299       }
1300     }
1301   }
1302 
1303   // Generate stub for disjoint byte copy. If "aligned" is true, the
1304   // "from" and "to" addresses are assumed to be heapword aligned.
1305   address generate_disjoint_byte_copy(bool aligned, const char * name) {
1306     StubCodeMark mark(this, "StubRoutines", name);
1307 
1308     // This is the zarch specific stub generator for byte array copy.
1309     // Refer to generate_disjoint_copy for a list of prereqs and features:
1310     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1311     generate_disjoint_copy(aligned, 1, false, false);
1312     return __ addr_at(start_off);
1313   }
1314 
1315 
1316   address generate_disjoint_short_copy(bool aligned, const char * name) {
1317     StubCodeMark mark(this, "StubRoutines", name);
1318     // This is the zarch specific stub generator for short array copy.
1319     // Refer to generate_disjoint_copy for a list of prereqs and features:
1320     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1321     generate_disjoint_copy(aligned, 2, false, false);
1322     return __ addr_at(start_off);
1323   }
1324 
1325 
1326   address generate_disjoint_int_copy(bool aligned, const char * name) {
1327     StubCodeMark mark(this, "StubRoutines", name);
1328     // This is the zarch specific stub generator for int array copy.
1329     // Refer to generate_disjoint_copy for a list of prereqs and features:
1330     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1331     generate_disjoint_copy(aligned, 4, false, false);
1332     return __ addr_at(start_off);
1333   }
1334 
1335 
1336   address generate_disjoint_long_copy(bool aligned, const char * name) {
1337     StubCodeMark mark(this, "StubRoutines", name);
1338     // This is the zarch specific stub generator for long array copy.
1339     // Refer to generate_disjoint_copy for a list of prereqs and features:
1340     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1341     generate_disjoint_copy(aligned, 8, false, false);
1342     return __ addr_at(start_off);
1343   }
1344 
1345 
1346   address generate_disjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
1347     StubCodeMark mark(this, "StubRoutines", name);
1348     // This is the zarch specific stub generator for oop array copy.
1349     // Refer to generate_disjoint_copy for a list of prereqs and features.
1350     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1351     unsigned int size      = UseCompressedOops ? 4 : 8;
1352 
1353     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1354     if (dest_uninitialized) {
1355       decorators |= IS_DEST_UNINITIALIZED;
1356     }
1357     if (aligned) {
1358       decorators |= ARRAYCOPY_ALIGNED;
1359     }
1360 
1361     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1362     bs->arraycopy_prologue(_masm, decorators, T_OBJECT, Z_ARG1, Z_ARG2, Z_ARG3);
1363 
1364     generate_disjoint_copy(aligned, size, true, true);
1365 
1366     bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, Z_ARG2, Z_ARG3, true);
1367 
1368     return __ addr_at(start_off);
1369   }
1370 
1371 
1372   address generate_conjoint_byte_copy(bool aligned, const char * name) {
1373     StubCodeMark mark(this, "StubRoutines", name);
1374     // This is the zarch specific stub generator for overlapping byte array copy.
1375     // Refer to generate_conjoint_copy for a list of prereqs and features:
1376     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
1377     address nooverlap_target = aligned ? StubRoutines::arrayof_jbyte_disjoint_arraycopy()
1378                                        : StubRoutines::jbyte_disjoint_arraycopy();
1379 
1380     array_overlap_test(nooverlap_target, 0); // Branch away to nooverlap_target if disjoint.
1381     generate_conjoint_copy(aligned, 1, false);
1382 
1383     return __ addr_at(start_off);
1384   }
1385 
1386 
1387   address generate_conjoint_short_copy(bool aligned, const char * name) {
1388     StubCodeMark mark(this, "StubRoutines", name);
1389     // This is the zarch specific stub generator for overlapping short array copy.
1390     // Refer to generate_conjoint_copy for a list of prereqs and features:
1391     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
1392     address nooverlap_target = aligned ? StubRoutines::arrayof_jshort_disjoint_arraycopy()
1393                                        : StubRoutines::jshort_disjoint_arraycopy();
1394 
1395     array_overlap_test(nooverlap_target, 1); // Branch away to nooverlap_target if disjoint.
1396     generate_conjoint_copy(aligned, 2, false);
1397 
1398     return __ addr_at(start_off);
1399   }
1400 
1401   address generate_conjoint_int_copy(bool aligned, const char * name) {
1402     StubCodeMark mark(this, "StubRoutines", name);
1403     // This is the zarch specific stub generator for overlapping int array copy.
1404     // Refer to generate_conjoint_copy for a list of prereqs and features:
1405 
1406     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
1407     address nooverlap_target = aligned ? StubRoutines::arrayof_jint_disjoint_arraycopy()
1408                                        : StubRoutines::jint_disjoint_arraycopy();
1409 
1410     array_overlap_test(nooverlap_target, 2); // Branch away to nooverlap_target if disjoint.
1411     generate_conjoint_copy(aligned, 4, false);
1412 
1413     return __ addr_at(start_off);
1414   }
1415 
1416   address generate_conjoint_long_copy(bool aligned, const char * name) {
1417     StubCodeMark mark(this, "StubRoutines", name);
1418     // This is the zarch specific stub generator for overlapping long array copy.
1419     // Refer to generate_conjoint_copy for a list of prereqs and features:
1420 
1421     unsigned int start_off   = __ offset();  // Remember stub start address (is rtn value).
1422     address nooverlap_target = aligned ? StubRoutines::arrayof_jlong_disjoint_arraycopy()
1423                                        : StubRoutines::jlong_disjoint_arraycopy();
1424 
1425     array_overlap_test(nooverlap_target, 3); // Branch away to nooverlap_target if disjoint.
1426     generate_conjoint_copy(aligned, 8, false);
1427 
1428     return __ addr_at(start_off);
1429   }
1430 
1431   address generate_conjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
1432     StubCodeMark mark(this, "StubRoutines", name);
1433     // This is the zarch specific stub generator for overlapping oop array copy.
1434     // Refer to generate_conjoint_copy for a list of prereqs and features.
1435     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1436     unsigned int size      = UseCompressedOops ? 4 : 8;
1437     unsigned int shift     = UseCompressedOops ? 2 : 3;
1438 
1439     address nooverlap_target = aligned ? StubRoutines::arrayof_oop_disjoint_arraycopy(dest_uninitialized)
1440                                        : StubRoutines::oop_disjoint_arraycopy(dest_uninitialized);
1441 
1442     // Branch to disjoint_copy (if applicable) before pre_barrier to avoid double pre_barrier.
1443     array_overlap_test(nooverlap_target, shift);  // Branch away to nooverlap_target if disjoint.
1444 
1445     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1446     if (dest_uninitialized) {
1447       decorators |= IS_DEST_UNINITIALIZED;
1448     }
1449     if (aligned) {
1450       decorators |= ARRAYCOPY_ALIGNED;
1451     }
1452 
1453     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1454     bs->arraycopy_prologue(_masm, decorators, T_OBJECT, Z_ARG1, Z_ARG2, Z_ARG3);
1455 
1456     generate_conjoint_copy(aligned, size, true);  // Must preserve ARG2, ARG3.
1457 
1458     bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, Z_ARG2, Z_ARG3, true);
1459 
1460     return __ addr_at(start_off);
1461   }
1462 
1463 
1464   void generate_arraycopy_stubs() {
1465 
1466     // Note: the disjoint stubs must be generated first, some of
1467     // the conjoint stubs use them.
1468     StubRoutines::_jbyte_disjoint_arraycopy      = generate_disjoint_byte_copy (false, "jbyte_disjoint_arraycopy");
1469     StubRoutines::_jshort_disjoint_arraycopy     = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
1470     StubRoutines::_jint_disjoint_arraycopy       = generate_disjoint_int_copy  (false, "jint_disjoint_arraycopy");
1471     StubRoutines::_jlong_disjoint_arraycopy      = generate_disjoint_long_copy (false, "jlong_disjoint_arraycopy");
1472     StubRoutines::_oop_disjoint_arraycopy        = generate_disjoint_oop_copy  (false, "oop_disjoint_arraycopy", false);
1473     StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy  (false, "oop_disjoint_arraycopy_uninit", true);
1474 
1475     StubRoutines::_arrayof_jbyte_disjoint_arraycopy      = generate_disjoint_byte_copy (true, "arrayof_jbyte_disjoint_arraycopy");
1476     StubRoutines::_arrayof_jshort_disjoint_arraycopy     = generate_disjoint_short_copy(true, "arrayof_jshort_disjoint_arraycopy");
1477     StubRoutines::_arrayof_jint_disjoint_arraycopy       = generate_disjoint_int_copy  (true, "arrayof_jint_disjoint_arraycopy");
1478     StubRoutines::_arrayof_jlong_disjoint_arraycopy      = generate_disjoint_long_copy (true, "arrayof_jlong_disjoint_arraycopy");
1479     StubRoutines::_arrayof_oop_disjoint_arraycopy        = generate_disjoint_oop_copy  (true, "arrayof_oop_disjoint_arraycopy", false);
1480     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy  (true, "arrayof_oop_disjoint_arraycopy_uninit", true);
1481 
1482     StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy (false, "jbyte_arraycopy");
1483     StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(false, "jshort_arraycopy");
1484     StubRoutines::_jint_arraycopy            = generate_conjoint_int_copy  (false, "jint_arraycopy");
1485     StubRoutines::_jlong_arraycopy           = generate_conjoint_long_copy (false, "jlong_arraycopy");
1486     StubRoutines::_oop_arraycopy             = generate_conjoint_oop_copy  (false, "oop_arraycopy", false);
1487     StubRoutines::_oop_arraycopy_uninit      = generate_conjoint_oop_copy  (false, "oop_arraycopy_uninit", true);
1488 
1489     StubRoutines::_arrayof_jbyte_arraycopy      = generate_conjoint_byte_copy (true, "arrayof_jbyte_arraycopy");
1490     StubRoutines::_arrayof_jshort_arraycopy     = generate_conjoint_short_copy(true, "arrayof_jshort_arraycopy");
1491     StubRoutines::_arrayof_jint_arraycopy       = generate_conjoint_int_copy  (true, "arrayof_jint_arraycopy");
1492     StubRoutines::_arrayof_jlong_arraycopy      = generate_conjoint_long_copy (true, "arrayof_jlong_arraycopy");
1493     StubRoutines::_arrayof_oop_arraycopy        = generate_conjoint_oop_copy  (true, "arrayof_oop_arraycopy", false);
1494     StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy  (true, "arrayof_oop_arraycopy_uninit", true);
1495   }
1496 
1497   // Call interface for AES_encryptBlock, AES_decryptBlock stubs.
1498   //
1499   //   Z_ARG1 - source data block. Ptr to leftmost byte to be processed.
1500   //   Z_ARG2 - destination data block. Ptr to leftmost byte to be stored.
1501   //            For in-place encryption/decryption, ARG1 and ARG2 can point
1502   //            to the same piece of storage.
1503   //   Z_ARG3 - Crypto key address (expanded key). The first n bits of
1504   //            the expanded key constitute the original AES-<n> key (see below).
1505   //
1506   //   Z_RET  - return value. First unprocessed byte offset in src buffer.
1507   //
1508   // Some remarks:
1509   //   The crypto key, as passed from the caller to these encryption stubs,
1510   //   is a so-called expanded key. It is derived from the original key
1511   //   by the Rijndael key schedule, see http://en.wikipedia.org/wiki/Rijndael_key_schedule
1512   //   With the expanded key, the cipher/decipher task is decomposed in
1513   //   multiple, less complex steps, called rounds. Sun SPARC and Intel
1514   //   processors obviously implement support for those less complex steps.
1515   //   z/Architecture provides instructions for full cipher/decipher complexity.
1516   //   Therefore, we need the original, not the expanded key here.
1517   //   Luckily, the first n bits of an AES-<n> expanded key are formed
1518   //   by the original key itself. That takes us out of trouble. :-)
1519   //   The key length (in bytes) relation is as follows:
1520   //     original    expanded   rounds  key bit     keylen
1521   //    key bytes   key bytes            length   in words
1522   //           16         176       11      128         44
1523   //           24         208       13      192         52
1524   //           32         240       15      256         60
1525   //
1526   // The crypto instructions used in the AES* stubs have some specific register requirements.
1527   //   Z_R0   holds the crypto function code. Please refer to the KM/KMC instruction
1528   //          description in the "z/Architecture Principles of Operation" manual for details.
1529   //   Z_R1   holds the parameter block address. The parameter block contains the cryptographic key
1530   //          (KM instruction) and the chaining value (KMC instruction).
1531   //   dst    must designate an even-numbered register, holding the address of the output message.
1532   //   src    must designate an even/odd register pair, holding the address/length of the original message
1533 
1534   // Helper function which generates code to
1535   //  - load the function code in register fCode (== Z_R0).
1536   //  - load the data block length (depends on cipher function) into register srclen if requested.
1537   //  - is_decipher switches between cipher/decipher function codes
1538   //  - set_len requests (if true) loading the data block length in register srclen
1539   void generate_load_AES_fCode(Register keylen, Register fCode, Register srclen, bool is_decipher) {
1540 
1541     BLOCK_COMMENT("Set fCode {"); {
1542       Label fCode_set;
1543       int   mode = is_decipher ? VM_Version::CipherMode::decipher : VM_Version::CipherMode::cipher;
1544       bool  identical_dataBlk_len =  (VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES192_dataBlk)
1545                                   && (VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES256_dataBlk);
1546       // Expanded key length is 44/52/60 * 4 bytes for AES-128/AES-192/AES-256.
1547       __ z_cghi(keylen, 52); // Check only once at the beginning. keylen and fCode may share the same register.
1548 
1549       __ z_lghi(fCode, VM_Version::Cipher::_AES128 + mode);
1550       if (!identical_dataBlk_len) {
1551         __ z_lghi(srclen, VM_Version::Cipher::_AES128_dataBlk);
1552       }
1553       __ z_brl(fCode_set);  // keyLen <  52: AES128
1554 
1555       __ z_lghi(fCode, VM_Version::Cipher::_AES192 + mode);
1556       if (!identical_dataBlk_len) {
1557         __ z_lghi(srclen, VM_Version::Cipher::_AES192_dataBlk);
1558       }
1559       __ z_bre(fCode_set);  // keyLen == 52: AES192
1560 
1561       __ z_lghi(fCode, VM_Version::Cipher::_AES256 + mode);
1562       if (!identical_dataBlk_len) {
1563         __ z_lghi(srclen, VM_Version::Cipher::_AES256_dataBlk);
1564       }
1565       // __ z_brh(fCode_set);  // keyLen <  52: AES128           // fallthru
1566 
1567       __ bind(fCode_set);
1568       if (identical_dataBlk_len) {
1569         __ z_lghi(srclen, VM_Version::Cipher::_AES128_dataBlk);
1570       }
1571     }
1572     BLOCK_COMMENT("} Set fCode");
1573   }
1574 
1575   // Push a parameter block for the cipher/decipher instruction on the stack.
1576   // Layout of the additional stack space allocated for AES_cipherBlockChaining:
1577   //
1578   //   |        |
1579   //   +--------+ <-- SP before expansion
1580   //   |        |
1581   //   :        :  alignment loss (part 2), 0..(AES_parmBlk_align-1) bytes
1582   //   |        |
1583   //   +--------+
1584   //   |        |
1585   //   :        :  space for parameter block, size VM_Version::Cipher::_AES*_parmBlk_C
1586   //   |        |
1587   //   +--------+ <-- parmBlk, octoword-aligned, start of parameter block
1588   //   |        |
1589   //   :        :  additional stack space for spills etc., size AES_parmBlk_addspace, DW @ Z_SP not usable!!!
1590   //   |        |
1591   //   +--------+ <-- Z_SP + alignment loss, octoword-aligned
1592   //   |        |
1593   //   :        :  alignment loss (part 1), 0..(AES_parmBlk_align-1) bytes. DW @ Z_SP not usable!!!
1594   //   |        |
1595   //   +--------+ <-- Z_SP after expansion
1596 
1597   void generate_push_Block(int dataBlk_len, int parmBlk_len, int crypto_fCode,
1598                            Register parmBlk, Register keylen, Register fCode, Register cv, Register key) {
1599 
1600     AES_parmBlk_addspace = AES_parmBlk_align; // Must be multiple of AES_parmblk_align.
1601                                               // spill space for regs etc., don't use DW @SP!
1602     const int cv_len     = dataBlk_len;
1603     const int key_len    = parmBlk_len - cv_len;
1604     // This len must be known at JIT compile time. Only then are we able to recalc the SP before resize.
1605     // We buy this knowledge by wasting some (up to AES_parmBlk_align) bytes of stack space.
1606     const int resize_len = cv_len + key_len + AES_parmBlk_align + AES_parmBlk_addspace;
1607 
1608     // Use parmBlk as temp reg here to hold the frame pointer.
1609     __ resize_frame(-resize_len, parmBlk, true);
1610 
1611     // calculate parmBlk address from updated (resized) SP.
1612     __ add2reg(parmBlk, resize_len - (cv_len + key_len), Z_SP);
1613     __ z_nill(parmBlk, (~(AES_parmBlk_align-1)) & 0xffff); // Align parameter block.
1614 
1615     // There is room for stuff in the range [parmBlk-AES_parmBlk_addspace+8, parmBlk).
1616     __ z_stg(keylen,  -8, parmBlk);                        // Spill keylen for later use.
1617 
1618     // calculate (SP before resize) from updated SP.
1619     __ add2reg(keylen, resize_len, Z_SP);                  // keylen holds prev SP for now.
1620     __ z_stg(keylen, -16, parmBlk);                        // Spill prev SP for easy revert.
1621 
1622     __ z_mvc(0,      cv_len-1,  parmBlk, 0, cv);     // Copy cv.
1623     __ z_mvc(cv_len, key_len-1, parmBlk, 0, key);    // Copy key.
1624     __ z_lghi(fCode, crypto_fCode);
1625   }
1626 
1627   // NOTE:
1628   //   Before returning, the stub has to copy the chaining value from
1629   //   the parmBlk, where it was updated by the crypto instruction, back
1630   //   to the chaining value array the address of which was passed in the cv argument.
1631   //   As all the available registers are used and modified by KMC, we need to save
1632   //   the key length across the KMC instruction. We do so by spilling it to the stack,
1633   //   just preceding the parmBlk (at (parmBlk - 8)).
1634   void generate_push_parmBlk(Register keylen, Register fCode, Register parmBlk, Register key, Register cv, bool is_decipher) {
1635     int       mode = is_decipher ? VM_Version::CipherMode::decipher : VM_Version::CipherMode::cipher;
1636     Label     parmBlk_128, parmBlk_192, parmBlk_256, parmBlk_set;
1637 
1638     BLOCK_COMMENT("push parmBlk {");
1639     // We have just three cipher strengths which translates into three
1640     // possible extended key lengths: 44, 52, and 60 bytes.
1641     // We therefore can compare the actual length against the "middle" length
1642     // and get: lt -> len=44, eq -> len=52, gt -> len=60.
1643     __ z_cghi(keylen, 52);
1644     if (VM_Version::has_Crypto_AES128()) { __ z_brl(parmBlk_128); }  // keyLen <  52: AES128
1645     if (VM_Version::has_Crypto_AES192()) { __ z_bre(parmBlk_192); }  // keyLen == 52: AES192
1646     if (VM_Version::has_Crypto_AES256()) { __ z_brh(parmBlk_256); }  // keyLen >  52: AES256
1647 
1648     // Security net: requested AES function not available on this CPU.
1649     // NOTE:
1650     //   As of now (March 2015), this safety net is not required. JCE policy files limit the
1651     //   cryptographic strength of the keys used to 128 bit. If we have AES hardware support
1652     //   at all, we have at least AES-128.
1653     __ stop_static("AES key strength not supported by CPU. Use -XX:-UseAES as remedy.", 0);
1654 
1655     if (VM_Version::has_Crypto_AES256()) {
1656       __ bind(parmBlk_256);
1657       generate_push_Block(VM_Version::Cipher::_AES256_dataBlk,
1658                           VM_Version::Cipher::_AES256_parmBlk_C,
1659                           VM_Version::Cipher::_AES256 + mode,
1660                           parmBlk, keylen, fCode, cv, key);
1661       if (VM_Version::has_Crypto_AES128() || VM_Version::has_Crypto_AES192()) {
1662         __ z_bru(parmBlk_set);  // Fallthru otherwise.
1663       }
1664     }
1665 
1666     if (VM_Version::has_Crypto_AES192()) {
1667       __ bind(parmBlk_192);
1668       generate_push_Block(VM_Version::Cipher::_AES192_dataBlk,
1669                           VM_Version::Cipher::_AES192_parmBlk_C,
1670                           VM_Version::Cipher::_AES192 + mode,
1671                           parmBlk, keylen, fCode, cv, key);
1672       if (VM_Version::has_Crypto_AES128()) {
1673         __ z_bru(parmBlk_set);  // Fallthru otherwise.
1674       }
1675     }
1676 
1677     if (VM_Version::has_Crypto_AES128()) {
1678       __ bind(parmBlk_128);
1679       generate_push_Block(VM_Version::Cipher::_AES128_dataBlk,
1680                           VM_Version::Cipher::_AES128_parmBlk_C,
1681                           VM_Version::Cipher::_AES128 + mode,
1682                           parmBlk, keylen, fCode, cv, key);
1683       // Fallthru
1684     }
1685 
1686     __ bind(parmBlk_set);
1687     BLOCK_COMMENT("} push parmBlk");
1688   }
1689 
1690   // Pop a parameter block from the stack. The chaining value portion of the parameter block
1691   // is copied back to the cv array as it is needed for subsequent cipher steps.
1692   // The keylen value as well as the original SP (before resizing) was pushed to the stack
1693   // when pushing the parameter block.
1694   void generate_pop_parmBlk(Register keylen, Register parmBlk, Register key, Register cv) {
1695 
1696     BLOCK_COMMENT("pop parmBlk {");
1697     bool identical_dataBlk_len =  (VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES192_dataBlk) &&
1698                                   (VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES256_dataBlk);
1699     if (identical_dataBlk_len) {
1700       int cv_len = VM_Version::Cipher::_AES128_dataBlk;
1701       __ z_mvc(0, cv_len-1, cv, 0, parmBlk);  // Copy cv.
1702     } else {
1703       int cv_len;
1704       Label parmBlk_128, parmBlk_192, parmBlk_256, parmBlk_set;
1705       __ z_lg(keylen, -8, parmBlk);  // restore keylen
1706       __ z_cghi(keylen, 52);
1707       if (VM_Version::has_Crypto_AES256()) __ z_brh(parmBlk_256);  // keyLen >  52: AES256
1708       if (VM_Version::has_Crypto_AES192()) __ z_bre(parmBlk_192);  // keyLen == 52: AES192
1709       // if (VM_Version::has_Crypto_AES128()) __ z_brl(parmBlk_128);  // keyLen <  52: AES128  // fallthru
1710 
1711       // Security net: there is no one here. If we would need it, we should have
1712       // fallen into it already when pushing the parameter block.
1713       if (VM_Version::has_Crypto_AES128()) {
1714         __ bind(parmBlk_128);
1715         cv_len = VM_Version::Cipher::_AES128_dataBlk;
1716         __ z_mvc(0, cv_len-1, cv, 0, parmBlk);  // Copy cv.
1717         if (VM_Version::has_Crypto_AES192() || VM_Version::has_Crypto_AES256()) {
1718           __ z_bru(parmBlk_set);
1719         }
1720       }
1721 
1722       if (VM_Version::has_Crypto_AES192()) {
1723         __ bind(parmBlk_192);
1724         cv_len = VM_Version::Cipher::_AES192_dataBlk;
1725         __ z_mvc(0, cv_len-1, cv, 0, parmBlk);  // Copy cv.
1726         if (VM_Version::has_Crypto_AES256()) {
1727           __ z_bru(parmBlk_set);
1728         }
1729       }
1730 
1731       if (VM_Version::has_Crypto_AES256()) {
1732         __ bind(parmBlk_256);
1733         cv_len = VM_Version::Cipher::_AES256_dataBlk;
1734         __ z_mvc(0, cv_len-1, cv, 0, parmBlk);  // Copy cv.
1735         // __ z_bru(parmBlk_set);  // fallthru
1736       }
1737       __ bind(parmBlk_set);
1738     }
1739     __ z_lg(Z_SP, -16, parmBlk); // Revert resize_frame_absolute. Z_SP saved by push_parmBlk.
1740     BLOCK_COMMENT("} pop parmBlk");
1741   }
1742 
1743   // Compute AES encrypt/decrypt function.
1744   void generate_AES_cipherBlock(bool is_decipher) {
1745     // Incoming arguments.
1746     Register       from    = Z_ARG1; // source byte array
1747     Register       to      = Z_ARG2; // destination byte array
1748     Register       key     = Z_ARG3; // expanded key array
1749 
1750     const Register keylen  = Z_R0;   // Temporarily (until fCode is set) holds the expanded key array length.
1751 
1752     // Register definitions as required by KM instruction.
1753     const Register fCode   = Z_R0;   // crypto function code
1754     const Register parmBlk = Z_R1;   // parameter block address (points to crypto key)
1755     const Register src     = Z_ARG1; // Must be even reg (KM requirement).
1756     const Register srclen  = Z_ARG2; // Must be odd reg and pair with src. Overwrites destination address.
1757     const Register dst     = Z_ARG3; // Must be even reg (KM requirement). Overwrites expanded key address.
1758 
1759     // Read key len of expanded key (in 4-byte words).
1760     __ z_lgf(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
1761 
1762     // Copy arguments to registers as required by crypto instruction.
1763     __ z_lgr(parmBlk, key);          // crypto key (in T_INT array).
1764     __ lgr_if_needed(src, from);     // Copy src address. Will not emit, src/from are identical.
1765     __ z_lgr(dst, to);               // Copy dst address, even register required.
1766 
1767     // Construct function code into fCode(Z_R0), data block length into srclen(Z_ARG2).
1768     generate_load_AES_fCode(keylen, fCode, srclen, is_decipher);
1769 
1770     __ km(dst, src);                 // Cipher the message.
1771 
1772     __ z_br(Z_R14);
1773   }
1774 
1775   // Compute AES encrypt function.
1776   address generate_AES_encryptBlock(const char* name) {
1777     __ align(CodeEntryAlignment);
1778     StubCodeMark mark(this, "StubRoutines", name);
1779     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1780 
1781     generate_AES_cipherBlock(false);
1782 
1783     return __ addr_at(start_off);
1784   }
1785 
1786   // Compute AES decrypt function.
1787   address generate_AES_decryptBlock(const char* name) {
1788     __ align(CodeEntryAlignment);
1789     StubCodeMark mark(this, "StubRoutines", name);
1790     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1791 
1792     generate_AES_cipherBlock(true);
1793 
1794     return __ addr_at(start_off);
1795   }
1796 
1797   // These stubs receive the addresses of the cryptographic key and of the chaining value as two separate
1798   // arguments (registers "key" and "cv", respectively). The KMC instruction, on the other hand, requires
1799   // chaining value and key to be, in this sequence, adjacent in storage. Thus, we need to allocate some
1800   // thread-local working storage. Using heap memory incurs all the hassles of allocating/freeing.
1801   // Stack space, on the contrary, is deallocated automatically when we return from the stub to the caller.
1802   // *** WARNING ***
1803   // Please note that we do not formally allocate stack space, nor do we
1804   // update the stack pointer. Therefore, no function calls are allowed
1805   // and nobody else must use the stack range where the parameter block
1806   // is located.
1807   // We align the parameter block to the next available octoword.
1808   //
1809   // Compute chained AES encrypt function.
1810   void generate_AES_cipherBlockChaining(bool is_decipher) {
1811 
1812     Register       from    = Z_ARG1; // source byte array (clear text)
1813     Register       to      = Z_ARG2; // destination byte array (ciphered)
1814     Register       key     = Z_ARG3; // expanded key array.
1815     Register       cv      = Z_ARG4; // chaining value
1816     const Register msglen  = Z_ARG5; // Total length of the msg to be encrypted. Value must be returned
1817                                      // in Z_RET upon completion of this stub. Is 32-bit integer.
1818 
1819     const Register keylen  = Z_R0;   // Expanded key length, as read from key array. Temp only.
1820     const Register fCode   = Z_R0;   // crypto function code
1821     const Register parmBlk = Z_R1;   // parameter block address (points to crypto key)
1822     const Register src     = Z_ARG1; // is Z_R2
1823     const Register srclen  = Z_ARG2; // Overwrites destination address.
1824     const Register dst     = Z_ARG3; // Overwrites key address.
1825 
1826     // Read key len of expanded key (in 4-byte words).
1827     __ z_lgf(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
1828 
1829     // Construct parm block address in parmBlk (== Z_R1), copy cv and key to parm block.
1830     // Construct function code in fCode (Z_R0).
1831     generate_push_parmBlk(keylen, fCode, parmBlk, key, cv, is_decipher);
1832 
1833     // Prepare other registers for instruction.
1834     __ lgr_if_needed(src, from);     // Copy src address. Will not emit, src/from are identical.
1835     __ z_lgr(dst, to);
1836     __ z_llgfr(srclen, msglen);      // We pass the offsets as ints, not as longs as required.
1837 
1838     __ kmc(dst, src);                // Cipher the message.
1839 
1840     generate_pop_parmBlk(keylen, parmBlk, key, cv);
1841 
1842     __ z_llgfr(Z_RET, msglen);       // We pass the offsets as ints, not as longs as required.
1843     __ z_br(Z_R14);
1844   }
1845 
1846   // Compute chained AES encrypt function.
1847   address generate_cipherBlockChaining_AES_encrypt(const char* name) {
1848     __ align(CodeEntryAlignment);
1849     StubCodeMark mark(this, "StubRoutines", name);
1850     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
1851 
1852     generate_AES_cipherBlockChaining(false);
1853 
1854     return __ addr_at(start_off);
1855   }
1856 
1857   // Compute chained AES decrypt function.
1858   address generate_cipherBlockChaining_AES_decrypt(const char* name) {
1859     __ align(CodeEntryAlignment);
1860     StubCodeMark mark(this, "StubRoutines", name);
1861     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
1862 
1863     generate_AES_cipherBlockChaining(true);
1864 
1865     return __ addr_at(start_off);
1866   }
1867 
1868 
1869   // *****************************************************************************
1870 
1871   // AES CounterMode
1872   // Push a parameter block for the cipher/decipher instruction on the stack.
1873   // Layout of the additional stack space allocated for counterMode_AES_cipherBlock
1874   //
1875   //   |        |
1876   //   +--------+ <-- SP before expansion
1877   //   |        |
1878   //   :        :  alignment loss (part 2), 0..(AES_parmBlk_align-1) bytes.
1879   //   |        |
1880   //   +--------+ <-- gap = parmBlk + parmBlk_len + ctrArea_len
1881   //   |        |
1882   //   :        :  byte[] ctr - kmctr expects a counter vector the size of the input vector.
1883   //   :        :         The interface only provides byte[16] iv, the init vector.
1884   //   :        :         The size of this area is a tradeoff between stack space, init effort, and speed.
1885   //   |        |         Each counter is a 128bit int. Vector element [0] is a copy of iv.
1886   //   |        |         Vector element [i] is formed by incrementing element [i-1].
1887   //   +--------+ <-- ctr = parmBlk + parmBlk_len
1888   //   |        |
1889   //   :        :  space for parameter block, size VM_Version::Cipher::_AES*_parmBlk_G
1890   //   |        |
1891   //   +--------+ <-- parmBlk = Z_SP + (alignment loss (part 1+2)) + AES_dataBlk_space + AES_parmBlk_addSpace, octoword-aligned, start of parameter block
1892   //   |        |
1893   //   :        :  additional stack space for spills etc., min. size AES_parmBlk_addspace, all bytes usable.
1894   //   |        |
1895   //   +--------+ <-- Z_SP + alignment loss (part 1+2) + AES_dataBlk_space, octoword-aligned
1896   //   |        |
1897   //   :        :  space for one source data block and one dest data block.
1898   //   |        |
1899   //   +--------+ <-- Z_SP + alignment loss (part 1+2), octoword-aligned
1900   //   |        |
1901   //   :        :  additional alignment loss. Blocks above can't tolerate unusable DW @SP.
1902   //   |        |
1903   //   +--------+ <-- Z_SP + alignment loss (part 1), octoword-aligned
1904   //   |        |
1905   //   :        :  alignment loss (part 1), 0..(AES_parmBlk_align-1) bytes. DW @ Z_SP holds frame ptr.
1906   //   |        |
1907   //   +--------+ <-- Z_SP after expansion
1908   //
1909   //   additional space allocation (per DW):
1910   //    spillSpace = parmBlk - AES_parmBlk_addspace
1911   //    dataBlocks = spillSpace - AES_dataBlk_space
1912   //
1913   //    parmBlk-8  various fields of various lengths
1914   //               parmBlk-1: key_len (only one byte is stored at parmBlk-1)
1915   //               parmBlk-2: fCode (only one byte is stored at parmBlk-2)
1916   //               parmBlk-4: ctrVal_len (as retrieved from iv array), in bytes, as HW
1917   //               parmBlk-8: msglen length (in bytes) of crypto msg, as passed in by caller
1918   //                          return value is calculated from this: rv = msglen - processed.
1919   //    parmBlk-16 old_SP (SP before resize)
1920   //    parmBlk-24 temp values
1921   //                up to and including main loop in generate_counterMode_AES
1922   //                 - parmBlk-20: remmsg_len remaining msg len (aka unprocessed msg bytes)
1923   //                after main loop in generate_counterMode_AES
1924   //                 - parmBlk-24: spill slot for various address values
1925   //
1926   //    parmBlk-40 free spill slot, used for local spills.
1927   //    parmBlk-64 ARG2(dst) ptr spill slot
1928   //    parmBlk-56 ARG3(crypto key) ptr spill slot
1929   //    parmBlk-48 ARG4(icv value) ptr spill slot
1930   //
1931   //    parmBlk-72
1932   //    parmBlk-80
1933   //    parmBlk-88 counter vector current position
1934   //    parmBlk-96 reduced msg len (after preLoop processing)
1935   //
1936   //    parmBlk-104 Z_R13 spill slot (preLoop only)
1937   //    parmBlk-112 Z_R12 spill slot (preLoop only)
1938   //    parmBlk-120 Z_R11 spill slot (preLoop only)
1939   //    parmBlk-128 Z_R10 spill slot (preLoop only)
1940   //
1941   //
1942   // Layout of the parameter block (instruction KMCTR, function KMCTR-AES*
1943   //
1944   //   +--------+ key_len: +16 (AES-128), +24 (AES-192), +32 (AES-256)
1945   //   |        |
1946   //   |        |  cryptographic key
1947   //   |        |
1948   //   +--------+ <-- parmBlk
1949   //
1950   // On exit:
1951   //   Z_SP     points to resized frame
1952   //            Z_SP before resize available from -16(parmBlk)
1953   //   parmBlk  points to crypto instruction parameter block
1954   //            parameter block is filled with crypto key.
1955   //   msglen   unchanged, saved for later at -24(parmBlk)
1956   //   fCode    contains function code for instruction
1957   //   key      unchanged
1958   //
1959   void generate_counterMode_prepare_Stack(Register parmBlk, Register ctr, Register counter, Register scratch) {
1960 
1961     BLOCK_COMMENT("prepare stack counterMode_AESCrypt {");
1962 
1963     // save argument registers.
1964     //   ARG1(from) is Z_RET as well. Not saved or restored.
1965     //   ARG5(msglen) is restored by other means.
1966     __ z_stmg(Z_ARG2, Z_ARG4, argsave_offset,    parmBlk);
1967 
1968     assert(AES_ctrVec_len > 0, "sanity. We need a counter vector");
1969     __ add2reg(counter, AES_parmBlk_align, parmBlk);       // counter array is located behind crypto key. Available range is disp12 only.
1970     __ z_mvc(0, AES_ctrVal_len-1, counter, 0, ctr);        // move first copy of iv
1971     for (int j = 1; j < AES_ctrVec_len; j+=j) {            // j (and amount of moved data) doubles with every iteration
1972       int offset = j * AES_ctrVal_len;
1973       if (offset <= 256) {
1974         __ z_mvc(offset, offset-1, counter, 0, counter);   // move iv
1975       } else {
1976         for (int k = 0; k < offset; k += 256) {
1977           __ z_mvc(offset+k, 255, counter, 0, counter);
1978         }
1979       }
1980     }
1981 
1982     Label noCarry, done;
1983     __ z_lg(scratch, Address(ctr, 8));                     // get low-order DW of initial counter.
1984     __ z_algfi(scratch, AES_ctrVec_len);                   // check if we will overflow during init.
1985     __ z_brc(Assembler::bcondLogNoCarry, noCarry);         // No, 64-bit increment is sufficient.
1986 
1987     for (int j = 1; j < AES_ctrVec_len; j++) {             // start with j = 1; no need to add 0 to the first counter value.
1988       int offset = j * AES_ctrVal_len;
1989       generate_increment128(counter, offset, j, scratch);  // increment iv by index value
1990     }
1991     __ z_bru(done);
1992 
1993     __ bind(noCarry);
1994     for (int j = 1; j < AES_ctrVec_len; j++) {             // start with j = 1; no need to add 0 to the first counter value.
1995       int offset = j * AES_ctrVal_len;
1996       generate_increment64(counter, offset, j);            // increment iv by index value
1997     }
1998 
1999     __ bind(done);
2000 
2001     BLOCK_COMMENT("} prepare stack counterMode_AESCrypt");
2002   }
2003 
2004 
2005   void generate_counterMode_increment_ctrVector(Register parmBlk, Register counter, Register scratch, bool v0_only) {
2006 
2007     BLOCK_COMMENT("increment ctrVector counterMode_AESCrypt {");
2008 
2009     __ add2reg(counter, AES_parmBlk_align, parmBlk);       // ptr to counter array needs to be restored
2010 
2011     if (v0_only) {
2012       int offset = 0;
2013       generate_increment128(counter, offset, AES_ctrVec_len, scratch); // increment iv by # vector elements
2014     } else {
2015       int j = 0;
2016       if (VM_Version::has_VectorFacility()) {
2017         bool first_call = true;
2018         for (; j < (AES_ctrVec_len - 3); j+=4) {                       // increment blocks of 4 iv elements
2019           int offset = j * AES_ctrVal_len;
2020           generate_increment128x4(counter, offset, AES_ctrVec_len, first_call);
2021           first_call = false;
2022         }
2023       }
2024       for (; j < AES_ctrVec_len; j++) {
2025         int offset = j * AES_ctrVal_len;
2026         generate_increment128(counter, offset, AES_ctrVec_len, scratch); // increment iv by # vector elements
2027       }
2028     }
2029 
2030     BLOCK_COMMENT("} increment ctrVector counterMode_AESCrypt");
2031   }
2032 
2033   // IBM s390 (IBM z/Architecture, to be more exact) uses Big-Endian number representation.
2034   // Therefore, the bits are ordered from most significant to least significant. The address
2035   // of a number in memory points to its lowest location where the most significant bit is stored.
2036   void generate_increment64(Register counter, int offset, int increment) {
2037     __ z_algsi(offset + 8, counter, increment);            // increment, no overflow check
2038   }
2039 
2040   void generate_increment128(Register counter, int offset, int increment, Register scratch) {
2041     __ clear_reg(scratch);                                 // prepare to add carry to high-order DW
2042     __ z_algsi(offset + 8, counter, increment);            // increment low order DW
2043     __ z_alcg(scratch, Address(counter, offset));          // add carry to high-order DW
2044     __ z_stg(scratch, Address(counter, offset));           // store back
2045   }
2046 
2047   void generate_increment128(Register counter, int offset, Register increment, Register scratch) {
2048     __ clear_reg(scratch);                                 // prepare to add carry to high-order DW
2049     __ z_alg(increment, Address(counter, offset + 8));     // increment low order DW
2050     __ z_stg(increment, Address(counter, offset + 8));     // store back
2051     __ z_alcg(scratch, Address(counter, offset));          // add carry to high-order DW
2052     __ z_stg(scratch, Address(counter, offset));           // store back
2053   }
2054 
2055   // This is the vector variant of increment128, incrementing 4 ctr vector elements per call.
2056   void generate_increment128x4(Register counter, int offset, int increment, bool init) {
2057     VectorRegister Vincr      = Z_V16;
2058     VectorRegister Vctr0      = Z_V20;
2059     VectorRegister Vctr1      = Z_V21;
2060     VectorRegister Vctr2      = Z_V22;
2061     VectorRegister Vctr3      = Z_V23;
2062 
2063     // Initialize the increment value only once for a series of increments.
2064     // It must be assured that the non-initializing generator calls are
2065     // immediately subsequent. Otherwise, there is no guarantee for Vincr to be unchanged.
2066     if (init) {
2067       __ z_vzero(Vincr);                                   // preset VReg with constant increment
2068       __ z_vleih(Vincr, increment, 7);                     // rightmost HW has ix = 7
2069     }
2070 
2071     __ z_vlm(Vctr0, Vctr3, offset, counter);               // get the counter values
2072     __ z_vaq(Vctr0, Vctr0, Vincr);                         // increment them
2073     __ z_vaq(Vctr1, Vctr1, Vincr);
2074     __ z_vaq(Vctr2, Vctr2, Vincr);
2075     __ z_vaq(Vctr3, Vctr3, Vincr);
2076     __ z_vstm(Vctr0, Vctr3, offset, counter);              // store the counter values
2077   }
2078 
2079   unsigned int generate_counterMode_push_Block(int dataBlk_len, int parmBlk_len, int crypto_fCode,
2080                            Register parmBlk, Register msglen, Register fCode, Register key) {
2081 
2082     // space for data blocks (src and dst, one each) for partial block processing)
2083     AES_parmBlk_addspace = AES_stackSpace_incr             // spill space (temp data)
2084                          + AES_stackSpace_incr             // for argument save/restore
2085                          + AES_stackSpace_incr*2           // for work reg save/restore
2086                          ;
2087     AES_dataBlk_space    = roundup(2*dataBlk_len, AES_parmBlk_align);
2088     AES_dataBlk_offset   = -(AES_parmBlk_addspace+AES_dataBlk_space);
2089     const int key_len    = parmBlk_len;                    // The length of the unextended key (16, 24, 32)
2090 
2091     assert((AES_ctrVal_len == 0) || (AES_ctrVal_len == dataBlk_len), "varying dataBlk_len is not supported.");
2092     AES_ctrVal_len  = dataBlk_len;                         // ctr init value len (in bytes)
2093     AES_ctrArea_len = AES_ctrVec_len * AES_ctrVal_len;     // space required on stack for ctr vector
2094 
2095     // This len must be known at JIT compile time. Only then are we able to recalc the SP before resize.
2096     // We buy this knowledge by wasting some (up to AES_parmBlk_align) bytes of stack space.
2097     const int resize_len = AES_parmBlk_align               // room for alignment of parmBlk
2098                          + AES_parmBlk_align               // extra room for alignment
2099                          + AES_dataBlk_space               // one src and one dst data blk
2100                          + AES_parmBlk_addspace            // spill space for local data
2101                          + roundup(parmBlk_len, AES_parmBlk_align)  // aligned length of parmBlk
2102                          + AES_ctrArea_len                 // stack space for ctr vector
2103                          ;
2104     Register scratch     = fCode;  // We can use fCode as a scratch register. It's contents on entry
2105                                    // is irrelevant and it is set at the very end of this code block.
2106 
2107     assert(key_len < 256, "excessive crypto key len: %d, limit: 256", key_len);
2108 
2109     BLOCK_COMMENT(err_msg("push_Block (%d bytes) counterMode_AESCrypt%d {", resize_len, parmBlk_len*8));
2110 
2111     // After the frame is resized, the parmBlk is positioned such
2112     // that it is octoword-aligned. This potentially creates some
2113     // alignment waste in addspace and/or in the gap area.
2114     // After resize_frame, scratch contains the frame pointer.
2115     __ resize_frame(-resize_len, scratch, true);
2116 #ifdef ASSERT
2117     __ clear_mem(Address(Z_SP, (intptr_t)8), resize_len - 8);
2118 #endif
2119 
2120     // calculate aligned parmBlk address from updated (resized) SP.
2121     __ add2reg(parmBlk, AES_parmBlk_addspace + AES_dataBlk_space + (2*AES_parmBlk_align-1), Z_SP);
2122     __ z_nill(parmBlk, (~(AES_parmBlk_align-1)) & 0xffff); // Align parameter block.
2123 
2124     // There is room to spill stuff in the range [parmBlk-AES_parmBlk_addspace+8, parmBlk).
2125     __ z_mviy(keylen_offset, parmBlk, key_len - 1);        // Spill crypto key length for later use. Decrement by one for direct use with xc template.
2126     __ z_mviy(fCode_offset,  parmBlk, crypto_fCode);       // Crypto function code, will be loaded into Z_R0 later.
2127     __ z_sty(msglen, msglen_offset, parmBlk);              // full plaintext/ciphertext len.
2128     __ z_sty(msglen, msglen_red_offset, parmBlk);          // save for main loop, may get updated in preLoop.
2129     __ z_sra(msglen, exact_log2(dataBlk_len));             // # full cipher blocks that can be formed from input text.
2130     __ z_sty(msglen, rem_msgblk_offset, parmBlk);
2131 
2132     __ add2reg(scratch, resize_len, Z_SP);                 // calculate (SP before resize) from resized SP.
2133     __ z_stg(scratch, unextSP_offset, parmBlk);            // Spill unextended SP for easy revert.
2134     __ z_stmg(Z_R10, Z_R13, regsave_offset, parmBlk);      // make some regs available as work registers
2135 
2136     // Fill parmBlk with all required data
2137     __ z_mvc(0, key_len-1, parmBlk, 0, key);               // Copy key. Need to do it here - key_len is only known here.
2138     BLOCK_COMMENT(err_msg("} push_Block (%d bytes) counterMode_AESCrypt%d", resize_len, parmBlk_len*8));
2139     return resize_len;
2140   }
2141 
2142 
2143   void generate_counterMode_pop_Block(Register parmBlk, Register msglen, Label& eraser) {
2144     // For added safety, clear the stack area where the crypto key was stored.
2145     Register scratch = msglen;
2146     assert_different_registers(scratch, Z_R0);             // can't use Z_R0 for exrl.
2147 
2148     // wipe out key on stack
2149     __ z_llgc(scratch, keylen_offset, parmBlk);            // get saved (key_len-1) value (we saved just one byte!)
2150     __ z_exrl(scratch, eraser);                            // template relies on parmBlk still pointing to key on stack
2151 
2152     // restore argument registers.
2153     //   ARG1(from) is Z_RET as well. Not restored - will hold return value anyway.
2154     //   ARG5(msglen) is restored further down.
2155     __ z_lmg(Z_ARG2, Z_ARG4, argsave_offset,    parmBlk);
2156 
2157     // restore work registers
2158     __ z_lmg(Z_R10, Z_R13, regsave_offset, parmBlk);       // make some regs available as work registers
2159 
2160     __ z_lgf(msglen, msglen_offset,  parmBlk);             // Restore msglen, only low order FW is valid
2161 #ifdef ASSERT
2162     {
2163       Label skip2last, skip2done;
2164       // Z_RET (aka Z_R2) can be used as scratch as well. It will be set from msglen before return.
2165       __ z_lgr(Z_RET, Z_SP);                                 // save extended SP
2166       __ z_lg(Z_SP,    unextSP_offset, parmBlk);             // trim stack back to unextended size
2167       __ z_sgrk(Z_R1, Z_SP, Z_RET);
2168 
2169       __ z_cghi(Z_R1, 256);
2170       __ z_brl(skip2last);
2171       __ z_xc(0, 255, Z_RET, 0, Z_RET);
2172       __ z_aghi(Z_RET, 256);
2173       __ z_aghi(Z_R1, -256);
2174 
2175       __ z_cghi(Z_R1, 256);
2176       __ z_brl(skip2last);
2177       __ z_xc(0, 255, Z_RET, 0, Z_RET);
2178       __ z_aghi(Z_RET, 256);
2179       __ z_aghi(Z_R1, -256);
2180 
2181       __ z_cghi(Z_R1, 256);
2182       __ z_brl(skip2last);
2183       __ z_xc(0, 255, Z_RET, 0, Z_RET);
2184       __ z_aghi(Z_RET, 256);
2185       __ z_aghi(Z_R1, -256);
2186 
2187       __ bind(skip2last);
2188       __ z_lgr(Z_R0, Z_RET);
2189       __ z_aghik(Z_RET, Z_R1, -1);  // decrement for exrl
2190       __ z_brl(skip2done);
2191       __ z_lgr(parmBlk, Z_R0);      // parmBlk == Z_R1, used in eraser template
2192       __ z_exrl(Z_RET, eraser);
2193 
2194       __ bind(skip2done);
2195     }
2196 #else
2197     __ z_lg(Z_SP,    unextSP_offset, parmBlk);             // trim stack back to unextended size
2198 #endif
2199   }
2200 
2201 
2202   int generate_counterMode_push_parmBlk(Register parmBlk, Register msglen, Register fCode, Register key, bool is_decipher) {
2203     int       resize_len = 0;
2204     int       mode = is_decipher ? VM_Version::CipherMode::decipher : VM_Version::CipherMode::cipher;
2205     Label     parmBlk_128, parmBlk_192, parmBlk_256, parmBlk_set;
2206     Register  keylen = fCode;      // Expanded key length, as read from key array, Temp only.
2207                                    // use fCode as scratch; fCode receives its final value later.
2208 
2209     // Read key len of expanded key (in 4-byte words).
2210     __ z_lgf(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2211     __ z_cghi(keylen, 52);
2212     if (VM_Version::has_Crypto_AES_CTR256()) { __ z_brh(parmBlk_256); }  // keyLen >  52: AES256. Assume: most frequent
2213     if (VM_Version::has_Crypto_AES_CTR128()) { __ z_brl(parmBlk_128); }  // keyLen <  52: AES128.
2214     if (VM_Version::has_Crypto_AES_CTR192()) { __ z_bre(parmBlk_192); }  // keyLen == 52: AES192. Assume: least frequent
2215 
2216     // Safety net: requested AES_CTR function for requested keylen not available on this CPU.
2217     __ stop_static("AES key strength not supported by CPU. Use -XX:-UseAESCTRIntrinsics as remedy.", 0);
2218 
2219     if (VM_Version::has_Crypto_AES_CTR128()) {
2220       __ bind(parmBlk_128);
2221       resize_len = generate_counterMode_push_Block(VM_Version::Cipher::_AES128_dataBlk,
2222                           VM_Version::Cipher::_AES128_parmBlk_G,
2223                           VM_Version::Cipher::_AES128 + mode,
2224                           parmBlk, msglen, fCode, key);
2225       if (VM_Version::has_Crypto_AES_CTR256() || VM_Version::has_Crypto_AES_CTR192()) {
2226         __ z_bru(parmBlk_set);  // Fallthru otherwise.
2227       }
2228     }
2229 
2230     if (VM_Version::has_Crypto_AES_CTR192()) {
2231       __ bind(parmBlk_192);
2232       resize_len = generate_counterMode_push_Block(VM_Version::Cipher::_AES192_dataBlk,
2233                           VM_Version::Cipher::_AES192_parmBlk_G,
2234                           VM_Version::Cipher::_AES192 + mode,
2235                           parmBlk, msglen, fCode, key);
2236       if (VM_Version::has_Crypto_AES_CTR256()) {
2237         __ z_bru(parmBlk_set);  // Fallthru otherwise.
2238       }
2239     }
2240 
2241     if (VM_Version::has_Crypto_AES_CTR256()) {
2242       __ bind(parmBlk_256);
2243       resize_len = generate_counterMode_push_Block(VM_Version::Cipher::_AES256_dataBlk,
2244                           VM_Version::Cipher::_AES256_parmBlk_G,
2245                           VM_Version::Cipher::_AES256 + mode,
2246                           parmBlk, msglen, fCode, key);
2247       // Fallthru
2248     }
2249 
2250     __ bind(parmBlk_set);
2251     return resize_len;
2252   }
2253 
2254 
2255   void generate_counterMode_pop_parmBlk(Register parmBlk, Register msglen, Label& eraser) {
2256 
2257     BLOCK_COMMENT("pop parmBlk counterMode_AESCrypt {");
2258 
2259     generate_counterMode_pop_Block(parmBlk, msglen, eraser);
2260 
2261     BLOCK_COMMENT("} pop parmBlk counterMode_AESCrypt");
2262   }
2263 
2264   // Implementation of counter-mode AES encrypt/decrypt function.
2265   //
2266   void generate_counterMode_AES_impl(bool is_decipher) {
2267 
2268     // On entry:
2269     // if there was a previous call to update(), and this previous call did not fully use
2270     // the current encrypted counter, that counter is available at arg6_Offset(Z_SP).
2271     // The index of the first unused bayte in the encrypted counter is available at arg7_Offset(Z_SP).
2272     // The index is in the range [1..AES_ctrVal_len] ([1..16]), where index == 16 indicates a fully
2273     // used previous encrypted counter.
2274     // The unencrypted counter has already been incremented and is ready to be used for the next
2275     // data block, after the unused bytes from the previous call have been consumed.
2276     // The unencrypted counter follows the "increment-after use" principle.
2277 
2278     // On exit:
2279     // The index of the first unused byte of the encrypted counter is written back to arg7_Offset(Z_SP).
2280     // A value of AES_ctrVal_len (16) indicates there is no leftover byte.
2281     // If there is at least one leftover byte (1 <= index < AES_ctrVal_len), the encrypted counter value
2282     // is written back to arg6_Offset(Z_SP). If there is no leftover, nothing is written back.
2283     // The unencrypted counter value is written back after having been incremented.
2284 
2285     Register       from    = Z_ARG1; // byte[], source byte array (clear text)
2286     Register       to      = Z_ARG2; // byte[], destination byte array (ciphered)
2287     Register       key     = Z_ARG3; // byte[], expanded key array.
2288     Register       ctr     = Z_ARG4; // byte[], counter byte array.
2289     const Register msglen  = Z_ARG5; // int, Total length of the msg to be encrypted. Value must be
2290                                      // returned in Z_RET upon completion of this stub.
2291                                      // This is a jint. Negative values are illegal, but technically possible.
2292                                      // Do not rely on high word. Contents is undefined.
2293                // encCtr   = Z_ARG6  - encrypted counter (byte array),
2294                //                      address passed on stack at _z_abi(remaining_cargs) + 0 * WordSize
2295                // cvIndex  = Z_ARG7  - # used (consumed) bytes of encrypted counter,
2296                //                      passed on stack at _z_abi(remaining_cargs) + 1 * WordSize
2297                //                      Caution:4-byte value, right-justified in 8-byte stack word
2298 
2299     const Register fCode   = Z_R0;   // crypto function code
2300     const Register parmBlk = Z_R1;   // parameter block address (points to crypto key)
2301     const Register src     = Z_ARG1; // is Z_R2, forms even/odd pair with srclen
2302     const Register srclen  = Z_ARG2; // Overwrites destination address.
2303     const Register dst     = Z_ARG3; // Overwrites key address.
2304     const Register counter = Z_ARG5; // Overwrites msglen. Must have counter array in an even register.
2305 
2306     Label srcMover, dstMover, fromMover, ctrXOR, dataEraser;  // EXRL (execution) templates.
2307     Label CryptoLoop, CryptoLoop_doit, CryptoLoop_end, CryptoLoop_setupAndDoLast, CryptoLoop_ctrVal_inc;
2308     Label allDone, allDone_noInc, popAndExit, Exit;
2309 
2310     int    arg6_Offset = _z_abi(remaining_cargs) + 0 * HeapWordSize;
2311     int    arg7_Offset = _z_abi(remaining_cargs) + 1 * HeapWordSize; // stack slot holds ptr to int value
2312     int   oldSP_Offset = 0;
2313 
2314     // Is there anything to do at all? Protect against negative len as well.
2315     __ z_ltr(msglen, msglen);
2316     __ z_brnh(Exit);
2317 
2318     // Expand stack, load parm block address into parmBlk (== Z_R1), copy crypto key to parm block.
2319     oldSP_Offset = generate_counterMode_push_parmBlk(parmBlk, msglen, fCode, key, is_decipher);
2320     arg6_Offset += oldSP_Offset;
2321     arg7_Offset += oldSP_Offset;
2322 
2323     // Check if there is a leftover, partially used encrypted counter from last invocation.
2324     // If so, use those leftover counter bytes first before starting the "normal" encryption.
2325 
2326     // We do not have access to the encrypted counter value. It is generated and used only
2327     // internally within the previous kmctr instruction. But, at the end of call to this stub,
2328     // the last encrypted couner is extracted by ciphering a 0x00 byte stream. The result is
2329     // stored at the arg6 location for use with the subsequent call.
2330     //
2331     // The #used bytes of the encrypted counter (from a previous call) is provided via arg7.
2332     // It is used as index into the encrypted counter to access the first byte availabla for ciphering.
2333     // To cipher the input text, we move the number of remaining bytes in the encrypted counter from
2334     // input to output. Then we simply XOR the output bytes with the associated encrypted counter bytes.
2335 
2336     Register cvIxAddr  = Z_R10;                  // Address of index into encCtr. Preserved for use @CryptoLoop_end.
2337     __ z_lg(cvIxAddr, arg7_Offset, Z_SP);        // arg7: addr of field encCTR_index.
2338 
2339     {
2340       Register cvUnused  = Z_R11;                // # unused bytes of encrypted counter value (= 16 - cvIndex)
2341       Register encCtr    = Z_R12;                // encrypted counter value, points to first ununsed byte.
2342       Register cvIndex   = Z_R13;                // # index of first unused byte of encrypted counter value
2343       Label    preLoop_end;
2344 
2345       // preLoop is necessary only if there is a partially used encrypted counter (encCtr).
2346       // Partially used means cvIndex is in [1, dataBlk_len-1].
2347       // cvIndex == 0:           encCtr is set up but not used at all. Should not occur.
2348       // cvIndex == dataBlk_len: encCtr is exhausted, all bytes used.
2349       // Using unsigned compare protects against cases where (cvIndex < 0).
2350       __ z_clfhsi(0, cvIxAddr, AES_ctrVal_len);  // check #used bytes in encCtr against ctr len.
2351       __ z_brnl(preLoop_end);                    // if encCtr is fully used, skip to normal processing.
2352       __ z_ltgf(cvIndex, 0, Z_R0, cvIxAddr);     // # used bytes in encCTR.
2353       __ z_brz(preLoop_end);                     // if encCtr has no used bytes, skip to normal processing.
2354 
2355       __ z_lg(encCtr, arg6_Offset, Z_SP);        // encrypted counter from last call to update()
2356       __ z_agr(encCtr, cvIndex);                 // now points to first unused byte
2357 
2358       __ add2reg(cvUnused, -AES_ctrVal_len, cvIndex); // calculate #unused bytes in encCtr.
2359       __ z_lcgr(cvUnused, cvUnused);             // previous checks ensure cvUnused in range [1, dataBlk_len-1]
2360 
2361       __ z_lgf(msglen, msglen_offset, parmBlk);  // Restore msglen (jint value)
2362       __ z_cr(cvUnused, msglen);                 // check if msg can consume all unused encCtr bytes
2363       __ z_locr(cvUnused, msglen, Assembler::bcondHigh); // take the shorter length
2364       __ z_aghi(cvUnused, -1);                   // decrement # unused bytes by 1 for exrl instruction
2365                                                  // preceding checks ensure cvUnused in range [1, dataBlk_len-1]
2366       __ z_exrl(cvUnused, fromMover);
2367       __ z_exrl(cvUnused, ctrXOR);
2368 
2369       __ z_aghi(cvUnused, 1);                    // revert decrement from above
2370       __ z_agr(cvIndex, cvUnused);               // update index into encCtr (first unused byte)
2371       __ z_st(cvIndex, 0, cvIxAddr);             // write back arg7, cvIxAddr is still valid
2372 
2373       // update pointers and counters to prepare for main loop
2374       __ z_agr(from, cvUnused);
2375       __ z_agr(to, cvUnused);
2376       __ z_sr(msglen, cvUnused);                 // #bytes not yet processed
2377       __ z_sty(msglen, msglen_red_offset, parmBlk); // save for calculations in main loop
2378       __ z_srak(Z_R0, msglen, exact_log2(AES_ctrVal_len));// # full cipher blocks that can be formed from input text.
2379       __ z_sty(Z_R0, rem_msgblk_offset, parmBlk);
2380 
2381       // check remaining msglen. If zero, all msg bytes were processed in preLoop.
2382       __ z_ltr(msglen, msglen);
2383       __ z_brnh(popAndExit);
2384 
2385       __ bind(preLoop_end);
2386     }
2387 
2388     // Create count vector on stack to accommodate up to AES_ctrVec_len blocks.
2389     generate_counterMode_prepare_Stack(parmBlk, ctr, counter, fCode);
2390 
2391     // Prepare other registers for instruction.
2392     __ lgr_if_needed(src, from);     // Copy src address. Will not emit, src/from are identical.
2393     __ z_lgr(dst, to);
2394     __ z_llgc(fCode, fCode_offset, Z_R0, parmBlk);
2395 
2396     __ bind(CryptoLoop);
2397       __ z_lghi(srclen, AES_ctrArea_len);                     // preset len (#bytes) for next iteration: max possible.
2398       __ z_asi(rem_msgblk_offset, parmBlk, -AES_ctrVec_len);  // decrement #remaining blocks (16 bytes each). Range: [+127..-128]
2399       __ z_brl(CryptoLoop_setupAndDoLast);                    // Handling the last iteration (using less than max #blocks) out-of-line
2400 
2401       __ bind(CryptoLoop_doit);
2402       __ kmctr(dst, counter, src);   // Cipher the message.
2403 
2404       __ z_lt(srclen, rem_msgblk_offset, Z_R0, parmBlk);      // check if this was the last iteration
2405       __ z_brz(CryptoLoop_ctrVal_inc);                        // == 0: ctrVector fully used. Need to increment the first
2406                                                               //       vector element to encrypt remaining unprocessed bytes.
2407 //    __ z_brl(CryptoLoop_end);                               //  < 0: this was detected before and handled at CryptoLoop_setupAndDoLast
2408                                                               //  > 0: this is the fallthru case, need another iteration
2409 
2410       generate_counterMode_increment_ctrVector(parmBlk, counter, srclen, false); // srclen unused here (serves as scratch)
2411       __ z_bru(CryptoLoop);
2412 
2413     __ bind(CryptoLoop_end);
2414 
2415     // OK, when we arrive here, we have encrypted all of the "from" byte stream
2416     // except for the last few [0..dataBlk_len) bytes. In addition, we know that
2417     // there are no more unused bytes in the previously generated encrypted counter.
2418     // The (unencrypted) counter, however, is ready to use (it was incremented before).
2419 
2420     // To encrypt the few remaining bytes, we need to form an extra src and dst
2421     // data block of dataBlk_len each. This is because we can only process full
2422     // blocks but we must not read or write beyond the boundaries of the argument
2423     // arrays. Here is what we do:
2424     //  - The ctrVector has at least one unused element. This is ensured by CryptoLoop code.
2425     //  - The (first) unused element is pointed at by the counter register.
2426     //  - The src data block is filled with the remaining "from" bytes, remainder of block undefined.
2427     //  - The single src data block is encrypted into the dst data block.
2428     //  - The dst data block is copied into the "to" array, but only the leftmost few bytes
2429     //    (as many as were left in the source byte stream).
2430     //  - The counter value to be used is pointed at by the counter register.
2431     //  - Fortunately, the crypto instruction (kmctr) has updated all related addresses such that
2432     //    we know where to continue with "from" and "to" and which counter value to use next.
2433 
2434     Register encCtr    = Z_R12;  // encrypted counter value, points to stub argument.
2435     Register tmpDst    = Z_R12;  // addr of temp destination (for last partial block encryption)
2436 
2437     __ z_lgf(srclen, msglen_red_offset, parmBlk);          // plaintext/ciphertext len after potential preLoop processing.
2438     __ z_nilf(srclen, AES_ctrVal_len - 1);                 // those rightmost bits indicate the unprocessed #bytes
2439     __ z_stg(srclen, localSpill_offset, parmBlk);          // save for later reuse
2440     __ z_mvhi(0, cvIxAddr, 16);                            // write back arg7 (default 16 in case of allDone).
2441     __ z_braz(allDone_noInc);                              // no unprocessed bytes? Then we are done.
2442                                                            // This also means the last block of data processed was
2443                                                            // a full-sized block (AES_ctrVal_len bytes) which results
2444                                                            // in no leftover encrypted counter bytes.
2445     __ z_st(srclen, 0, cvIxAddr);                          // This will be the index of the first unused byte in the encrypted counter.
2446     __ z_stg(counter, counter_offset, parmBlk);            // save counter location for easy later restore
2447 
2448     // calculate address (on stack) for final dst and src blocks.
2449     __ add2reg(tmpDst, AES_dataBlk_offset, parmBlk);       // tmp dst (on stack) is right before tmp src
2450 
2451     // We have a residue of [1..15] unprocessed bytes, srclen holds the exact number.
2452     // Residue == 0 was checked just above, residue == AES_ctrVal_len would be another
2453     // full-sized block and would have been handled by CryptoLoop.
2454 
2455     __ add2reg(srclen, -1);                                // decrement for exrl
2456     __ z_exrl(srclen, srcMover);                           // copy remaining bytes of src byte stream
2457     __ load_const_optimized(srclen, AES_ctrVal_len);       // kmctr processes only complete blocks
2458     __ add2reg(src, AES_ctrVal_len, tmpDst);               // tmp dst is right before tmp src
2459 
2460     __ kmctr(tmpDst, counter, src);                        // Cipher the remaining bytes.
2461 
2462     __ add2reg(tmpDst, -AES_ctrVal_len, tmpDst);           // restore tmp dst address
2463     __ z_lg(srclen, localSpill_offset, parmBlk);           // residual len, saved above
2464     __ add2reg(srclen, -1);                                // decrement for exrl
2465     __ z_exrl(srclen, dstMover);
2466 
2467     // Write back new encrypted counter
2468     __ add2reg(src, AES_dataBlk_offset, parmBlk);
2469     __ clear_mem(Address(src, RegisterOrConstant((intptr_t)0)), AES_ctrVal_len);
2470     __ load_const_optimized(srclen, AES_ctrVal_len);       // kmctr processes only complete blocks
2471     __ z_lg(encCtr, arg6_Offset, Z_SP);                    // write encrypted counter to arg6
2472     __ z_lg(counter, counter_offset, parmBlk);             // restore counter
2473     __ kmctr(encCtr, counter, src);
2474 
2475     // The last used element of the counter vector contains the latest counter value that was used.
2476     // As described above, the counter value on exit must be the one to be used next.
2477     __ bind(allDone);
2478     __ z_lg(counter, counter_offset, parmBlk);             // restore counter
2479     generate_increment128(counter, 0, 1, Z_R0);
2480 
2481     __ bind(allDone_noInc);
2482     __ z_mvc(0, AES_ctrVal_len, ctr, 0, counter);
2483 
2484     __ bind(popAndExit);
2485     generate_counterMode_pop_parmBlk(parmBlk, msglen, dataEraser);
2486 
2487     __ bind(Exit);
2488     __ z_lgfr(Z_RET, msglen);
2489 
2490     __ z_br(Z_R14);
2491 
2492     //----------------------------
2493     //---<  out-of-line code  >---
2494     //----------------------------
2495     __ bind(CryptoLoop_setupAndDoLast);
2496       __ z_lgf(srclen, rem_msgblk_offset, parmBlk);           // remaining #blocks in memory is < 0
2497       __ z_aghi(srclen, AES_ctrVec_len);                      // recalculate the actually remaining #blocks
2498       __ z_sllg(srclen, srclen, exact_log2(AES_ctrVal_len));  // convert to #bytes. Counter value is same length as data block
2499       __ kmctr(dst, counter, src);                            // Cipher the last integral blocks of the message.
2500       __ z_bru(CryptoLoop_end);                               // There is at least one unused counter vector element.
2501                                                               // no need to increment.
2502 
2503     __ bind(CryptoLoop_ctrVal_inc);
2504       generate_counterMode_increment_ctrVector(parmBlk, counter, srclen, true); // srclen unused here (serves as scratch)
2505       __ z_bru(CryptoLoop_end);
2506 
2507     //-------------------------------------------
2508     //---<  execution templates for preLoop  >---
2509     //-------------------------------------------
2510     __ bind(fromMover);
2511     __ z_mvc(0, 0, to, 0, from);               // Template instruction to move input data to dst.
2512     __ bind(ctrXOR);
2513     __ z_xc(0,  0, to, 0, encCtr);             // Template instruction to XOR input data (now in to) with encrypted counter.
2514 
2515     //-------------------------------
2516     //---<  execution templates  >---
2517     //-------------------------------
2518     __ bind(dataEraser);
2519     __ z_xc(0, 0, parmBlk, 0, parmBlk);  // Template instruction to erase crypto key on stack.
2520     __ bind(dstMover);
2521     __ z_mvc(0, 0, dst, 0, tmpDst);      // Template instruction to move encrypted reminder from stack to dst.
2522     __ bind(srcMover);
2523     __ z_mvc(AES_ctrVal_len, 0, tmpDst, 0, src); // Template instruction to move reminder of source byte stream to stack.
2524   }
2525 
2526 
2527   // Create two intrinsic variants, optimized for short and long plaintexts.
2528   void generate_counterMode_AES(bool is_decipher) {
2529 
2530     const Register msglen  = Z_ARG5;    // int, Total length of the msg to be encrypted. Value must be
2531                                         // returned in Z_RET upon completion of this stub.
2532     const int threshold = 256;          // above this length (in bytes), text is considered long.
2533     const int vec_short = threshold>>6; // that many blocks (16 bytes each) per iteration, max 4 loop iterations
2534     const int vec_long  = threshold>>2; // that many blocks (16 bytes each) per iteration.
2535 
2536     Label AESCTR_short, AESCTR_long;
2537 
2538     __ z_chi(msglen, threshold);
2539     __ z_brh(AESCTR_long);
2540 
2541     __ bind(AESCTR_short);
2542 
2543     BLOCK_COMMENT(err_msg("counterMode_AESCrypt (text len <= %d, block size = %d) {", threshold, vec_short*16));
2544 
2545     AES_ctrVec_len = vec_short;
2546     generate_counterMode_AES_impl(false);   // control of generated code will not return
2547 
2548     BLOCK_COMMENT(err_msg("} counterMode_AESCrypt (text len <= %d, block size = %d)", threshold, vec_short*16));
2549 
2550     __ align(32); // Octoword alignment benefits branch targets.
2551 
2552     BLOCK_COMMENT(err_msg("counterMode_AESCrypt (text len > %d, block size = %d) {", threshold, vec_long*16));
2553 
2554     __ bind(AESCTR_long);
2555     AES_ctrVec_len = vec_long;
2556     generate_counterMode_AES_impl(false);   // control of generated code will not return
2557 
2558     BLOCK_COMMENT(err_msg("} counterMode_AESCrypt (text len > %d, block size = %d)", threshold, vec_long*16));
2559   }
2560 
2561 
2562   // Compute AES-CTR crypto function.
2563   // Encrypt or decrypt is selected via parameters. Only one stub is necessary.
2564   address generate_counterMode_AESCrypt(const char* name) {
2565     __ align(CodeEntryAlignment);
2566     StubCodeMark mark(this, "StubRoutines", name);
2567     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
2568 
2569     generate_counterMode_AES(false);
2570 
2571     return __ addr_at(start_off);
2572   }
2573 
2574 // *****************************************************************************
2575 
2576   // Compute GHASH function.
2577   address generate_ghash_processBlocks() {
2578     __ align(CodeEntryAlignment);
2579     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
2580     unsigned int start_off = __ offset();   // Remember stub start address (is rtn value).
2581 
2582     const Register state   = Z_ARG1;
2583     const Register subkeyH = Z_ARG2;
2584     const Register data    = Z_ARG3; // 1st of even-odd register pair.
2585     const Register blocks  = Z_ARG4;
2586     const Register len     = blocks; // 2nd of even-odd register pair.
2587 
2588     const int param_block_size = 4 * 8;
2589     const int frame_resize = param_block_size + 8; // Extra space for copy of fp.
2590 
2591     // Reserve stack space for parameter block (R1).
2592     __ z_lgr(Z_R1, Z_SP);
2593     __ resize_frame(-frame_resize, Z_R0, true);
2594     __ z_aghi(Z_R1, -param_block_size);
2595 
2596     // Fill parameter block.
2597     __ z_mvc(Address(Z_R1)    , Address(state)  , 16);
2598     __ z_mvc(Address(Z_R1, 16), Address(subkeyH), 16);
2599 
2600     // R4+5: data pointer + length
2601     __ z_llgfr(len, blocks);  // Cast to 64-bit.
2602 
2603     // R0: function code
2604     __ load_const_optimized(Z_R0, (int)VM_Version::MsgDigest::_GHASH);
2605 
2606     // Compute.
2607     __ z_sllg(len, len, 4);  // In bytes.
2608     __ kimd(data);
2609 
2610     // Copy back result and free parameter block.
2611     __ z_mvc(Address(state), Address(Z_R1), 16);
2612     __ z_xc(Address(Z_R1), param_block_size, Address(Z_R1));
2613     __ z_aghi(Z_SP, frame_resize);
2614 
2615     __ z_br(Z_R14);
2616 
2617     return __ addr_at(start_off);
2618   }
2619 
2620 
2621   // Call interface for all SHA* stubs.
2622   //
2623   //   Z_ARG1 - source data block. Ptr to leftmost byte to be processed.
2624   //   Z_ARG2 - current SHA state. Ptr to state area. This area serves as
2625   //            parameter block as required by the crypto instruction.
2626   //   Z_ARG3 - current byte offset in source data block.
2627   //   Z_ARG4 - last byte offset in source data block.
2628   //            (Z_ARG4 - Z_ARG3) gives the #bytes remaining to be processed.
2629   //
2630   //   Z_RET  - return value. First unprocessed byte offset in src buffer.
2631   //
2632   //   A few notes on the call interface:
2633   //    - All stubs, whether they are single-block or multi-block, are assumed to
2634   //      digest an integer multiple of the data block length of data. All data
2635   //      blocks are digested using the intermediate message digest (KIMD) instruction.
2636   //      Special end processing, as done by the KLMD instruction, seems to be
2637   //      emulated by the calling code.
2638   //
2639   //    - Z_ARG1 addresses the first byte of source data. The offset (Z_ARG3) is
2640   //      already accounted for.
2641   //
2642   //    - The current SHA state (the intermediate message digest value) is contained
2643   //      in an area addressed by Z_ARG2. The area size depends on the SHA variant
2644   //      and is accessible via the enum VM_Version::MsgDigest::_SHA<n>_parmBlk_I
2645   //
2646   //    - The single-block stub is expected to digest exactly one data block, starting
2647   //      at the address passed in Z_ARG1.
2648   //
2649   //    - The multi-block stub is expected to digest all data blocks which start in
2650   //      the offset interval [srcOff(Z_ARG3), srcLimit(Z_ARG4)). The exact difference
2651   //      (srcLimit-srcOff), rounded up to the next multiple of the data block length,
2652   //      gives the number of blocks to digest. It must be assumed that the calling code
2653   //      provides for a large enough source data buffer.
2654   //
2655   // Compute SHA-1 function.
2656   address generate_SHA1_stub(bool multiBlock, const char* name) {
2657     __ align(CodeEntryAlignment);
2658     StubCodeMark mark(this, "StubRoutines", name);
2659     unsigned int start_off = __ offset();   // Remember stub start address (is rtn value).
2660 
2661     const Register srcBuff        = Z_ARG1; // Points to first block to process (offset already added).
2662     const Register SHAState       = Z_ARG2; // Only on entry. Reused soon thereafter for kimd register pairs.
2663     const Register srcOff         = Z_ARG3; // int
2664     const Register srcLimit       = Z_ARG4; // Only passed in multiBlock case. int
2665 
2666     const Register SHAState_local = Z_R1;
2667     const Register SHAState_save  = Z_ARG3;
2668     const Register srcBufLen      = Z_ARG2; // Destroys state address, must be copied before.
2669     Label useKLMD, rtn;
2670 
2671     __ load_const_optimized(Z_R0, (int)VM_Version::MsgDigest::_SHA1);   // function code
2672     __ z_lgr(SHAState_local, SHAState);                                 // SHAState == parameter block
2673 
2674     if (multiBlock) {  // Process everything from offset to limit.
2675 
2676       // The following description is valid if we get a raw (unpimped) source data buffer,
2677       // spanning the range between [srcOff(Z_ARG3), srcLimit(Z_ARG4)). As detailed above,
2678       // the calling convention for these stubs is different. We leave the description in
2679       // to inform the reader what must be happening hidden in the calling code.
2680       //
2681       // The data block to be processed can have arbitrary length, i.e. its length does not
2682       // need to be an integer multiple of SHA<n>_datablk. Therefore, we need to implement
2683       // two different paths. If the length is an integer multiple, we use KIMD, saving us
2684       // to copy the SHA state back and forth. If the length is odd, we copy the SHA state
2685       // to the stack, execute a KLMD instruction on it and copy the result back to the
2686       // caller's SHA state location.
2687 
2688       // Total #srcBuff blocks to process.
2689       if (VM_Version::has_DistinctOpnds()) {
2690         __ z_srk(srcBufLen, srcLimit, srcOff); // exact difference
2691         __ z_ahi(srcBufLen, VM_Version::MsgDigest::_SHA1_dataBlk-1);   // round up
2692         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA1_dataBlk-1)) & 0xffff);
2693         __ z_ark(srcLimit, srcOff, srcBufLen); // Srclimit temporarily holds return value.
2694         __ z_llgfr(srcBufLen, srcBufLen);      // Cast to 64-bit.
2695       } else {
2696         __ z_lgfr(srcBufLen, srcLimit);        // Exact difference. srcLimit passed as int.
2697         __ z_sgfr(srcBufLen, srcOff);          // SrcOff passed as int, now properly casted to long.
2698         __ z_aghi(srcBufLen, VM_Version::MsgDigest::_SHA1_dataBlk-1);   // round up
2699         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA1_dataBlk-1)) & 0xffff);
2700         __ z_lgr(srcLimit, srcOff);            // SrcLimit temporarily holds return value.
2701         __ z_agr(srcLimit, srcBufLen);
2702       }
2703 
2704       // Integral #blocks to digest?
2705       // As a result of the calculations above, srcBufLen MUST be an integer
2706       // multiple of _SHA1_dataBlk, or else we are in big trouble.
2707       // We insert an asm_assert into the KLMD case to guard against that.
2708       __ z_tmll(srcBufLen, VM_Version::MsgDigest::_SHA1_dataBlk-1);
2709       __ z_brc(Assembler::bcondNotAllZero, useKLMD);
2710 
2711       // Process all full blocks.
2712       __ kimd(srcBuff);
2713 
2714       __ z_lgr(Z_RET, srcLimit);  // Offset of first unprocessed byte in buffer.
2715     } else {  // Process one data block only.
2716       __ load_const_optimized(srcBufLen, (int)VM_Version::MsgDigest::_SHA1_dataBlk);   // #srcBuff bytes to process
2717       __ kimd(srcBuff);
2718       __ add2reg(Z_RET, (int)VM_Version::MsgDigest::_SHA1_dataBlk, srcOff);            // Offset of first unprocessed byte in buffer. No 32 to 64 bit extension needed.
2719     }
2720 
2721     __ bind(rtn);
2722     __ z_br(Z_R14);
2723 
2724     if (multiBlock) {
2725       __ bind(useKLMD);
2726 
2727 #if 1
2728       // Security net: this stub is believed to be called for full-sized data blocks only
2729       // NOTE: The following code is believed to be correct, but is is not tested.
2730       __ stop_static("SHA128 stub can digest full data blocks only. Use -XX:-UseSHA as remedy.", 0);
2731 #endif
2732     }
2733 
2734     return __ addr_at(start_off);
2735   }
2736 
2737   // Compute SHA-256 function.
2738   address generate_SHA256_stub(bool multiBlock, const char* name) {
2739     __ align(CodeEntryAlignment);
2740     StubCodeMark mark(this, "StubRoutines", name);
2741     unsigned int start_off = __ offset();   // Remember stub start address (is rtn value).
2742 
2743     const Register srcBuff        = Z_ARG1;
2744     const Register SHAState       = Z_ARG2; // Only on entry. Reused soon thereafter.
2745     const Register SHAState_local = Z_R1;
2746     const Register SHAState_save  = Z_ARG3;
2747     const Register srcOff         = Z_ARG3;
2748     const Register srcLimit       = Z_ARG4;
2749     const Register srcBufLen      = Z_ARG2; // Destroys state address, must be copied before.
2750     Label useKLMD, rtn;
2751 
2752     __ load_const_optimized(Z_R0, (int)VM_Version::MsgDigest::_SHA256); // function code
2753     __ z_lgr(SHAState_local, SHAState);                                 // SHAState == parameter block
2754 
2755     if (multiBlock) {  // Process everything from offset to limit.
2756       // The following description is valid if we get a raw (unpimped) source data buffer,
2757       // spanning the range between [srcOff(Z_ARG3), srcLimit(Z_ARG4)). As detailed above,
2758       // the calling convention for these stubs is different. We leave the description in
2759       // to inform the reader what must be happening hidden in the calling code.
2760       //
2761       // The data block to be processed can have arbitrary length, i.e. its length does not
2762       // need to be an integer multiple of SHA<n>_datablk. Therefore, we need to implement
2763       // two different paths. If the length is an integer multiple, we use KIMD, saving us
2764       // to copy the SHA state back and forth. If the length is odd, we copy the SHA state
2765       // to the stack, execute a KLMD instruction on it and copy the result back to the
2766       // caller's SHA state location.
2767 
2768       // total #srcBuff blocks to process
2769       if (VM_Version::has_DistinctOpnds()) {
2770         __ z_srk(srcBufLen, srcLimit, srcOff);   // exact difference
2771         __ z_ahi(srcBufLen, VM_Version::MsgDigest::_SHA256_dataBlk-1); // round up
2772         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA256_dataBlk-1)) & 0xffff);
2773         __ z_ark(srcLimit, srcOff, srcBufLen);   // Srclimit temporarily holds return value.
2774         __ z_llgfr(srcBufLen, srcBufLen);        // Cast to 64-bit.
2775       } else {
2776         __ z_lgfr(srcBufLen, srcLimit);          // exact difference
2777         __ z_sgfr(srcBufLen, srcOff);
2778         __ z_aghi(srcBufLen, VM_Version::MsgDigest::_SHA256_dataBlk-1); // round up
2779         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA256_dataBlk-1)) & 0xffff);
2780         __ z_lgr(srcLimit, srcOff);              // Srclimit temporarily holds return value.
2781         __ z_agr(srcLimit, srcBufLen);
2782       }
2783 
2784       // Integral #blocks to digest?
2785       // As a result of the calculations above, srcBufLen MUST be an integer
2786       // multiple of _SHA1_dataBlk, or else we are in big trouble.
2787       // We insert an asm_assert into the KLMD case to guard against that.
2788       __ z_tmll(srcBufLen, VM_Version::MsgDigest::_SHA256_dataBlk-1);
2789       __ z_brc(Assembler::bcondNotAllZero, useKLMD);
2790 
2791       // Process all full blocks.
2792       __ kimd(srcBuff);
2793 
2794       __ z_lgr(Z_RET, srcLimit);  // Offset of first unprocessed byte in buffer.
2795     } else {  // Process one data block only.
2796       __ load_const_optimized(srcBufLen, (int)VM_Version::MsgDigest::_SHA256_dataBlk); // #srcBuff bytes to process
2797       __ kimd(srcBuff);
2798       __ add2reg(Z_RET, (int)VM_Version::MsgDigest::_SHA256_dataBlk, srcOff);          // Offset of first unprocessed byte in buffer.
2799     }
2800 
2801     __ bind(rtn);
2802     __ z_br(Z_R14);
2803 
2804     if (multiBlock) {
2805       __ bind(useKLMD);
2806 #if 1
2807       // Security net: this stub is believed to be called for full-sized data blocks only.
2808       // NOTE:
2809       //   The following code is believed to be correct, but is is not tested.
2810       __ stop_static("SHA256 stub can digest full data blocks only. Use -XX:-UseSHA as remedy.", 0);
2811 #endif
2812     }
2813 
2814     return __ addr_at(start_off);
2815   }
2816 
2817   // Compute SHA-512 function.
2818   address generate_SHA512_stub(bool multiBlock, const char* name) {
2819     __ align(CodeEntryAlignment);
2820     StubCodeMark mark(this, "StubRoutines", name);
2821     unsigned int start_off = __ offset();   // Remember stub start address (is rtn value).
2822 
2823     const Register srcBuff        = Z_ARG1;
2824     const Register SHAState       = Z_ARG2; // Only on entry. Reused soon thereafter.
2825     const Register SHAState_local = Z_R1;
2826     const Register SHAState_save  = Z_ARG3;
2827     const Register srcOff         = Z_ARG3;
2828     const Register srcLimit       = Z_ARG4;
2829     const Register srcBufLen      = Z_ARG2; // Destroys state address, must be copied before.
2830     Label useKLMD, rtn;
2831 
2832     __ load_const_optimized(Z_R0, (int)VM_Version::MsgDigest::_SHA512); // function code
2833     __ z_lgr(SHAState_local, SHAState);                                 // SHAState == parameter block
2834 
2835     if (multiBlock) {  // Process everything from offset to limit.
2836       // The following description is valid if we get a raw (unpimped) source data buffer,
2837       // spanning the range between [srcOff(Z_ARG3), srcLimit(Z_ARG4)). As detailed above,
2838       // the calling convention for these stubs is different. We leave the description in
2839       // to inform the reader what must be happening hidden in the calling code.
2840       //
2841       // The data block to be processed can have arbitrary length, i.e. its length does not
2842       // need to be an integer multiple of SHA<n>_datablk. Therefore, we need to implement
2843       // two different paths. If the length is an integer multiple, we use KIMD, saving us
2844       // to copy the SHA state back and forth. If the length is odd, we copy the SHA state
2845       // to the stack, execute a KLMD instruction on it and copy the result back to the
2846       // caller's SHA state location.
2847 
2848       // total #srcBuff blocks to process
2849       if (VM_Version::has_DistinctOpnds()) {
2850         __ z_srk(srcBufLen, srcLimit, srcOff);   // exact difference
2851         __ z_ahi(srcBufLen, VM_Version::MsgDigest::_SHA512_dataBlk-1); // round up
2852         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA512_dataBlk-1)) & 0xffff);
2853         __ z_ark(srcLimit, srcOff, srcBufLen);   // Srclimit temporarily holds return value.
2854         __ z_llgfr(srcBufLen, srcBufLen);        // Cast to 64-bit.
2855       } else {
2856         __ z_lgfr(srcBufLen, srcLimit);          // exact difference
2857         __ z_sgfr(srcBufLen, srcOff);
2858         __ z_aghi(srcBufLen, VM_Version::MsgDigest::_SHA512_dataBlk-1); // round up
2859         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA512_dataBlk-1)) & 0xffff);
2860         __ z_lgr(srcLimit, srcOff);              // Srclimit temporarily holds return value.
2861         __ z_agr(srcLimit, srcBufLen);
2862       }
2863 
2864       // integral #blocks to digest?
2865       // As a result of the calculations above, srcBufLen MUST be an integer
2866       // multiple of _SHA1_dataBlk, or else we are in big trouble.
2867       // We insert an asm_assert into the KLMD case to guard against that.
2868       __ z_tmll(srcBufLen, VM_Version::MsgDigest::_SHA512_dataBlk-1);
2869       __ z_brc(Assembler::bcondNotAllZero, useKLMD);
2870 
2871       // Process all full blocks.
2872       __ kimd(srcBuff);
2873 
2874       __ z_lgr(Z_RET, srcLimit);  // Offset of first unprocessed byte in buffer.
2875     } else {  // Process one data block only.
2876       __ load_const_optimized(srcBufLen, (int)VM_Version::MsgDigest::_SHA512_dataBlk); // #srcBuff bytes to process
2877       __ kimd(srcBuff);
2878       __ add2reg(Z_RET, (int)VM_Version::MsgDigest::_SHA512_dataBlk, srcOff);          // Offset of first unprocessed byte in buffer.
2879     }
2880 
2881     __ bind(rtn);
2882     __ z_br(Z_R14);
2883 
2884     if (multiBlock) {
2885       __ bind(useKLMD);
2886 #if 1
2887       // Security net: this stub is believed to be called for full-sized data blocks only
2888       // NOTE:
2889       //   The following code is believed to be correct, but is is not tested.
2890       __ stop_static("SHA512 stub can digest full data blocks only. Use -XX:-UseSHA as remedy.", 0);
2891 #endif
2892     }
2893 
2894     return __ addr_at(start_off);
2895   }
2896 
2897 
2898   /**
2899    *  Arguments:
2900    *
2901    * Inputs:
2902    *   Z_ARG1    - int   crc
2903    *   Z_ARG2    - byte* buf
2904    *   Z_ARG3    - int   length (of buffer)
2905    *
2906    * Result:
2907    *   Z_RET     - int   crc result
2908    **/
2909   // Compute CRC function (generic, for all polynomials).
2910   void generate_CRC_updateBytes(const char* name, Register table, bool invertCRC) {
2911 
2912     // arguments to kernel_crc32:
2913     Register       crc     = Z_ARG1;  // Current checksum, preset by caller or result from previous call, int.
2914     Register       data    = Z_ARG2;  // source byte array
2915     Register       dataLen = Z_ARG3;  // #bytes to process, int
2916 //    Register       table   = Z_ARG4;  // crc table address. Preloaded and passed in by caller.
2917     const Register t0      = Z_R10;   // work reg for kernel* emitters
2918     const Register t1      = Z_R11;   // work reg for kernel* emitters
2919     const Register t2      = Z_R12;   // work reg for kernel* emitters
2920     const Register t3      = Z_R13;   // work reg for kernel* emitters
2921 
2922 
2923     assert_different_registers(crc, data, dataLen, table);
2924 
2925     // We pass these values as ints, not as longs as required by C calling convention.
2926     // Crc used as int.
2927     __ z_llgfr(dataLen, dataLen);
2928 
2929     __ resize_frame(-(6*8), Z_R0, true); // Resize frame to provide add'l space to spill 5 registers.
2930     __ z_stmg(Z_R10, Z_R13, 1*8, Z_SP);  // Spill regs 10..11 to make them available as work registers.
2931     __ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, invertCRC);
2932     __ z_lmg(Z_R10, Z_R13, 1*8, Z_SP);   // Spill regs 10..11 back from stack.
2933     __ resize_frame(+(6*8), Z_R0, true); // Resize frame to provide add'l space to spill 5 registers.
2934 
2935     __ z_llgfr(Z_RET, crc);  // Updated crc is function result. No copying required, just zero upper 32 bits.
2936     __ z_br(Z_R14);          // Result already in Z_RET == Z_ARG1.
2937   }
2938 
2939 
2940   // Compute CRC32 function.
2941   address generate_CRC32_updateBytes(const char* name) {
2942     __ align(CodeEntryAlignment);
2943     StubCodeMark mark(this, "StubRoutines", name);
2944     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
2945 
2946     assert(UseCRC32Intrinsics, "should not generate this stub (%s) with CRC32 intrinsics disabled", name);
2947 
2948     BLOCK_COMMENT("CRC32_updateBytes {");
2949     Register       table   = Z_ARG4;  // crc32 table address.
2950     StubRoutines::zarch::generate_load_crc_table_addr(_masm, table);
2951 
2952     generate_CRC_updateBytes(name, table, true);
2953     BLOCK_COMMENT("} CRC32_updateBytes");
2954 
2955     return __ addr_at(start_off);
2956   }
2957 
2958 
2959   // Compute CRC32C function.
2960   address generate_CRC32C_updateBytes(const char* name) {
2961     __ align(CodeEntryAlignment);
2962     StubCodeMark mark(this, "StubRoutines", name);
2963     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
2964 
2965     assert(UseCRC32CIntrinsics, "should not generate this stub (%s) with CRC32C intrinsics disabled", name);
2966 
2967     BLOCK_COMMENT("CRC32C_updateBytes {");
2968     Register       table   = Z_ARG4;  // crc32c table address.
2969     StubRoutines::zarch::generate_load_crc32c_table_addr(_masm, table);
2970 
2971     generate_CRC_updateBytes(name, table, false);
2972     BLOCK_COMMENT("} CRC32C_updateBytes");
2973 
2974     return __ addr_at(start_off);
2975   }
2976 
2977 
2978   // Arguments:
2979   //   Z_ARG1    - x address
2980   //   Z_ARG2    - x length
2981   //   Z_ARG3    - y address
2982   //   Z_ARG4    - y length
2983   //   Z_ARG5    - z address
2984   //   160[Z_SP] - z length
2985   address generate_multiplyToLen() {
2986     __ align(CodeEntryAlignment);
2987     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
2988 
2989     address start = __ pc();
2990 
2991     const Register x    = Z_ARG1;
2992     const Register xlen = Z_ARG2;
2993     const Register y    = Z_ARG3;
2994     const Register ylen = Z_ARG4;
2995     const Register z    = Z_ARG5;
2996     // zlen is passed on the stack:
2997     // Address zlen(Z_SP, _z_abi(remaining_cargs));
2998 
2999     // Next registers will be saved on stack in multiply_to_len().
3000     const Register tmp1 = Z_tmp_1;
3001     const Register tmp2 = Z_tmp_2;
3002     const Register tmp3 = Z_tmp_3;
3003     const Register tmp4 = Z_tmp_4;
3004     const Register tmp5 = Z_R9;
3005 
3006     BLOCK_COMMENT("Entry:");
3007 
3008     __ z_llgfr(xlen, xlen);
3009     __ z_llgfr(ylen, ylen);
3010 
3011     __ multiply_to_len(x, xlen, y, ylen, z, tmp1, tmp2, tmp3, tmp4, tmp5);
3012 
3013     __ z_br(Z_R14);  // Return to caller.
3014 
3015     return start;
3016   }
3017 
3018   address generate_method_entry_barrier() {
3019     __ align(CodeEntryAlignment);
3020     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
3021 
3022     address start = __ pc();
3023 
3024     int nbytes_volatile = (8 + 5) * BytesPerWord;
3025 
3026     // VM-Call Prologue
3027     __ save_return_pc();
3028     __ push_frame_abi160(nbytes_volatile);
3029     __ save_volatile_regs(Z_SP, frame::z_abi_160_size, true, false);
3030 
3031     // Prep arg for VM call
3032     // Create ptr to stored return_pc in caller frame.
3033     __ z_la(Z_ARG1, _z_abi(return_pc) + frame::z_abi_160_size + nbytes_volatile, Z_R0, Z_SP);
3034 
3035     // VM-Call: BarrierSetNMethod::nmethod_stub_entry_barrier(address* return_address_ptr)
3036     __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier));
3037     __ z_ltr(Z_R0_scratch, Z_RET);
3038 
3039     // VM-Call Epilogue
3040     __ restore_volatile_regs(Z_SP, frame::z_abi_160_size, true, false);
3041     __ pop_frame();
3042     __ restore_return_pc();
3043 
3044     // Check return val of VM-Call
3045     __ z_bcr(Assembler::bcondZero, Z_R14);
3046 
3047     // Pop frame built in prologue.
3048     // Required so wrong_method_stub can deduce caller.
3049     __ pop_frame();
3050     __ restore_return_pc();
3051 
3052     // VM-Call indicates deoptimization required
3053     __ load_const_optimized(Z_R1_scratch, SharedRuntime::get_handle_wrong_method_stub());
3054     __ z_br(Z_R1_scratch);
3055 
3056     return start;
3057   }
3058 
3059   address generate_cont_thaw(bool return_barrier, bool exception) {
3060     if (!Continuations::enabled()) return nullptr;
3061     Unimplemented();
3062     return nullptr;
3063   }
3064 
3065   address generate_cont_thaw() {
3066     if (!Continuations::enabled()) return nullptr;
3067     Unimplemented();
3068     return nullptr;
3069   }
3070 
3071   address generate_cont_returnBarrier() {
3072     if (!Continuations::enabled()) return nullptr;
3073     Unimplemented();
3074     return nullptr;
3075   }
3076 
3077   address generate_cont_returnBarrier_exception() {
3078     if (!Continuations::enabled()) return nullptr;
3079     Unimplemented();
3080     return nullptr;
3081   }
3082 
3083   #if INCLUDE_JFR
3084   RuntimeStub* generate_jfr_write_checkpoint() {
3085     if (!Continuations::enabled()) return nullptr;
3086     Unimplemented();
3087     return nullptr;
3088   }
3089 
3090   RuntimeStub* generate_jfr_return_lease() {
3091     if (!Continuations::enabled()) return nullptr;
3092     Unimplemented();
3093     return nullptr;
3094   }
3095 
3096   #endif // INCLUDE_JFR
3097 
3098   // exception handler for upcall stubs
3099   address generate_upcall_stub_exception_handler() {
3100     StubCodeMark mark(this, "StubRoutines", "upcall stub exception handler");
3101     address start = __ pc();
3102 
3103     // Native caller has no idea how to handle exceptions,
3104     // so we just crash here. Up to callee to catch exceptions.
3105     __ verify_oop(Z_ARG1);
3106     __ load_const_optimized(Z_R1_scratch, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
3107     __ call_c(Z_R1_scratch);
3108     __ should_not_reach_here();
3109 
3110     return start;
3111   }
3112 
3113   void generate_initial_stubs() {
3114     // Generates all stubs and initializes the entry points.
3115 
3116     // Entry points that exist in all platforms.
3117     // Note: This is code that could be shared among different
3118     // platforms - however the benefit seems to be smaller than the
3119     // disadvantage of having a much more complicated generator
3120     // structure. See also comment in stubRoutines.hpp.
3121     StubRoutines::_forward_exception_entry                 = generate_forward_exception();
3122 
3123     StubRoutines::_call_stub_entry                         = generate_call_stub(StubRoutines::_call_stub_return_address);
3124     StubRoutines::_catch_exception_entry                   = generate_catch_exception();
3125 
3126     // Build this early so it's available for the interpreter.
3127     StubRoutines::_throw_StackOverflowError_entry          =
3128       generate_throw_exception("StackOverflowError throw_exception",
3129                                CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false);
3130     StubRoutines::_throw_delayed_StackOverflowError_entry  =
3131       generate_throw_exception("delayed StackOverflowError throw_exception",
3132                                CAST_FROM_FN_PTR(address, SharedRuntime::throw_delayed_StackOverflowError), false);
3133 
3134     //----------------------------------------------------------------------
3135     // Entry points that are platform specific.
3136 
3137     if (UseCRC32Intrinsics) {
3138       StubRoutines::_crc_table_adr     = (address)StubRoutines::zarch::_crc_table;
3139       StubRoutines::_updateBytesCRC32  = generate_CRC32_updateBytes("CRC32_updateBytes");
3140     }
3141 
3142     if (UseCRC32CIntrinsics) {
3143       StubRoutines::_crc32c_table_addr = (address)StubRoutines::zarch::_crc32c_table;
3144       StubRoutines::_updateBytesCRC32C = generate_CRC32C_updateBytes("CRC32C_updateBytes");
3145     }
3146 
3147     // Comapct string intrinsics: Translate table for string inflate intrinsic. Used by trot instruction.
3148     StubRoutines::zarch::_trot_table_addr = (address)StubRoutines::zarch::_trot_table;
3149   }
3150 
3151   void generate_continuation_stubs() {
3152     if (!Continuations::enabled()) return;
3153 
3154     // Continuation stubs:
3155     StubRoutines::_cont_thaw          = generate_cont_thaw();
3156     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
3157     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
3158 
3159     JFR_ONLY(generate_jfr_stubs();)
3160   }
3161 
3162 #if INCLUDE_JFR
3163   void generate_jfr_stubs() {
3164     StubRoutines::_jfr_write_checkpoint_stub = generate_jfr_write_checkpoint();
3165     StubRoutines::_jfr_write_checkpoint = StubRoutines::_jfr_write_checkpoint_stub->entry_point();
3166     StubRoutines::_jfr_return_lease_stub = generate_jfr_return_lease();
3167     StubRoutines::_jfr_return_lease = StubRoutines::_jfr_return_lease_stub->entry_point();
3168   }
3169 #endif // INCLUDE_JFR
3170 
3171   void generate_final_stubs() {
3172     // Generates all stubs and initializes the entry points.
3173 
3174     StubRoutines::zarch::_partial_subtype_check            = generate_partial_subtype_check();
3175 
3176     // These entry points require SharedInfo::stack0 to be set up in non-core builds.
3177     StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError),  false);
3178     StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError),  false);
3179     StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
3180 
3181     // Support for verify_oop (must happen after universe_init).
3182     StubRoutines::_verify_oop_subroutine_entry             = generate_verify_oop_subroutine();
3183 
3184     // Arraycopy stubs used by compilers.
3185     generate_arraycopy_stubs();
3186 
3187     // nmethod entry barriers for concurrent class unloading
3188     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
3189     if (bs_nm != nullptr) {
3190       StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
3191     }
3192 
3193     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
3194   }
3195 
3196   void generate_compiler_stubs() {
3197 #if COMPILER2_OR_JVMCI
3198     // Generate AES intrinsics code.
3199     if (UseAESIntrinsics) {
3200       if (VM_Version::has_Crypto_AES()) {
3201         StubRoutines::_aescrypt_encryptBlock = generate_AES_encryptBlock("AES_encryptBlock");
3202         StubRoutines::_aescrypt_decryptBlock = generate_AES_decryptBlock("AES_decryptBlock");
3203         StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_AES_encrypt("AES_encryptBlock_chaining");
3204         StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_AES_decrypt("AES_decryptBlock_chaining");
3205       } else {
3206         // In PRODUCT builds, the function pointers will keep their initial (null) value.
3207         // LibraryCallKit::try_to_inline() will return false then, preventing the intrinsic to be called.
3208         assert(VM_Version::has_Crypto_AES(), "Inconsistent settings. Check vm_version_s390.cpp");
3209       }
3210     }
3211 
3212     if (UseAESCTRIntrinsics) {
3213       if (VM_Version::has_Crypto_AES_CTR()) {
3214         StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt("counterMode_AESCrypt");
3215       } else {
3216         // In PRODUCT builds, the function pointers will keep their initial (null) value.
3217         // LibraryCallKit::try_to_inline() will return false then, preventing the intrinsic to be called.
3218         assert(VM_Version::has_Crypto_AES_CTR(), "Inconsistent settings. Check vm_version_s390.cpp");
3219       }
3220     }
3221 
3222     // Generate GHASH intrinsics code
3223     if (UseGHASHIntrinsics) {
3224       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
3225     }
3226 
3227     // Generate SHA1/SHA256/SHA512 intrinsics code.
3228     if (UseSHA1Intrinsics) {
3229       StubRoutines::_sha1_implCompress     = generate_SHA1_stub(false,   "SHA1_singleBlock");
3230       StubRoutines::_sha1_implCompressMB   = generate_SHA1_stub(true,    "SHA1_multiBlock");
3231     }
3232     if (UseSHA256Intrinsics) {
3233       StubRoutines::_sha256_implCompress   = generate_SHA256_stub(false, "SHA256_singleBlock");
3234       StubRoutines::_sha256_implCompressMB = generate_SHA256_stub(true,  "SHA256_multiBlock");
3235     }
3236     if (UseSHA512Intrinsics) {
3237       StubRoutines::_sha512_implCompress   = generate_SHA512_stub(false, "SHA512_singleBlock");
3238       StubRoutines::_sha512_implCompressMB = generate_SHA512_stub(true,  "SHA512_multiBlock");
3239     }
3240 
3241 #ifdef COMPILER2
3242     if (UseMultiplyToLenIntrinsic) {
3243       StubRoutines::_multiplyToLen = generate_multiplyToLen();
3244     }
3245     if (UseMontgomeryMultiplyIntrinsic) {
3246       StubRoutines::_montgomeryMultiply
3247         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
3248     }
3249     if (UseMontgomerySquareIntrinsic) {
3250       StubRoutines::_montgomerySquare
3251         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
3252     }
3253 #endif
3254 #endif // COMPILER2_OR_JVMCI
3255   }
3256 
3257  public:
3258   StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) {
3259     switch(kind) {
3260     case Initial_stubs:
3261       generate_initial_stubs();
3262       break;
3263      case Continuation_stubs:
3264       generate_continuation_stubs();
3265       break;
3266     case Compiler_stubs:
3267       generate_compiler_stubs();
3268       break;
3269     case Final_stubs:
3270       generate_final_stubs();
3271       break;
3272     default:
3273       fatal("unexpected stubs kind: %d", kind);
3274       break;
3275     };
3276   }
3277 
3278  private:
3279   int _stub_count;
3280   void stub_prolog(StubCodeDesc* cdesc) {
3281 #ifdef ASSERT
3282     // Put extra information in the stub code, to make it more readable.
3283     // Write the high part of the address.
3284     // [RGV] Check if there is a dependency on the size of this prolog.
3285     __ emit_data((intptr_t)cdesc >> 32);
3286     __ emit_data((intptr_t)cdesc);
3287     __ emit_data(++_stub_count);
3288 #endif
3289     align(true);
3290   }
3291 
3292   void align(bool at_header = false) {
3293     // z/Architecture cache line size is 256 bytes.
3294     // There is no obvious benefit in aligning stub
3295     // code to cache lines. Use CodeEntryAlignment instead.
3296     const unsigned int icache_line_size      = CodeEntryAlignment;
3297     const unsigned int icache_half_line_size = MIN2<unsigned int>(32, CodeEntryAlignment);
3298 
3299     if (at_header) {
3300       while ((intptr_t)(__ pc()) % icache_line_size != 0) {
3301         __ z_illtrap();
3302       }
3303     } else {
3304       while ((intptr_t)(__ pc()) % icache_half_line_size != 0) {
3305         __ z_nop();
3306       }
3307     }
3308   }
3309 
3310 };
3311 
3312 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) {
3313   StubGenerator g(code, kind);
3314 }