1 /*
   2  * Copyright (c) 2016, 2025, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2016, 2024 SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "asm/macroAssembler.inline.hpp"
  27 #include "registerSaver_s390.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "gc/shared/barrierSetNMethod.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "interpreter/interp_masm.hpp"
  33 #include "memory/universe.hpp"
  34 #include "nativeInst_s390.hpp"
  35 #include "oops/instanceOop.hpp"
  36 #include "oops/objArrayKlass.hpp"
  37 #include "oops/oop.inline.hpp"
  38 #include "prims/methodHandles.hpp"
  39 #include "prims/upcallLinker.hpp"
  40 #include "runtime/frame.inline.hpp"
  41 #include "runtime/handles.inline.hpp"
  42 #include "runtime/javaThread.hpp"
  43 #include "runtime/sharedRuntime.hpp"
  44 #include "runtime/stubCodeGenerator.hpp"
  45 #include "runtime/stubRoutines.hpp"
  46 #include "utilities/formatBuffer.hpp"
  47 #include "utilities/macros.hpp"
  48 #include "utilities/powerOfTwo.hpp"
  49 
  50 // Declaration and definition of StubGenerator (no .hpp file).
  51 // For a more detailed description of the stub routine structure
  52 // see the comment in stubRoutines.hpp.
  53 
  54 #ifdef PRODUCT
  55 #define __ _masm->
  56 #else
  57 #define __ (Verbose ? (_masm->block_comment(FILE_AND_LINE),_masm):_masm)->
  58 #endif
  59 
  60 #define BLOCK_COMMENT(str) if (PrintAssembly || PrintStubCode) __ block_comment(str)
  61 #define BIND(label)        bind(label); BLOCK_COMMENT(#label ":")
  62 
  63 
  64   // These static, partially const, variables are for the AES intrinsics.
  65   // They are declared/initialized here to make them available across function bodies.
  66 
  67       static const int AES_parmBlk_align    = 32;                  // octoword alignment.
  68       static const int AES_stackSpace_incr  = AES_parmBlk_align;   // add'l stack space is allocated in such increments.
  69                                                                    // Must be multiple of AES_parmBlk_align.
  70 
  71       static int AES_ctrVal_len  = 0;                              // ctr init value len (in bytes), expected: length of dataBlk (16)
  72       static int AES_ctrVec_len  = 0;                              // # of ctr vector elements. That many block can be ciphered with one instruction execution
  73       static int AES_ctrArea_len = 0;                              // reserved stack space (in bytes) for ctr (= ctrVal_len * ctrVec_len)
  74 
  75       static int AES_parmBlk_addspace = 0;  // Must be multiple of AES_parmblk_align.
  76                                             // Will be set by stub generator to stub specific value.
  77       static int AES_dataBlk_space    = 0;  // Must be multiple of AES_parmblk_align.
  78                                             // Will be set by stub generator to stub specific value.
  79       static int AES_dataBlk_offset   = 0;  // offset of the local src and dst dataBlk buffers
  80                                             // Will be set by stub generator to stub specific value.
  81 
  82       // These offsets are relative to the parameter block address (Register parmBlk = Z_R1)
  83       static const int keylen_offset     =  -1;
  84       static const int fCode_offset      =  -2;
  85       static const int ctrVal_len_offset =  -4;
  86       static const int msglen_offset     =  -8;
  87       static const int unextSP_offset    = -16;
  88       static const int rem_msgblk_offset = -20;
  89       static const int argsave_offset    = -2*AES_parmBlk_align;
  90       static const int regsave_offset    = -4*AES_parmBlk_align; // save space for work regs (Z_R10..13)
  91       static const int msglen_red_offset = regsave_offset + AES_parmBlk_align; // reduced len after preLoop;
  92       static const int counter_offset    = msglen_red_offset+8;  // current counter vector position.
  93       static const int localSpill_offset = argsave_offset + 24;  // arg2..arg4 are saved
  94 
  95 
  96       // -----------------------------------------------------------------------
  97 // Stub Code definitions
  98 
  99 class StubGenerator: public StubCodeGenerator {
 100  private:
 101 
 102   //----------------------------------------------------------------------
 103   // Call stubs are used to call Java from C.
 104 
 105   //
 106   // Arguments:
 107   //
 108   //   R2        - call wrapper address     : address
 109   //   R3        - result                   : intptr_t*
 110   //   R4        - result type              : BasicType
 111   //   R5        - method                   : method
 112   //   R6        - frame mgr entry point    : address
 113   //   [SP+160]  - parameter block          : intptr_t*
 114   //   [SP+172]  - parameter count in words : int
 115   //   [SP+176]  - thread                   : Thread*
 116   //
 117   address generate_call_stub(address& return_address) {
 118     // Set up a new C frame, copy Java arguments, call frame manager
 119     // or native_entry, and process result.
 120 
 121     StubGenStubId stub_id = StubGenStubId::call_stub_id;
 122     StubCodeMark mark(this, stub_id);
 123     address start = __ pc();
 124 
 125     Register r_arg_call_wrapper_addr   = Z_ARG1;
 126     Register r_arg_result_addr         = Z_ARG2;
 127     Register r_arg_result_type         = Z_ARG3;
 128     Register r_arg_method              = Z_ARG4;
 129     Register r_arg_entry               = Z_ARG5;
 130 
 131     // offsets to fp
 132     #define d_arg_thread 176
 133     #define d_arg_argument_addr 160
 134     #define d_arg_argument_count 168+4
 135 
 136     Register r_entryframe_fp           = Z_tmp_1;
 137     Register r_top_of_arguments_addr   = Z_ARG4;
 138     Register r_new_arg_entry = Z_R14;
 139 
 140     // macros for frame offsets
 141     #define call_wrapper_address_offset \
 142                _z_entry_frame_locals_neg(call_wrapper_address)
 143     #define result_address_offset \
 144               _z_entry_frame_locals_neg(result_address)
 145     #define result_type_offset \
 146               _z_entry_frame_locals_neg(result_type)
 147     #define arguments_tos_address_offset \
 148               _z_entry_frame_locals_neg(arguments_tos_address)
 149 
 150     {
 151       //
 152       // STACK on entry to call_stub:
 153       //
 154       //     F1      [C_FRAME]
 155       //            ...
 156       //
 157 
 158       Register r_argument_addr              = Z_tmp_3;
 159       Register r_argumentcopy_addr          = Z_tmp_4;
 160       Register r_argument_size_in_bytes     = Z_ARG5;
 161       Register r_frame_size                 = Z_R1;
 162 
 163       Label arguments_copied;
 164 
 165       // Save non-volatile registers to ABI of caller frame.
 166       BLOCK_COMMENT("save registers, push frame {");
 167       __ z_stmg(Z_R6, Z_R14, 16, Z_SP);
 168       __ z_std(Z_F8, 96, Z_SP);
 169       __ z_std(Z_F9, 104, Z_SP);
 170       __ z_std(Z_F10, 112, Z_SP);
 171       __ z_std(Z_F11, 120, Z_SP);
 172       __ z_std(Z_F12, 128, Z_SP);
 173       __ z_std(Z_F13, 136, Z_SP);
 174       __ z_std(Z_F14, 144, Z_SP);
 175       __ z_std(Z_F15, 152, Z_SP);
 176 
 177       //
 178       // Push ENTRY_FRAME including arguments:
 179       //
 180       //     F0      [TOP_IJAVA_FRAME_ABI]
 181       //             [outgoing Java arguments]
 182       //             [ENTRY_FRAME_LOCALS]
 183       //     F1      [C_FRAME]
 184       //             ...
 185       //
 186 
 187       // Calculate new frame size and push frame.
 188       #define abi_plus_locals_size \
 189                 (frame::z_top_ijava_frame_abi_size + frame::z_entry_frame_locals_size)
 190       if (abi_plus_locals_size % BytesPerWord == 0) {
 191         // Preload constant part of frame size.
 192         __ load_const_optimized(r_frame_size, -abi_plus_locals_size/BytesPerWord);
 193         // Keep copy of our frame pointer (caller's SP).
 194         __ z_lgr(r_entryframe_fp, Z_SP);
 195         // Add space required by arguments to frame size.
 196         __ z_slgf(r_frame_size, d_arg_argument_count, Z_R0, Z_SP);
 197         // Move Z_ARG5 early, it will be used as a local.
 198         __ z_lgr(r_new_arg_entry, r_arg_entry);
 199         // Convert frame size from words to bytes.
 200         __ z_sllg(r_frame_size, r_frame_size, LogBytesPerWord);
 201         __ push_frame(r_frame_size, r_entryframe_fp,
 202                       false/*don't copy SP*/, true /*frame size sign inverted*/);
 203       } else {
 204         guarantee(false, "frame sizes should be multiples of word size (BytesPerWord)");
 205       }
 206       BLOCK_COMMENT("} save, push");
 207 
 208       // Load argument registers for call.
 209       BLOCK_COMMENT("prepare/copy arguments {");
 210       __ z_lgr(Z_method, r_arg_method);
 211       __ z_lg(Z_thread, d_arg_thread, r_entryframe_fp);
 212 
 213       // Calculate top_of_arguments_addr which will be tos (not prepushed) later.
 214       // Wimply use SP + frame::top_ijava_frame_size.
 215       __ add2reg(r_top_of_arguments_addr,
 216                  frame::z_top_ijava_frame_abi_size - BytesPerWord, Z_SP);
 217 
 218       // Initialize call_stub locals (step 1).
 219       if ((call_wrapper_address_offset + BytesPerWord == result_address_offset) &&
 220           (result_address_offset + BytesPerWord == result_type_offset)          &&
 221           (result_type_offset + BytesPerWord == arguments_tos_address_offset)) {
 222 
 223         __ z_stmg(r_arg_call_wrapper_addr, r_top_of_arguments_addr,
 224                   call_wrapper_address_offset, r_entryframe_fp);
 225       } else {
 226         __ z_stg(r_arg_call_wrapper_addr,
 227                  call_wrapper_address_offset, r_entryframe_fp);
 228         __ z_stg(r_arg_result_addr,
 229                  result_address_offset, r_entryframe_fp);
 230         __ z_stg(r_arg_result_type,
 231                  result_type_offset, r_entryframe_fp);
 232         __ z_stg(r_top_of_arguments_addr,
 233                  arguments_tos_address_offset, r_entryframe_fp);
 234       }
 235 
 236       // Copy Java arguments.
 237 
 238       // Any arguments to copy?
 239       __ load_and_test_int2long(Z_R1, Address(r_entryframe_fp, d_arg_argument_count));
 240       __ z_bre(arguments_copied);
 241 
 242       // Prepare loop and copy arguments in reverse order.
 243       {
 244         // Calculate argument size in bytes.
 245         __ z_sllg(r_argument_size_in_bytes, Z_R1, LogBytesPerWord);
 246 
 247         // Get addr of first incoming Java argument.
 248         __ z_lg(r_argument_addr, d_arg_argument_addr, r_entryframe_fp);
 249 
 250         // Let r_argumentcopy_addr point to last outgoing Java argument.
 251         __ add2reg(r_argumentcopy_addr, BytesPerWord, r_top_of_arguments_addr); // = Z_SP+160 effectively.
 252 
 253         // Let r_argument_addr point to last incoming Java argument.
 254         __ add2reg_with_index(r_argument_addr, -BytesPerWord,
 255                               r_argument_size_in_bytes, r_argument_addr);
 256 
 257         // Now loop while Z_R1 > 0 and copy arguments.
 258         {
 259           Label next_argument;
 260           __ bind(next_argument);
 261           // Mem-mem move.
 262           __ z_mvc(0, BytesPerWord-1, r_argumentcopy_addr, 0, r_argument_addr);
 263           __ add2reg(r_argument_addr,    -BytesPerWord);
 264           __ add2reg(r_argumentcopy_addr, BytesPerWord);
 265           __ z_brct(Z_R1, next_argument);
 266         }
 267       }  // End of argument copy loop.
 268 
 269       __ bind(arguments_copied);
 270     }
 271     BLOCK_COMMENT("} arguments");
 272 
 273     BLOCK_COMMENT("call {");
 274     {
 275       // Call frame manager or native entry.
 276 
 277       //
 278       // Register state on entry to frame manager / native entry:
 279       //
 280       //   Z_ARG1 = r_top_of_arguments_addr  - intptr_t *sender tos (prepushed)
 281       //                                       Lesp = (SP) + copied_arguments_offset - 8
 282       //   Z_method                          - method
 283       //   Z_thread                          - JavaThread*
 284       //
 285 
 286       // Here, the usual SP is the initial_caller_sp.
 287       __ z_lgr(Z_R10, Z_SP);
 288 
 289       // Z_esp points to the slot below the last argument.
 290       __ z_lgr(Z_esp, r_top_of_arguments_addr);
 291 
 292       //
 293       // Stack on entry to frame manager / native entry:
 294       //
 295       //     F0      [TOP_IJAVA_FRAME_ABI]
 296       //             [outgoing Java arguments]
 297       //             [ENTRY_FRAME_LOCALS]
 298       //     F1      [C_FRAME]
 299       //             ...
 300       //
 301 
 302       // Do a light-weight C-call here, r_new_arg_entry holds the address
 303       // of the interpreter entry point (frame manager or native entry)
 304       // and save runtime-value of return_pc in return_address
 305       // (call by reference argument).
 306       return_address = __ call_stub(r_new_arg_entry);
 307     }
 308     BLOCK_COMMENT("} call");
 309 
 310     {
 311       BLOCK_COMMENT("restore registers {");
 312       // Returned from frame manager or native entry.
 313       // Now pop frame, process result, and return to caller.
 314 
 315       //
 316       // Stack on exit from frame manager / native entry:
 317       //
 318       //     F0      [ABI]
 319       //             ...
 320       //             [ENTRY_FRAME_LOCALS]
 321       //     F1      [C_FRAME]
 322       //             ...
 323       //
 324       // Just pop the topmost frame ...
 325       //
 326 
 327       // Restore frame pointer.
 328       __ z_lg(r_entryframe_fp, _z_abi(callers_sp), Z_SP);
 329       // Pop frame. Done here to minimize stalls.
 330       __ pop_frame();
 331 
 332       // Reload some volatile registers which we've spilled before the call
 333       // to frame manager / native entry.
 334       // Access all locals via frame pointer, because we know nothing about
 335       // the topmost frame's size.
 336       __ z_lg(r_arg_result_addr, result_address_offset, r_entryframe_fp);
 337       __ z_lg(r_arg_result_type, result_type_offset, r_entryframe_fp);
 338 
 339       // Restore non-volatiles.
 340       __ z_lmg(Z_R6, Z_R14, 16, Z_SP);
 341       __ z_ld(Z_F8, 96, Z_SP);
 342       __ z_ld(Z_F9, 104, Z_SP);
 343       __ z_ld(Z_F10, 112, Z_SP);
 344       __ z_ld(Z_F11, 120, Z_SP);
 345       __ z_ld(Z_F12, 128, Z_SP);
 346       __ z_ld(Z_F13, 136, Z_SP);
 347       __ z_ld(Z_F14, 144, Z_SP);
 348       __ z_ld(Z_F15, 152, Z_SP);
 349       BLOCK_COMMENT("} restore");
 350 
 351       //
 352       // Stack on exit from call_stub:
 353       //
 354       //     0       [C_FRAME]
 355       //             ...
 356       //
 357       // No call_stub frames left.
 358       //
 359 
 360       // All non-volatiles have been restored at this point!!
 361 
 362       //------------------------------------------------------------------------
 363       // The following code makes some assumptions on the T_<type> enum values.
 364       // The enum is defined in globalDefinitions.hpp.
 365       // The validity of the assumptions is tested as far as possible.
 366       //   The assigned values should not be shuffled
 367       //   T_BOOLEAN==4    - lowest used enum value
 368       //   T_NARROWOOP==16 - largest used enum value
 369       //------------------------------------------------------------------------
 370       BLOCK_COMMENT("process result {");
 371       Label firstHandler;
 372       int   handlerLen= 8;
 373 #ifdef ASSERT
 374       char  assertMsg[] = "check BasicType definition in globalDefinitions.hpp";
 375       __ z_chi(r_arg_result_type, T_BOOLEAN);
 376       __ asm_assert(Assembler::bcondNotLow, assertMsg, 0x0234);
 377       __ z_chi(r_arg_result_type, T_NARROWOOP);
 378       __ asm_assert(Assembler::bcondNotHigh, assertMsg, 0x0235);
 379 #endif
 380       __ add2reg(r_arg_result_type, -T_BOOLEAN);          // Remove offset.
 381       __ z_larl(Z_R1, firstHandler);                      // location of first handler
 382       __ z_sllg(r_arg_result_type, r_arg_result_type, 3); // Each handler is 8 bytes long.
 383       __ z_bc(MacroAssembler::bcondAlways, 0, r_arg_result_type, Z_R1);
 384 
 385       __ align(handlerLen);
 386       __ bind(firstHandler);
 387       // T_BOOLEAN:
 388         guarantee(T_BOOLEAN == 4, "check BasicType definition in globalDefinitions.hpp");
 389         __ z_st(Z_RET, 0, r_arg_result_addr);
 390         __ z_br(Z_R14); // Return to caller.
 391         __ align(handlerLen);
 392       // T_CHAR:
 393         guarantee(T_CHAR == T_BOOLEAN+1, "check BasicType definition in globalDefinitions.hpp");
 394         __ z_st(Z_RET, 0, r_arg_result_addr);
 395         __ z_br(Z_R14); // Return to caller.
 396         __ align(handlerLen);
 397       // T_FLOAT:
 398         guarantee(T_FLOAT == T_CHAR+1, "check BasicType definition in globalDefinitions.hpp");
 399         __ z_ste(Z_FRET, 0, r_arg_result_addr);
 400         __ z_br(Z_R14); // Return to caller.
 401         __ align(handlerLen);
 402       // T_DOUBLE:
 403         guarantee(T_DOUBLE == T_FLOAT+1, "check BasicType definition in globalDefinitions.hpp");
 404         __ z_std(Z_FRET, 0, r_arg_result_addr);
 405         __ z_br(Z_R14); // Return to caller.
 406         __ align(handlerLen);
 407       // T_BYTE:
 408         guarantee(T_BYTE == T_DOUBLE+1, "check BasicType definition in globalDefinitions.hpp");
 409         __ z_st(Z_RET, 0, r_arg_result_addr);
 410         __ z_br(Z_R14); // Return to caller.
 411         __ align(handlerLen);
 412       // T_SHORT:
 413         guarantee(T_SHORT == T_BYTE+1, "check BasicType definition in globalDefinitions.hpp");
 414         __ z_st(Z_RET, 0, r_arg_result_addr);
 415         __ z_br(Z_R14); // Return to caller.
 416         __ align(handlerLen);
 417       // T_INT:
 418         guarantee(T_INT == T_SHORT+1, "check BasicType definition in globalDefinitions.hpp");
 419         __ z_st(Z_RET, 0, r_arg_result_addr);
 420         __ z_br(Z_R14); // Return to caller.
 421         __ align(handlerLen);
 422       // T_LONG:
 423         guarantee(T_LONG == T_INT+1, "check BasicType definition in globalDefinitions.hpp");
 424         __ z_stg(Z_RET, 0, r_arg_result_addr);
 425         __ z_br(Z_R14); // Return to caller.
 426         __ align(handlerLen);
 427       // T_OBJECT:
 428         guarantee(T_OBJECT == T_LONG+1, "check BasicType definition in globalDefinitions.hpp");
 429         __ z_stg(Z_RET, 0, r_arg_result_addr);
 430         __ z_br(Z_R14); // Return to caller.
 431         __ align(handlerLen);
 432       // T_ARRAY:
 433         guarantee(T_ARRAY == T_OBJECT+1, "check BasicType definition in globalDefinitions.hpp");
 434         __ z_stg(Z_RET, 0, r_arg_result_addr);
 435         __ z_br(Z_R14); // Return to caller.
 436         __ align(handlerLen);
 437       // T_VOID:
 438         guarantee(T_VOID == T_ARRAY+1, "check BasicType definition in globalDefinitions.hpp");
 439         __ z_stg(Z_RET, 0, r_arg_result_addr);
 440         __ z_br(Z_R14); // Return to caller.
 441         __ align(handlerLen);
 442       // T_ADDRESS:
 443         guarantee(T_ADDRESS == T_VOID+1, "check BasicType definition in globalDefinitions.hpp");
 444         __ z_stg(Z_RET, 0, r_arg_result_addr);
 445         __ z_br(Z_R14); // Return to caller.
 446         __ align(handlerLen);
 447       // T_NARROWOOP:
 448         guarantee(T_NARROWOOP == T_ADDRESS+1, "check BasicType definition in globalDefinitions.hpp");
 449         __ z_st(Z_RET, 0, r_arg_result_addr);
 450         __ z_br(Z_R14); // Return to caller.
 451         __ align(handlerLen);
 452       BLOCK_COMMENT("} process result");
 453     }
 454     return start;
 455   }
 456 
 457   // Return point for a Java call if there's an exception thrown in
 458   // Java code. The exception is caught and transformed into a
 459   // pending exception stored in JavaThread that can be tested from
 460   // within the VM.
 461   address generate_catch_exception() {
 462     StubGenStubId stub_id = StubGenStubId::catch_exception_id;
 463     StubCodeMark mark(this, stub_id);
 464 
 465     address start = __ pc();
 466 
 467     //
 468     // Registers alive
 469     //
 470     //   Z_thread
 471     //   Z_ARG1 - address of pending exception
 472     //   Z_ARG2 - return address in call stub
 473     //
 474 
 475     const Register exception_file = Z_R0;
 476     const Register exception_line = Z_R1;
 477 
 478     __ load_const_optimized(exception_file, (void*)__FILE__);
 479     __ load_const_optimized(exception_line, (void*)__LINE__);
 480 
 481     __ z_stg(Z_ARG1, thread_(pending_exception));
 482     // Store into `char *'.
 483     __ z_stg(exception_file, thread_(exception_file));
 484     // Store into `int'.
 485     __ z_st(exception_line, thread_(exception_line));
 486 
 487     // Complete return to VM.
 488     assert(StubRoutines::_call_stub_return_address != nullptr, "must have been generated before");
 489 
 490     // Continue in call stub.
 491     __ z_br(Z_ARG2);
 492 
 493     return start;
 494   }
 495 
 496   // Continuation point for runtime calls returning with a pending
 497   // exception. The pending exception check happened in the runtime
 498   // or native call stub. The pending exception in Thread is
 499   // converted into a Java-level exception.
 500   //
 501   // Read:
 502   //   Z_R14: pc the runtime library callee wants to return to.
 503   //   Since the exception occurred in the callee, the return pc
 504   //   from the point of view of Java is the exception pc.
 505   //
 506   // Invalidate:
 507   //   Volatile registers (except below).
 508   //
 509   // Update:
 510   //   Z_ARG1: exception
 511   //   (Z_R14 is unchanged and is live out).
 512   //
 513   address generate_forward_exception() {
 514     StubGenStubId stub_id = StubGenStubId::forward_exception_id;
 515     StubCodeMark mark(this, stub_id);
 516     address start = __ pc();
 517 
 518     #define pending_exception_offset in_bytes(Thread::pending_exception_offset())
 519 #ifdef ASSERT
 520     // Get pending exception oop.
 521     __ z_lg(Z_ARG1, pending_exception_offset, Z_thread);
 522 
 523     // Make sure that this code is only executed if there is a pending exception.
 524     {
 525       Label L;
 526       __ z_ltgr(Z_ARG1, Z_ARG1);
 527       __ z_brne(L);
 528       __ stop("StubRoutines::forward exception: no pending exception (1)");
 529       __ bind(L);
 530     }
 531 
 532     __ verify_oop(Z_ARG1, "StubRoutines::forward exception: not an oop");
 533 #endif
 534 
 535     __ z_lgr(Z_ARG2, Z_R14); // Copy exception pc into Z_ARG2.
 536     __ save_return_pc();
 537     __ push_frame_abi160(0);
 538     // Find exception handler.
 539     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address),
 540                     Z_thread,
 541                     Z_ARG2);
 542     // Copy handler's address.
 543     __ z_lgr(Z_R1, Z_RET);
 544     __ pop_frame();
 545     __ restore_return_pc();
 546 
 547     // Set up the arguments for the exception handler:
 548     // - Z_ARG1: exception oop
 549     // - Z_ARG2: exception pc
 550 
 551     // Load pending exception oop.
 552     __ z_lg(Z_ARG1, pending_exception_offset, Z_thread);
 553 
 554     // The exception pc is the return address in the caller,
 555     // must load it into Z_ARG2
 556     __ z_lgr(Z_ARG2, Z_R14);
 557 
 558 #ifdef ASSERT
 559     // Make sure exception is set.
 560     { Label L;
 561       __ z_ltgr(Z_ARG1, Z_ARG1);
 562       __ z_brne(L);
 563       __ stop("StubRoutines::forward exception: no pending exception (2)");
 564       __ bind(L);
 565     }
 566 #endif
 567     // Clear the pending exception.
 568     __ clear_mem(Address(Z_thread, pending_exception_offset), sizeof(void *));
 569     // Jump to exception handler
 570     __ z_br(Z_R1 /*handler address*/);
 571 
 572     return start;
 573 
 574     #undef pending_exception_offset
 575   }
 576 
 577 #undef __
 578 #ifdef PRODUCT
 579 #define __ _masm->
 580 #else
 581 #define __ (Verbose ? (_masm->block_comment(FILE_AND_LINE),_masm):_masm)->
 582 #endif
 583 
 584   // Support for uint StubRoutine::zarch::partial_subtype_check(Klass
 585   // sub, Klass super);
 586   //
 587   // Arguments:
 588   //   ret  : Z_RET, returned
 589   //   sub  : Z_ARG2, argument, not changed
 590   //   super: Z_ARG3, argument, not changed
 591   //
 592   //   raddr: Z_R14, blown by call
 593   //
 594   address generate_partial_subtype_check() {
 595     StubGenStubId stub_id = StubGenStubId::partial_subtype_check_id;
 596     StubCodeMark mark(this, stub_id);
 597     Label miss;
 598 
 599     address start = __ pc();
 600 
 601     const Register Rsubklass   = Z_ARG2; // subklass
 602     const Register Rsuperklass = Z_ARG3; // superklass
 603 
 604     // No args, but tmp registers that are killed.
 605     const Register Rlength     = Z_ARG4; // cache array length
 606     const Register Rarray_ptr  = Z_ARG5; // Current value from cache array.
 607 
 608     if (UseCompressedOops) {
 609       assert(Universe::heap() != nullptr, "java heap must be initialized to generate partial_subtype_check stub");
 610     }
 611 
 612     // Always take the slow path.
 613     __ check_klass_subtype_slow_path(Rsubklass, Rsuperklass,
 614                                      Rarray_ptr, Rlength, nullptr, &miss);
 615 
 616     // Match falls through here.
 617     __ clear_reg(Z_RET);               // Zero indicates a match. Set EQ flag in CC.
 618     __ z_br(Z_R14);
 619 
 620     __ BIND(miss);
 621     __ load_const_optimized(Z_RET, 1); // One indicates a miss.
 622     __ z_ltgr(Z_RET, Z_RET);           // Set NE flag in CR.
 623     __ z_br(Z_R14);
 624 
 625     return start;
 626   }
 627 
 628   void generate_lookup_secondary_supers_table_stub() {
 629     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id;
 630     StubCodeMark mark(this, stub_id);
 631 
 632     const Register
 633         r_super_klass  = Z_ARG1,
 634         r_sub_klass    = Z_ARG2,
 635         r_array_index  = Z_ARG3,
 636         r_array_length = Z_ARG4,
 637         r_array_base   = Z_ARG5,
 638         r_bitmap       = Z_R10,
 639         r_result       = Z_R11;
 640     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
 641       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
 642       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
 643                                              r_array_base, r_array_length, r_array_index,
 644                                              r_bitmap, r_result, slot);
 645 
 646       __ z_br(Z_R14);
 647     }
 648   }
 649 
 650   // Slow path implementation for UseSecondarySupersTable.
 651   address generate_lookup_secondary_supers_table_slow_path_stub() {
 652     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id;
 653     StubCodeMark mark(this, stub_id);
 654 
 655     address start = __ pc();
 656 
 657     const Register
 658         r_super_klass  = Z_ARG1,
 659         r_array_base   = Z_ARG5,
 660         r_temp1        = Z_ARG4,
 661         r_array_index  = Z_ARG3,
 662         r_bitmap       = Z_R10,
 663         r_result       = Z_R11;
 664 
 665     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base,
 666                                                r_array_index, r_bitmap, r_temp1, r_result, /* is_stub */ true);
 667 
 668     __ z_br(Z_R14);
 669 
 670     return start;
 671   }
 672 
 673 #if !defined(PRODUCT)
 674   // Wrapper which calls oopDesc::is_oop_or_null()
 675   // Only called by MacroAssembler::verify_oop
 676   static void verify_oop_helper(const char* message, oopDesc* o) {
 677     if (!oopDesc::is_oop_or_null(o)) {
 678       fatal("%s. oop: " PTR_FORMAT, message, p2i(o));
 679     }
 680     ++ StubRoutines::_verify_oop_count;
 681   }
 682 #endif
 683 
 684   // Return address of code to be called from code generated by
 685   // MacroAssembler::verify_oop.
 686   //
 687   // Don't generate, rather use C++ code.
 688   address generate_verify_oop_subroutine() {
 689     // Don't generate a StubCodeMark, because no code is generated!
 690     // Generating the mark triggers notifying the oprofile jvmti agent
 691     // about the dynamic code generation, but the stub without
 692     // code (code_size == 0) confuses opjitconv
 693     // StubCodeMark mark(this, "StubRoutines", "verify_oop_stub");
 694 
 695     address start = nullptr;
 696 
 697 #if !defined(PRODUCT)
 698     start = CAST_FROM_FN_PTR(address, verify_oop_helper);
 699 #endif
 700 
 701     return start;
 702   }
 703 
 704   // This is to test that the count register contains a positive int value.
 705   // Required because C2 does not respect int to long conversion for stub calls.
 706   void assert_positive_int(Register count) {
 707 #ifdef ASSERT
 708     __ z_srag(Z_R0, count, 31);  // Just leave the sign (must be zero) in Z_R0.
 709     __ asm_assert(Assembler::bcondZero, "missing zero extend", 0xAFFE);
 710 #endif
 711   }
 712 
 713   //  Generate overlap test for array copy stubs.
 714   //  If no actual overlap is detected, control is transferred to the
 715   //  "normal" copy stub (entry address passed in disjoint_copy_target).
 716   //  Otherwise, execution continues with the code generated by the
 717   //  caller of array_overlap_test.
 718   //
 719   //  Input:
 720   //    Z_ARG1    - from
 721   //    Z_ARG2    - to
 722   //    Z_ARG3    - element count
 723   void array_overlap_test(address disjoint_copy_target, int log2_elem_size) {
 724     __ MacroAssembler::compare_and_branch_optimized(Z_ARG2, Z_ARG1, Assembler::bcondNotHigh,
 725                                                     disjoint_copy_target, /*len64=*/true, /*has_sign=*/false);
 726 
 727     Register index = Z_ARG3;
 728     if (log2_elem_size > 0) {
 729       __ z_sllg(Z_R1, Z_ARG3, log2_elem_size);  // byte count
 730       index = Z_R1;
 731     }
 732     __ add2reg_with_index(Z_R1, 0, index, Z_ARG1);  // First byte after "from" range.
 733 
 734     __ MacroAssembler::compare_and_branch_optimized(Z_R1, Z_ARG2, Assembler::bcondNotHigh,
 735                                                     disjoint_copy_target, /*len64=*/true, /*has_sign=*/false);
 736 
 737     // Destructive overlap: let caller generate code for that.
 738   }
 739 
 740   //  Generate stub for disjoint array copy. If "aligned" is true, the
 741   //  "from" and "to" addresses are assumed to be heapword aligned.
 742   //
 743   //  Arguments for generated stub:
 744   //      from:  Z_ARG1
 745   //      to:    Z_ARG2
 746   //      count: Z_ARG3 treated as signed
 747   void generate_disjoint_copy(bool aligned, int element_size,
 748                               bool branchToEnd,
 749                               bool restoreArgs) {
 750     // This is the zarch specific stub generator for general array copy tasks.
 751     // It has the following prereqs and features:
 752     //
 753     // - No destructive overlap allowed (else unpredictable results).
 754     // - Destructive overlap does not exist if the leftmost byte of the target
 755     //   does not coincide with any of the source bytes (except the leftmost).
 756     //
 757     //   Register usage upon entry:
 758     //      Z_ARG1 == Z_R2 :   address of source array
 759     //      Z_ARG2 == Z_R3 :   address of target array
 760     //      Z_ARG3 == Z_R4 :   length of operands (# of elements on entry)
 761     //
 762     // Register usage within the generator:
 763     // - Z_R0 and Z_R1 are KILLed by the stub routine (target addr/len).
 764     //                 Used as pair register operand in complex moves, scratch registers anyway.
 765     // - Z_R5 is KILLed by the stub routine (source register pair addr/len) (even/odd reg).
 766     //                  Same as R0/R1, but no scratch register.
 767     // - Z_ARG1, Z_ARG2, Z_ARG3 are USEd but preserved by the stub routine,
 768     //                          but they might get temporarily overwritten.
 769 
 770     Register  save_reg    = Z_ARG4;   // (= Z_R5), holds original target operand address for restore.
 771 
 772     {
 773       Register   llen_reg = Z_R1;     // Holds left operand len (odd reg).
 774       Register  laddr_reg = Z_R0;     // Holds left operand addr (even reg), overlaps with data_reg.
 775       Register   rlen_reg = Z_R5;     // Holds right operand len (odd reg), overlaps with save_reg.
 776       Register  raddr_reg = Z_R4;     // Holds right operand addr (even reg), overlaps with len_reg.
 777 
 778       Register   data_reg = Z_R0;     // Holds copied data chunk in alignment process and copy loop.
 779       Register    len_reg = Z_ARG3;   // Holds operand len (#elements at entry, #bytes shortly after).
 780       Register    dst_reg = Z_ARG2;   // Holds left (target)  operand addr.
 781       Register    src_reg = Z_ARG1;   // Holds right (source) operand addr.
 782 
 783       Label     doMVCLOOP, doMVCLOOPcount, doMVCLOOPiterate;
 784       Label     doMVCUnrolled;
 785       NearLabel doMVC,  doMVCgeneral, done;
 786       Label     MVC_template;
 787       address   pcMVCblock_b, pcMVCblock_e;
 788 
 789       bool      usedMVCLE       = true;
 790       bool      usedMVCLOOP     = true;
 791       bool      usedMVCUnrolled = false;
 792       bool      usedMVC         = false;
 793       bool      usedMVCgeneral  = false;
 794 
 795       int       stride;
 796       Register  stride_reg;
 797       Register  ix_reg;
 798 
 799       assert((element_size<=256) && (256%element_size == 0), "element size must be <= 256, power of 2");
 800       unsigned int log2_size = exact_log2(element_size);
 801 
 802       switch (element_size) {
 803         case 1:  BLOCK_COMMENT("ARRAYCOPY DISJOINT byte  {"); break;
 804         case 2:  BLOCK_COMMENT("ARRAYCOPY DISJOINT short {"); break;
 805         case 4:  BLOCK_COMMENT("ARRAYCOPY DISJOINT int   {"); break;
 806         case 8:  BLOCK_COMMENT("ARRAYCOPY DISJOINT long  {"); break;
 807         default: BLOCK_COMMENT("ARRAYCOPY DISJOINT       {"); break;
 808       }
 809 
 810       assert_positive_int(len_reg);
 811 
 812       BLOCK_COMMENT("preparation {");
 813 
 814       // No copying if len <= 0.
 815       if (branchToEnd) {
 816         __ compare64_and_branch(len_reg, (intptr_t) 0, Assembler::bcondNotHigh, done);
 817       } else {
 818         if (VM_Version::has_CompareBranch()) {
 819           __ z_cgib(len_reg, 0, Assembler::bcondNotHigh, 0, Z_R14);
 820         } else {
 821           __ z_ltgr(len_reg, len_reg);
 822           __ z_bcr(Assembler::bcondNotPositive, Z_R14);
 823         }
 824       }
 825 
 826       // Prefetch just one cache line. Speculative opt for short arrays.
 827       // Do not use Z_R1 in prefetch. Is undefined here.
 828       if (VM_Version::has_Prefetch()) {
 829         __ z_pfd(0x01, 0, Z_R0, src_reg); // Fetch access.
 830         __ z_pfd(0x02, 0, Z_R0, dst_reg); // Store access.
 831       }
 832 
 833       BLOCK_COMMENT("} preparation");
 834 
 835       // Save args only if really needed.
 836       // Keep len test local to branch. Is generated only once.
 837 
 838       BLOCK_COMMENT("mode selection {");
 839 
 840       // Special handling for arrays with only a few elements.
 841       // Nothing fancy: just an executed MVC.
 842       if (log2_size > 0) {
 843         __ z_sllg(Z_R1, len_reg, log2_size); // Remember #bytes in Z_R1.
 844       }
 845       if (element_size != 8) {
 846         __ z_cghi(len_reg, 256/element_size);
 847         __ z_brnh(doMVC);
 848         usedMVC = true;
 849       }
 850       if (element_size == 8) { // Long and oop arrays are always aligned.
 851         __ z_cghi(len_reg, 256/element_size);
 852         __ z_brnh(doMVCUnrolled);
 853         usedMVCUnrolled = true;
 854       }
 855 
 856       // Prefetch another cache line. We, for sure, have more than one line to copy.
 857       if (VM_Version::has_Prefetch()) {
 858         __ z_pfd(0x01, 256, Z_R0, src_reg); // Fetch access.
 859         __ z_pfd(0x02, 256, Z_R0, dst_reg); // Store access.
 860       }
 861 
 862       if (restoreArgs) {
 863         // Remember entry value of ARG2 to restore all arguments later from that knowledge.
 864         __ z_lgr(save_reg, dst_reg);
 865       }
 866 
 867       __ z_cghi(len_reg, 4096/element_size);
 868       if (log2_size == 0) {
 869         __ z_lgr(Z_R1, len_reg); // Init Z_R1 with #bytes
 870       }
 871       __ z_brnh(doMVCLOOP);
 872 
 873       // Fall through to MVCLE case.
 874 
 875       BLOCK_COMMENT("} mode selection");
 876 
 877       // MVCLE: for long arrays
 878       //   DW aligned: Best performance for sizes > 4kBytes.
 879       //   unaligned:  Least complex for sizes > 256 bytes.
 880       if (usedMVCLE) {
 881         BLOCK_COMMENT("mode MVCLE {");
 882 
 883         // Setup registers for mvcle.
 884         //__ z_lgr(llen_reg, len_reg);// r1 <- r4  #bytes already in Z_R1, aka llen_reg.
 885         __ z_lgr(laddr_reg, dst_reg); // r0 <- r3
 886         __ z_lgr(raddr_reg, src_reg); // r4 <- r2
 887         __ z_lgr(rlen_reg, llen_reg); // r5 <- r1
 888 
 889         __ MacroAssembler::move_long_ext(laddr_reg, raddr_reg, 0xb0);    // special: bypass cache
 890         // __ MacroAssembler::move_long_ext(laddr_reg, raddr_reg, 0xb8); // special: Hold data in cache.
 891         // __ MacroAssembler::move_long_ext(laddr_reg, raddr_reg, 0);
 892 
 893         if (restoreArgs) {
 894           // MVCLE updates the source (Z_R4,Z_R5) and target (Z_R0,Z_R1) register pairs.
 895           // Dst_reg (Z_ARG2) and src_reg (Z_ARG1) are left untouched. No restore required.
 896           // Len_reg (Z_ARG3) is destroyed and must be restored.
 897           __ z_slgr(laddr_reg, dst_reg);    // copied #bytes
 898           if (log2_size > 0) {
 899             __ z_srag(Z_ARG3, laddr_reg, log2_size); // Convert back to #elements.
 900           } else {
 901             __ z_lgr(Z_ARG3, laddr_reg);
 902           }
 903         }
 904         if (branchToEnd) {
 905           __ z_bru(done);
 906         } else {
 907           __ z_br(Z_R14);
 908         }
 909         BLOCK_COMMENT("} mode MVCLE");
 910       }
 911       // No fallthru possible here.
 912 
 913       //  MVCUnrolled: for short, aligned arrays.
 914 
 915       if (usedMVCUnrolled) {
 916         BLOCK_COMMENT("mode MVC unrolled {");
 917         stride = 8;
 918 
 919         // Generate unrolled MVC instructions.
 920         for (int ii = 32; ii > 1; ii--) {
 921           __ z_mvc(0, ii * stride-1, dst_reg, 0, src_reg); // ii*8 byte copy
 922           if (branchToEnd) {
 923             __ z_bru(done);
 924           } else {
 925             __ z_br(Z_R14);
 926           }
 927         }
 928 
 929         pcMVCblock_b = __ pc();
 930         __ z_mvc(0, 1 * stride-1, dst_reg, 0, src_reg); // 8 byte copy
 931         if (branchToEnd) {
 932           __ z_bru(done);
 933         } else {
 934           __ z_br(Z_R14);
 935         }
 936 
 937         pcMVCblock_e = __ pc();
 938         Label MVC_ListEnd;
 939         __ bind(MVC_ListEnd);
 940 
 941         // This is an absolute fast path:
 942         // - Array len in bytes must be not greater than 256.
 943         // - Array len in bytes must be an integer mult of DW
 944         //   to save expensive handling of trailing bytes.
 945         // - Argument restore is not done,
 946         //   i.e. previous code must not alter arguments (this code doesn't either).
 947 
 948         __ bind(doMVCUnrolled);
 949 
 950         // Avoid mul, prefer shift where possible.
 951         // Combine shift right (for #DW) with shift left (for block size).
 952         // Set CC for zero test below (asm_assert).
 953         // Note: #bytes comes in Z_R1, #DW in len_reg.
 954         unsigned int MVCblocksize    = pcMVCblock_e - pcMVCblock_b;
 955         unsigned int logMVCblocksize = 0xffffffffU; // Pacify compiler ("used uninitialized" warning).
 956 
 957         if (log2_size > 0) { // Len was scaled into Z_R1.
 958           switch (MVCblocksize) {
 959 
 960             case  8: logMVCblocksize = 3;
 961                      __ z_ltgr(Z_R0, Z_R1); // #bytes is index
 962                      break;                 // reasonable size, use shift
 963 
 964             case 16: logMVCblocksize = 4;
 965                      __ z_slag(Z_R0, Z_R1, logMVCblocksize-log2_size);
 966                      break;                 // reasonable size, use shift
 967 
 968             default: logMVCblocksize = 0;
 969                      __ z_ltgr(Z_R0, len_reg); // #DW for mul
 970                      break;                 // all other sizes: use mul
 971           }
 972         } else {
 973           guarantee(log2_size, "doMVCUnrolled: only for DW entities");
 974         }
 975 
 976         // This test (and branch) is redundant. Previous code makes sure that
 977         //  - element count > 0
 978         //  - element size == 8.
 979         // Thus, len reg should never be zero here. We insert an asm_assert() here,
 980         // just to double-check and to be on the safe side.
 981         __ asm_assert(false, "zero len cannot occur", 99);
 982 
 983         __ z_larl(Z_R1, MVC_ListEnd);        // Get addr of last instr block.
 984         // Avoid mul, prefer shift where possible.
 985         if (logMVCblocksize == 0) {
 986           __ z_mghi(Z_R0, MVCblocksize);
 987         }
 988         __ z_slgr(Z_R1, Z_R0);
 989         __ z_br(Z_R1);
 990         BLOCK_COMMENT("} mode MVC unrolled");
 991       }
 992       // No fallthru possible here.
 993 
 994       // MVC execute template
 995       // Must always generate. Usage may be switched on below.
 996       // There is no suitable place after here to put the template.
 997       __ bind(MVC_template);
 998       __ z_mvc(0,0,dst_reg,0,src_reg);      // Instr template, never exec directly!
 999 
1000 
1001       // MVC Loop: for medium-sized arrays
1002 
1003       // Only for DW aligned arrays (src and dst).
1004       // #bytes to copy must be at least 256!!!
1005       // Non-aligned cases handled separately.
1006       stride     = 256;
1007       stride_reg = Z_R1;   // Holds #bytes when control arrives here.
1008       ix_reg     = Z_ARG3; // Alias for len_reg.
1009 
1010 
1011       if (usedMVCLOOP) {
1012         BLOCK_COMMENT("mode MVC loop {");
1013         __ bind(doMVCLOOP);
1014 
1015         __ z_lcgr(ix_reg, Z_R1);         // Ix runs from -(n-2)*stride to 1*stride (inclusive).
1016         __ z_llill(stride_reg, stride);
1017         __ add2reg(ix_reg, 2*stride);    // Thus: increment ix by 2*stride.
1018 
1019         __ bind(doMVCLOOPiterate);
1020           __ z_mvc(0, stride-1, dst_reg, 0, src_reg);
1021           __ add2reg(dst_reg, stride);
1022           __ add2reg(src_reg, stride);
1023           __ bind(doMVCLOOPcount);
1024           __ z_brxlg(ix_reg, stride_reg, doMVCLOOPiterate);
1025 
1026         // Don 't use add2reg() here, since we must set the condition code!
1027         __ z_aghi(ix_reg, -2*stride);       // Compensate incr from above: zero diff means "all copied".
1028 
1029         if (restoreArgs) {
1030           __ z_lcgr(Z_R1, ix_reg);          // Prepare ix_reg for copy loop, #bytes expected in Z_R1.
1031           __ z_brnz(doMVCgeneral);          // We're not done yet, ix_reg is not zero.
1032 
1033           // ARG1, ARG2, and ARG3 were altered by the code above, so restore them building on save_reg.
1034           __ z_slgr(dst_reg, save_reg);     // copied #bytes
1035           __ z_slgr(src_reg, dst_reg);      // = ARG1 (now restored)
1036           if (log2_size) {
1037             __ z_srag(Z_ARG3, dst_reg, log2_size); // Convert back to #elements to restore ARG3.
1038           } else {
1039             __ z_lgr(Z_ARG3, dst_reg);
1040           }
1041           __ z_lgr(Z_ARG2, save_reg);       // ARG2 now restored.
1042 
1043           if (branchToEnd) {
1044             __ z_bru(done);
1045           } else {
1046             __ z_br(Z_R14);
1047           }
1048 
1049         } else {
1050             if (branchToEnd) {
1051               __ z_brz(done);                        // CC set by aghi instr.
1052           } else {
1053               __ z_bcr(Assembler::bcondZero, Z_R14); // We're all done if zero.
1054             }
1055 
1056           __ z_lcgr(Z_R1, ix_reg);    // Prepare ix_reg for copy loop, #bytes expected in Z_R1.
1057           // __ z_bru(doMVCgeneral);  // fallthru
1058         }
1059         usedMVCgeneral = true;
1060         BLOCK_COMMENT("} mode MVC loop");
1061       }
1062       // Fallthru to doMVCgeneral
1063 
1064       // MVCgeneral: for short, unaligned arrays, after other copy operations
1065 
1066       // Somewhat expensive due to use of EX instruction, but simple.
1067       if (usedMVCgeneral) {
1068         BLOCK_COMMENT("mode MVC general {");
1069         __ bind(doMVCgeneral);
1070 
1071         __ add2reg(len_reg, -1, Z_R1);             // Get #bytes-1 for EXECUTE.
1072         if (VM_Version::has_ExecuteExtensions()) {
1073           __ z_exrl(len_reg, MVC_template);        // Execute MVC with variable length.
1074         } else {
1075           __ z_larl(Z_R1, MVC_template);           // Get addr of instr template.
1076           __ z_ex(len_reg, 0, Z_R0, Z_R1);         // Execute MVC with variable length.
1077         }                                          // penalty: 9 ticks
1078 
1079         if (restoreArgs) {
1080           // ARG1, ARG2, and ARG3 were altered by code executed before, so restore them building on save_reg
1081           __ z_slgr(dst_reg, save_reg);            // Copied #bytes without the "doMVCgeneral" chunk
1082           __ z_slgr(src_reg, dst_reg);             // = ARG1 (now restored), was not advanced for "doMVCgeneral" chunk
1083           __ add2reg_with_index(dst_reg, 1, len_reg, dst_reg); // Len of executed MVC was not accounted for, yet.
1084           if (log2_size) {
1085             __ z_srag(Z_ARG3, dst_reg, log2_size); // Convert back to #elements to restore ARG3
1086           } else {
1087              __ z_lgr(Z_ARG3, dst_reg);
1088           }
1089           __ z_lgr(Z_ARG2, save_reg);              // ARG2 now restored.
1090         }
1091 
1092         if (usedMVC) {
1093           if (branchToEnd) {
1094             __ z_bru(done);
1095           } else {
1096             __ z_br(Z_R14);
1097         }
1098         } else {
1099           if (!branchToEnd) __ z_br(Z_R14);
1100         }
1101         BLOCK_COMMENT("} mode MVC general");
1102       }
1103       // Fallthru possible if following block not generated.
1104 
1105       // MVC: for short, unaligned arrays
1106 
1107       // Somewhat expensive due to use of EX instruction, but simple. penalty: 9 ticks.
1108       // Differs from doMVCgeneral in reconstruction of ARG2, ARG3, and ARG4.
1109       if (usedMVC) {
1110         BLOCK_COMMENT("mode MVC {");
1111         __ bind(doMVC);
1112 
1113         // get #bytes-1 for EXECUTE
1114         if (log2_size) {
1115           __ add2reg(Z_R1, -1);                // Length was scaled into Z_R1.
1116         } else {
1117           __ add2reg(Z_R1, -1, len_reg);       // Length was not scaled.
1118         }
1119 
1120         if (VM_Version::has_ExecuteExtensions()) {
1121           __ z_exrl(Z_R1, MVC_template);       // Execute MVC with variable length.
1122         } else {
1123           __ z_lgr(Z_R0, Z_R5);                // Save ARG4, may be unnecessary.
1124           __ z_larl(Z_R5, MVC_template);       // Get addr of instr template.
1125           __ z_ex(Z_R1, 0, Z_R0, Z_R5);        // Execute MVC with variable length.
1126           __ z_lgr(Z_R5, Z_R0);                // Restore ARG4, may be unnecessary.
1127         }
1128 
1129         if (!branchToEnd) {
1130           __ z_br(Z_R14);
1131         }
1132         BLOCK_COMMENT("} mode MVC");
1133       }
1134 
1135       __ bind(done);
1136 
1137       switch (element_size) {
1138         case 1:  BLOCK_COMMENT("} ARRAYCOPY DISJOINT byte "); break;
1139         case 2:  BLOCK_COMMENT("} ARRAYCOPY DISJOINT short"); break;
1140         case 4:  BLOCK_COMMENT("} ARRAYCOPY DISJOINT int  "); break;
1141         case 8:  BLOCK_COMMENT("} ARRAYCOPY DISJOINT long "); break;
1142         default: BLOCK_COMMENT("} ARRAYCOPY DISJOINT      "); break;
1143       }
1144     }
1145   }
1146 
1147   // Generate stub for conjoint array copy. If "aligned" is true, the
1148   // "from" and "to" addresses are assumed to be heapword aligned.
1149   //
1150   // Arguments for generated stub:
1151   //   from:  Z_ARG1
1152   //   to:    Z_ARG2
1153   //   count: Z_ARG3 treated as signed
1154   void generate_conjoint_copy(bool aligned, int element_size, bool branchToEnd) {
1155 
1156     // This is the zarch specific stub generator for general array copy tasks.
1157     // It has the following prereqs and features:
1158     //
1159     // - Destructive overlap exists and is handled by reverse copy.
1160     // - Destructive overlap exists if the leftmost byte of the target
1161     //   does coincide with any of the source bytes (except the leftmost).
1162     // - Z_R0 and Z_R1 are KILLed by the stub routine (data and stride)
1163     // - Z_ARG1 and Z_ARG2 are USEd but preserved by the stub routine.
1164     // - Z_ARG3 is USED but preserved by the stub routine.
1165     // - Z_ARG4 is used as index register and is thus KILLed.
1166     //
1167     {
1168       Register stride_reg = Z_R1;     // Stride & compare value in loop (negative element_size).
1169       Register   data_reg = Z_R0;     // Holds value of currently processed element.
1170       Register     ix_reg = Z_ARG4;   // Holds byte index of currently processed element.
1171       Register    len_reg = Z_ARG3;   // Holds length (in #elements) of arrays.
1172       Register    dst_reg = Z_ARG2;   // Holds left  operand addr.
1173       Register    src_reg = Z_ARG1;   // Holds right operand addr.
1174 
1175       assert(256%element_size == 0, "Element size must be power of 2.");
1176       assert(element_size     <= 8, "Can't handle more than DW units.");
1177 
1178       switch (element_size) {
1179         case 1:  BLOCK_COMMENT("ARRAYCOPY CONJOINT byte  {"); break;
1180         case 2:  BLOCK_COMMENT("ARRAYCOPY CONJOINT short {"); break;
1181         case 4:  BLOCK_COMMENT("ARRAYCOPY CONJOINT int   {"); break;
1182         case 8:  BLOCK_COMMENT("ARRAYCOPY CONJOINT long  {"); break;
1183         default: BLOCK_COMMENT("ARRAYCOPY CONJOINT       {"); break;
1184       }
1185 
1186       assert_positive_int(len_reg);
1187 
1188       if (VM_Version::has_Prefetch()) {
1189         __ z_pfd(0x01, 0, Z_R0, src_reg); // Fetch access.
1190         __ z_pfd(0x02, 0, Z_R0, dst_reg); // Store access.
1191       }
1192 
1193       unsigned int log2_size = exact_log2(element_size);
1194       if (log2_size) {
1195         __ z_sllg(ix_reg, len_reg, log2_size);
1196       } else {
1197         __ z_lgr(ix_reg, len_reg);
1198       }
1199 
1200       // Optimize reverse copy loop.
1201       // Main loop copies DW units which may be unaligned. Unaligned access adds some penalty ticks.
1202       // Unaligned DW access (neither fetch nor store) is DW-atomic, but should be alignment-atomic.
1203       // Preceding the main loop, some bytes are copied to obtain a DW-multiple remaining length.
1204 
1205       Label countLoop1;
1206       Label copyLoop1;
1207       Label skipBY;
1208       Label skipHW;
1209       int   stride = -8;
1210 
1211       __ load_const_optimized(stride_reg, stride); // Prepare for DW copy loop.
1212 
1213       if (element_size == 8)    // Nothing to do here.
1214         __ z_bru(countLoop1);
1215       else {                    // Do not generate dead code.
1216         __ z_tmll(ix_reg, 7);   // Check the "odd" bits.
1217         __ z_bre(countLoop1);   // There are none, very good!
1218       }
1219 
1220       if (log2_size == 0) {     // Handle leftover Byte.
1221         __ z_tmll(ix_reg, 1);
1222         __ z_bre(skipBY);
1223         __ z_lb(data_reg,   -1, ix_reg, src_reg);
1224         __ z_stcy(data_reg, -1, ix_reg, dst_reg);
1225         __ add2reg(ix_reg, -1); // Decrement delayed to avoid AGI.
1226         __ bind(skipBY);
1227         // fallthru
1228       }
1229       if (log2_size <= 1) {     // Handle leftover HW.
1230         __ z_tmll(ix_reg, 2);
1231         __ z_bre(skipHW);
1232         __ z_lhy(data_reg,  -2, ix_reg, src_reg);
1233         __ z_sthy(data_reg, -2, ix_reg, dst_reg);
1234         __ add2reg(ix_reg, -2); // Decrement delayed to avoid AGI.
1235         __ bind(skipHW);
1236         __ z_tmll(ix_reg, 4);
1237         __ z_bre(countLoop1);
1238         // fallthru
1239       }
1240       if (log2_size <= 2) {     // There are just 4 bytes (left) that need to be copied.
1241         __ z_ly(data_reg,  -4, ix_reg, src_reg);
1242         __ z_sty(data_reg, -4, ix_reg, dst_reg);
1243         __ add2reg(ix_reg, -4); // Decrement delayed to avoid AGI.
1244         __ z_bru(countLoop1);
1245       }
1246 
1247       // Control can never get to here. Never! Never ever!
1248       __ z_illtrap(0x99);
1249       __ bind(copyLoop1);
1250       __ z_lg(data_reg,  0, ix_reg, src_reg);
1251       __ z_stg(data_reg, 0, ix_reg, dst_reg);
1252       __ bind(countLoop1);
1253       __ z_brxhg(ix_reg, stride_reg, copyLoop1);
1254 
1255       if (!branchToEnd)
1256         __ z_br(Z_R14);
1257 
1258       switch (element_size) {
1259         case 1:  BLOCK_COMMENT("} ARRAYCOPY CONJOINT byte "); break;
1260         case 2:  BLOCK_COMMENT("} ARRAYCOPY CONJOINT short"); break;
1261         case 4:  BLOCK_COMMENT("} ARRAYCOPY CONJOINT int  "); break;
1262         case 8:  BLOCK_COMMENT("} ARRAYCOPY CONJOINT long "); break;
1263         default: BLOCK_COMMENT("} ARRAYCOPY CONJOINT      "); break;
1264       }
1265     }
1266   }
1267 
1268   address generate_disjoint_nonoop_copy(StubGenStubId stub_id) {
1269     bool aligned;
1270     int element_size;
1271     switch (stub_id) {
1272     case jbyte_disjoint_arraycopy_id:
1273       aligned = false;
1274       element_size = 1;
1275       break;
1276     case arrayof_jbyte_disjoint_arraycopy_id:
1277       aligned = true;
1278       element_size = 1;
1279       break;
1280     case jshort_disjoint_arraycopy_id:
1281       aligned = false;
1282       element_size = 2;
1283       break;
1284     case arrayof_jshort_disjoint_arraycopy_id:
1285       aligned = true;
1286       element_size = 2;
1287       break;
1288     case jint_disjoint_arraycopy_id:
1289       aligned = false;
1290       element_size = 4;
1291       break;
1292     case arrayof_jint_disjoint_arraycopy_id:
1293       aligned = true;
1294       element_size = 4;
1295       break;
1296     case jlong_disjoint_arraycopy_id:
1297       aligned = false;
1298       element_size = 8;
1299       break;
1300     case arrayof_jlong_disjoint_arraycopy_id:
1301       aligned = true;
1302       element_size = 8;
1303       break;
1304     default:
1305       ShouldNotReachHere();
1306     }
1307     StubCodeMark mark(this, stub_id);
1308     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1309     generate_disjoint_copy(aligned, element_size, false, false);
1310     return __ addr_at(start_off);
1311   }
1312 
1313   address generate_disjoint_oop_copy(StubGenStubId stub_id) {
1314     bool aligned;
1315     bool dest_uninitialized;
1316     switch (stub_id) {
1317     case oop_disjoint_arraycopy_id:
1318       aligned = false;
1319       dest_uninitialized = false;
1320       break;
1321     case arrayof_oop_disjoint_arraycopy_id:
1322       aligned = true;
1323       dest_uninitialized = false;
1324       break;
1325     case oop_disjoint_arraycopy_uninit_id:
1326       aligned = false;
1327       dest_uninitialized = true;
1328       break;
1329     case arrayof_oop_disjoint_arraycopy_uninit_id:
1330       aligned = true;
1331       dest_uninitialized = true;
1332       break;
1333     default:
1334       ShouldNotReachHere();
1335     }
1336     StubCodeMark mark(this, stub_id);
1337     // This is the zarch specific stub generator for oop array copy.
1338     // Refer to generate_disjoint_copy for a list of prereqs and features.
1339     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1340     unsigned int size      = UseCompressedOops ? 4 : 8;
1341 
1342     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1343     if (dest_uninitialized) {
1344       decorators |= IS_DEST_UNINITIALIZED;
1345     }
1346     if (aligned) {
1347       decorators |= ARRAYCOPY_ALIGNED;
1348     }
1349 
1350     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1351     bs->arraycopy_prologue(_masm, decorators, T_OBJECT, Z_ARG1, Z_ARG2, Z_ARG3);
1352 
1353     generate_disjoint_copy(aligned, size, true, true);
1354 
1355     bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, Z_ARG2, Z_ARG3, true);
1356 
1357     return __ addr_at(start_off);
1358   }
1359 
1360   address generate_conjoint_nonoop_copy(StubGenStubId stub_id) {
1361     bool aligned;
1362     int shift; // i.e. log2(element size)
1363     address nooverlap_target;
1364     switch (stub_id) {
1365     case jbyte_arraycopy_id:
1366       aligned = false;
1367       shift = 0;
1368       nooverlap_target = StubRoutines::jbyte_disjoint_arraycopy();
1369       break;
1370     case arrayof_jbyte_arraycopy_id:
1371       aligned = true;
1372       shift = 0;
1373       nooverlap_target = StubRoutines::arrayof_jbyte_disjoint_arraycopy();
1374       break;
1375     case jshort_arraycopy_id:
1376       aligned = false;
1377       shift = 1;
1378       nooverlap_target = StubRoutines::jshort_disjoint_arraycopy();
1379       break;
1380     case arrayof_jshort_arraycopy_id:
1381       aligned = true;
1382       shift = 1;
1383       nooverlap_target = StubRoutines::arrayof_jshort_disjoint_arraycopy();
1384       break;
1385     case jint_arraycopy_id:
1386       aligned = false;
1387       shift = 2;
1388       nooverlap_target = StubRoutines::jint_disjoint_arraycopy();
1389       break;
1390     case arrayof_jint_arraycopy_id:
1391       aligned = true;
1392       shift = 2;
1393       nooverlap_target = StubRoutines::arrayof_jint_disjoint_arraycopy();
1394       break;
1395     case jlong_arraycopy_id:
1396       aligned = false;
1397       shift = 3;
1398       nooverlap_target = StubRoutines::jlong_disjoint_arraycopy();
1399       break;
1400     case arrayof_jlong_arraycopy_id:
1401       aligned = true;
1402       shift = 3;
1403       nooverlap_target = StubRoutines::arrayof_jlong_disjoint_arraycopy();
1404       break;
1405     default:
1406       ShouldNotReachHere();
1407     }
1408     StubCodeMark mark(this, stub_id);
1409     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1410     array_overlap_test(nooverlap_target, shift); // Branch away to nooverlap_target if disjoint.
1411     generate_conjoint_copy(aligned, 1 << shift, false);
1412     return __ addr_at(start_off);
1413   }
1414 
1415   address generate_conjoint_oop_copy(StubGenStubId stub_id) {
1416     bool aligned;
1417     bool dest_uninitialized;
1418     address nooverlap_target;
1419     switch (stub_id) {
1420     case oop_arraycopy_id:
1421       aligned = false;
1422       dest_uninitialized = false;
1423       nooverlap_target = StubRoutines::oop_disjoint_arraycopy(dest_uninitialized);
1424       break;
1425     case arrayof_oop_arraycopy_id:
1426       aligned = true;
1427       dest_uninitialized = false;
1428       nooverlap_target = StubRoutines::arrayof_oop_disjoint_arraycopy(dest_uninitialized);
1429       break;
1430     case oop_arraycopy_uninit_id:
1431       aligned = false;
1432       dest_uninitialized = true;
1433       nooverlap_target = StubRoutines::oop_disjoint_arraycopy(dest_uninitialized);
1434       break;
1435     case arrayof_oop_arraycopy_uninit_id:
1436       aligned = true;
1437       dest_uninitialized = true;
1438       nooverlap_target = StubRoutines::arrayof_oop_disjoint_arraycopy(dest_uninitialized);
1439       break;
1440     default:
1441       ShouldNotReachHere();
1442     }
1443     StubCodeMark mark(this, stub_id);
1444     // This is the zarch specific stub generator for overlapping oop array copy.
1445     // Refer to generate_conjoint_copy for a list of prereqs and features.
1446     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1447     unsigned int size      = UseCompressedOops ? 4 : 8;
1448     unsigned int shift     = UseCompressedOops ? 2 : 3;
1449 
1450     // Branch to disjoint_copy (if applicable) before pre_barrier to avoid double pre_barrier.
1451     array_overlap_test(nooverlap_target, shift);  // Branch away to nooverlap_target if disjoint.
1452 
1453     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1454     if (dest_uninitialized) {
1455       decorators |= IS_DEST_UNINITIALIZED;
1456     }
1457     if (aligned) {
1458       decorators |= ARRAYCOPY_ALIGNED;
1459     }
1460 
1461     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1462     bs->arraycopy_prologue(_masm, decorators, T_OBJECT, Z_ARG1, Z_ARG2, Z_ARG3);
1463 
1464     generate_conjoint_copy(aligned, size, true);  // Must preserve ARG2, ARG3.
1465 
1466     bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, Z_ARG2, Z_ARG3, true);
1467 
1468     return __ addr_at(start_off);
1469   }
1470 
1471 
1472   void generate_arraycopy_stubs() {
1473 
1474     // Note: the disjoint stubs must be generated first, some of
1475     // the conjoint stubs use them.
1476     StubRoutines::_jbyte_disjoint_arraycopy      = generate_disjoint_nonoop_copy (StubGenStubId::jbyte_disjoint_arraycopy_id);
1477     StubRoutines::_jshort_disjoint_arraycopy     = generate_disjoint_nonoop_copy(StubGenStubId::jshort_disjoint_arraycopy_id);
1478     StubRoutines::_jint_disjoint_arraycopy       = generate_disjoint_nonoop_copy  (StubGenStubId::jint_disjoint_arraycopy_id);
1479     StubRoutines::_jlong_disjoint_arraycopy      = generate_disjoint_nonoop_copy (StubGenStubId::jlong_disjoint_arraycopy_id);
1480     StubRoutines::_oop_disjoint_arraycopy        = generate_disjoint_oop_copy  (StubGenStubId::oop_disjoint_arraycopy_id);
1481     StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy  (StubGenStubId::oop_disjoint_arraycopy_uninit_id);
1482 
1483     StubRoutines::_arrayof_jbyte_disjoint_arraycopy      = generate_disjoint_nonoop_copy (StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id);
1484     StubRoutines::_arrayof_jshort_disjoint_arraycopy     = generate_disjoint_nonoop_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id);
1485     StubRoutines::_arrayof_jint_disjoint_arraycopy       = generate_disjoint_nonoop_copy  (StubGenStubId::arrayof_jint_disjoint_arraycopy_id);
1486     StubRoutines::_arrayof_jlong_disjoint_arraycopy      = generate_disjoint_nonoop_copy (StubGenStubId::arrayof_jlong_disjoint_arraycopy_id);
1487     StubRoutines::_arrayof_oop_disjoint_arraycopy        = generate_disjoint_oop_copy  (StubGenStubId::arrayof_oop_disjoint_arraycopy_id);
1488     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy  (StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id);
1489 
1490     StubRoutines::_jbyte_arraycopy           = generate_conjoint_nonoop_copy(StubGenStubId::jbyte_arraycopy_id);
1491     StubRoutines::_jshort_arraycopy          = generate_conjoint_nonoop_copy(StubGenStubId::jshort_arraycopy_id);
1492     StubRoutines::_jint_arraycopy            = generate_conjoint_nonoop_copy(StubGenStubId::jint_arraycopy_id);
1493     StubRoutines::_jlong_arraycopy           = generate_conjoint_nonoop_copy(StubGenStubId::jlong_arraycopy_id);
1494     StubRoutines::_oop_arraycopy             = generate_conjoint_oop_copy(StubGenStubId::oop_arraycopy_id);
1495     StubRoutines::_oop_arraycopy_uninit      = generate_conjoint_oop_copy(StubGenStubId::oop_arraycopy_uninit_id);
1496 
1497     StubRoutines::_arrayof_jbyte_arraycopy      = generate_conjoint_nonoop_copy(StubGenStubId::arrayof_jbyte_arraycopy_id);
1498     StubRoutines::_arrayof_jshort_arraycopy     = generate_conjoint_nonoop_copy(StubGenStubId::arrayof_jshort_arraycopy_id);
1499     StubRoutines::_arrayof_jint_arraycopy       = generate_conjoint_nonoop_copy (StubGenStubId::arrayof_jint_arraycopy_id);
1500     StubRoutines::_arrayof_jlong_arraycopy      = generate_conjoint_nonoop_copy(StubGenStubId::arrayof_jlong_arraycopy_id);
1501     StubRoutines::_arrayof_oop_arraycopy        = generate_conjoint_oop_copy(StubGenStubId::arrayof_oop_arraycopy_id);
1502     StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id);
1503   }
1504 
1505   // Call interface for AES_encryptBlock, AES_decryptBlock stubs.
1506   //
1507   //   Z_ARG1 - source data block. Ptr to leftmost byte to be processed.
1508   //   Z_ARG2 - destination data block. Ptr to leftmost byte to be stored.
1509   //            For in-place encryption/decryption, ARG1 and ARG2 can point
1510   //            to the same piece of storage.
1511   //   Z_ARG3 - Crypto key address (expanded key). The first n bits of
1512   //            the expanded key constitute the original AES-<n> key (see below).
1513   //
1514   //   Z_RET  - return value. First unprocessed byte offset in src buffer.
1515   //
1516   // Some remarks:
1517   //   The crypto key, as passed from the caller to these encryption stubs,
1518   //   is a so-called expanded key. It is derived from the original key
1519   //   by the Rijndael key schedule, see http://en.wikipedia.org/wiki/Rijndael_key_schedule
1520   //   With the expanded key, the cipher/decipher task is decomposed in
1521   //   multiple, less complex steps, called rounds. Sun SPARC and Intel
1522   //   processors obviously implement support for those less complex steps.
1523   //   z/Architecture provides instructions for full cipher/decipher complexity.
1524   //   Therefore, we need the original, not the expanded key here.
1525   //   Luckily, the first n bits of an AES-<n> expanded key are formed
1526   //   by the original key itself. That takes us out of trouble. :-)
1527   //   The key length (in bytes) relation is as follows:
1528   //     original    expanded   rounds  key bit     keylen
1529   //    key bytes   key bytes            length   in words
1530   //           16         176       11      128         44
1531   //           24         208       13      192         52
1532   //           32         240       15      256         60
1533   //
1534   // The crypto instructions used in the AES* stubs have some specific register requirements.
1535   //   Z_R0   holds the crypto function code. Please refer to the KM/KMC instruction
1536   //          description in the "z/Architecture Principles of Operation" manual for details.
1537   //   Z_R1   holds the parameter block address. The parameter block contains the cryptographic key
1538   //          (KM instruction) and the chaining value (KMC instruction).
1539   //   dst    must designate an even-numbered register, holding the address of the output message.
1540   //   src    must designate an even/odd register pair, holding the address/length of the original message
1541 
1542   // Helper function which generates code to
1543   //  - load the function code in register fCode (== Z_R0).
1544   //  - load the data block length (depends on cipher function) into register srclen if requested.
1545   //  - is_decipher switches between cipher/decipher function codes
1546   //  - set_len requests (if true) loading the data block length in register srclen
1547   void generate_load_AES_fCode(Register keylen, Register fCode, Register srclen, bool is_decipher) {
1548 
1549     BLOCK_COMMENT("Set fCode {"); {
1550       Label fCode_set;
1551       int   mode = is_decipher ? VM_Version::CipherMode::decipher : VM_Version::CipherMode::cipher;
1552       bool  identical_dataBlk_len =  (VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES192_dataBlk)
1553                                   && (VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES256_dataBlk);
1554       // Expanded key length is 44/52/60 * 4 bytes for AES-128/AES-192/AES-256.
1555       __ z_cghi(keylen, 52); // Check only once at the beginning. keylen and fCode may share the same register.
1556 
1557       __ z_lghi(fCode, VM_Version::Cipher::_AES128 + mode);
1558       if (!identical_dataBlk_len) {
1559         __ z_lghi(srclen, VM_Version::Cipher::_AES128_dataBlk);
1560       }
1561       __ z_brl(fCode_set);  // keyLen <  52: AES128
1562 
1563       __ z_lghi(fCode, VM_Version::Cipher::_AES192 + mode);
1564       if (!identical_dataBlk_len) {
1565         __ z_lghi(srclen, VM_Version::Cipher::_AES192_dataBlk);
1566       }
1567       __ z_bre(fCode_set);  // keyLen == 52: AES192
1568 
1569       __ z_lghi(fCode, VM_Version::Cipher::_AES256 + mode);
1570       if (!identical_dataBlk_len) {
1571         __ z_lghi(srclen, VM_Version::Cipher::_AES256_dataBlk);
1572       }
1573       // __ z_brh(fCode_set);  // keyLen <  52: AES128           // fallthru
1574 
1575       __ bind(fCode_set);
1576       if (identical_dataBlk_len) {
1577         __ z_lghi(srclen, VM_Version::Cipher::_AES128_dataBlk);
1578       }
1579     }
1580     BLOCK_COMMENT("} Set fCode");
1581   }
1582 
1583   // Push a parameter block for the cipher/decipher instruction on the stack.
1584   // Layout of the additional stack space allocated for AES_cipherBlockChaining:
1585   //
1586   //   |        |
1587   //   +--------+ <-- SP before expansion
1588   //   |        |
1589   //   :        :  alignment loss (part 2), 0..(AES_parmBlk_align-1) bytes
1590   //   |        |
1591   //   +--------+
1592   //   |        |
1593   //   :        :  space for parameter block, size VM_Version::Cipher::_AES*_parmBlk_C
1594   //   |        |
1595   //   +--------+ <-- parmBlk, octoword-aligned, start of parameter block
1596   //   |        |
1597   //   :        :  additional stack space for spills etc., size AES_parmBlk_addspace, DW @ Z_SP not usable!!!
1598   //   |        |
1599   //   +--------+ <-- Z_SP + alignment loss, octoword-aligned
1600   //   |        |
1601   //   :        :  alignment loss (part 1), 0..(AES_parmBlk_align-1) bytes. DW @ Z_SP not usable!!!
1602   //   |        |
1603   //   +--------+ <-- Z_SP after expansion
1604 
1605   void generate_push_Block(int dataBlk_len, int parmBlk_len, int crypto_fCode,
1606                            Register parmBlk, Register keylen, Register fCode, Register cv, Register key) {
1607 
1608     AES_parmBlk_addspace = AES_parmBlk_align; // Must be multiple of AES_parmblk_align.
1609                                               // spill space for regs etc., don't use DW @SP!
1610     const int cv_len     = dataBlk_len;
1611     const int key_len    = parmBlk_len - cv_len;
1612     // This len must be known at JIT compile time. Only then are we able to recalc the SP before resize.
1613     // We buy this knowledge by wasting some (up to AES_parmBlk_align) bytes of stack space.
1614     const int resize_len = cv_len + key_len + AES_parmBlk_align + AES_parmBlk_addspace;
1615 
1616     // Use parmBlk as temp reg here to hold the frame pointer.
1617     __ resize_frame(-resize_len, parmBlk, true);
1618 
1619     // calculate parmBlk address from updated (resized) SP.
1620     __ add2reg(parmBlk, resize_len - (cv_len + key_len), Z_SP);
1621     __ z_nill(parmBlk, (~(AES_parmBlk_align-1)) & 0xffff); // Align parameter block.
1622 
1623     // There is room for stuff in the range [parmBlk-AES_parmBlk_addspace+8, parmBlk).
1624     __ z_stg(keylen,  -8, parmBlk);                        // Spill keylen for later use.
1625 
1626     // calculate (SP before resize) from updated SP.
1627     __ add2reg(keylen, resize_len, Z_SP);                  // keylen holds prev SP for now.
1628     __ z_stg(keylen, -16, parmBlk);                        // Spill prev SP for easy revert.
1629 
1630     __ z_mvc(0,      cv_len-1,  parmBlk, 0, cv);     // Copy cv.
1631     __ z_mvc(cv_len, key_len-1, parmBlk, 0, key);    // Copy key.
1632     __ z_lghi(fCode, crypto_fCode);
1633   }
1634 
1635   // NOTE:
1636   //   Before returning, the stub has to copy the chaining value from
1637   //   the parmBlk, where it was updated by the crypto instruction, back
1638   //   to the chaining value array the address of which was passed in the cv argument.
1639   //   As all the available registers are used and modified by KMC, we need to save
1640   //   the key length across the KMC instruction. We do so by spilling it to the stack,
1641   //   just preceding the parmBlk (at (parmBlk - 8)).
1642   void generate_push_parmBlk(Register keylen, Register fCode, Register parmBlk, Register key, Register cv, bool is_decipher) {
1643     int       mode = is_decipher ? VM_Version::CipherMode::decipher : VM_Version::CipherMode::cipher;
1644     Label     parmBlk_128, parmBlk_192, parmBlk_256, parmBlk_set;
1645 
1646     BLOCK_COMMENT("push parmBlk {");
1647     // We have just three cipher strengths which translates into three
1648     // possible extended key lengths: 44, 52, and 60 bytes.
1649     // We therefore can compare the actual length against the "middle" length
1650     // and get: lt -> len=44, eq -> len=52, gt -> len=60.
1651     __ z_cghi(keylen, 52);
1652     if (VM_Version::has_Crypto_AES128()) { __ z_brl(parmBlk_128); }  // keyLen <  52: AES128
1653     if (VM_Version::has_Crypto_AES192()) { __ z_bre(parmBlk_192); }  // keyLen == 52: AES192
1654     if (VM_Version::has_Crypto_AES256()) { __ z_brh(parmBlk_256); }  // keyLen >  52: AES256
1655 
1656     // Security net: requested AES function not available on this CPU.
1657     // NOTE:
1658     //   As of now (March 2015), this safety net is not required. JCE policy files limit the
1659     //   cryptographic strength of the keys used to 128 bit. If we have AES hardware support
1660     //   at all, we have at least AES-128.
1661     __ stop_static("AES key strength not supported by CPU. Use -XX:-UseAES as remedy.", 0);
1662 
1663     if (VM_Version::has_Crypto_AES256()) {
1664       __ bind(parmBlk_256);
1665       generate_push_Block(VM_Version::Cipher::_AES256_dataBlk,
1666                           VM_Version::Cipher::_AES256_parmBlk_C,
1667                           VM_Version::Cipher::_AES256 + mode,
1668                           parmBlk, keylen, fCode, cv, key);
1669       if (VM_Version::has_Crypto_AES128() || VM_Version::has_Crypto_AES192()) {
1670         __ z_bru(parmBlk_set);  // Fallthru otherwise.
1671       }
1672     }
1673 
1674     if (VM_Version::has_Crypto_AES192()) {
1675       __ bind(parmBlk_192);
1676       generate_push_Block(VM_Version::Cipher::_AES192_dataBlk,
1677                           VM_Version::Cipher::_AES192_parmBlk_C,
1678                           VM_Version::Cipher::_AES192 + mode,
1679                           parmBlk, keylen, fCode, cv, key);
1680       if (VM_Version::has_Crypto_AES128()) {
1681         __ z_bru(parmBlk_set);  // Fallthru otherwise.
1682       }
1683     }
1684 
1685     if (VM_Version::has_Crypto_AES128()) {
1686       __ bind(parmBlk_128);
1687       generate_push_Block(VM_Version::Cipher::_AES128_dataBlk,
1688                           VM_Version::Cipher::_AES128_parmBlk_C,
1689                           VM_Version::Cipher::_AES128 + mode,
1690                           parmBlk, keylen, fCode, cv, key);
1691       // Fallthru
1692     }
1693 
1694     __ bind(parmBlk_set);
1695     BLOCK_COMMENT("} push parmBlk");
1696   }
1697 
1698   // Pop a parameter block from the stack. The chaining value portion of the parameter block
1699   // is copied back to the cv array as it is needed for subsequent cipher steps.
1700   // The keylen value as well as the original SP (before resizing) was pushed to the stack
1701   // when pushing the parameter block.
1702   void generate_pop_parmBlk(Register keylen, Register parmBlk, Register key, Register cv) {
1703 
1704     BLOCK_COMMENT("pop parmBlk {");
1705     bool identical_dataBlk_len =  (VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES192_dataBlk) &&
1706                                   (VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES256_dataBlk);
1707     if (identical_dataBlk_len) {
1708       int cv_len = VM_Version::Cipher::_AES128_dataBlk;
1709       __ z_mvc(0, cv_len-1, cv, 0, parmBlk);  // Copy cv.
1710     } else {
1711       int cv_len;
1712       Label parmBlk_128, parmBlk_192, parmBlk_256, parmBlk_set;
1713       __ z_lg(keylen, -8, parmBlk);  // restore keylen
1714       __ z_cghi(keylen, 52);
1715       if (VM_Version::has_Crypto_AES256()) __ z_brh(parmBlk_256);  // keyLen >  52: AES256
1716       if (VM_Version::has_Crypto_AES192()) __ z_bre(parmBlk_192);  // keyLen == 52: AES192
1717       // if (VM_Version::has_Crypto_AES128()) __ z_brl(parmBlk_128);  // keyLen <  52: AES128  // fallthru
1718 
1719       // Security net: there is no one here. If we would need it, we should have
1720       // fallen into it already when pushing the parameter block.
1721       if (VM_Version::has_Crypto_AES128()) {
1722         __ bind(parmBlk_128);
1723         cv_len = VM_Version::Cipher::_AES128_dataBlk;
1724         __ z_mvc(0, cv_len-1, cv, 0, parmBlk);  // Copy cv.
1725         if (VM_Version::has_Crypto_AES192() || VM_Version::has_Crypto_AES256()) {
1726           __ z_bru(parmBlk_set);
1727         }
1728       }
1729 
1730       if (VM_Version::has_Crypto_AES192()) {
1731         __ bind(parmBlk_192);
1732         cv_len = VM_Version::Cipher::_AES192_dataBlk;
1733         __ z_mvc(0, cv_len-1, cv, 0, parmBlk);  // Copy cv.
1734         if (VM_Version::has_Crypto_AES256()) {
1735           __ z_bru(parmBlk_set);
1736         }
1737       }
1738 
1739       if (VM_Version::has_Crypto_AES256()) {
1740         __ bind(parmBlk_256);
1741         cv_len = VM_Version::Cipher::_AES256_dataBlk;
1742         __ z_mvc(0, cv_len-1, cv, 0, parmBlk);  // Copy cv.
1743         // __ z_bru(parmBlk_set);  // fallthru
1744       }
1745       __ bind(parmBlk_set);
1746     }
1747     __ z_lg(Z_SP, -16, parmBlk); // Revert resize_frame_absolute. Z_SP saved by push_parmBlk.
1748     BLOCK_COMMENT("} pop parmBlk");
1749   }
1750 
1751   // Compute AES encrypt/decrypt function.
1752   void generate_AES_cipherBlock(bool is_decipher) {
1753     // Incoming arguments.
1754     Register       from    = Z_ARG1; // source byte array
1755     Register       to      = Z_ARG2; // destination byte array
1756     Register       key     = Z_ARG3; // expanded key array
1757 
1758     const Register keylen  = Z_R0;   // Temporarily (until fCode is set) holds the expanded key array length.
1759 
1760     // Register definitions as required by KM instruction.
1761     const Register fCode   = Z_R0;   // crypto function code
1762     const Register parmBlk = Z_R1;   // parameter block address (points to crypto key)
1763     const Register src     = Z_ARG1; // Must be even reg (KM requirement).
1764     const Register srclen  = Z_ARG2; // Must be odd reg and pair with src. Overwrites destination address.
1765     const Register dst     = Z_ARG3; // Must be even reg (KM requirement). Overwrites expanded key address.
1766 
1767     // Read key len of expanded key (in 4-byte words).
1768     __ z_lgf(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
1769 
1770     // Copy arguments to registers as required by crypto instruction.
1771     __ z_lgr(parmBlk, key);          // crypto key (in T_INT array).
1772     __ lgr_if_needed(src, from);     // Copy src address. Will not emit, src/from are identical.
1773     __ z_lgr(dst, to);               // Copy dst address, even register required.
1774 
1775     // Construct function code into fCode(Z_R0), data block length into srclen(Z_ARG2).
1776     generate_load_AES_fCode(keylen, fCode, srclen, is_decipher);
1777 
1778     __ km(dst, src);                 // Cipher the message.
1779 
1780     __ z_br(Z_R14);
1781   }
1782 
1783   // Compute AES encrypt function.
1784   address generate_AES_encryptBlock() {
1785     __ align(CodeEntryAlignment);
1786     StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id;
1787     StubCodeMark mark(this, stub_id);
1788     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1789 
1790     generate_AES_cipherBlock(false);
1791 
1792     return __ addr_at(start_off);
1793   }
1794 
1795   // Compute AES decrypt function.
1796   address generate_AES_decryptBlock() {
1797     __ align(CodeEntryAlignment);
1798     StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id;
1799     StubCodeMark mark(this, stub_id);
1800     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1801 
1802     generate_AES_cipherBlock(true);
1803 
1804     return __ addr_at(start_off);
1805   }
1806 
1807   // These stubs receive the addresses of the cryptographic key and of the chaining value as two separate
1808   // arguments (registers "key" and "cv", respectively). The KMC instruction, on the other hand, requires
1809   // chaining value and key to be, in this sequence, adjacent in storage. Thus, we need to allocate some
1810   // thread-local working storage. Using heap memory incurs all the hassles of allocating/freeing.
1811   // Stack space, on the contrary, is deallocated automatically when we return from the stub to the caller.
1812   // *** WARNING ***
1813   // Please note that we do not formally allocate stack space, nor do we
1814   // update the stack pointer. Therefore, no function calls are allowed
1815   // and nobody else must use the stack range where the parameter block
1816   // is located.
1817   // We align the parameter block to the next available octoword.
1818   //
1819   // Compute chained AES encrypt function.
1820   void generate_AES_cipherBlockChaining(bool is_decipher) {
1821 
1822     Register       from    = Z_ARG1; // source byte array (clear text)
1823     Register       to      = Z_ARG2; // destination byte array (ciphered)
1824     Register       key     = Z_ARG3; // expanded key array.
1825     Register       cv      = Z_ARG4; // chaining value
1826     const Register msglen  = Z_ARG5; // Total length of the msg to be encrypted. Value must be returned
1827                                      // in Z_RET upon completion of this stub. Is 32-bit integer.
1828 
1829     const Register keylen  = Z_R0;   // Expanded key length, as read from key array. Temp only.
1830     const Register fCode   = Z_R0;   // crypto function code
1831     const Register parmBlk = Z_R1;   // parameter block address (points to crypto key)
1832     const Register src     = Z_ARG1; // is Z_R2
1833     const Register srclen  = Z_ARG2; // Overwrites destination address.
1834     const Register dst     = Z_ARG3; // Overwrites key address.
1835 
1836     // Read key len of expanded key (in 4-byte words).
1837     __ z_lgf(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
1838 
1839     // Construct parm block address in parmBlk (== Z_R1), copy cv and key to parm block.
1840     // Construct function code in fCode (Z_R0).
1841     generate_push_parmBlk(keylen, fCode, parmBlk, key, cv, is_decipher);
1842 
1843     // Prepare other registers for instruction.
1844     __ lgr_if_needed(src, from);     // Copy src address. Will not emit, src/from are identical.
1845     __ z_lgr(dst, to);
1846     __ z_llgfr(srclen, msglen);      // We pass the offsets as ints, not as longs as required.
1847 
1848     __ kmc(dst, src);                // Cipher the message.
1849 
1850     generate_pop_parmBlk(keylen, parmBlk, key, cv);
1851 
1852     __ z_llgfr(Z_RET, msglen);       // We pass the offsets as ints, not as longs as required.
1853     __ z_br(Z_R14);
1854   }
1855 
1856   // Compute chained AES encrypt function.
1857   address generate_cipherBlockChaining_AES_encrypt() {
1858     __ align(CodeEntryAlignment);
1859     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_encryptAESCrypt_id;
1860     StubCodeMark mark(this, stub_id);
1861     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
1862 
1863     generate_AES_cipherBlockChaining(false);
1864 
1865     return __ addr_at(start_off);
1866   }
1867 
1868   // Compute chained AES decrypt function.
1869   address generate_cipherBlockChaining_AES_decrypt() {
1870     __ align(CodeEntryAlignment);
1871     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_decryptAESCrypt_id;
1872     StubCodeMark mark(this, stub_id);
1873     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
1874 
1875     generate_AES_cipherBlockChaining(true);
1876 
1877     return __ addr_at(start_off);
1878   }
1879 
1880 
1881   // *****************************************************************************
1882 
1883   // AES CounterMode
1884   // Push a parameter block for the cipher/decipher instruction on the stack.
1885   // Layout of the additional stack space allocated for counterMode_AES_cipherBlock
1886   //
1887   //   |        |
1888   //   +--------+ <-- SP before expansion
1889   //   |        |
1890   //   :        :  alignment loss (part 2), 0..(AES_parmBlk_align-1) bytes.
1891   //   |        |
1892   //   +--------+ <-- gap = parmBlk + parmBlk_len + ctrArea_len
1893   //   |        |
1894   //   :        :  byte[] ctr - kmctr expects a counter vector the size of the input vector.
1895   //   :        :         The interface only provides byte[16] iv, the init vector.
1896   //   :        :         The size of this area is a tradeoff between stack space, init effort, and speed.
1897   //   |        |         Each counter is a 128bit int. Vector element [0] is a copy of iv.
1898   //   |        |         Vector element [i] is formed by incrementing element [i-1].
1899   //   +--------+ <-- ctr = parmBlk + parmBlk_len
1900   //   |        |
1901   //   :        :  space for parameter block, size VM_Version::Cipher::_AES*_parmBlk_G
1902   //   |        |
1903   //   +--------+ <-- parmBlk = Z_SP + (alignment loss (part 1+2)) + AES_dataBlk_space + AES_parmBlk_addSpace, octoword-aligned, start of parameter block
1904   //   |        |
1905   //   :        :  additional stack space for spills etc., min. size AES_parmBlk_addspace, all bytes usable.
1906   //   |        |
1907   //   +--------+ <-- Z_SP + alignment loss (part 1+2) + AES_dataBlk_space, octoword-aligned
1908   //   |        |
1909   //   :        :  space for one source data block and one dest data block.
1910   //   |        |
1911   //   +--------+ <-- Z_SP + alignment loss (part 1+2), octoword-aligned
1912   //   |        |
1913   //   :        :  additional alignment loss. Blocks above can't tolerate unusable DW @SP.
1914   //   |        |
1915   //   +--------+ <-- Z_SP + alignment loss (part 1), octoword-aligned
1916   //   |        |
1917   //   :        :  alignment loss (part 1), 0..(AES_parmBlk_align-1) bytes. DW @ Z_SP holds frame ptr.
1918   //   |        |
1919   //   +--------+ <-- Z_SP after expansion
1920   //
1921   //   additional space allocation (per DW):
1922   //    spillSpace = parmBlk - AES_parmBlk_addspace
1923   //    dataBlocks = spillSpace - AES_dataBlk_space
1924   //
1925   //    parmBlk-8  various fields of various lengths
1926   //               parmBlk-1: key_len (only one byte is stored at parmBlk-1)
1927   //               parmBlk-2: fCode (only one byte is stored at parmBlk-2)
1928   //               parmBlk-4: ctrVal_len (as retrieved from iv array), in bytes, as HW
1929   //               parmBlk-8: msglen length (in bytes) of crypto msg, as passed in by caller
1930   //                          return value is calculated from this: rv = msglen - processed.
1931   //    parmBlk-16 old_SP (SP before resize)
1932   //    parmBlk-24 temp values
1933   //                up to and including main loop in generate_counterMode_AES
1934   //                 - parmBlk-20: remmsg_len remaining msg len (aka unprocessed msg bytes)
1935   //                after main loop in generate_counterMode_AES
1936   //                 - parmBlk-24: spill slot for various address values
1937   //
1938   //    parmBlk-40 free spill slot, used for local spills.
1939   //    parmBlk-64 ARG2(dst) ptr spill slot
1940   //    parmBlk-56 ARG3(crypto key) ptr spill slot
1941   //    parmBlk-48 ARG4(icv value) ptr spill slot
1942   //
1943   //    parmBlk-72
1944   //    parmBlk-80
1945   //    parmBlk-88 counter vector current position
1946   //    parmBlk-96 reduced msg len (after preLoop processing)
1947   //
1948   //    parmBlk-104 Z_R13 spill slot (preLoop only)
1949   //    parmBlk-112 Z_R12 spill slot (preLoop only)
1950   //    parmBlk-120 Z_R11 spill slot (preLoop only)
1951   //    parmBlk-128 Z_R10 spill slot (preLoop only)
1952   //
1953   //
1954   // Layout of the parameter block (instruction KMCTR, function KMCTR-AES*
1955   //
1956   //   +--------+ key_len: +16 (AES-128), +24 (AES-192), +32 (AES-256)
1957   //   |        |
1958   //   |        |  cryptographic key
1959   //   |        |
1960   //   +--------+ <-- parmBlk
1961   //
1962   // On exit:
1963   //   Z_SP     points to resized frame
1964   //            Z_SP before resize available from -16(parmBlk)
1965   //   parmBlk  points to crypto instruction parameter block
1966   //            parameter block is filled with crypto key.
1967   //   msglen   unchanged, saved for later at -24(parmBlk)
1968   //   fCode    contains function code for instruction
1969   //   key      unchanged
1970   //
1971   void generate_counterMode_prepare_Stack(Register parmBlk, Register ctr, Register counter, Register scratch) {
1972 
1973     BLOCK_COMMENT("prepare stack counterMode_AESCrypt {");
1974 
1975     // save argument registers.
1976     //   ARG1(from) is Z_RET as well. Not saved or restored.
1977     //   ARG5(msglen) is restored by other means.
1978     __ z_stmg(Z_ARG2, Z_ARG4, argsave_offset,    parmBlk);
1979 
1980     assert(AES_ctrVec_len > 0, "sanity. We need a counter vector");
1981     __ add2reg(counter, AES_parmBlk_align, parmBlk);       // counter array is located behind crypto key. Available range is disp12 only.
1982     __ z_mvc(0, AES_ctrVal_len-1, counter, 0, ctr);        // move first copy of iv
1983     for (int j = 1; j < AES_ctrVec_len; j+=j) {            // j (and amount of moved data) doubles with every iteration
1984       int offset = j * AES_ctrVal_len;
1985       if (offset <= 256) {
1986         __ z_mvc(offset, offset-1, counter, 0, counter);   // move iv
1987       } else {
1988         for (int k = 0; k < offset; k += 256) {
1989           __ z_mvc(offset+k, 255, counter, 0, counter);
1990         }
1991       }
1992     }
1993 
1994     Label noCarry, done;
1995     __ z_lg(scratch, Address(ctr, 8));                     // get low-order DW of initial counter.
1996     __ z_algfi(scratch, AES_ctrVec_len);                   // check if we will overflow during init.
1997     __ z_brc(Assembler::bcondLogNoCarry, noCarry);         // No, 64-bit increment is sufficient.
1998 
1999     for (int j = 1; j < AES_ctrVec_len; j++) {             // start with j = 1; no need to add 0 to the first counter value.
2000       int offset = j * AES_ctrVal_len;
2001       generate_increment128(counter, offset, j, scratch);  // increment iv by index value
2002     }
2003     __ z_bru(done);
2004 
2005     __ bind(noCarry);
2006     for (int j = 1; j < AES_ctrVec_len; j++) {             // start with j = 1; no need to add 0 to the first counter value.
2007       int offset = j * AES_ctrVal_len;
2008       generate_increment64(counter, offset, j);            // increment iv by index value
2009     }
2010 
2011     __ bind(done);
2012 
2013     BLOCK_COMMENT("} prepare stack counterMode_AESCrypt");
2014   }
2015 
2016 
2017   void generate_counterMode_increment_ctrVector(Register parmBlk, Register counter, Register scratch, bool v0_only) {
2018 
2019     BLOCK_COMMENT("increment ctrVector counterMode_AESCrypt {");
2020 
2021     __ add2reg(counter, AES_parmBlk_align, parmBlk);       // ptr to counter array needs to be restored
2022 
2023     if (v0_only) {
2024       int offset = 0;
2025       generate_increment128(counter, offset, AES_ctrVec_len, scratch); // increment iv by # vector elements
2026     } else {
2027       int j = 0;
2028       if (VM_Version::has_VectorFacility()) {
2029         bool first_call = true;
2030         for (; j < (AES_ctrVec_len - 3); j+=4) {                       // increment blocks of 4 iv elements
2031           int offset = j * AES_ctrVal_len;
2032           generate_increment128x4(counter, offset, AES_ctrVec_len, first_call);
2033           first_call = false;
2034         }
2035       }
2036       for (; j < AES_ctrVec_len; j++) {
2037         int offset = j * AES_ctrVal_len;
2038         generate_increment128(counter, offset, AES_ctrVec_len, scratch); // increment iv by # vector elements
2039       }
2040     }
2041 
2042     BLOCK_COMMENT("} increment ctrVector counterMode_AESCrypt");
2043   }
2044 
2045   // IBM s390 (IBM z/Architecture, to be more exact) uses Big-Endian number representation.
2046   // Therefore, the bits are ordered from most significant to least significant. The address
2047   // of a number in memory points to its lowest location where the most significant bit is stored.
2048   void generate_increment64(Register counter, int offset, int increment) {
2049     __ z_algsi(offset + 8, counter, increment);            // increment, no overflow check
2050   }
2051 
2052   void generate_increment128(Register counter, int offset, int increment, Register scratch) {
2053     __ clear_reg(scratch);                                 // prepare to add carry to high-order DW
2054     __ z_algsi(offset + 8, counter, increment);            // increment low order DW
2055     __ z_alcg(scratch, Address(counter, offset));          // add carry to high-order DW
2056     __ z_stg(scratch, Address(counter, offset));           // store back
2057   }
2058 
2059   void generate_increment128(Register counter, int offset, Register increment, Register scratch) {
2060     __ clear_reg(scratch);                                 // prepare to add carry to high-order DW
2061     __ z_alg(increment, Address(counter, offset + 8));     // increment low order DW
2062     __ z_stg(increment, Address(counter, offset + 8));     // store back
2063     __ z_alcg(scratch, Address(counter, offset));          // add carry to high-order DW
2064     __ z_stg(scratch, Address(counter, offset));           // store back
2065   }
2066 
2067   // This is the vector variant of increment128, incrementing 4 ctr vector elements per call.
2068   void generate_increment128x4(Register counter, int offset, int increment, bool init) {
2069     VectorRegister Vincr      = Z_V16;
2070     VectorRegister Vctr0      = Z_V20;
2071     VectorRegister Vctr1      = Z_V21;
2072     VectorRegister Vctr2      = Z_V22;
2073     VectorRegister Vctr3      = Z_V23;
2074 
2075     // Initialize the increment value only once for a series of increments.
2076     // It must be assured that the non-initializing generator calls are
2077     // immediately subsequent. Otherwise, there is no guarantee for Vincr to be unchanged.
2078     if (init) {
2079       __ z_vzero(Vincr);                                   // preset VReg with constant increment
2080       __ z_vleih(Vincr, increment, 7);                     // rightmost HW has ix = 7
2081     }
2082 
2083     __ z_vlm(Vctr0, Vctr3, offset, counter);               // get the counter values
2084     __ z_vaq(Vctr0, Vctr0, Vincr);                         // increment them
2085     __ z_vaq(Vctr1, Vctr1, Vincr);
2086     __ z_vaq(Vctr2, Vctr2, Vincr);
2087     __ z_vaq(Vctr3, Vctr3, Vincr);
2088     __ z_vstm(Vctr0, Vctr3, offset, counter);              // store the counter values
2089   }
2090 
2091   unsigned int generate_counterMode_push_Block(int dataBlk_len, int parmBlk_len, int crypto_fCode,
2092                            Register parmBlk, Register msglen, Register fCode, Register key) {
2093 
2094     // space for data blocks (src and dst, one each) for partial block processing)
2095     AES_parmBlk_addspace = AES_stackSpace_incr             // spill space (temp data)
2096                          + AES_stackSpace_incr             // for argument save/restore
2097                          + AES_stackSpace_incr*2           // for work reg save/restore
2098                          ;
2099     AES_dataBlk_space    = roundup(2*dataBlk_len, AES_parmBlk_align);
2100     AES_dataBlk_offset   = -(AES_parmBlk_addspace+AES_dataBlk_space);
2101     const int key_len    = parmBlk_len;                    // The length of the unextended key (16, 24, 32)
2102 
2103     assert((AES_ctrVal_len == 0) || (AES_ctrVal_len == dataBlk_len), "varying dataBlk_len is not supported.");
2104     AES_ctrVal_len  = dataBlk_len;                         // ctr init value len (in bytes)
2105     AES_ctrArea_len = AES_ctrVec_len * AES_ctrVal_len;     // space required on stack for ctr vector
2106 
2107     // This len must be known at JIT compile time. Only then are we able to recalc the SP before resize.
2108     // We buy this knowledge by wasting some (up to AES_parmBlk_align) bytes of stack space.
2109     const int resize_len = AES_parmBlk_align               // room for alignment of parmBlk
2110                          + AES_parmBlk_align               // extra room for alignment
2111                          + AES_dataBlk_space               // one src and one dst data blk
2112                          + AES_parmBlk_addspace            // spill space for local data
2113                          + roundup(parmBlk_len, AES_parmBlk_align)  // aligned length of parmBlk
2114                          + AES_ctrArea_len                 // stack space for ctr vector
2115                          ;
2116     Register scratch     = fCode;  // We can use fCode as a scratch register. It's contents on entry
2117                                    // is irrelevant and it is set at the very end of this code block.
2118 
2119     assert(key_len < 256, "excessive crypto key len: %d, limit: 256", key_len);
2120 
2121     BLOCK_COMMENT(err_msg("push_Block (%d bytes) counterMode_AESCrypt%d {", resize_len, parmBlk_len*8));
2122 
2123     // After the frame is resized, the parmBlk is positioned such
2124     // that it is octoword-aligned. This potentially creates some
2125     // alignment waste in addspace and/or in the gap area.
2126     // After resize_frame, scratch contains the frame pointer.
2127     __ resize_frame(-resize_len, scratch, true);
2128 #ifdef ASSERT
2129     __ clear_mem(Address(Z_SP, (intptr_t)8), resize_len - 8);
2130 #endif
2131 
2132     // calculate aligned parmBlk address from updated (resized) SP.
2133     __ add2reg(parmBlk, AES_parmBlk_addspace + AES_dataBlk_space + (2*AES_parmBlk_align-1), Z_SP);
2134     __ z_nill(parmBlk, (~(AES_parmBlk_align-1)) & 0xffff); // Align parameter block.
2135 
2136     // There is room to spill stuff in the range [parmBlk-AES_parmBlk_addspace+8, parmBlk).
2137     __ z_mviy(keylen_offset, parmBlk, key_len - 1);        // Spill crypto key length for later use. Decrement by one for direct use with xc template.
2138     __ z_mviy(fCode_offset,  parmBlk, crypto_fCode);       // Crypto function code, will be loaded into Z_R0 later.
2139     __ z_sty(msglen, msglen_offset, parmBlk);              // full plaintext/ciphertext len.
2140     __ z_sty(msglen, msglen_red_offset, parmBlk);          // save for main loop, may get updated in preLoop.
2141     __ z_sra(msglen, exact_log2(dataBlk_len));             // # full cipher blocks that can be formed from input text.
2142     __ z_sty(msglen, rem_msgblk_offset, parmBlk);
2143 
2144     __ add2reg(scratch, resize_len, Z_SP);                 // calculate (SP before resize) from resized SP.
2145     __ z_stg(scratch, unextSP_offset, parmBlk);            // Spill unextended SP for easy revert.
2146     __ z_stmg(Z_R10, Z_R13, regsave_offset, parmBlk);      // make some regs available as work registers
2147 
2148     // Fill parmBlk with all required data
2149     __ z_mvc(0, key_len-1, parmBlk, 0, key);               // Copy key. Need to do it here - key_len is only known here.
2150     BLOCK_COMMENT(err_msg("} push_Block (%d bytes) counterMode_AESCrypt%d", resize_len, parmBlk_len*8));
2151     return resize_len;
2152   }
2153 
2154 
2155   void generate_counterMode_pop_Block(Register parmBlk, Register msglen, Label& eraser) {
2156     // For added safety, clear the stack area where the crypto key was stored.
2157     Register scratch = msglen;
2158     assert_different_registers(scratch, Z_R0);             // can't use Z_R0 for exrl.
2159 
2160     // wipe out key on stack
2161     __ z_llgc(scratch, keylen_offset, parmBlk);            // get saved (key_len-1) value (we saved just one byte!)
2162     __ z_exrl(scratch, eraser);                            // template relies on parmBlk still pointing to key on stack
2163 
2164     // restore argument registers.
2165     //   ARG1(from) is Z_RET as well. Not restored - will hold return value anyway.
2166     //   ARG5(msglen) is restored further down.
2167     __ z_lmg(Z_ARG2, Z_ARG4, argsave_offset,    parmBlk);
2168 
2169     // restore work registers
2170     __ z_lmg(Z_R10, Z_R13, regsave_offset, parmBlk);       // make some regs available as work registers
2171 
2172     __ z_lgf(msglen, msglen_offset,  parmBlk);             // Restore msglen, only low order FW is valid
2173 #ifdef ASSERT
2174     {
2175       Label skip2last, skip2done;
2176       // Z_RET (aka Z_R2) can be used as scratch as well. It will be set from msglen before return.
2177       __ z_lgr(Z_RET, Z_SP);                                 // save extended SP
2178       __ z_lg(Z_SP,    unextSP_offset, parmBlk);             // trim stack back to unextended size
2179       __ z_sgrk(Z_R1, Z_SP, Z_RET);
2180 
2181       __ z_cghi(Z_R1, 256);
2182       __ z_brl(skip2last);
2183       __ z_xc(0, 255, Z_RET, 0, Z_RET);
2184       __ z_aghi(Z_RET, 256);
2185       __ z_aghi(Z_R1, -256);
2186 
2187       __ z_cghi(Z_R1, 256);
2188       __ z_brl(skip2last);
2189       __ z_xc(0, 255, Z_RET, 0, Z_RET);
2190       __ z_aghi(Z_RET, 256);
2191       __ z_aghi(Z_R1, -256);
2192 
2193       __ z_cghi(Z_R1, 256);
2194       __ z_brl(skip2last);
2195       __ z_xc(0, 255, Z_RET, 0, Z_RET);
2196       __ z_aghi(Z_RET, 256);
2197       __ z_aghi(Z_R1, -256);
2198 
2199       __ bind(skip2last);
2200       __ z_lgr(Z_R0, Z_RET);
2201       __ z_aghik(Z_RET, Z_R1, -1);  // decrement for exrl
2202       __ z_brl(skip2done);
2203       __ z_lgr(parmBlk, Z_R0);      // parmBlk == Z_R1, used in eraser template
2204       __ z_exrl(Z_RET, eraser);
2205 
2206       __ bind(skip2done);
2207     }
2208 #else
2209     __ z_lg(Z_SP,    unextSP_offset, parmBlk);             // trim stack back to unextended size
2210 #endif
2211   }
2212 
2213 
2214   int generate_counterMode_push_parmBlk(Register parmBlk, Register msglen, Register fCode, Register key, bool is_decipher) {
2215     int       resize_len = 0;
2216     int       mode = is_decipher ? VM_Version::CipherMode::decipher : VM_Version::CipherMode::cipher;
2217     Label     parmBlk_128, parmBlk_192, parmBlk_256, parmBlk_set;
2218     Register  keylen = fCode;      // Expanded key length, as read from key array, Temp only.
2219                                    // use fCode as scratch; fCode receives its final value later.
2220 
2221     // Read key len of expanded key (in 4-byte words).
2222     __ z_lgf(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2223     __ z_cghi(keylen, 52);
2224     if (VM_Version::has_Crypto_AES_CTR256()) { __ z_brh(parmBlk_256); }  // keyLen >  52: AES256. Assume: most frequent
2225     if (VM_Version::has_Crypto_AES_CTR128()) { __ z_brl(parmBlk_128); }  // keyLen <  52: AES128.
2226     if (VM_Version::has_Crypto_AES_CTR192()) { __ z_bre(parmBlk_192); }  // keyLen == 52: AES192. Assume: least frequent
2227 
2228     // Safety net: requested AES_CTR function for requested keylen not available on this CPU.
2229     __ stop_static("AES key strength not supported by CPU. Use -XX:-UseAESCTRIntrinsics as remedy.", 0);
2230 
2231     if (VM_Version::has_Crypto_AES_CTR128()) {
2232       __ bind(parmBlk_128);
2233       resize_len = generate_counterMode_push_Block(VM_Version::Cipher::_AES128_dataBlk,
2234                           VM_Version::Cipher::_AES128_parmBlk_G,
2235                           VM_Version::Cipher::_AES128 + mode,
2236                           parmBlk, msglen, fCode, key);
2237       if (VM_Version::has_Crypto_AES_CTR256() || VM_Version::has_Crypto_AES_CTR192()) {
2238         __ z_bru(parmBlk_set);  // Fallthru otherwise.
2239       }
2240     }
2241 
2242     if (VM_Version::has_Crypto_AES_CTR192()) {
2243       __ bind(parmBlk_192);
2244       resize_len = generate_counterMode_push_Block(VM_Version::Cipher::_AES192_dataBlk,
2245                           VM_Version::Cipher::_AES192_parmBlk_G,
2246                           VM_Version::Cipher::_AES192 + mode,
2247                           parmBlk, msglen, fCode, key);
2248       if (VM_Version::has_Crypto_AES_CTR256()) {
2249         __ z_bru(parmBlk_set);  // Fallthru otherwise.
2250       }
2251     }
2252 
2253     if (VM_Version::has_Crypto_AES_CTR256()) {
2254       __ bind(parmBlk_256);
2255       resize_len = generate_counterMode_push_Block(VM_Version::Cipher::_AES256_dataBlk,
2256                           VM_Version::Cipher::_AES256_parmBlk_G,
2257                           VM_Version::Cipher::_AES256 + mode,
2258                           parmBlk, msglen, fCode, key);
2259       // Fallthru
2260     }
2261 
2262     __ bind(parmBlk_set);
2263     return resize_len;
2264   }
2265 
2266 
2267   void generate_counterMode_pop_parmBlk(Register parmBlk, Register msglen, Label& eraser) {
2268 
2269     BLOCK_COMMENT("pop parmBlk counterMode_AESCrypt {");
2270 
2271     generate_counterMode_pop_Block(parmBlk, msglen, eraser);
2272 
2273     BLOCK_COMMENT("} pop parmBlk counterMode_AESCrypt");
2274   }
2275 
2276   // Implementation of counter-mode AES encrypt/decrypt function.
2277   //
2278   void generate_counterMode_AES_impl(bool is_decipher) {
2279 
2280     // On entry:
2281     // if there was a previous call to update(), and this previous call did not fully use
2282     // the current encrypted counter, that counter is available at arg6_Offset(Z_SP).
2283     // The index of the first unused bayte in the encrypted counter is available at arg7_Offset(Z_SP).
2284     // The index is in the range [1..AES_ctrVal_len] ([1..16]), where index == 16 indicates a fully
2285     // used previous encrypted counter.
2286     // The unencrypted counter has already been incremented and is ready to be used for the next
2287     // data block, after the unused bytes from the previous call have been consumed.
2288     // The unencrypted counter follows the "increment-after use" principle.
2289 
2290     // On exit:
2291     // The index of the first unused byte of the encrypted counter is written back to arg7_Offset(Z_SP).
2292     // A value of AES_ctrVal_len (16) indicates there is no leftover byte.
2293     // If there is at least one leftover byte (1 <= index < AES_ctrVal_len), the encrypted counter value
2294     // is written back to arg6_Offset(Z_SP). If there is no leftover, nothing is written back.
2295     // The unencrypted counter value is written back after having been incremented.
2296 
2297     Register       from    = Z_ARG1; // byte[], source byte array (clear text)
2298     Register       to      = Z_ARG2; // byte[], destination byte array (ciphered)
2299     Register       key     = Z_ARG3; // byte[], expanded key array.
2300     Register       ctr     = Z_ARG4; // byte[], counter byte array.
2301     const Register msglen  = Z_ARG5; // int, Total length of the msg to be encrypted. Value must be
2302                                      // returned in Z_RET upon completion of this stub.
2303                                      // This is a jint. Negative values are illegal, but technically possible.
2304                                      // Do not rely on high word. Contents is undefined.
2305                // encCtr   = Z_ARG6  - encrypted counter (byte array),
2306                //                      address passed on stack at _z_abi(remaining_cargs) + 0 * WordSize
2307                // cvIndex  = Z_ARG7  - # used (consumed) bytes of encrypted counter,
2308                //                      passed on stack at _z_abi(remaining_cargs) + 1 * WordSize
2309                //                      Caution:4-byte value, right-justified in 8-byte stack word
2310 
2311     const Register fCode   = Z_R0;   // crypto function code
2312     const Register parmBlk = Z_R1;   // parameter block address (points to crypto key)
2313     const Register src     = Z_ARG1; // is Z_R2, forms even/odd pair with srclen
2314     const Register srclen  = Z_ARG2; // Overwrites destination address.
2315     const Register dst     = Z_ARG3; // Overwrites key address.
2316     const Register counter = Z_ARG5; // Overwrites msglen. Must have counter array in an even register.
2317 
2318     Label srcMover, dstMover, fromMover, ctrXOR, dataEraser;  // EXRL (execution) templates.
2319     Label CryptoLoop, CryptoLoop_doit, CryptoLoop_end, CryptoLoop_setupAndDoLast, CryptoLoop_ctrVal_inc;
2320     Label allDone, allDone_noInc, popAndExit, Exit;
2321 
2322     int    arg6_Offset = _z_abi(remaining_cargs) + 0 * HeapWordSize;
2323     int    arg7_Offset = _z_abi(remaining_cargs) + 1 * HeapWordSize; // stack slot holds ptr to int value
2324     int   oldSP_Offset = 0;
2325 
2326     // Is there anything to do at all? Protect against negative len as well.
2327     __ z_ltr(msglen, msglen);
2328     __ z_brnh(Exit);
2329 
2330     // Expand stack, load parm block address into parmBlk (== Z_R1), copy crypto key to parm block.
2331     oldSP_Offset = generate_counterMode_push_parmBlk(parmBlk, msglen, fCode, key, is_decipher);
2332     arg6_Offset += oldSP_Offset;
2333     arg7_Offset += oldSP_Offset;
2334 
2335     // Check if there is a leftover, partially used encrypted counter from last invocation.
2336     // If so, use those leftover counter bytes first before starting the "normal" encryption.
2337 
2338     // We do not have access to the encrypted counter value. It is generated and used only
2339     // internally within the previous kmctr instruction. But, at the end of call to this stub,
2340     // the last encrypted couner is extracted by ciphering a 0x00 byte stream. The result is
2341     // stored at the arg6 location for use with the subsequent call.
2342     //
2343     // The #used bytes of the encrypted counter (from a previous call) is provided via arg7.
2344     // It is used as index into the encrypted counter to access the first byte availabla for ciphering.
2345     // To cipher the input text, we move the number of remaining bytes in the encrypted counter from
2346     // input to output. Then we simply XOR the output bytes with the associated encrypted counter bytes.
2347 
2348     Register cvIxAddr  = Z_R10;                  // Address of index into encCtr. Preserved for use @CryptoLoop_end.
2349     __ z_lg(cvIxAddr, arg7_Offset, Z_SP);        // arg7: addr of field encCTR_index.
2350 
2351     {
2352       Register cvUnused  = Z_R11;                // # unused bytes of encrypted counter value (= 16 - cvIndex)
2353       Register encCtr    = Z_R12;                // encrypted counter value, points to first ununsed byte.
2354       Register cvIndex   = Z_R13;                // # index of first unused byte of encrypted counter value
2355       Label    preLoop_end;
2356 
2357       // preLoop is necessary only if there is a partially used encrypted counter (encCtr).
2358       // Partially used means cvIndex is in [1, dataBlk_len-1].
2359       // cvIndex == 0:           encCtr is set up but not used at all. Should not occur.
2360       // cvIndex == dataBlk_len: encCtr is exhausted, all bytes used.
2361       // Using unsigned compare protects against cases where (cvIndex < 0).
2362       __ z_clfhsi(0, cvIxAddr, AES_ctrVal_len);  // check #used bytes in encCtr against ctr len.
2363       __ z_brnl(preLoop_end);                    // if encCtr is fully used, skip to normal processing.
2364       __ z_ltgf(cvIndex, 0, Z_R0, cvIxAddr);     // # used bytes in encCTR.
2365       __ z_brz(preLoop_end);                     // if encCtr has no used bytes, skip to normal processing.
2366 
2367       __ z_lg(encCtr, arg6_Offset, Z_SP);        // encrypted counter from last call to update()
2368       __ z_agr(encCtr, cvIndex);                 // now points to first unused byte
2369 
2370       __ add2reg(cvUnused, -AES_ctrVal_len, cvIndex); // calculate #unused bytes in encCtr.
2371       __ z_lcgr(cvUnused, cvUnused);             // previous checks ensure cvUnused in range [1, dataBlk_len-1]
2372 
2373       __ z_lgf(msglen, msglen_offset, parmBlk);  // Restore msglen (jint value)
2374       __ z_cr(cvUnused, msglen);                 // check if msg can consume all unused encCtr bytes
2375       __ z_locr(cvUnused, msglen, Assembler::bcondHigh); // take the shorter length
2376       __ z_aghi(cvUnused, -1);                   // decrement # unused bytes by 1 for exrl instruction
2377                                                  // preceding checks ensure cvUnused in range [1, dataBlk_len-1]
2378       __ z_exrl(cvUnused, fromMover);
2379       __ z_exrl(cvUnused, ctrXOR);
2380 
2381       __ z_aghi(cvUnused, 1);                    // revert decrement from above
2382       __ z_agr(cvIndex, cvUnused);               // update index into encCtr (first unused byte)
2383       __ z_st(cvIndex, 0, cvIxAddr);             // write back arg7, cvIxAddr is still valid
2384 
2385       // update pointers and counters to prepare for main loop
2386       __ z_agr(from, cvUnused);
2387       __ z_agr(to, cvUnused);
2388       __ z_sr(msglen, cvUnused);                 // #bytes not yet processed
2389       __ z_sty(msglen, msglen_red_offset, parmBlk); // save for calculations in main loop
2390       __ z_srak(Z_R0, msglen, exact_log2(AES_ctrVal_len));// # full cipher blocks that can be formed from input text.
2391       __ z_sty(Z_R0, rem_msgblk_offset, parmBlk);
2392 
2393       // check remaining msglen. If zero, all msg bytes were processed in preLoop.
2394       __ z_ltr(msglen, msglen);
2395       __ z_brnh(popAndExit);
2396 
2397       __ bind(preLoop_end);
2398     }
2399 
2400     // Create count vector on stack to accommodate up to AES_ctrVec_len blocks.
2401     generate_counterMode_prepare_Stack(parmBlk, ctr, counter, fCode);
2402 
2403     // Prepare other registers for instruction.
2404     __ lgr_if_needed(src, from);     // Copy src address. Will not emit, src/from are identical.
2405     __ z_lgr(dst, to);
2406     __ z_llgc(fCode, fCode_offset, Z_R0, parmBlk);
2407 
2408     __ bind(CryptoLoop);
2409       __ z_lghi(srclen, AES_ctrArea_len);                     // preset len (#bytes) for next iteration: max possible.
2410       __ z_asi(rem_msgblk_offset, parmBlk, -AES_ctrVec_len);  // decrement #remaining blocks (16 bytes each). Range: [+127..-128]
2411       __ z_brl(CryptoLoop_setupAndDoLast);                    // Handling the last iteration (using less than max #blocks) out-of-line
2412 
2413       __ bind(CryptoLoop_doit);
2414       __ kmctr(dst, counter, src);   // Cipher the message.
2415 
2416       __ z_lt(srclen, rem_msgblk_offset, Z_R0, parmBlk);      // check if this was the last iteration
2417       __ z_brz(CryptoLoop_ctrVal_inc);                        // == 0: ctrVector fully used. Need to increment the first
2418                                                               //       vector element to encrypt remaining unprocessed bytes.
2419 //    __ z_brl(CryptoLoop_end);                               //  < 0: this was detected before and handled at CryptoLoop_setupAndDoLast
2420                                                               //  > 0: this is the fallthru case, need another iteration
2421 
2422       generate_counterMode_increment_ctrVector(parmBlk, counter, srclen, false); // srclen unused here (serves as scratch)
2423       __ z_bru(CryptoLoop);
2424 
2425     __ bind(CryptoLoop_end);
2426 
2427     // OK, when we arrive here, we have encrypted all of the "from" byte stream
2428     // except for the last few [0..dataBlk_len) bytes. In addition, we know that
2429     // there are no more unused bytes in the previously generated encrypted counter.
2430     // The (unencrypted) counter, however, is ready to use (it was incremented before).
2431 
2432     // To encrypt the few remaining bytes, we need to form an extra src and dst
2433     // data block of dataBlk_len each. This is because we can only process full
2434     // blocks but we must not read or write beyond the boundaries of the argument
2435     // arrays. Here is what we do:
2436     //  - The ctrVector has at least one unused element. This is ensured by CryptoLoop code.
2437     //  - The (first) unused element is pointed at by the counter register.
2438     //  - The src data block is filled with the remaining "from" bytes, remainder of block undefined.
2439     //  - The single src data block is encrypted into the dst data block.
2440     //  - The dst data block is copied into the "to" array, but only the leftmost few bytes
2441     //    (as many as were left in the source byte stream).
2442     //  - The counter value to be used is pointed at by the counter register.
2443     //  - Fortunately, the crypto instruction (kmctr) has updated all related addresses such that
2444     //    we know where to continue with "from" and "to" and which counter value to use next.
2445 
2446     Register encCtr    = Z_R12;  // encrypted counter value, points to stub argument.
2447     Register tmpDst    = Z_R12;  // addr of temp destination (for last partial block encryption)
2448 
2449     __ z_lgf(srclen, msglen_red_offset, parmBlk);          // plaintext/ciphertext len after potential preLoop processing.
2450     __ z_nilf(srclen, AES_ctrVal_len - 1);                 // those rightmost bits indicate the unprocessed #bytes
2451     __ z_stg(srclen, localSpill_offset, parmBlk);          // save for later reuse
2452     __ z_mvhi(0, cvIxAddr, 16);                            // write back arg7 (default 16 in case of allDone).
2453     __ z_braz(allDone_noInc);                              // no unprocessed bytes? Then we are done.
2454                                                            // This also means the last block of data processed was
2455                                                            // a full-sized block (AES_ctrVal_len bytes) which results
2456                                                            // in no leftover encrypted counter bytes.
2457     __ z_st(srclen, 0, cvIxAddr);                          // This will be the index of the first unused byte in the encrypted counter.
2458     __ z_stg(counter, counter_offset, parmBlk);            // save counter location for easy later restore
2459 
2460     // calculate address (on stack) for final dst and src blocks.
2461     __ add2reg(tmpDst, AES_dataBlk_offset, parmBlk);       // tmp dst (on stack) is right before tmp src
2462 
2463     // We have a residue of [1..15] unprocessed bytes, srclen holds the exact number.
2464     // Residue == 0 was checked just above, residue == AES_ctrVal_len would be another
2465     // full-sized block and would have been handled by CryptoLoop.
2466 
2467     __ add2reg(srclen, -1);                                // decrement for exrl
2468     __ z_exrl(srclen, srcMover);                           // copy remaining bytes of src byte stream
2469     __ load_const_optimized(srclen, AES_ctrVal_len);       // kmctr processes only complete blocks
2470     __ add2reg(src, AES_ctrVal_len, tmpDst);               // tmp dst is right before tmp src
2471 
2472     __ kmctr(tmpDst, counter, src);                        // Cipher the remaining bytes.
2473 
2474     __ add2reg(tmpDst, -AES_ctrVal_len, tmpDst);           // restore tmp dst address
2475     __ z_lg(srclen, localSpill_offset, parmBlk);           // residual len, saved above
2476     __ add2reg(srclen, -1);                                // decrement for exrl
2477     __ z_exrl(srclen, dstMover);
2478 
2479     // Write back new encrypted counter
2480     __ add2reg(src, AES_dataBlk_offset, parmBlk);
2481     __ clear_mem(Address(src, RegisterOrConstant((intptr_t)0)), AES_ctrVal_len);
2482     __ load_const_optimized(srclen, AES_ctrVal_len);       // kmctr processes only complete blocks
2483     __ z_lg(encCtr, arg6_Offset, Z_SP);                    // write encrypted counter to arg6
2484     __ z_lg(counter, counter_offset, parmBlk);             // restore counter
2485     __ kmctr(encCtr, counter, src);
2486 
2487     // The last used element of the counter vector contains the latest counter value that was used.
2488     // As described above, the counter value on exit must be the one to be used next.
2489     __ bind(allDone);
2490     __ z_lg(counter, counter_offset, parmBlk);             // restore counter
2491     generate_increment128(counter, 0, 1, Z_R0);
2492 
2493     __ bind(allDone_noInc);
2494     __ z_mvc(0, AES_ctrVal_len, ctr, 0, counter);
2495 
2496     __ bind(popAndExit);
2497     generate_counterMode_pop_parmBlk(parmBlk, msglen, dataEraser);
2498 
2499     __ bind(Exit);
2500     __ z_lgfr(Z_RET, msglen);
2501 
2502     __ z_br(Z_R14);
2503 
2504     //----------------------------
2505     //---<  out-of-line code  >---
2506     //----------------------------
2507     __ bind(CryptoLoop_setupAndDoLast);
2508       __ z_lgf(srclen, rem_msgblk_offset, parmBlk);           // remaining #blocks in memory is < 0
2509       __ z_aghi(srclen, AES_ctrVec_len);                      // recalculate the actually remaining #blocks
2510       __ z_sllg(srclen, srclen, exact_log2(AES_ctrVal_len));  // convert to #bytes. Counter value is same length as data block
2511       __ kmctr(dst, counter, src);                            // Cipher the last integral blocks of the message.
2512       __ z_bru(CryptoLoop_end);                               // There is at least one unused counter vector element.
2513                                                               // no need to increment.
2514 
2515     __ bind(CryptoLoop_ctrVal_inc);
2516       generate_counterMode_increment_ctrVector(parmBlk, counter, srclen, true); // srclen unused here (serves as scratch)
2517       __ z_bru(CryptoLoop_end);
2518 
2519     //-------------------------------------------
2520     //---<  execution templates for preLoop  >---
2521     //-------------------------------------------
2522     __ bind(fromMover);
2523     __ z_mvc(0, 0, to, 0, from);               // Template instruction to move input data to dst.
2524     __ bind(ctrXOR);
2525     __ z_xc(0,  0, to, 0, encCtr);             // Template instruction to XOR input data (now in to) with encrypted counter.
2526 
2527     //-------------------------------
2528     //---<  execution templates  >---
2529     //-------------------------------
2530     __ bind(dataEraser);
2531     __ z_xc(0, 0, parmBlk, 0, parmBlk);  // Template instruction to erase crypto key on stack.
2532     __ bind(dstMover);
2533     __ z_mvc(0, 0, dst, 0, tmpDst);      // Template instruction to move encrypted reminder from stack to dst.
2534     __ bind(srcMover);
2535     __ z_mvc(AES_ctrVal_len, 0, tmpDst, 0, src); // Template instruction to move reminder of source byte stream to stack.
2536   }
2537 
2538 
2539   // Create two intrinsic variants, optimized for short and long plaintexts.
2540   void generate_counterMode_AES(bool is_decipher) {
2541 
2542     const Register msglen  = Z_ARG5;    // int, Total length of the msg to be encrypted. Value must be
2543                                         // returned in Z_RET upon completion of this stub.
2544     const int threshold = 256;          // above this length (in bytes), text is considered long.
2545     const int vec_short = threshold>>6; // that many blocks (16 bytes each) per iteration, max 4 loop iterations
2546     const int vec_long  = threshold>>2; // that many blocks (16 bytes each) per iteration.
2547 
2548     Label AESCTR_short, AESCTR_long;
2549 
2550     __ z_chi(msglen, threshold);
2551     __ z_brh(AESCTR_long);
2552 
2553     __ bind(AESCTR_short);
2554 
2555     BLOCK_COMMENT(err_msg("counterMode_AESCrypt (text len <= %d, block size = %d) {", threshold, vec_short*16));
2556 
2557     AES_ctrVec_len = vec_short;
2558     generate_counterMode_AES_impl(false);   // control of generated code will not return
2559 
2560     BLOCK_COMMENT(err_msg("} counterMode_AESCrypt (text len <= %d, block size = %d)", threshold, vec_short*16));
2561 
2562     __ align(32); // Octoword alignment benefits branch targets.
2563 
2564     BLOCK_COMMENT(err_msg("counterMode_AESCrypt (text len > %d, block size = %d) {", threshold, vec_long*16));
2565 
2566     __ bind(AESCTR_long);
2567     AES_ctrVec_len = vec_long;
2568     generate_counterMode_AES_impl(false);   // control of generated code will not return
2569 
2570     BLOCK_COMMENT(err_msg("} counterMode_AESCrypt (text len > %d, block size = %d)", threshold, vec_long*16));
2571   }
2572 
2573 
2574   // Compute AES-CTR crypto function.
2575   // Encrypt or decrypt is selected via parameters. Only one stub is necessary.
2576   address generate_counterMode_AESCrypt() {
2577     __ align(CodeEntryAlignment);
2578     StubGenStubId stub_id = StubGenStubId::counterMode_AESCrypt_id;
2579     StubCodeMark mark(this, stub_id);
2580     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
2581 
2582     generate_counterMode_AES(false);
2583 
2584     return __ addr_at(start_off);
2585   }
2586 
2587 // *****************************************************************************
2588 
2589   // Compute GHASH function.
2590   address generate_ghash_processBlocks() {
2591     __ align(CodeEntryAlignment);
2592     StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_id;
2593     StubCodeMark mark(this, stub_id);
2594     unsigned int start_off = __ offset();   // Remember stub start address (is rtn value).
2595 
2596     const Register state   = Z_ARG1;
2597     const Register subkeyH = Z_ARG2;
2598     const Register data    = Z_ARG3; // 1st of even-odd register pair.
2599     const Register blocks  = Z_ARG4;
2600     const Register len     = blocks; // 2nd of even-odd register pair.
2601 
2602     const int param_block_size = 4 * 8;
2603     const int frame_resize = param_block_size + 8; // Extra space for copy of fp.
2604 
2605     // Reserve stack space for parameter block (R1).
2606     __ z_lgr(Z_R1, Z_SP);
2607     __ resize_frame(-frame_resize, Z_R0, true);
2608     __ z_aghi(Z_R1, -param_block_size);
2609 
2610     // Fill parameter block.
2611     __ z_mvc(Address(Z_R1)    , Address(state)  , 16);
2612     __ z_mvc(Address(Z_R1, 16), Address(subkeyH), 16);
2613 
2614     // R4+5: data pointer + length
2615     __ z_llgfr(len, blocks);  // Cast to 64-bit.
2616 
2617     // R0: function code
2618     __ load_const_optimized(Z_R0, (int)VM_Version::MsgDigest::_GHASH);
2619 
2620     // Compute.
2621     __ z_sllg(len, len, 4);  // In bytes.
2622     __ kimd(data);
2623 
2624     // Copy back result and free parameter block.
2625     __ z_mvc(Address(state), Address(Z_R1), 16);
2626     __ z_xc(Address(Z_R1), param_block_size, Address(Z_R1));
2627     __ z_aghi(Z_SP, frame_resize);
2628 
2629     __ z_br(Z_R14);
2630 
2631     return __ addr_at(start_off);
2632   }
2633 
2634 
2635   // Call interface for all SHA* stubs.
2636   //
2637   //   Z_ARG1 - source data block. Ptr to leftmost byte to be processed.
2638   //   Z_ARG2 - current SHA state. Ptr to state area. This area serves as
2639   //            parameter block as required by the crypto instruction.
2640   //   Z_ARG3 - current byte offset in source data block.
2641   //   Z_ARG4 - last byte offset in source data block.
2642   //            (Z_ARG4 - Z_ARG3) gives the #bytes remaining to be processed.
2643   //
2644   //   Z_RET  - return value. First unprocessed byte offset in src buffer.
2645   //
2646   //   A few notes on the call interface:
2647   //    - All stubs, whether they are single-block or multi-block, are assumed to
2648   //      digest an integer multiple of the data block length of data. All data
2649   //      blocks are digested using the intermediate message digest (KIMD) instruction.
2650   //      Special end processing, as done by the KLMD instruction, seems to be
2651   //      emulated by the calling code.
2652   //
2653   //    - Z_ARG1 addresses the first byte of source data. The offset (Z_ARG3) is
2654   //      already accounted for.
2655   //
2656   //    - The current SHA state (the intermediate message digest value) is contained
2657   //      in an area addressed by Z_ARG2. The area size depends on the SHA variant
2658   //      and is accessible via the enum VM_Version::MsgDigest::_SHA<n>_parmBlk_I
2659   //
2660   //    - The single-block stub is expected to digest exactly one data block, starting
2661   //      at the address passed in Z_ARG1.
2662   //
2663   //    - The multi-block stub is expected to digest all data blocks which start in
2664   //      the offset interval [srcOff(Z_ARG3), srcLimit(Z_ARG4)). The exact difference
2665   //      (srcLimit-srcOff), rounded up to the next multiple of the data block length,
2666   //      gives the number of blocks to digest. It must be assumed that the calling code
2667   //      provides for a large enough source data buffer.
2668   //
2669   // Compute SHA-1 function.
2670   address generate_SHA1_stub(StubGenStubId stub_id) {
2671     bool multiBlock;
2672     switch (stub_id) {
2673     case sha1_implCompress_id:
2674       multiBlock = false;
2675       break;
2676     case sha1_implCompressMB_id:
2677       multiBlock = true;
2678       break;
2679     default:
2680       ShouldNotReachHere();
2681     }
2682     __ align(CodeEntryAlignment);
2683     StubCodeMark mark(this, stub_id);
2684     unsigned int start_off = __ offset();   // Remember stub start address (is rtn value).
2685 
2686     const Register srcBuff        = Z_ARG1; // Points to first block to process (offset already added).
2687     const Register SHAState       = Z_ARG2; // Only on entry. Reused soon thereafter for kimd register pairs.
2688     const Register srcOff         = Z_ARG3; // int
2689     const Register srcLimit       = Z_ARG4; // Only passed in multiBlock case. int
2690 
2691     const Register SHAState_local = Z_R1;
2692     const Register SHAState_save  = Z_ARG3;
2693     const Register srcBufLen      = Z_ARG2; // Destroys state address, must be copied before.
2694     Label useKLMD, rtn;
2695 
2696     __ load_const_optimized(Z_R0, (int)VM_Version::MsgDigest::_SHA1);   // function code
2697     __ z_lgr(SHAState_local, SHAState);                                 // SHAState == parameter block
2698 
2699     if (multiBlock) {  // Process everything from offset to limit.
2700 
2701       // The following description is valid if we get a raw (unpimped) source data buffer,
2702       // spanning the range between [srcOff(Z_ARG3), srcLimit(Z_ARG4)). As detailed above,
2703       // the calling convention for these stubs is different. We leave the description in
2704       // to inform the reader what must be happening hidden in the calling code.
2705       //
2706       // The data block to be processed can have arbitrary length, i.e. its length does not
2707       // need to be an integer multiple of SHA<n>_datablk. Therefore, we need to implement
2708       // two different paths. If the length is an integer multiple, we use KIMD, saving us
2709       // to copy the SHA state back and forth. If the length is odd, we copy the SHA state
2710       // to the stack, execute a KLMD instruction on it and copy the result back to the
2711       // caller's SHA state location.
2712 
2713       // Total #srcBuff blocks to process.
2714       if (VM_Version::has_DistinctOpnds()) {
2715         __ z_srk(srcBufLen, srcLimit, srcOff); // exact difference
2716         __ z_ahi(srcBufLen, VM_Version::MsgDigest::_SHA1_dataBlk-1);   // round up
2717         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA1_dataBlk-1)) & 0xffff);
2718         __ z_ark(srcLimit, srcOff, srcBufLen); // Srclimit temporarily holds return value.
2719         __ z_llgfr(srcBufLen, srcBufLen);      // Cast to 64-bit.
2720       } else {
2721         __ z_lgfr(srcBufLen, srcLimit);        // Exact difference. srcLimit passed as int.
2722         __ z_sgfr(srcBufLen, srcOff);          // SrcOff passed as int, now properly casted to long.
2723         __ z_aghi(srcBufLen, VM_Version::MsgDigest::_SHA1_dataBlk-1);   // round up
2724         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA1_dataBlk-1)) & 0xffff);
2725         __ z_lgr(srcLimit, srcOff);            // SrcLimit temporarily holds return value.
2726         __ z_agr(srcLimit, srcBufLen);
2727       }
2728 
2729       // Integral #blocks to digest?
2730       // As a result of the calculations above, srcBufLen MUST be an integer
2731       // multiple of _SHA1_dataBlk, or else we are in big trouble.
2732       // We insert an asm_assert into the KLMD case to guard against that.
2733       __ z_tmll(srcBufLen, VM_Version::MsgDigest::_SHA1_dataBlk-1);
2734       __ z_brc(Assembler::bcondNotAllZero, useKLMD);
2735 
2736       // Process all full blocks.
2737       __ kimd(srcBuff);
2738 
2739       __ z_lgr(Z_RET, srcLimit);  // Offset of first unprocessed byte in buffer.
2740     } else {  // Process one data block only.
2741       __ load_const_optimized(srcBufLen, (int)VM_Version::MsgDigest::_SHA1_dataBlk);   // #srcBuff bytes to process
2742       __ kimd(srcBuff);
2743       __ add2reg(Z_RET, (int)VM_Version::MsgDigest::_SHA1_dataBlk, srcOff);            // Offset of first unprocessed byte in buffer. No 32 to 64 bit extension needed.
2744     }
2745 
2746     __ bind(rtn);
2747     __ z_br(Z_R14);
2748 
2749     if (multiBlock) {
2750       __ bind(useKLMD);
2751 
2752 #if 1
2753       // Security net: this stub is believed to be called for full-sized data blocks only
2754       // NOTE: The following code is believed to be correct, but is is not tested.
2755       __ stop_static("SHA128 stub can digest full data blocks only. Use -XX:-UseSHA as remedy.", 0);
2756 #endif
2757     }
2758 
2759     return __ addr_at(start_off);
2760   }
2761 
2762   // Compute SHA-256 function.
2763   address generate_SHA256_stub(StubGenStubId stub_id) {
2764     bool multiBlock;
2765     switch (stub_id) {
2766     case sha256_implCompress_id:
2767       multiBlock = false;
2768       break;
2769     case sha256_implCompressMB_id:
2770       multiBlock = true;
2771       break;
2772     default:
2773       ShouldNotReachHere();
2774     }
2775     __ align(CodeEntryAlignment);
2776     StubCodeMark mark(this, stub_id);
2777     unsigned int start_off = __ offset();   // Remember stub start address (is rtn value).
2778 
2779     const Register srcBuff        = Z_ARG1;
2780     const Register SHAState       = Z_ARG2; // Only on entry. Reused soon thereafter.
2781     const Register SHAState_local = Z_R1;
2782     const Register SHAState_save  = Z_ARG3;
2783     const Register srcOff         = Z_ARG3;
2784     const Register srcLimit       = Z_ARG4;
2785     const Register srcBufLen      = Z_ARG2; // Destroys state address, must be copied before.
2786     Label useKLMD, rtn;
2787 
2788     __ load_const_optimized(Z_R0, (int)VM_Version::MsgDigest::_SHA256); // function code
2789     __ z_lgr(SHAState_local, SHAState);                                 // SHAState == parameter block
2790 
2791     if (multiBlock) {  // Process everything from offset to limit.
2792       // The following description is valid if we get a raw (unpimped) source data buffer,
2793       // spanning the range between [srcOff(Z_ARG3), srcLimit(Z_ARG4)). As detailed above,
2794       // the calling convention for these stubs is different. We leave the description in
2795       // to inform the reader what must be happening hidden in the calling code.
2796       //
2797       // The data block to be processed can have arbitrary length, i.e. its length does not
2798       // need to be an integer multiple of SHA<n>_datablk. Therefore, we need to implement
2799       // two different paths. If the length is an integer multiple, we use KIMD, saving us
2800       // to copy the SHA state back and forth. If the length is odd, we copy the SHA state
2801       // to the stack, execute a KLMD instruction on it and copy the result back to the
2802       // caller's SHA state location.
2803 
2804       // total #srcBuff blocks to process
2805       if (VM_Version::has_DistinctOpnds()) {
2806         __ z_srk(srcBufLen, srcLimit, srcOff);   // exact difference
2807         __ z_ahi(srcBufLen, VM_Version::MsgDigest::_SHA256_dataBlk-1); // round up
2808         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA256_dataBlk-1)) & 0xffff);
2809         __ z_ark(srcLimit, srcOff, srcBufLen);   // Srclimit temporarily holds return value.
2810         __ z_llgfr(srcBufLen, srcBufLen);        // Cast to 64-bit.
2811       } else {
2812         __ z_lgfr(srcBufLen, srcLimit);          // exact difference
2813         __ z_sgfr(srcBufLen, srcOff);
2814         __ z_aghi(srcBufLen, VM_Version::MsgDigest::_SHA256_dataBlk-1); // round up
2815         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA256_dataBlk-1)) & 0xffff);
2816         __ z_lgr(srcLimit, srcOff);              // Srclimit temporarily holds return value.
2817         __ z_agr(srcLimit, srcBufLen);
2818       }
2819 
2820       // Integral #blocks to digest?
2821       // As a result of the calculations above, srcBufLen MUST be an integer
2822       // multiple of _SHA1_dataBlk, or else we are in big trouble.
2823       // We insert an asm_assert into the KLMD case to guard against that.
2824       __ z_tmll(srcBufLen, VM_Version::MsgDigest::_SHA256_dataBlk-1);
2825       __ z_brc(Assembler::bcondNotAllZero, useKLMD);
2826 
2827       // Process all full blocks.
2828       __ kimd(srcBuff);
2829 
2830       __ z_lgr(Z_RET, srcLimit);  // Offset of first unprocessed byte in buffer.
2831     } else {  // Process one data block only.
2832       __ load_const_optimized(srcBufLen, (int)VM_Version::MsgDigest::_SHA256_dataBlk); // #srcBuff bytes to process
2833       __ kimd(srcBuff);
2834       __ add2reg(Z_RET, (int)VM_Version::MsgDigest::_SHA256_dataBlk, srcOff);          // Offset of first unprocessed byte in buffer.
2835     }
2836 
2837     __ bind(rtn);
2838     __ z_br(Z_R14);
2839 
2840     if (multiBlock) {
2841       __ bind(useKLMD);
2842 #if 1
2843       // Security net: this stub is believed to be called for full-sized data blocks only.
2844       // NOTE:
2845       //   The following code is believed to be correct, but is is not tested.
2846       __ stop_static("SHA256 stub can digest full data blocks only. Use -XX:-UseSHA as remedy.", 0);
2847 #endif
2848     }
2849 
2850     return __ addr_at(start_off);
2851   }
2852 
2853   // Compute SHA-512 function.
2854   address generate_SHA512_stub(StubGenStubId stub_id) {
2855     bool multiBlock;
2856     switch (stub_id) {
2857     case sha512_implCompress_id:
2858       multiBlock = false;
2859       break;
2860     case sha512_implCompressMB_id:
2861       multiBlock = true;
2862       break;
2863     default:
2864       ShouldNotReachHere();
2865     }
2866     __ align(CodeEntryAlignment);
2867     StubCodeMark mark(this, stub_id);
2868     unsigned int start_off = __ offset();   // Remember stub start address (is rtn value).
2869 
2870     const Register srcBuff        = Z_ARG1;
2871     const Register SHAState       = Z_ARG2; // Only on entry. Reused soon thereafter.
2872     const Register SHAState_local = Z_R1;
2873     const Register SHAState_save  = Z_ARG3;
2874     const Register srcOff         = Z_ARG3;
2875     const Register srcLimit       = Z_ARG4;
2876     const Register srcBufLen      = Z_ARG2; // Destroys state address, must be copied before.
2877     Label useKLMD, rtn;
2878 
2879     __ load_const_optimized(Z_R0, (int)VM_Version::MsgDigest::_SHA512); // function code
2880     __ z_lgr(SHAState_local, SHAState);                                 // SHAState == parameter block
2881 
2882     if (multiBlock) {  // Process everything from offset to limit.
2883       // The following description is valid if we get a raw (unpimped) source data buffer,
2884       // spanning the range between [srcOff(Z_ARG3), srcLimit(Z_ARG4)). As detailed above,
2885       // the calling convention for these stubs is different. We leave the description in
2886       // to inform the reader what must be happening hidden in the calling code.
2887       //
2888       // The data block to be processed can have arbitrary length, i.e. its length does not
2889       // need to be an integer multiple of SHA<n>_datablk. Therefore, we need to implement
2890       // two different paths. If the length is an integer multiple, we use KIMD, saving us
2891       // to copy the SHA state back and forth. If the length is odd, we copy the SHA state
2892       // to the stack, execute a KLMD instruction on it and copy the result back to the
2893       // caller's SHA state location.
2894 
2895       // total #srcBuff blocks to process
2896       if (VM_Version::has_DistinctOpnds()) {
2897         __ z_srk(srcBufLen, srcLimit, srcOff);   // exact difference
2898         __ z_ahi(srcBufLen, VM_Version::MsgDigest::_SHA512_dataBlk-1); // round up
2899         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA512_dataBlk-1)) & 0xffff);
2900         __ z_ark(srcLimit, srcOff, srcBufLen);   // Srclimit temporarily holds return value.
2901         __ z_llgfr(srcBufLen, srcBufLen);        // Cast to 64-bit.
2902       } else {
2903         __ z_lgfr(srcBufLen, srcLimit);          // exact difference
2904         __ z_sgfr(srcBufLen, srcOff);
2905         __ z_aghi(srcBufLen, VM_Version::MsgDigest::_SHA512_dataBlk-1); // round up
2906         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA512_dataBlk-1)) & 0xffff);
2907         __ z_lgr(srcLimit, srcOff);              // Srclimit temporarily holds return value.
2908         __ z_agr(srcLimit, srcBufLen);
2909       }
2910 
2911       // integral #blocks to digest?
2912       // As a result of the calculations above, srcBufLen MUST be an integer
2913       // multiple of _SHA1_dataBlk, or else we are in big trouble.
2914       // We insert an asm_assert into the KLMD case to guard against that.
2915       __ z_tmll(srcBufLen, VM_Version::MsgDigest::_SHA512_dataBlk-1);
2916       __ z_brc(Assembler::bcondNotAllZero, useKLMD);
2917 
2918       // Process all full blocks.
2919       __ kimd(srcBuff);
2920 
2921       __ z_lgr(Z_RET, srcLimit);  // Offset of first unprocessed byte in buffer.
2922     } else {  // Process one data block only.
2923       __ load_const_optimized(srcBufLen, (int)VM_Version::MsgDigest::_SHA512_dataBlk); // #srcBuff bytes to process
2924       __ kimd(srcBuff);
2925       __ add2reg(Z_RET, (int)VM_Version::MsgDigest::_SHA512_dataBlk, srcOff);          // Offset of first unprocessed byte in buffer.
2926     }
2927 
2928     __ bind(rtn);
2929     __ z_br(Z_R14);
2930 
2931     if (multiBlock) {
2932       __ bind(useKLMD);
2933 #if 1
2934       // Security net: this stub is believed to be called for full-sized data blocks only
2935       // NOTE:
2936       //   The following code is believed to be correct, but is is not tested.
2937       __ stop_static("SHA512 stub can digest full data blocks only. Use -XX:-UseSHA as remedy.", 0);
2938 #endif
2939     }
2940 
2941     return __ addr_at(start_off);
2942   }
2943 
2944 
2945   /**
2946    *  Arguments:
2947    *
2948    * Inputs:
2949    *   Z_ARG1    - int   crc
2950    *   Z_ARG2    - byte* buf
2951    *   Z_ARG3    - int   length (of buffer)
2952    *
2953    * Result:
2954    *   Z_RET     - int   crc result
2955    **/
2956   // Compute CRC function (generic, for all polynomials).
2957   void generate_CRC_updateBytes(Register table, bool invertCRC) {
2958 
2959     // arguments to kernel_crc32:
2960     Register       crc     = Z_ARG1;  // Current checksum, preset by caller or result from previous call, int.
2961     Register       data    = Z_ARG2;  // source byte array
2962     Register       dataLen = Z_ARG3;  // #bytes to process, int
2963 //    Register       table   = Z_ARG4;  // crc table address. Preloaded and passed in by caller.
2964     const Register t0      = Z_R10;   // work reg for kernel* emitters
2965     const Register t1      = Z_R11;   // work reg for kernel* emitters
2966     const Register t2      = Z_R12;   // work reg for kernel* emitters
2967     const Register t3      = Z_R13;   // work reg for kernel* emitters
2968 
2969 
2970     assert_different_registers(crc, data, dataLen, table);
2971 
2972     // We pass these values as ints, not as longs as required by C calling convention.
2973     // Crc used as int.
2974     __ z_llgfr(dataLen, dataLen);
2975 
2976     __ resize_frame(-(6*8), Z_R0, true); // Resize frame to provide add'l space to spill 5 registers.
2977     __ z_stmg(Z_R10, Z_R13, 1*8, Z_SP);  // Spill regs 10..11 to make them available as work registers.
2978     __ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, invertCRC);
2979     __ z_lmg(Z_R10, Z_R13, 1*8, Z_SP);   // Spill regs 10..11 back from stack.
2980     __ resize_frame(+(6*8), Z_R0, true); // Resize frame to provide add'l space to spill 5 registers.
2981 
2982     __ z_llgfr(Z_RET, crc);  // Updated crc is function result. No copying required, just zero upper 32 bits.
2983     __ z_br(Z_R14);          // Result already in Z_RET == Z_ARG1.
2984   }
2985 
2986 
2987   // Compute CRC32 function.
2988   address generate_CRC32_updateBytes() {
2989     __ align(CodeEntryAlignment);
2990     StubGenStubId stub_id =  StubGenStubId::updateBytesCRC32_id;
2991     StubCodeMark mark(this, stub_id);
2992     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
2993 
2994     assert(UseCRC32Intrinsics, "should not generate this stub (%s) with CRC32 intrinsics disabled", StubRoutines::get_stub_name(stub_id));
2995 
2996     BLOCK_COMMENT("CRC32_updateBytes {");
2997     Register       table   = Z_ARG4;  // crc32 table address.
2998     StubRoutines::zarch::generate_load_crc_table_addr(_masm, table);
2999 
3000     generate_CRC_updateBytes(table, true);
3001     BLOCK_COMMENT("} CRC32_updateBytes");
3002 
3003     return __ addr_at(start_off);
3004   }
3005 
3006 
3007   // Compute CRC32C function.
3008   address generate_CRC32C_updateBytes() {
3009     __ align(CodeEntryAlignment);
3010     StubGenStubId stub_id =  StubGenStubId::updateBytesCRC32C_id;
3011     StubCodeMark mark(this, stub_id);
3012     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
3013 
3014     assert(UseCRC32CIntrinsics, "should not generate this stub (%s) with CRC32C intrinsics disabled", StubRoutines::get_stub_name(stub_id));
3015 
3016     BLOCK_COMMENT("CRC32C_updateBytes {");
3017     Register       table   = Z_ARG4;  // crc32c table address.
3018     StubRoutines::zarch::generate_load_crc32c_table_addr(_masm, table);
3019 
3020     generate_CRC_updateBytes(table, false);
3021     BLOCK_COMMENT("} CRC32C_updateBytes");
3022 
3023     return __ addr_at(start_off);
3024   }
3025 
3026 
3027   // Arguments:
3028   //   Z_ARG1    - x address
3029   //   Z_ARG2    - x length
3030   //   Z_ARG3    - y address
3031   //   Z_ARG4    - y length
3032   //   Z_ARG5    - z address
3033   address generate_multiplyToLen() {
3034     __ align(CodeEntryAlignment);
3035     StubGenStubId stub_id =  StubGenStubId::multiplyToLen_id;
3036     StubCodeMark mark(this, stub_id);
3037 
3038     address start = __ pc();
3039 
3040     const Register x    = Z_ARG1;
3041     const Register xlen = Z_ARG2;
3042     const Register y    = Z_ARG3;
3043     const Register ylen = Z_ARG4;
3044     const Register z    = Z_ARG5;
3045 
3046     // Next registers will be saved on stack in multiply_to_len().
3047     const Register tmp1 = Z_tmp_1;
3048     const Register tmp2 = Z_tmp_2;
3049     const Register tmp3 = Z_tmp_3;
3050     const Register tmp4 = Z_tmp_4;
3051     const Register tmp5 = Z_R9;
3052 
3053     BLOCK_COMMENT("Entry:");
3054 
3055     __ z_llgfr(xlen, xlen);
3056     __ z_llgfr(ylen, ylen);
3057 
3058     __ multiply_to_len(x, xlen, y, ylen, z, tmp1, tmp2, tmp3, tmp4, tmp5);
3059 
3060     __ z_br(Z_R14);  // Return to caller.
3061 
3062     return start;
3063   }
3064 
3065   address generate_method_entry_barrier() {
3066     __ align(CodeEntryAlignment);
3067     StubGenStubId stub_id =  StubGenStubId::method_entry_barrier_id;
3068     StubCodeMark mark(this, stub_id);
3069 
3070     address start = __ pc();
3071 
3072     int nbytes_volatile = (8 + 5) * BytesPerWord;
3073 
3074     // VM-Call Prologue
3075     __ save_return_pc();
3076     __ push_frame_abi160(nbytes_volatile);
3077     __ save_volatile_regs(Z_SP, frame::z_abi_160_size, true, false);
3078 
3079     // Prep arg for VM call
3080     // Create ptr to stored return_pc in caller frame.
3081     __ z_la(Z_ARG1, _z_abi(return_pc) + frame::z_abi_160_size + nbytes_volatile, Z_R0, Z_SP);
3082 
3083     // VM-Call: BarrierSetNMethod::nmethod_stub_entry_barrier(address* return_address_ptr)
3084     __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier));
3085     __ z_ltr(Z_R0_scratch, Z_RET);
3086 
3087     // VM-Call Epilogue
3088     __ restore_volatile_regs(Z_SP, frame::z_abi_160_size, true, false);
3089     __ pop_frame();
3090     __ restore_return_pc();
3091 
3092     // Check return val of VM-Call
3093     __ z_bcr(Assembler::bcondZero, Z_R14);
3094 
3095     // Pop frame built in prologue.
3096     // Required so wrong_method_stub can deduce caller.
3097     __ pop_frame();
3098     __ restore_return_pc();
3099 
3100     // VM-Call indicates deoptimization required
3101     __ load_const_optimized(Z_R1_scratch, SharedRuntime::get_handle_wrong_method_stub());
3102     __ z_br(Z_R1_scratch);
3103 
3104     return start;
3105   }
3106 
3107   address generate_cont_thaw(bool return_barrier, bool exception) {
3108     if (!Continuations::enabled()) return nullptr;
3109     Unimplemented();
3110     return nullptr;
3111   }
3112 
3113   address generate_cont_thaw() {
3114     if (!Continuations::enabled()) return nullptr;
3115     Unimplemented();
3116     return nullptr;
3117   }
3118 
3119   address generate_cont_returnBarrier() {
3120     if (!Continuations::enabled()) return nullptr;
3121     Unimplemented();
3122     return nullptr;
3123   }
3124 
3125   address generate_cont_returnBarrier_exception() {
3126     if (!Continuations::enabled()) return nullptr;
3127     Unimplemented();
3128     return nullptr;
3129   }
3130 
3131   // exception handler for upcall stubs
3132   address generate_upcall_stub_exception_handler() {
3133     StubGenStubId stub_id =  StubGenStubId::upcall_stub_exception_handler_id;
3134     StubCodeMark mark(this, stub_id);
3135     address start = __ pc();
3136 
3137     // Native caller has no idea how to handle exceptions,
3138     // so we just crash here. Up to callee to catch exceptions.
3139     __ verify_oop(Z_ARG1);
3140     __ load_const_optimized(Z_R1_scratch, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
3141     __ call_c(Z_R1_scratch);
3142     __ should_not_reach_here();
3143 
3144     return start;
3145   }
3146 
3147   // load Method* target of MethodHandle
3148   // Z_ARG1 = jobject receiver
3149   // Z_method = Method* result
3150   address generate_upcall_stub_load_target() {
3151     StubGenStubId stub_id =  StubGenStubId::upcall_stub_load_target_id;
3152     StubCodeMark mark(this, stub_id);
3153     address start = __ pc();
3154 
3155     __ resolve_global_jobject(Z_ARG1, Z_tmp_1, Z_tmp_2);
3156       // Load target method from receiver
3157     __ load_heap_oop(Z_method, Address(Z_ARG1, java_lang_invoke_MethodHandle::form_offset()),
3158                     noreg, noreg, IS_NOT_NULL);
3159     __ load_heap_oop(Z_method, Address(Z_method, java_lang_invoke_LambdaForm::vmentry_offset()),
3160                     noreg, noreg, IS_NOT_NULL);
3161     __ load_heap_oop(Z_method, Address(Z_method, java_lang_invoke_MemberName::method_offset()),
3162                     noreg, noreg, IS_NOT_NULL);
3163     __ z_lg(Z_method, Address(Z_method, java_lang_invoke_ResolvedMethodName::vmtarget_offset()));
3164     __ z_stg(Z_method, Address(Z_thread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
3165 
3166     __ z_br(Z_R14);
3167 
3168     return start;
3169   }
3170 
3171   void generate_initial_stubs() {
3172     // Generates all stubs and initializes the entry points.
3173 
3174     // Entry points that exist in all platforms.
3175     // Note: This is code that could be shared among different
3176     // platforms - however the benefit seems to be smaller than the
3177     // disadvantage of having a much more complicated generator
3178     // structure. See also comment in stubRoutines.hpp.
3179     StubRoutines::_forward_exception_entry                 = generate_forward_exception();
3180 
3181     StubRoutines::_call_stub_entry                         = generate_call_stub(StubRoutines::_call_stub_return_address);
3182     StubRoutines::_catch_exception_entry                   = generate_catch_exception();
3183 
3184     //----------------------------------------------------------------------
3185     // Entry points that are platform specific.
3186 
3187     if (UseCRC32Intrinsics) {
3188       StubRoutines::_crc_table_adr     = (address)StubRoutines::zarch::_crc_table;
3189       StubRoutines::_updateBytesCRC32  = generate_CRC32_updateBytes();
3190     }
3191 
3192     if (UseCRC32CIntrinsics) {
3193       StubRoutines::_crc32c_table_addr = (address)StubRoutines::zarch::_crc32c_table;
3194       StubRoutines::_updateBytesCRC32C = generate_CRC32C_updateBytes();
3195     }
3196 
3197     // Comapct string intrinsics: Translate table for string inflate intrinsic. Used by trot instruction.
3198     StubRoutines::zarch::_trot_table_addr = (address)StubRoutines::zarch::_trot_table;
3199   }
3200 
3201   void generate_continuation_stubs() {
3202     if (!Continuations::enabled()) return;
3203 
3204     // Continuation stubs:
3205     StubRoutines::_cont_thaw          = generate_cont_thaw();
3206     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
3207     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
3208   }
3209 
3210   void generate_final_stubs() {
3211     // Generates all stubs and initializes the entry points.
3212 
3213     // Support for verify_oop (must happen after universe_init).
3214     StubRoutines::_verify_oop_subroutine_entry             = generate_verify_oop_subroutine();
3215 
3216     // Arraycopy stubs used by compilers.
3217     generate_arraycopy_stubs();
3218 
3219     // nmethod entry barriers for concurrent class unloading
3220     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
3221     if (bs_nm != nullptr) {
3222       StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
3223     }
3224 
3225 #ifdef COMPILER2
3226     if (UseSecondarySupersTable) {
3227       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
3228       if (!InlineSecondarySupersTest) {
3229         generate_lookup_secondary_supers_table_stub();
3230       }
3231     }
3232 #endif // COMPILER2
3233 
3234     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
3235     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
3236   }
3237 
3238   void generate_compiler_stubs() {
3239 
3240     StubRoutines::zarch::_partial_subtype_check            = generate_partial_subtype_check();
3241 
3242 #if COMPILER2_OR_JVMCI
3243     // Generate AES intrinsics code.
3244     if (UseAESIntrinsics) {
3245       if (VM_Version::has_Crypto_AES()) {
3246         StubRoutines::_aescrypt_encryptBlock = generate_AES_encryptBlock();
3247         StubRoutines::_aescrypt_decryptBlock = generate_AES_decryptBlock();
3248         StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_AES_encrypt();
3249         StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_AES_decrypt();
3250       } else {
3251         // In PRODUCT builds, the function pointers will keep their initial (null) value.
3252         // LibraryCallKit::try_to_inline() will return false then, preventing the intrinsic to be called.
3253         assert(VM_Version::has_Crypto_AES(), "Inconsistent settings. Check vm_version_s390.cpp");
3254       }
3255     }
3256 
3257     if (UseAESCTRIntrinsics) {
3258       if (VM_Version::has_Crypto_AES_CTR()) {
3259         StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
3260       } else {
3261         // In PRODUCT builds, the function pointers will keep their initial (null) value.
3262         // LibraryCallKit::try_to_inline() will return false then, preventing the intrinsic to be called.
3263         assert(VM_Version::has_Crypto_AES_CTR(), "Inconsistent settings. Check vm_version_s390.cpp");
3264       }
3265     }
3266 
3267     // Generate GHASH intrinsics code
3268     if (UseGHASHIntrinsics) {
3269       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
3270     }
3271 
3272     // Generate SHA1/SHA256/SHA512 intrinsics code.
3273     if (UseSHA1Intrinsics) {
3274       StubRoutines::_sha1_implCompress     = generate_SHA1_stub(StubGenStubId::sha1_implCompress_id);
3275       StubRoutines::_sha1_implCompressMB   = generate_SHA1_stub(StubGenStubId::sha1_implCompressMB_id);
3276     }
3277     if (UseSHA256Intrinsics) {
3278       StubRoutines::_sha256_implCompress   = generate_SHA256_stub(StubGenStubId::sha256_implCompress_id);
3279       StubRoutines::_sha256_implCompressMB = generate_SHA256_stub(StubGenStubId::sha256_implCompressMB_id);
3280     }
3281     if (UseSHA512Intrinsics) {
3282       StubRoutines::_sha512_implCompress   = generate_SHA512_stub(StubGenStubId::sha512_implCompress_id);
3283       StubRoutines::_sha512_implCompressMB = generate_SHA512_stub(StubGenStubId::sha512_implCompressMB_id);
3284     }
3285 
3286 #ifdef COMPILER2
3287     if (UseMultiplyToLenIntrinsic) {
3288       StubRoutines::_multiplyToLen = generate_multiplyToLen();
3289     }
3290     if (UseMontgomeryMultiplyIntrinsic) {
3291       StubRoutines::_montgomeryMultiply
3292         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
3293     }
3294     if (UseMontgomerySquareIntrinsic) {
3295       StubRoutines::_montgomerySquare
3296         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
3297     }
3298 #endif
3299 #endif // COMPILER2_OR_JVMCI
3300   }
3301 
3302  public:
3303   StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) {
3304     switch(blob_id) {
3305     case initial_id:
3306       generate_initial_stubs();
3307       break;
3308      case continuation_id:
3309       generate_continuation_stubs();
3310       break;
3311     case compiler_id:
3312       generate_compiler_stubs();
3313       break;
3314     case final_id:
3315       generate_final_stubs();
3316       break;
3317     default:
3318       fatal("unexpected blob id: %d", blob_id);
3319       break;
3320     };
3321   }
3322 
3323  private:
3324   int _stub_count;
3325   void stub_prolog(StubCodeDesc* cdesc) {
3326 #ifdef ASSERT
3327     // Put extra information in the stub code, to make it more readable.
3328     // Write the high part of the address.
3329     // [RGV] Check if there is a dependency on the size of this prolog.
3330     __ emit_data((intptr_t)cdesc >> 32);
3331     __ emit_data((intptr_t)cdesc);
3332     __ emit_data(++_stub_count);
3333 #endif
3334     align(true);
3335   }
3336 
3337   void align(bool at_header = false) {
3338     // z/Architecture cache line size is 256 bytes.
3339     // There is no obvious benefit in aligning stub
3340     // code to cache lines. Use CodeEntryAlignment instead.
3341     const unsigned int icache_line_size      = CodeEntryAlignment;
3342     const unsigned int icache_half_line_size = MIN2<unsigned int>(32, CodeEntryAlignment);
3343 
3344     if (at_header) {
3345       while ((intptr_t)(__ pc()) % icache_line_size != 0) {
3346         __ z_illtrap();
3347       }
3348     } else {
3349       while ((intptr_t)(__ pc()) % icache_half_line_size != 0) {
3350         __ z_nop();
3351       }
3352     }
3353   }
3354 
3355 };
3356 
3357 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) {
3358   StubGenerator g(code, blob_id);
3359 }