1 /*
   2  * Copyright (c) 2016, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2016, 2019 SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "registerSaver_s390.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "interpreter/interp_masm.hpp"
  33 #include "memory/universe.hpp"
  34 #include "nativeInst_s390.hpp"
  35 #include "oops/instanceOop.hpp"
  36 #include "oops/objArrayKlass.hpp"
  37 #include "oops/oop.inline.hpp"
  38 #include "prims/methodHandles.hpp"
  39 #include "runtime/frame.inline.hpp"
  40 #include "runtime/handles.inline.hpp"
  41 #include "runtime/sharedRuntime.hpp"
  42 #include "runtime/stubCodeGenerator.hpp"
  43 #include "runtime/stubRoutines.hpp"
  44 #include "runtime/thread.inline.hpp"
  45 #include "utilities/powerOfTwo.hpp"
  46 
  47 // Declaration and definition of StubGenerator (no .hpp file).
  48 // For a more detailed description of the stub routine structure
  49 // see the comment in stubRoutines.hpp.
  50 
  51 #ifdef PRODUCT
  52 #define __ _masm->
  53 #else
  54 #define __ (Verbose ? (_masm->block_comment(FILE_AND_LINE),_masm):_masm)->
  55 #endif
  56 
  57 #define BLOCK_COMMENT(str) if (PrintAssembly) __ block_comment(str)
  58 #define BIND(label)        bind(label); BLOCK_COMMENT(#label ":")
  59 
  60 // -----------------------------------------------------------------------
  61 // Stub Code definitions
  62 
  63 class StubGenerator: public StubCodeGenerator {
  64  private:
  65 
  66   //----------------------------------------------------------------------
  67   // Call stubs are used to call Java from C.
  68 
  69   //
  70   // Arguments:
  71   //
  72   //   R2        - call wrapper address     : address
  73   //   R3        - result                   : intptr_t*
  74   //   R4        - result type              : BasicType
  75   //   R5        - method                   : method
  76   //   R6        - frame mgr entry point    : address
  77   //   [SP+160]  - parameter block          : intptr_t*
  78   //   [SP+172]  - parameter count in words : int
  79   //   [SP+176]  - thread                   : Thread*
  80   //
  81   address generate_call_stub(address& return_address) {
  82     // Set up a new C frame, copy Java arguments, call frame manager
  83     // or native_entry, and process result.
  84 
  85     StubCodeMark mark(this, "StubRoutines", "call_stub");
  86     address start = __ pc();
  87 
  88     Register r_arg_call_wrapper_addr   = Z_ARG1;
  89     Register r_arg_result_addr         = Z_ARG2;
  90     Register r_arg_result_type         = Z_ARG3;
  91     Register r_arg_method              = Z_ARG4;
  92     Register r_arg_entry               = Z_ARG5;
  93 
  94     // offsets to fp
  95     #define d_arg_thread 176
  96     #define d_arg_argument_addr 160
  97     #define d_arg_argument_count 168+4
  98 
  99     Register r_entryframe_fp           = Z_tmp_1;
 100     Register r_top_of_arguments_addr   = Z_ARG4;
 101     Register r_new_arg_entry = Z_R14;
 102 
 103     // macros for frame offsets
 104     #define call_wrapper_address_offset \
 105                _z_entry_frame_locals_neg(call_wrapper_address)
 106     #define result_address_offset \
 107               _z_entry_frame_locals_neg(result_address)
 108     #define result_type_offset \
 109               _z_entry_frame_locals_neg(result_type)
 110     #define arguments_tos_address_offset \
 111               _z_entry_frame_locals_neg(arguments_tos_address)
 112 
 113     {
 114       //
 115       // STACK on entry to call_stub:
 116       //
 117       //     F1      [C_FRAME]
 118       //            ...
 119       //
 120 
 121       Register r_argument_addr              = Z_tmp_3;
 122       Register r_argumentcopy_addr          = Z_tmp_4;
 123       Register r_argument_size_in_bytes     = Z_ARG5;
 124       Register r_frame_size                 = Z_R1;
 125 
 126       Label arguments_copied;
 127 
 128       // Save non-volatile registers to ABI of caller frame.
 129       BLOCK_COMMENT("save registers, push frame {");
 130       __ z_stmg(Z_R6, Z_R14, 16, Z_SP);
 131       __ z_std(Z_F8, 96, Z_SP);
 132       __ z_std(Z_F9, 104, Z_SP);
 133       __ z_std(Z_F10, 112, Z_SP);
 134       __ z_std(Z_F11, 120, Z_SP);
 135       __ z_std(Z_F12, 128, Z_SP);
 136       __ z_std(Z_F13, 136, Z_SP);
 137       __ z_std(Z_F14, 144, Z_SP);
 138       __ z_std(Z_F15, 152, Z_SP);
 139 
 140       //
 141       // Push ENTRY_FRAME including arguments:
 142       //
 143       //     F0      [TOP_IJAVA_FRAME_ABI]
 144       //             [outgoing Java arguments]
 145       //             [ENTRY_FRAME_LOCALS]
 146       //     F1      [C_FRAME]
 147       //             ...
 148       //
 149 
 150       // Calculate new frame size and push frame.
 151       #define abi_plus_locals_size \
 152                 (frame::z_top_ijava_frame_abi_size + frame::z_entry_frame_locals_size)
 153       if (abi_plus_locals_size % BytesPerWord == 0) {
 154         // Preload constant part of frame size.
 155         __ load_const_optimized(r_frame_size, -abi_plus_locals_size/BytesPerWord);
 156         // Keep copy of our frame pointer (caller's SP).
 157         __ z_lgr(r_entryframe_fp, Z_SP);
 158         // Add space required by arguments to frame size.
 159         __ z_slgf(r_frame_size, d_arg_argument_count, Z_R0, Z_SP);
 160         // Move Z_ARG5 early, it will be used as a local.
 161         __ z_lgr(r_new_arg_entry, r_arg_entry);
 162         // Convert frame size from words to bytes.
 163         __ z_sllg(r_frame_size, r_frame_size, LogBytesPerWord);
 164         __ push_frame(r_frame_size, r_entryframe_fp,
 165                       false/*don't copy SP*/, true /*frame size sign inverted*/);
 166       } else {
 167         guarantee(false, "frame sizes should be multiples of word size (BytesPerWord)");
 168       }
 169       BLOCK_COMMENT("} save, push");
 170 
 171       // Load argument registers for call.
 172       BLOCK_COMMENT("prepare/copy arguments {");
 173       __ z_lgr(Z_method, r_arg_method);
 174       __ z_lg(Z_thread, d_arg_thread, r_entryframe_fp);
 175 
 176       // Calculate top_of_arguments_addr which will be tos (not prepushed) later.
 177       // Wimply use SP + frame::top_ijava_frame_size.
 178       __ add2reg(r_top_of_arguments_addr,
 179                  frame::z_top_ijava_frame_abi_size - BytesPerWord, Z_SP);
 180 
 181       // Initialize call_stub locals (step 1).
 182       if ((call_wrapper_address_offset + BytesPerWord == result_address_offset) &&
 183           (result_address_offset + BytesPerWord == result_type_offset)          &&
 184           (result_type_offset + BytesPerWord == arguments_tos_address_offset)) {
 185 
 186         __ z_stmg(r_arg_call_wrapper_addr, r_top_of_arguments_addr,
 187                   call_wrapper_address_offset, r_entryframe_fp);
 188       } else {
 189         __ z_stg(r_arg_call_wrapper_addr,
 190                  call_wrapper_address_offset, r_entryframe_fp);
 191         __ z_stg(r_arg_result_addr,
 192                  result_address_offset, r_entryframe_fp);
 193         __ z_stg(r_arg_result_type,
 194                  result_type_offset, r_entryframe_fp);
 195         __ z_stg(r_top_of_arguments_addr,
 196                  arguments_tos_address_offset, r_entryframe_fp);
 197       }
 198 
 199       // Copy Java arguments.
 200 
 201       // Any arguments to copy?
 202       __ load_and_test_int2long(Z_R1, Address(r_entryframe_fp, d_arg_argument_count));
 203       __ z_bre(arguments_copied);
 204 
 205       // Prepare loop and copy arguments in reverse order.
 206       {
 207         // Calculate argument size in bytes.
 208         __ z_sllg(r_argument_size_in_bytes, Z_R1, LogBytesPerWord);
 209 
 210         // Get addr of first incoming Java argument.
 211         __ z_lg(r_argument_addr, d_arg_argument_addr, r_entryframe_fp);
 212 
 213         // Let r_argumentcopy_addr point to last outgoing Java argument.
 214         __ add2reg(r_argumentcopy_addr, BytesPerWord, r_top_of_arguments_addr); // = Z_SP+160 effectively.
 215 
 216         // Let r_argument_addr point to last incoming Java argument.
 217         __ add2reg_with_index(r_argument_addr, -BytesPerWord,
 218                               r_argument_size_in_bytes, r_argument_addr);
 219 
 220         // Now loop while Z_R1 > 0 and copy arguments.
 221         {
 222           Label next_argument;
 223           __ bind(next_argument);
 224           // Mem-mem move.
 225           __ z_mvc(0, BytesPerWord-1, r_argumentcopy_addr, 0, r_argument_addr);
 226           __ add2reg(r_argument_addr,    -BytesPerWord);
 227           __ add2reg(r_argumentcopy_addr, BytesPerWord);
 228           __ z_brct(Z_R1, next_argument);
 229         }
 230       }  // End of argument copy loop.
 231 
 232       __ bind(arguments_copied);
 233     }
 234     BLOCK_COMMENT("} arguments");
 235 
 236     BLOCK_COMMENT("call {");
 237     {
 238       // Call frame manager or native entry.
 239 
 240       //
 241       // Register state on entry to frame manager / native entry:
 242       //
 243       //   Z_ARG1 = r_top_of_arguments_addr  - intptr_t *sender tos (prepushed)
 244       //                                       Lesp = (SP) + copied_arguments_offset - 8
 245       //   Z_method                          - method
 246       //   Z_thread                          - JavaThread*
 247       //
 248 
 249       // Here, the usual SP is the initial_caller_sp.
 250       __ z_lgr(Z_R10, Z_SP);
 251 
 252       // Z_esp points to the slot below the last argument.
 253       __ z_lgr(Z_esp, r_top_of_arguments_addr);
 254 
 255       //
 256       // Stack on entry to frame manager / native entry:
 257       //
 258       //     F0      [TOP_IJAVA_FRAME_ABI]
 259       //             [outgoing Java arguments]
 260       //             [ENTRY_FRAME_LOCALS]
 261       //     F1      [C_FRAME]
 262       //             ...
 263       //
 264 
 265       // Do a light-weight C-call here, r_new_arg_entry holds the address
 266       // of the interpreter entry point (frame manager or native entry)
 267       // and save runtime-value of return_pc in return_address
 268       // (call by reference argument).
 269       return_address = __ call_stub(r_new_arg_entry);
 270     }
 271     BLOCK_COMMENT("} call");
 272 
 273     {
 274       BLOCK_COMMENT("restore registers {");
 275       // Returned from frame manager or native entry.
 276       // Now pop frame, process result, and return to caller.
 277 
 278       //
 279       // Stack on exit from frame manager / native entry:
 280       //
 281       //     F0      [ABI]
 282       //             ...
 283       //             [ENTRY_FRAME_LOCALS]
 284       //     F1      [C_FRAME]
 285       //             ...
 286       //
 287       // Just pop the topmost frame ...
 288       //
 289 
 290       // Restore frame pointer.
 291       __ z_lg(r_entryframe_fp, _z_abi(callers_sp), Z_SP);
 292       // Pop frame. Done here to minimize stalls.
 293       __ pop_frame();
 294 
 295       // Reload some volatile registers which we've spilled before the call
 296       // to frame manager / native entry.
 297       // Access all locals via frame pointer, because we know nothing about
 298       // the topmost frame's size.
 299       __ z_lg(r_arg_result_addr, result_address_offset, r_entryframe_fp);
 300       __ z_lg(r_arg_result_type, result_type_offset, r_entryframe_fp);
 301 
 302       // Restore non-volatiles.
 303       __ z_lmg(Z_R6, Z_R14, 16, Z_SP);
 304       __ z_ld(Z_F8, 96, Z_SP);
 305       __ z_ld(Z_F9, 104, Z_SP);
 306       __ z_ld(Z_F10, 112, Z_SP);
 307       __ z_ld(Z_F11, 120, Z_SP);
 308       __ z_ld(Z_F12, 128, Z_SP);
 309       __ z_ld(Z_F13, 136, Z_SP);
 310       __ z_ld(Z_F14, 144, Z_SP);
 311       __ z_ld(Z_F15, 152, Z_SP);
 312       BLOCK_COMMENT("} restore");
 313 
 314       //
 315       // Stack on exit from call_stub:
 316       //
 317       //     0       [C_FRAME]
 318       //             ...
 319       //
 320       // No call_stub frames left.
 321       //
 322 
 323       // All non-volatiles have been restored at this point!!
 324 
 325       //------------------------------------------------------------------------
 326       // The following code makes some assumptions on the T_<type> enum values.
 327       // The enum is defined in globalDefinitions.hpp.
 328       // The validity of the assumptions is tested as far as possible.
 329       //   The assigned values should not be shuffled
 330       //   T_BOOLEAN==4    - lowest used enum value
 331       //   T_NARROWOOP==16 - largest used enum value
 332       //------------------------------------------------------------------------
 333       BLOCK_COMMENT("process result {");
 334       Label firstHandler;
 335       int   handlerLen= 8;
 336 #ifdef ASSERT
 337       char  assertMsg[] = "check BasicType definition in globalDefinitions.hpp";
 338       __ z_chi(r_arg_result_type, T_BOOLEAN);
 339       __ asm_assert_low(assertMsg, 0x0234);
 340       __ z_chi(r_arg_result_type, T_NARROWOOP);
 341       __ asm_assert_high(assertMsg, 0x0235);
 342 #endif
 343       __ add2reg(r_arg_result_type, -T_BOOLEAN);          // Remove offset.
 344       __ z_larl(Z_R1, firstHandler);                      // location of first handler
 345       __ z_sllg(r_arg_result_type, r_arg_result_type, 3); // Each handler is 8 bytes long.
 346       __ z_bc(MacroAssembler::bcondAlways, 0, r_arg_result_type, Z_R1);
 347 
 348       __ align(handlerLen);
 349       __ bind(firstHandler);
 350       // T_BOOLEAN:
 351         guarantee(T_BOOLEAN == 4, "check BasicType definition in globalDefinitions.hpp");
 352         __ z_st(Z_RET, 0, r_arg_result_addr);
 353         __ z_br(Z_R14); // Return to caller.
 354         __ align(handlerLen);
 355       // T_CHAR:
 356         guarantee(T_CHAR == T_BOOLEAN+1, "check BasicType definition in globalDefinitions.hpp");
 357         __ z_st(Z_RET, 0, r_arg_result_addr);
 358         __ z_br(Z_R14); // Return to caller.
 359         __ align(handlerLen);
 360       // T_FLOAT:
 361         guarantee(T_FLOAT == T_CHAR+1, "check BasicType definition in globalDefinitions.hpp");
 362         __ z_ste(Z_FRET, 0, r_arg_result_addr);
 363         __ z_br(Z_R14); // Return to caller.
 364         __ align(handlerLen);
 365       // T_DOUBLE:
 366         guarantee(T_DOUBLE == T_FLOAT+1, "check BasicType definition in globalDefinitions.hpp");
 367         __ z_std(Z_FRET, 0, r_arg_result_addr);
 368         __ z_br(Z_R14); // Return to caller.
 369         __ align(handlerLen);
 370       // T_BYTE:
 371         guarantee(T_BYTE == T_DOUBLE+1, "check BasicType definition in globalDefinitions.hpp");
 372         __ z_st(Z_RET, 0, r_arg_result_addr);
 373         __ z_br(Z_R14); // Return to caller.
 374         __ align(handlerLen);
 375       // T_SHORT:
 376         guarantee(T_SHORT == T_BYTE+1, "check BasicType definition in globalDefinitions.hpp");
 377         __ z_st(Z_RET, 0, r_arg_result_addr);
 378         __ z_br(Z_R14); // Return to caller.
 379         __ align(handlerLen);
 380       // T_INT:
 381         guarantee(T_INT == T_SHORT+1, "check BasicType definition in globalDefinitions.hpp");
 382         __ z_st(Z_RET, 0, r_arg_result_addr);
 383         __ z_br(Z_R14); // Return to caller.
 384         __ align(handlerLen);
 385       // T_LONG:
 386         guarantee(T_LONG == T_INT+1, "check BasicType definition in globalDefinitions.hpp");
 387         __ z_stg(Z_RET, 0, r_arg_result_addr);
 388         __ z_br(Z_R14); // Return to caller.
 389         __ align(handlerLen);
 390       // T_OBJECT:
 391         guarantee(T_OBJECT == T_LONG+1, "check BasicType definition in globalDefinitions.hpp");
 392         __ z_stg(Z_RET, 0, r_arg_result_addr);
 393         __ z_br(Z_R14); // Return to caller.
 394         __ align(handlerLen);
 395       // T_ARRAY:
 396         guarantee(T_ARRAY == T_OBJECT+1, "check BasicType definition in globalDefinitions.hpp");
 397         __ z_stg(Z_RET, 0, r_arg_result_addr);
 398         __ z_br(Z_R14); // Return to caller.
 399         __ align(handlerLen);
 400       // T_VOID:
 401         guarantee(T_VOID == T_ARRAY+1, "check BasicType definition in globalDefinitions.hpp");
 402         __ z_stg(Z_RET, 0, r_arg_result_addr);
 403         __ z_br(Z_R14); // Return to caller.
 404         __ align(handlerLen);
 405       // T_ADDRESS:
 406         guarantee(T_ADDRESS == T_VOID+1, "check BasicType definition in globalDefinitions.hpp");
 407         __ z_stg(Z_RET, 0, r_arg_result_addr);
 408         __ z_br(Z_R14); // Return to caller.
 409         __ align(handlerLen);
 410       // T_NARROWOOP:
 411         guarantee(T_NARROWOOP == T_ADDRESS+1, "check BasicType definition in globalDefinitions.hpp");
 412         __ z_st(Z_RET, 0, r_arg_result_addr);
 413         __ z_br(Z_R14); // Return to caller.
 414         __ align(handlerLen);
 415       BLOCK_COMMENT("} process result");
 416     }
 417     return start;
 418   }
 419 
 420   // Return point for a Java call if there's an exception thrown in
 421   // Java code. The exception is caught and transformed into a
 422   // pending exception stored in JavaThread that can be tested from
 423   // within the VM.
 424   address generate_catch_exception() {
 425     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 426 
 427     address start = __ pc();
 428 
 429     //
 430     // Registers alive
 431     //
 432     //   Z_thread
 433     //   Z_ARG1 - address of pending exception
 434     //   Z_ARG2 - return address in call stub
 435     //
 436 
 437     const Register exception_file = Z_R0;
 438     const Register exception_line = Z_R1;
 439 
 440     __ load_const_optimized(exception_file, (void*)__FILE__);
 441     __ load_const_optimized(exception_line, (void*)__LINE__);
 442 
 443     __ z_stg(Z_ARG1, thread_(pending_exception));
 444     // Store into `char *'.
 445     __ z_stg(exception_file, thread_(exception_file));
 446     // Store into `int'.
 447     __ z_st(exception_line, thread_(exception_line));
 448 
 449     // Complete return to VM.
 450     assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
 451 
 452     // Continue in call stub.
 453     __ z_br(Z_ARG2);
 454 
 455     return start;
 456   }
 457 
 458   // Continuation point for runtime calls returning with a pending
 459   // exception. The pending exception check happened in the runtime
 460   // or native call stub. The pending exception in Thread is
 461   // converted into a Java-level exception.
 462   //
 463   // Read:
 464   //   Z_R14: pc the runtime library callee wants to return to.
 465   //   Since the exception occurred in the callee, the return pc
 466   //   from the point of view of Java is the exception pc.
 467   //
 468   // Invalidate:
 469   //   Volatile registers (except below).
 470   //
 471   // Update:
 472   //   Z_ARG1: exception
 473   //   (Z_R14 is unchanged and is live out).
 474   //
 475   address generate_forward_exception() {
 476     StubCodeMark mark(this, "StubRoutines", "forward_exception");
 477     address start = __ pc();
 478 
 479     #define pending_exception_offset in_bytes(Thread::pending_exception_offset())
 480 #ifdef ASSERT
 481     // Get pending exception oop.
 482     __ z_lg(Z_ARG1, pending_exception_offset, Z_thread);
 483 
 484     // Make sure that this code is only executed if there is a pending exception.
 485     {
 486       Label L;
 487       __ z_ltgr(Z_ARG1, Z_ARG1);
 488       __ z_brne(L);
 489       __ stop("StubRoutines::forward exception: no pending exception (1)");
 490       __ bind(L);
 491     }
 492 
 493     __ verify_oop(Z_ARG1, "StubRoutines::forward exception: not an oop");
 494 #endif
 495 
 496     __ z_lgr(Z_ARG2, Z_R14); // Copy exception pc into Z_ARG2.
 497     __ save_return_pc();
 498     __ push_frame_abi160(0);
 499     // Find exception handler.
 500     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address),
 501                     Z_thread,
 502                     Z_ARG2);
 503     // Copy handler's address.
 504     __ z_lgr(Z_R1, Z_RET);
 505     __ pop_frame();
 506     __ restore_return_pc();
 507 
 508     // Set up the arguments for the exception handler:
 509     // - Z_ARG1: exception oop
 510     // - Z_ARG2: exception pc
 511 
 512     // Load pending exception oop.
 513     __ z_lg(Z_ARG1, pending_exception_offset, Z_thread);
 514 
 515     // The exception pc is the return address in the caller,
 516     // must load it into Z_ARG2
 517     __ z_lgr(Z_ARG2, Z_R14);
 518 
 519 #ifdef ASSERT
 520     // Make sure exception is set.
 521     { Label L;
 522       __ z_ltgr(Z_ARG1, Z_ARG1);
 523       __ z_brne(L);
 524       __ stop("StubRoutines::forward exception: no pending exception (2)");
 525       __ bind(L);
 526     }
 527 #endif
 528     // Clear the pending exception.
 529     __ clear_mem(Address(Z_thread, pending_exception_offset), sizeof(void *));
 530     // Jump to exception handler
 531     __ z_br(Z_R1 /*handler address*/);
 532 
 533     return start;
 534 
 535     #undef pending_exception_offset
 536   }
 537 
 538   // Continuation point for throwing of implicit exceptions that are
 539   // not handled in the current activation. Fabricates an exception
 540   // oop and initiates normal exception dispatching in this
 541   // frame. Only callee-saved registers are preserved (through the
 542   // normal RegisterMap handling). If the compiler
 543   // needs all registers to be preserved between the fault point and
 544   // the exception handler then it must assume responsibility for that
 545   // in AbstractCompiler::continuation_for_implicit_null_exception or
 546   // continuation_for_implicit_division_by_zero_exception. All other
 547   // implicit exceptions (e.g., NullPointerException or
 548   // AbstractMethodError on entry) are either at call sites or
 549   // otherwise assume that stack unwinding will be initiated, so
 550   // caller saved registers were assumed volatile in the compiler.
 551 
 552   // Note that we generate only this stub into a RuntimeStub, because
 553   // it needs to be properly traversed and ignored during GC, so we
 554   // change the meaning of the "__" macro within this method.
 555 
 556   // Note: the routine set_pc_not_at_call_for_caller in
 557   // SharedRuntime.cpp requires that this code be generated into a
 558   // RuntimeStub.
 559 #undef __
 560 #define __ masm->
 561 
 562   address generate_throw_exception(const char* name, address runtime_entry,
 563                                    bool restore_saved_exception_pc,
 564                                    Register arg1 = noreg, Register arg2 = noreg) {
 565     assert_different_registers(arg1, Z_R0_scratch);  // would be destroyed by push_frame()
 566     assert_different_registers(arg2, Z_R0_scratch);  // would be destroyed by push_frame()
 567 
 568     int insts_size = 256;
 569     int locs_size  = 0;
 570     CodeBuffer      code(name, insts_size, locs_size);
 571     MacroAssembler* masm = new MacroAssembler(&code);
 572     int framesize_in_bytes;
 573     address start = __ pc();
 574 
 575     __ save_return_pc();
 576     framesize_in_bytes = __ push_frame_abi160(0);
 577 
 578     address frame_complete_pc = __ pc();
 579     if (restore_saved_exception_pc) {
 580       __ unimplemented("StubGenerator::throw_exception", 74);
 581     }
 582 
 583     // Note that we always have a runtime stub frame on the top of stack at this point.
 584     __ get_PC(Z_R1);
 585     __ set_last_Java_frame(/*sp*/Z_SP, /*pc*/Z_R1);
 586 
 587     // Do the call.
 588     BLOCK_COMMENT("call runtime_entry");
 589     __ call_VM_leaf(runtime_entry, Z_thread, arg1, arg2);
 590 
 591     __ reset_last_Java_frame();
 592 
 593 #ifdef ASSERT
 594     // Make sure that this code is only executed if there is a pending exception.
 595     { Label L;
 596       __ z_lg(Z_R0,
 597                 in_bytes(Thread::pending_exception_offset()),
 598                 Z_thread);
 599       __ z_ltgr(Z_R0, Z_R0);
 600       __ z_brne(L);
 601       __ stop("StubRoutines::throw_exception: no pending exception");
 602       __ bind(L);
 603     }
 604 #endif
 605 
 606     __ pop_frame();
 607     __ restore_return_pc();
 608 
 609     __ load_const_optimized(Z_R1, StubRoutines::forward_exception_entry());
 610     __ z_br(Z_R1);
 611 
 612     RuntimeStub* stub =
 613       RuntimeStub::new_runtime_stub(name, &code,
 614                                     frame_complete_pc - start,
 615                                     framesize_in_bytes/wordSize,
 616                                     NULL /*oop_maps*/, false);
 617 
 618     return stub->entry_point();
 619   }
 620 
 621 #undef __
 622 #ifdef PRODUCT
 623 #define __ _masm->
 624 #else
 625 #define __ (Verbose ? (_masm->block_comment(FILE_AND_LINE),_masm):_masm)->
 626 #endif
 627 
 628   // Support for uint StubRoutine::zarch::partial_subtype_check(Klass
 629   // sub, Klass super);
 630   //
 631   // Arguments:
 632   //   ret  : Z_RET, returned
 633   //   sub  : Z_ARG2, argument, not changed
 634   //   super: Z_ARG3, argument, not changed
 635   //
 636   //   raddr: Z_R14, blown by call
 637   //
 638   address generate_partial_subtype_check() {
 639     StubCodeMark mark(this, "StubRoutines", "partial_subtype_check");
 640     Label miss;
 641 
 642     address start = __ pc();
 643 
 644     const Register Rsubklass   = Z_ARG2; // subklass
 645     const Register Rsuperklass = Z_ARG3; // superklass
 646 
 647     // No args, but tmp registers that are killed.
 648     const Register Rlength     = Z_ARG4; // cache array length
 649     const Register Rarray_ptr  = Z_ARG5; // Current value from cache array.
 650 
 651     if (UseCompressedOops) {
 652       assert(Universe::heap() != NULL, "java heap must be initialized to generate partial_subtype_check stub");
 653     }
 654 
 655     // Always take the slow path.
 656     __ check_klass_subtype_slow_path(Rsubklass, Rsuperklass,
 657                                      Rarray_ptr, Rlength, NULL, &miss);
 658 
 659     // Match falls through here.
 660     __ clear_reg(Z_RET);               // Zero indicates a match. Set EQ flag in CC.
 661     __ z_br(Z_R14);
 662 
 663     __ BIND(miss);
 664     __ load_const_optimized(Z_RET, 1); // One indicates a miss.
 665     __ z_ltgr(Z_RET, Z_RET);           // Set NE flag in CR.
 666     __ z_br(Z_R14);
 667 
 668     return start;
 669   }
 670 
 671 #if !defined(PRODUCT)
 672   // Wrapper which calls oopDesc::is_oop_or_null()
 673   // Only called by MacroAssembler::verify_oop
 674   static void verify_oop_helper(const char* message, oopDesc* o) {
 675     if (!oopDesc::is_oop_or_null(o)) {
 676       fatal("%s. oop: " PTR_FORMAT, message, p2i(o));
 677     }
 678     ++ StubRoutines::_verify_oop_count;
 679   }
 680 #endif
 681 
 682   // Return address of code to be called from code generated by
 683   // MacroAssembler::verify_oop.
 684   //
 685   // Don't generate, rather use C++ code.
 686   address generate_verify_oop_subroutine() {
 687     // Don't generate a StubCodeMark, because no code is generated!
 688     // Generating the mark triggers notifying the oprofile jvmti agent
 689     // about the dynamic code generation, but the stub without
 690     // code (code_size == 0) confuses opjitconv
 691     // StubCodeMark mark(this, "StubRoutines", "verify_oop_stub");
 692 
 693     address start = 0;
 694 
 695 #if !defined(PRODUCT)
 696     start = CAST_FROM_FN_PTR(address, verify_oop_helper);
 697 #endif
 698 
 699     return start;
 700   }
 701 
 702   // This is to test that the count register contains a positive int value.
 703   // Required because C2 does not respect int to long conversion for stub calls.
 704   void assert_positive_int(Register count) {
 705 #ifdef ASSERT
 706     __ z_srag(Z_R0, count, 31);  // Just leave the sign (must be zero) in Z_R0.
 707     __ asm_assert_eq("missing zero extend", 0xAFFE);
 708 #endif
 709   }
 710 
 711   //  Generate overlap test for array copy stubs.
 712   //  If no actual overlap is detected, control is transferred to the
 713   //  "normal" copy stub (entry address passed in disjoint_copy_target).
 714   //  Otherwise, execution continues with the code generated by the
 715   //  caller of array_overlap_test.
 716   //
 717   //  Input:
 718   //    Z_ARG1    - from
 719   //    Z_ARG2    - to
 720   //    Z_ARG3    - element count
 721   void array_overlap_test(address disjoint_copy_target, int log2_elem_size) {
 722     __ MacroAssembler::compare_and_branch_optimized(Z_ARG2, Z_ARG1, Assembler::bcondNotHigh,
 723                                                     disjoint_copy_target, /*len64=*/true, /*has_sign=*/false);
 724 
 725     Register index = Z_ARG3;
 726     if (log2_elem_size > 0) {
 727       __ z_sllg(Z_R1, Z_ARG3, log2_elem_size);  // byte count
 728       index = Z_R1;
 729     }
 730     __ add2reg_with_index(Z_R1, 0, index, Z_ARG1);  // First byte after "from" range.
 731 
 732     __ MacroAssembler::compare_and_branch_optimized(Z_R1, Z_ARG2, Assembler::bcondNotHigh,
 733                                                     disjoint_copy_target, /*len64=*/true, /*has_sign=*/false);
 734 
 735     // Destructive overlap: let caller generate code for that.
 736   }
 737 
 738   //  Generate stub for disjoint array copy. If "aligned" is true, the
 739   //  "from" and "to" addresses are assumed to be heapword aligned.
 740   //
 741   //  Arguments for generated stub:
 742   //      from:  Z_ARG1
 743   //      to:    Z_ARG2
 744   //      count: Z_ARG3 treated as signed
 745   void generate_disjoint_copy(bool aligned, int element_size,
 746                               bool branchToEnd,
 747                               bool restoreArgs) {
 748     // This is the zarch specific stub generator for general array copy tasks.
 749     // It has the following prereqs and features:
 750     //
 751     // - No destructive overlap allowed (else unpredictable results).
 752     // - Destructive overlap does not exist if the leftmost byte of the target
 753     //   does not coincide with any of the source bytes (except the leftmost).
 754     //
 755     //   Register usage upon entry:
 756     //      Z_ARG1 == Z_R2 :   address of source array
 757     //      Z_ARG2 == Z_R3 :   address of target array
 758     //      Z_ARG3 == Z_R4 :   length of operands (# of elements on entry)
 759     //
 760     // Register usage within the generator:
 761     // - Z_R0 and Z_R1 are KILLed by the stub routine (target addr/len).
 762     //                 Used as pair register operand in complex moves, scratch registers anyway.
 763     // - Z_R5 is KILLed by the stub routine (source register pair addr/len) (even/odd reg).
 764     //                  Same as R0/R1, but no scratch register.
 765     // - Z_ARG1, Z_ARG2, Z_ARG3 are USEd but preserved by the stub routine,
 766     //                          but they might get temporarily overwritten.
 767 
 768     Register  save_reg    = Z_ARG4;   // (= Z_R5), holds original target operand address for restore.
 769 
 770     {
 771       Register   llen_reg = Z_R1;     // Holds left operand len (odd reg).
 772       Register  laddr_reg = Z_R0;     // Holds left operand addr (even reg), overlaps with data_reg.
 773       Register   rlen_reg = Z_R5;     // Holds right operand len (odd reg), overlaps with save_reg.
 774       Register  raddr_reg = Z_R4;     // Holds right operand addr (even reg), overlaps with len_reg.
 775 
 776       Register   data_reg = Z_R0;     // Holds copied data chunk in alignment process and copy loop.
 777       Register    len_reg = Z_ARG3;   // Holds operand len (#elements at entry, #bytes shortly after).
 778       Register    dst_reg = Z_ARG2;   // Holds left (target)  operand addr.
 779       Register    src_reg = Z_ARG1;   // Holds right (source) operand addr.
 780 
 781       Label     doMVCLOOP, doMVCLOOPcount, doMVCLOOPiterate;
 782       Label     doMVCUnrolled;
 783       NearLabel doMVC,  doMVCgeneral, done;
 784       Label     MVC_template;
 785       address   pcMVCblock_b, pcMVCblock_e;
 786 
 787       bool      usedMVCLE       = true;
 788       bool      usedMVCLOOP     = true;
 789       bool      usedMVCUnrolled = false;
 790       bool      usedMVC         = false;
 791       bool      usedMVCgeneral  = false;
 792 
 793       int       stride;
 794       Register  stride_reg;
 795       Register  ix_reg;
 796 
 797       assert((element_size<=256) && (256%element_size == 0), "element size must be <= 256, power of 2");
 798       unsigned int log2_size = exact_log2(element_size);
 799 
 800       switch (element_size) {
 801         case 1:  BLOCK_COMMENT("ARRAYCOPY DISJOINT byte  {"); break;
 802         case 2:  BLOCK_COMMENT("ARRAYCOPY DISJOINT short {"); break;
 803         case 4:  BLOCK_COMMENT("ARRAYCOPY DISJOINT int   {"); break;
 804         case 8:  BLOCK_COMMENT("ARRAYCOPY DISJOINT long  {"); break;
 805         default: BLOCK_COMMENT("ARRAYCOPY DISJOINT       {"); break;
 806       }
 807 
 808       assert_positive_int(len_reg);
 809 
 810       BLOCK_COMMENT("preparation {");
 811 
 812       // No copying if len <= 0.
 813       if (branchToEnd) {
 814         __ compare64_and_branch(len_reg, (intptr_t) 0, Assembler::bcondNotHigh, done);
 815       } else {
 816         if (VM_Version::has_CompareBranch()) {
 817           __ z_cgib(len_reg, 0, Assembler::bcondNotHigh, 0, Z_R14);
 818         } else {
 819           __ z_ltgr(len_reg, len_reg);
 820           __ z_bcr(Assembler::bcondNotPositive, Z_R14);
 821         }
 822       }
 823 
 824       // Prefetch just one cache line. Speculative opt for short arrays.
 825       // Do not use Z_R1 in prefetch. Is undefined here.
 826       if (VM_Version::has_Prefetch()) {
 827         __ z_pfd(0x01, 0, Z_R0, src_reg); // Fetch access.
 828         __ z_pfd(0x02, 0, Z_R0, dst_reg); // Store access.
 829       }
 830 
 831       BLOCK_COMMENT("} preparation");
 832 
 833       // Save args only if really needed.
 834       // Keep len test local to branch. Is generated only once.
 835 
 836       BLOCK_COMMENT("mode selection {");
 837 
 838       // Special handling for arrays with only a few elements.
 839       // Nothing fancy: just an executed MVC.
 840       if (log2_size > 0) {
 841         __ z_sllg(Z_R1, len_reg, log2_size); // Remember #bytes in Z_R1.
 842       }
 843       if (element_size != 8) {
 844         __ z_cghi(len_reg, 256/element_size);
 845         __ z_brnh(doMVC);
 846         usedMVC = true;
 847       }
 848       if (element_size == 8) { // Long and oop arrays are always aligned.
 849         __ z_cghi(len_reg, 256/element_size);
 850         __ z_brnh(doMVCUnrolled);
 851         usedMVCUnrolled = true;
 852       }
 853 
 854       // Prefetch another cache line. We, for sure, have more than one line to copy.
 855       if (VM_Version::has_Prefetch()) {
 856         __ z_pfd(0x01, 256, Z_R0, src_reg); // Fetch access.
 857         __ z_pfd(0x02, 256, Z_R0, dst_reg); // Store access.
 858       }
 859 
 860       if (restoreArgs) {
 861         // Remember entry value of ARG2 to restore all arguments later from that knowledge.
 862         __ z_lgr(save_reg, dst_reg);
 863       }
 864 
 865       __ z_cghi(len_reg, 4096/element_size);
 866       if (log2_size == 0) {
 867         __ z_lgr(Z_R1, len_reg); // Init Z_R1 with #bytes
 868       }
 869       __ z_brnh(doMVCLOOP);
 870 
 871       // Fall through to MVCLE case.
 872 
 873       BLOCK_COMMENT("} mode selection");
 874 
 875       // MVCLE: for long arrays
 876       //   DW aligned: Best performance for sizes > 4kBytes.
 877       //   unaligned:  Least complex for sizes > 256 bytes.
 878       if (usedMVCLE) {
 879         BLOCK_COMMENT("mode MVCLE {");
 880 
 881         // Setup registers for mvcle.
 882         //__ z_lgr(llen_reg, len_reg);// r1 <- r4  #bytes already in Z_R1, aka llen_reg.
 883         __ z_lgr(laddr_reg, dst_reg); // r0 <- r3
 884         __ z_lgr(raddr_reg, src_reg); // r4 <- r2
 885         __ z_lgr(rlen_reg, llen_reg); // r5 <- r1
 886 
 887         __ MacroAssembler::move_long_ext(laddr_reg, raddr_reg, 0xb0);    // special: bypass cache
 888         // __ MacroAssembler::move_long_ext(laddr_reg, raddr_reg, 0xb8); // special: Hold data in cache.
 889         // __ MacroAssembler::move_long_ext(laddr_reg, raddr_reg, 0);
 890 
 891         if (restoreArgs) {
 892           // MVCLE updates the source (Z_R4,Z_R5) and target (Z_R0,Z_R1) register pairs.
 893           // Dst_reg (Z_ARG2) and src_reg (Z_ARG1) are left untouched. No restore required.
 894           // Len_reg (Z_ARG3) is destroyed and must be restored.
 895           __ z_slgr(laddr_reg, dst_reg);    // copied #bytes
 896           if (log2_size > 0) {
 897             __ z_srag(Z_ARG3, laddr_reg, log2_size); // Convert back to #elements.
 898           } else {
 899             __ z_lgr(Z_ARG3, laddr_reg);
 900           }
 901         }
 902         if (branchToEnd) {
 903           __ z_bru(done);
 904         } else {
 905           __ z_br(Z_R14);
 906         }
 907         BLOCK_COMMENT("} mode MVCLE");
 908       }
 909       // No fallthru possible here.
 910 
 911       //  MVCUnrolled: for short, aligned arrays.
 912 
 913       if (usedMVCUnrolled) {
 914         BLOCK_COMMENT("mode MVC unrolled {");
 915         stride = 8;
 916 
 917         // Generate unrolled MVC instructions.
 918         for (int ii = 32; ii > 1; ii--) {
 919           __ z_mvc(0, ii * stride-1, dst_reg, 0, src_reg); // ii*8 byte copy
 920           if (branchToEnd) {
 921             __ z_bru(done);
 922           } else {
 923             __ z_br(Z_R14);
 924           }
 925         }
 926 
 927         pcMVCblock_b = __ pc();
 928         __ z_mvc(0, 1 * stride-1, dst_reg, 0, src_reg); // 8 byte copy
 929         if (branchToEnd) {
 930           __ z_bru(done);
 931         } else {
 932           __ z_br(Z_R14);
 933         }
 934 
 935         pcMVCblock_e = __ pc();
 936         Label MVC_ListEnd;
 937         __ bind(MVC_ListEnd);
 938 
 939         // This is an absolute fast path:
 940         // - Array len in bytes must be not greater than 256.
 941         // - Array len in bytes must be an integer mult of DW
 942         //   to save expensive handling of trailing bytes.
 943         // - Argument restore is not done,
 944         //   i.e. previous code must not alter arguments (this code doesn't either).
 945 
 946         __ bind(doMVCUnrolled);
 947 
 948         // Avoid mul, prefer shift where possible.
 949         // Combine shift right (for #DW) with shift left (for block size).
 950         // Set CC for zero test below (asm_assert).
 951         // Note: #bytes comes in Z_R1, #DW in len_reg.
 952         unsigned int MVCblocksize    = pcMVCblock_e - pcMVCblock_b;
 953         unsigned int logMVCblocksize = 0xffffffffU; // Pacify compiler ("used uninitialized" warning).
 954 
 955         if (log2_size > 0) { // Len was scaled into Z_R1.
 956           switch (MVCblocksize) {
 957 
 958             case  8: logMVCblocksize = 3;
 959                      __ z_ltgr(Z_R0, Z_R1); // #bytes is index
 960                      break;                 // reasonable size, use shift
 961 
 962             case 16: logMVCblocksize = 4;
 963                      __ z_slag(Z_R0, Z_R1, logMVCblocksize-log2_size);
 964                      break;                 // reasonable size, use shift
 965 
 966             default: logMVCblocksize = 0;
 967                      __ z_ltgr(Z_R0, len_reg); // #DW for mul
 968                      break;                 // all other sizes: use mul
 969           }
 970         } else {
 971           guarantee(log2_size, "doMVCUnrolled: only for DW entities");
 972         }
 973 
 974         // This test (and branch) is redundant. Previous code makes sure that
 975         //  - element count > 0
 976         //  - element size == 8.
 977         // Thus, len reg should never be zero here. We insert an asm_assert() here,
 978         // just to double-check and to be on the safe side.
 979         __ asm_assert(false, "zero len cannot occur", 99);
 980 
 981         __ z_larl(Z_R1, MVC_ListEnd);        // Get addr of last instr block.
 982         // Avoid mul, prefer shift where possible.
 983         if (logMVCblocksize == 0) {
 984           __ z_mghi(Z_R0, MVCblocksize);
 985         }
 986         __ z_slgr(Z_R1, Z_R0);
 987         __ z_br(Z_R1);
 988         BLOCK_COMMENT("} mode MVC unrolled");
 989       }
 990       // No fallthru possible here.
 991 
 992       // MVC execute template
 993       // Must always generate. Usage may be switched on below.
 994       // There is no suitable place after here to put the template.
 995       __ bind(MVC_template);
 996       __ z_mvc(0,0,dst_reg,0,src_reg);      // Instr template, never exec directly!
 997 
 998 
 999       // MVC Loop: for medium-sized arrays
1000 
1001       // Only for DW aligned arrays (src and dst).
1002       // #bytes to copy must be at least 256!!!
1003       // Non-aligned cases handled separately.
1004       stride     = 256;
1005       stride_reg = Z_R1;   // Holds #bytes when control arrives here.
1006       ix_reg     = Z_ARG3; // Alias for len_reg.
1007 
1008 
1009       if (usedMVCLOOP) {
1010         BLOCK_COMMENT("mode MVC loop {");
1011         __ bind(doMVCLOOP);
1012 
1013         __ z_lcgr(ix_reg, Z_R1);         // Ix runs from -(n-2)*stride to 1*stride (inclusive).
1014         __ z_llill(stride_reg, stride);
1015         __ add2reg(ix_reg, 2*stride);    // Thus: increment ix by 2*stride.
1016 
1017         __ bind(doMVCLOOPiterate);
1018           __ z_mvc(0, stride-1, dst_reg, 0, src_reg);
1019           __ add2reg(dst_reg, stride);
1020           __ add2reg(src_reg, stride);
1021           __ bind(doMVCLOOPcount);
1022           __ z_brxlg(ix_reg, stride_reg, doMVCLOOPiterate);
1023 
1024         // Don 't use add2reg() here, since we must set the condition code!
1025         __ z_aghi(ix_reg, -2*stride);       // Compensate incr from above: zero diff means "all copied".
1026 
1027         if (restoreArgs) {
1028           __ z_lcgr(Z_R1, ix_reg);          // Prepare ix_reg for copy loop, #bytes expected in Z_R1.
1029           __ z_brnz(doMVCgeneral);          // We're not done yet, ix_reg is not zero.
1030 
1031           // ARG1, ARG2, and ARG3 were altered by the code above, so restore them building on save_reg.
1032           __ z_slgr(dst_reg, save_reg);     // copied #bytes
1033           __ z_slgr(src_reg, dst_reg);      // = ARG1 (now restored)
1034           if (log2_size) {
1035             __ z_srag(Z_ARG3, dst_reg, log2_size); // Convert back to #elements to restore ARG3.
1036           } else {
1037             __ z_lgr(Z_ARG3, dst_reg);
1038           }
1039           __ z_lgr(Z_ARG2, save_reg);       // ARG2 now restored.
1040 
1041           if (branchToEnd) {
1042             __ z_bru(done);
1043           } else {
1044             __ z_br(Z_R14);
1045           }
1046 
1047         } else {
1048             if (branchToEnd) {
1049               __ z_brz(done);                        // CC set by aghi instr.
1050           } else {
1051               __ z_bcr(Assembler::bcondZero, Z_R14); // We're all done if zero.
1052             }
1053 
1054           __ z_lcgr(Z_R1, ix_reg);    // Prepare ix_reg for copy loop, #bytes expected in Z_R1.
1055           // __ z_bru(doMVCgeneral);  // fallthru
1056         }
1057         usedMVCgeneral = true;
1058         BLOCK_COMMENT("} mode MVC loop");
1059       }
1060       // Fallthru to doMVCgeneral
1061 
1062       // MVCgeneral: for short, unaligned arrays, after other copy operations
1063 
1064       // Somewhat expensive due to use of EX instruction, but simple.
1065       if (usedMVCgeneral) {
1066         BLOCK_COMMENT("mode MVC general {");
1067         __ bind(doMVCgeneral);
1068 
1069         __ add2reg(len_reg, -1, Z_R1);             // Get #bytes-1 for EXECUTE.
1070         if (VM_Version::has_ExecuteExtensions()) {
1071           __ z_exrl(len_reg, MVC_template);        // Execute MVC with variable length.
1072         } else {
1073           __ z_larl(Z_R1, MVC_template);           // Get addr of instr template.
1074           __ z_ex(len_reg, 0, Z_R0, Z_R1);         // Execute MVC with variable length.
1075         }                                          // penalty: 9 ticks
1076 
1077         if (restoreArgs) {
1078           // ARG1, ARG2, and ARG3 were altered by code executed before, so restore them building on save_reg
1079           __ z_slgr(dst_reg, save_reg);            // Copied #bytes without the "doMVCgeneral" chunk
1080           __ z_slgr(src_reg, dst_reg);             // = ARG1 (now restored), was not advanced for "doMVCgeneral" chunk
1081           __ add2reg_with_index(dst_reg, 1, len_reg, dst_reg); // Len of executed MVC was not accounted for, yet.
1082           if (log2_size) {
1083             __ z_srag(Z_ARG3, dst_reg, log2_size); // Convert back to #elements to restore ARG3
1084           } else {
1085              __ z_lgr(Z_ARG3, dst_reg);
1086           }
1087           __ z_lgr(Z_ARG2, save_reg);              // ARG2 now restored.
1088         }
1089 
1090         if (usedMVC) {
1091           if (branchToEnd) {
1092             __ z_bru(done);
1093           } else {
1094             __ z_br(Z_R14);
1095         }
1096         } else {
1097           if (!branchToEnd) __ z_br(Z_R14);
1098         }
1099         BLOCK_COMMENT("} mode MVC general");
1100       }
1101       // Fallthru possible if following block not generated.
1102 
1103       // MVC: for short, unaligned arrays
1104 
1105       // Somewhat expensive due to use of EX instruction, but simple. penalty: 9 ticks.
1106       // Differs from doMVCgeneral in reconstruction of ARG2, ARG3, and ARG4.
1107       if (usedMVC) {
1108         BLOCK_COMMENT("mode MVC {");
1109         __ bind(doMVC);
1110 
1111         // get #bytes-1 for EXECUTE
1112         if (log2_size) {
1113           __ add2reg(Z_R1, -1);                // Length was scaled into Z_R1.
1114         } else {
1115           __ add2reg(Z_R1, -1, len_reg);       // Length was not scaled.
1116         }
1117 
1118         if (VM_Version::has_ExecuteExtensions()) {
1119           __ z_exrl(Z_R1, MVC_template);       // Execute MVC with variable length.
1120         } else {
1121           __ z_lgr(Z_R0, Z_R5);                // Save ARG4, may be unnecessary.
1122           __ z_larl(Z_R5, MVC_template);       // Get addr of instr template.
1123           __ z_ex(Z_R1, 0, Z_R0, Z_R5);        // Execute MVC with variable length.
1124           __ z_lgr(Z_R5, Z_R0);                // Restore ARG4, may be unnecessary.
1125         }
1126 
1127         if (!branchToEnd) {
1128           __ z_br(Z_R14);
1129         }
1130         BLOCK_COMMENT("} mode MVC");
1131       }
1132 
1133       __ bind(done);
1134 
1135       switch (element_size) {
1136         case 1:  BLOCK_COMMENT("} ARRAYCOPY DISJOINT byte "); break;
1137         case 2:  BLOCK_COMMENT("} ARRAYCOPY DISJOINT short"); break;
1138         case 4:  BLOCK_COMMENT("} ARRAYCOPY DISJOINT int  "); break;
1139         case 8:  BLOCK_COMMENT("} ARRAYCOPY DISJOINT long "); break;
1140         default: BLOCK_COMMENT("} ARRAYCOPY DISJOINT      "); break;
1141       }
1142     }
1143   }
1144 
1145   // Generate stub for conjoint array copy. If "aligned" is true, the
1146   // "from" and "to" addresses are assumed to be heapword aligned.
1147   //
1148   // Arguments for generated stub:
1149   //   from:  Z_ARG1
1150   //   to:    Z_ARG2
1151   //   count: Z_ARG3 treated as signed
1152   void generate_conjoint_copy(bool aligned, int element_size, bool branchToEnd) {
1153 
1154     // This is the zarch specific stub generator for general array copy tasks.
1155     // It has the following prereqs and features:
1156     //
1157     // - Destructive overlap exists and is handled by reverse copy.
1158     // - Destructive overlap exists if the leftmost byte of the target
1159     //   does coincide with any of the source bytes (except the leftmost).
1160     // - Z_R0 and Z_R1 are KILLed by the stub routine (data and stride)
1161     // - Z_ARG1 and Z_ARG2 are USEd but preserved by the stub routine.
1162     // - Z_ARG3 is USED but preserved by the stub routine.
1163     // - Z_ARG4 is used as index register and is thus KILLed.
1164     //
1165     {
1166       Register stride_reg = Z_R1;     // Stride & compare value in loop (negative element_size).
1167       Register   data_reg = Z_R0;     // Holds value of currently processed element.
1168       Register     ix_reg = Z_ARG4;   // Holds byte index of currently processed element.
1169       Register    len_reg = Z_ARG3;   // Holds length (in #elements) of arrays.
1170       Register    dst_reg = Z_ARG2;   // Holds left  operand addr.
1171       Register    src_reg = Z_ARG1;   // Holds right operand addr.
1172 
1173       assert(256%element_size == 0, "Element size must be power of 2.");
1174       assert(element_size     <= 8, "Can't handle more than DW units.");
1175 
1176       switch (element_size) {
1177         case 1:  BLOCK_COMMENT("ARRAYCOPY CONJOINT byte  {"); break;
1178         case 2:  BLOCK_COMMENT("ARRAYCOPY CONJOINT short {"); break;
1179         case 4:  BLOCK_COMMENT("ARRAYCOPY CONJOINT int   {"); break;
1180         case 8:  BLOCK_COMMENT("ARRAYCOPY CONJOINT long  {"); break;
1181         default: BLOCK_COMMENT("ARRAYCOPY CONJOINT       {"); break;
1182       }
1183 
1184       assert_positive_int(len_reg);
1185 
1186       if (VM_Version::has_Prefetch()) {
1187         __ z_pfd(0x01, 0, Z_R0, src_reg); // Fetch access.
1188         __ z_pfd(0x02, 0, Z_R0, dst_reg); // Store access.
1189       }
1190 
1191       unsigned int log2_size = exact_log2(element_size);
1192       if (log2_size) {
1193         __ z_sllg(ix_reg, len_reg, log2_size);
1194       } else {
1195         __ z_lgr(ix_reg, len_reg);
1196       }
1197 
1198       // Optimize reverse copy loop.
1199       // Main loop copies DW units which may be unaligned. Unaligned access adds some penalty ticks.
1200       // Unaligned DW access (neither fetch nor store) is DW-atomic, but should be alignment-atomic.
1201       // Preceding the main loop, some bytes are copied to obtain a DW-multiple remaining length.
1202 
1203       Label countLoop1;
1204       Label copyLoop1;
1205       Label skipBY;
1206       Label skipHW;
1207       int   stride = -8;
1208 
1209       __ load_const_optimized(stride_reg, stride); // Prepare for DW copy loop.
1210 
1211       if (element_size == 8)    // Nothing to do here.
1212         __ z_bru(countLoop1);
1213       else {                    // Do not generate dead code.
1214         __ z_tmll(ix_reg, 7);   // Check the "odd" bits.
1215         __ z_bre(countLoop1);   // There are none, very good!
1216       }
1217 
1218       if (log2_size == 0) {     // Handle leftover Byte.
1219         __ z_tmll(ix_reg, 1);
1220         __ z_bre(skipBY);
1221         __ z_lb(data_reg,   -1, ix_reg, src_reg);
1222         __ z_stcy(data_reg, -1, ix_reg, dst_reg);
1223         __ add2reg(ix_reg, -1); // Decrement delayed to avoid AGI.
1224         __ bind(skipBY);
1225         // fallthru
1226       }
1227       if (log2_size <= 1) {     // Handle leftover HW.
1228         __ z_tmll(ix_reg, 2);
1229         __ z_bre(skipHW);
1230         __ z_lhy(data_reg,  -2, ix_reg, src_reg);
1231         __ z_sthy(data_reg, -2, ix_reg, dst_reg);
1232         __ add2reg(ix_reg, -2); // Decrement delayed to avoid AGI.
1233         __ bind(skipHW);
1234         __ z_tmll(ix_reg, 4);
1235         __ z_bre(countLoop1);
1236         // fallthru
1237       }
1238       if (log2_size <= 2) {     // There are just 4 bytes (left) that need to be copied.
1239         __ z_ly(data_reg,  -4, ix_reg, src_reg);
1240         __ z_sty(data_reg, -4, ix_reg, dst_reg);
1241         __ add2reg(ix_reg, -4); // Decrement delayed to avoid AGI.
1242         __ z_bru(countLoop1);
1243       }
1244 
1245       // Control can never get to here. Never! Never ever!
1246       __ z_illtrap(0x99);
1247       __ bind(copyLoop1);
1248       __ z_lg(data_reg,  0, ix_reg, src_reg);
1249       __ z_stg(data_reg, 0, ix_reg, dst_reg);
1250       __ bind(countLoop1);
1251       __ z_brxhg(ix_reg, stride_reg, copyLoop1);
1252 
1253       if (!branchToEnd)
1254         __ z_br(Z_R14);
1255 
1256       switch (element_size) {
1257         case 1:  BLOCK_COMMENT("} ARRAYCOPY CONJOINT byte "); break;
1258         case 2:  BLOCK_COMMENT("} ARRAYCOPY CONJOINT short"); break;
1259         case 4:  BLOCK_COMMENT("} ARRAYCOPY CONJOINT int  "); break;
1260         case 8:  BLOCK_COMMENT("} ARRAYCOPY CONJOINT long "); break;
1261         default: BLOCK_COMMENT("} ARRAYCOPY CONJOINT      "); break;
1262       }
1263     }
1264   }
1265 
1266   // Generate stub for disjoint byte copy. If "aligned" is true, the
1267   // "from" and "to" addresses are assumed to be heapword aligned.
1268   address generate_disjoint_byte_copy(bool aligned, const char * name) {
1269     StubCodeMark mark(this, "StubRoutines", name);
1270 
1271     // This is the zarch specific stub generator for byte array copy.
1272     // Refer to generate_disjoint_copy for a list of prereqs and features:
1273     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1274     generate_disjoint_copy(aligned, 1, false, false);
1275     return __ addr_at(start_off);
1276   }
1277 
1278 
1279   address generate_disjoint_short_copy(bool aligned, const char * name) {
1280     StubCodeMark mark(this, "StubRoutines", name);
1281     // This is the zarch specific stub generator for short array copy.
1282     // Refer to generate_disjoint_copy for a list of prereqs and features:
1283     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1284     generate_disjoint_copy(aligned, 2, false, false);
1285     return __ addr_at(start_off);
1286   }
1287 
1288 
1289   address generate_disjoint_int_copy(bool aligned, const char * name) {
1290     StubCodeMark mark(this, "StubRoutines", name);
1291     // This is the zarch specific stub generator for int array copy.
1292     // Refer to generate_disjoint_copy for a list of prereqs and features:
1293     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1294     generate_disjoint_copy(aligned, 4, false, false);
1295     return __ addr_at(start_off);
1296   }
1297 
1298 
1299   address generate_disjoint_long_copy(bool aligned, const char * name) {
1300     StubCodeMark mark(this, "StubRoutines", name);
1301     // This is the zarch specific stub generator for long array copy.
1302     // Refer to generate_disjoint_copy for a list of prereqs and features:
1303     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1304     generate_disjoint_copy(aligned, 8, false, false);
1305     return __ addr_at(start_off);
1306   }
1307 
1308 
1309   address generate_disjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
1310     StubCodeMark mark(this, "StubRoutines", name);
1311     // This is the zarch specific stub generator for oop array copy.
1312     // Refer to generate_disjoint_copy for a list of prereqs and features.
1313     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1314     unsigned int size      = UseCompressedOops ? 4 : 8;
1315 
1316     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1317     if (dest_uninitialized) {
1318       decorators |= IS_DEST_UNINITIALIZED;
1319     }
1320     if (aligned) {
1321       decorators |= ARRAYCOPY_ALIGNED;
1322     }
1323 
1324     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1325     bs->arraycopy_prologue(_masm, decorators, T_OBJECT, Z_ARG1, Z_ARG2, Z_ARG3);
1326 
1327     generate_disjoint_copy(aligned, size, true, true);
1328 
1329     bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, Z_ARG2, Z_ARG3, true);
1330 
1331     return __ addr_at(start_off);
1332   }
1333 
1334 
1335   address generate_conjoint_byte_copy(bool aligned, const char * name) {
1336     StubCodeMark mark(this, "StubRoutines", name);
1337     // This is the zarch specific stub generator for overlapping byte array copy.
1338     // Refer to generate_conjoint_copy for a list of prereqs and features:
1339     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
1340     address nooverlap_target = aligned ? StubRoutines::arrayof_jbyte_disjoint_arraycopy()
1341                                        : StubRoutines::jbyte_disjoint_arraycopy();
1342 
1343     array_overlap_test(nooverlap_target, 0); // Branch away to nooverlap_target if disjoint.
1344     generate_conjoint_copy(aligned, 1, false);
1345 
1346     return __ addr_at(start_off);
1347   }
1348 
1349 
1350   address generate_conjoint_short_copy(bool aligned, const char * name) {
1351     StubCodeMark mark(this, "StubRoutines", name);
1352     // This is the zarch specific stub generator for overlapping short array copy.
1353     // Refer to generate_conjoint_copy for a list of prereqs and features:
1354     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
1355     address nooverlap_target = aligned ? StubRoutines::arrayof_jshort_disjoint_arraycopy()
1356                                        : StubRoutines::jshort_disjoint_arraycopy();
1357 
1358     array_overlap_test(nooverlap_target, 1); // Branch away to nooverlap_target if disjoint.
1359     generate_conjoint_copy(aligned, 2, false);
1360 
1361     return __ addr_at(start_off);
1362   }
1363 
1364   address generate_conjoint_int_copy(bool aligned, const char * name) {
1365     StubCodeMark mark(this, "StubRoutines", name);
1366     // This is the zarch specific stub generator for overlapping int array copy.
1367     // Refer to generate_conjoint_copy for a list of prereqs and features:
1368 
1369     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
1370     address nooverlap_target = aligned ? StubRoutines::arrayof_jint_disjoint_arraycopy()
1371                                        : StubRoutines::jint_disjoint_arraycopy();
1372 
1373     array_overlap_test(nooverlap_target, 2); // Branch away to nooverlap_target if disjoint.
1374     generate_conjoint_copy(aligned, 4, false);
1375 
1376     return __ addr_at(start_off);
1377   }
1378 
1379   address generate_conjoint_long_copy(bool aligned, const char * name) {
1380     StubCodeMark mark(this, "StubRoutines", name);
1381     // This is the zarch specific stub generator for overlapping long array copy.
1382     // Refer to generate_conjoint_copy for a list of prereqs and features:
1383 
1384     unsigned int start_off   = __ offset();  // Remember stub start address (is rtn value).
1385     address nooverlap_target = aligned ? StubRoutines::arrayof_jlong_disjoint_arraycopy()
1386                                        : StubRoutines::jlong_disjoint_arraycopy();
1387 
1388     array_overlap_test(nooverlap_target, 3); // Branch away to nooverlap_target if disjoint.
1389     generate_conjoint_copy(aligned, 8, false);
1390 
1391     return __ addr_at(start_off);
1392   }
1393 
1394   address generate_conjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
1395     StubCodeMark mark(this, "StubRoutines", name);
1396     // This is the zarch specific stub generator for overlapping oop array copy.
1397     // Refer to generate_conjoint_copy for a list of prereqs and features.
1398     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1399     unsigned int size      = UseCompressedOops ? 4 : 8;
1400     unsigned int shift     = UseCompressedOops ? 2 : 3;
1401 
1402     address nooverlap_target = aligned ? StubRoutines::arrayof_oop_disjoint_arraycopy(dest_uninitialized)
1403                                        : StubRoutines::oop_disjoint_arraycopy(dest_uninitialized);
1404 
1405     // Branch to disjoint_copy (if applicable) before pre_barrier to avoid double pre_barrier.
1406     array_overlap_test(nooverlap_target, shift);  // Branch away to nooverlap_target if disjoint.
1407 
1408     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1409     if (dest_uninitialized) {
1410       decorators |= IS_DEST_UNINITIALIZED;
1411     }
1412     if (aligned) {
1413       decorators |= ARRAYCOPY_ALIGNED;
1414     }
1415 
1416     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1417     bs->arraycopy_prologue(_masm, decorators, T_OBJECT, Z_ARG1, Z_ARG2, Z_ARG3);
1418 
1419     generate_conjoint_copy(aligned, size, true);  // Must preserve ARG2, ARG3.
1420 
1421     bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, Z_ARG2, Z_ARG3, true);
1422 
1423     return __ addr_at(start_off);
1424   }
1425 
1426 
1427   void generate_arraycopy_stubs() {
1428 
1429     // Note: the disjoint stubs must be generated first, some of
1430     // the conjoint stubs use them.
1431     StubRoutines::_jbyte_disjoint_arraycopy      = generate_disjoint_byte_copy (false, "jbyte_disjoint_arraycopy");
1432     StubRoutines::_jshort_disjoint_arraycopy     = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
1433     StubRoutines::_jint_disjoint_arraycopy       = generate_disjoint_int_copy  (false, "jint_disjoint_arraycopy");
1434     StubRoutines::_jlong_disjoint_arraycopy      = generate_disjoint_long_copy (false, "jlong_disjoint_arraycopy");
1435     StubRoutines::_oop_disjoint_arraycopy        = generate_disjoint_oop_copy  (false, "oop_disjoint_arraycopy", false);
1436     StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy  (false, "oop_disjoint_arraycopy_uninit", true);
1437 
1438     StubRoutines::_arrayof_jbyte_disjoint_arraycopy      = generate_disjoint_byte_copy (true, "arrayof_jbyte_disjoint_arraycopy");
1439     StubRoutines::_arrayof_jshort_disjoint_arraycopy     = generate_disjoint_short_copy(true, "arrayof_jshort_disjoint_arraycopy");
1440     StubRoutines::_arrayof_jint_disjoint_arraycopy       = generate_disjoint_int_copy  (true, "arrayof_jint_disjoint_arraycopy");
1441     StubRoutines::_arrayof_jlong_disjoint_arraycopy      = generate_disjoint_long_copy (true, "arrayof_jlong_disjoint_arraycopy");
1442     StubRoutines::_arrayof_oop_disjoint_arraycopy        = generate_disjoint_oop_copy  (true, "arrayof_oop_disjoint_arraycopy", false);
1443     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy  (true, "arrayof_oop_disjoint_arraycopy_uninit", true);
1444 
1445     StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy (false, "jbyte_arraycopy");
1446     StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(false, "jshort_arraycopy");
1447     StubRoutines::_jint_arraycopy            = generate_conjoint_int_copy  (false, "jint_arraycopy");
1448     StubRoutines::_jlong_arraycopy           = generate_conjoint_long_copy (false, "jlong_arraycopy");
1449     StubRoutines::_oop_arraycopy             = generate_conjoint_oop_copy  (false, "oop_arraycopy", false);
1450     StubRoutines::_oop_arraycopy_uninit      = generate_conjoint_oop_copy  (false, "oop_arraycopy_uninit", true);
1451 
1452     StubRoutines::_arrayof_jbyte_arraycopy      = generate_conjoint_byte_copy (true, "arrayof_jbyte_arraycopy");
1453     StubRoutines::_arrayof_jshort_arraycopy     = generate_conjoint_short_copy(true, "arrayof_jshort_arraycopy");
1454     StubRoutines::_arrayof_jint_arraycopy       = generate_conjoint_int_copy  (true, "arrayof_jint_arraycopy");
1455     StubRoutines::_arrayof_jlong_arraycopy      = generate_conjoint_long_copy (true, "arrayof_jlong_arraycopy");
1456     StubRoutines::_arrayof_oop_arraycopy        = generate_conjoint_oop_copy  (true, "arrayof_oop_arraycopy", false);
1457     StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy  (true, "arrayof_oop_arraycopy_uninit", true);
1458   }
1459 
1460   void generate_safefetch(const char* name, int size, address* entry, address* fault_pc, address* continuation_pc) {
1461 
1462     // safefetch signatures:
1463     //   int      SafeFetch32(int*      adr, int      errValue);
1464     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
1465     //
1466     // arguments:
1467     //   Z_ARG1 = adr
1468     //   Z_ARG2 = errValue
1469     //
1470     // result:
1471     //   Z_RET  = *adr or errValue
1472 
1473     StubCodeMark mark(this, "StubRoutines", name);
1474 
1475     // entry point
1476     // Load *adr into Z_ARG2, may fault.
1477     *entry = *fault_pc = __ pc();
1478     switch (size) {
1479       case 4:
1480         // Sign extended int32_t.
1481         __ z_lgf(Z_ARG2, 0, Z_ARG1);
1482         break;
1483       case 8:
1484         // int64_t
1485         __ z_lg(Z_ARG2, 0, Z_ARG1);
1486         break;
1487       default:
1488         ShouldNotReachHere();
1489     }
1490 
1491     // Return errValue or *adr.
1492     *continuation_pc = __ pc();
1493     __ z_lgr(Z_RET, Z_ARG2);
1494     __ z_br(Z_R14);
1495 
1496   }
1497 
1498   // Call interface for AES_encryptBlock, AES_decryptBlock stubs.
1499   //
1500   //   Z_ARG1 - source data block. Ptr to leftmost byte to be processed.
1501   //   Z_ARG2 - destination data block. Ptr to leftmost byte to be stored.
1502   //            For in-place encryption/decryption, ARG1 and ARG2 can point
1503   //            to the same piece of storage.
1504   //   Z_ARG3 - Crypto key address (expanded key). The first n bits of
1505   //            the expanded key constitute the original AES-<n> key (see below).
1506   //
1507   //   Z_RET  - return value. First unprocessed byte offset in src buffer.
1508   //
1509   // Some remarks:
1510   //   The crypto key, as passed from the caller to these encryption stubs,
1511   //   is a so-called expanded key. It is derived from the original key
1512   //   by the Rijndael key schedule, see http://en.wikipedia.org/wiki/Rijndael_key_schedule
1513   //   With the expanded key, the cipher/decipher task is decomposed in
1514   //   multiple, less complex steps, called rounds. Sun SPARC and Intel
1515   //   processors obviously implement support for those less complex steps.
1516   //   z/Architecture provides instructions for full cipher/decipher complexity.
1517   //   Therefore, we need the original, not the expanded key here.
1518   //   Luckily, the first n bits of an AES-<n> expanded key are formed
1519   //   by the original key itself. That takes us out of trouble. :-)
1520   //   The key length (in bytes) relation is as follows:
1521   //     original    expanded   rounds  key bit     keylen
1522   //    key bytes   key bytes            length   in words
1523   //           16         176       11      128         44
1524   //           24         208       13      192         52
1525   //           32         240       15      256         60
1526   //
1527   // The crypto instructions used in the AES* stubs have some specific register requirements.
1528   //   Z_R0   holds the crypto function code. Please refer to the KM/KMC instruction
1529   //          description in the "z/Architecture Principles of Operation" manual for details.
1530   //   Z_R1   holds the parameter block address. The parameter block contains the cryptographic key
1531   //          (KM instruction) and the chaining value (KMC instruction).
1532   //   dst    must designate an even-numbered register, holding the address of the output message.
1533   //   src    must designate an even/odd register pair, holding the address/length of the original message
1534 
1535   // Helper function which generates code to
1536   //  - load the function code in register fCode (== Z_R0).
1537   //  - load the data block length (depends on cipher function) into register srclen if requested.
1538   //  - is_decipher switches between cipher/decipher function codes
1539   //  - set_len requests (if true) loading the data block length in register srclen
1540   void generate_load_AES_fCode(Register keylen, Register fCode, Register srclen, bool is_decipher) {
1541 
1542     BLOCK_COMMENT("Set fCode {"); {
1543       Label fCode_set;
1544       int   mode = is_decipher ? VM_Version::CipherMode::decipher : VM_Version::CipherMode::cipher;
1545       bool  identical_dataBlk_len =  (VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES192_dataBlk)
1546                                   && (VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES256_dataBlk);
1547       // Expanded key length is 44/52/60 * 4 bytes for AES-128/AES-192/AES-256.
1548       __ z_cghi(keylen, 52); // Check only once at the beginning. keylen and fCode may share the same register.
1549 
1550       __ z_lghi(fCode, VM_Version::Cipher::_AES128 + mode);
1551       if (!identical_dataBlk_len) {
1552         __ z_lghi(srclen, VM_Version::Cipher::_AES128_dataBlk);
1553       }
1554       __ z_brl(fCode_set);  // keyLen <  52: AES128
1555 
1556       __ z_lghi(fCode, VM_Version::Cipher::_AES192 + mode);
1557       if (!identical_dataBlk_len) {
1558         __ z_lghi(srclen, VM_Version::Cipher::_AES192_dataBlk);
1559       }
1560       __ z_bre(fCode_set);  // keyLen == 52: AES192
1561 
1562       __ z_lghi(fCode, VM_Version::Cipher::_AES256 + mode);
1563       if (!identical_dataBlk_len) {
1564         __ z_lghi(srclen, VM_Version::Cipher::_AES256_dataBlk);
1565       }
1566       // __ z_brh(fCode_set);  // keyLen <  52: AES128           // fallthru
1567 
1568       __ bind(fCode_set);
1569       if (identical_dataBlk_len) {
1570         __ z_lghi(srclen, VM_Version::Cipher::_AES128_dataBlk);
1571       }
1572     }
1573     BLOCK_COMMENT("} Set fCode");
1574   }
1575 
1576   // Push a parameter block for the cipher/decipher instruction on the stack.
1577   // Layout of the additional stack space allocated for AES_cipherBlockChaining:
1578   //
1579   //   |        |
1580   //   +--------+ <-- SP before expansion
1581   //   |        |
1582   //   :        :  alignment loss, 0..(AES_parmBlk_align-8) bytes
1583   //   |        |
1584   //   +--------+
1585   //   |        |
1586   //   :        :  space for parameter block, size VM_Version::Cipher::_AES*_parmBlk_C
1587   //   |        |
1588   //   +--------+ <-- parmBlk, octoword-aligned, start of parameter block
1589   //   |        |
1590   //   :        :  additional stack space for spills etc., size AES_parmBlk_addspace, DW @ Z_SP not usable!!!
1591   //   |        |
1592   //   +--------+ <-- Z_SP after expansion
1593 
1594   void generate_push_Block(int dataBlk_len, int parmBlk_len, int crypto_fCode,
1595                            Register parmBlk, Register keylen, Register fCode, Register cv, Register key) {
1596     const int AES_parmBlk_align    = 32;  // octoword alignment.
1597     const int AES_parmBlk_addspace = 24;  // Must be sufficiently large to hold all spilled registers
1598                                           // (currently 2) PLUS 1 DW for the frame pointer.
1599 
1600     const int cv_len     = dataBlk_len;
1601     const int key_len    = parmBlk_len - cv_len;
1602     // This len must be known at JIT compile time. Only then are we able to recalc the SP before resize.
1603     // We buy this knowledge by wasting some (up to AES_parmBlk_align) bytes of stack space.
1604     const int resize_len = cv_len + key_len + AES_parmBlk_align + AES_parmBlk_addspace;
1605 
1606     // Use parmBlk as temp reg here to hold the frame pointer.
1607     __ resize_frame(-resize_len, parmBlk, true);
1608 
1609     // calculate parmBlk address from updated (resized) SP.
1610     __ add2reg(parmBlk, resize_len - (cv_len + key_len), Z_SP);
1611     __ z_nill(parmBlk, (~(AES_parmBlk_align-1)) & 0xffff); // Align parameter block.
1612 
1613     // There is room for stuff in the range [parmBlk-AES_parmBlk_addspace+8, parmBlk).
1614     __ z_stg(keylen,  -8, parmBlk);                        // Spill keylen for later use.
1615 
1616     // calculate (SP before resize) from updated SP.
1617     __ add2reg(keylen, resize_len, Z_SP);                  // keylen holds prev SP for now.
1618     __ z_stg(keylen, -16, parmBlk);                        // Spill prev SP for easy revert.
1619 
1620     __ z_mvc(0,      cv_len-1,  parmBlk, 0, cv);     // Copy cv.
1621     __ z_mvc(cv_len, key_len-1, parmBlk, 0, key);    // Copy key.
1622     __ z_lghi(fCode, crypto_fCode);
1623   }
1624 
1625   // NOTE:
1626   //   Before returning, the stub has to copy the chaining value from
1627   //   the parmBlk, where it was updated by the crypto instruction, back
1628   //   to the chaining value array the address of which was passed in the cv argument.
1629   //   As all the available registers are used and modified by KMC, we need to save
1630   //   the key length across the KMC instruction. We do so by spilling it to the stack,
1631   //   just preceding the parmBlk (at (parmBlk - 8)).
1632   void generate_push_parmBlk(Register keylen, Register fCode, Register parmBlk, Register key, Register cv, bool is_decipher) {
1633     int       mode = is_decipher ? VM_Version::CipherMode::decipher : VM_Version::CipherMode::cipher;
1634     Label     parmBlk_128, parmBlk_192, parmBlk_256, parmBlk_set;
1635 
1636     BLOCK_COMMENT("push parmBlk {");
1637     if (VM_Version::has_Crypto_AES()   ) { __ z_cghi(keylen, 52); }
1638     if (VM_Version::has_Crypto_AES128()) { __ z_brl(parmBlk_128); }  // keyLen <  52: AES128
1639     if (VM_Version::has_Crypto_AES192()) { __ z_bre(parmBlk_192); }  // keyLen == 52: AES192
1640     if (VM_Version::has_Crypto_AES256()) { __ z_brh(parmBlk_256); }  // keyLen >  52: AES256
1641 
1642     // Security net: requested AES function not available on this CPU.
1643     // NOTE:
1644     //   As of now (March 2015), this safety net is not required. JCE policy files limit the
1645     //   cryptographic strength of the keys used to 128 bit. If we have AES hardware support
1646     //   at all, we have at least AES-128.
1647     __ stop_static("AES key strength not supported by CPU. Use -XX:-UseAES as remedy.", 0);
1648 
1649     if (VM_Version::has_Crypto_AES256()) {
1650       __ bind(parmBlk_256);
1651       generate_push_Block(VM_Version::Cipher::_AES256_dataBlk,
1652                           VM_Version::Cipher::_AES256_parmBlk_C,
1653                           VM_Version::Cipher::_AES256 + mode,
1654                           parmBlk, keylen, fCode, cv, key);
1655       if (VM_Version::has_Crypto_AES128() || VM_Version::has_Crypto_AES192()) {
1656         __ z_bru(parmBlk_set);  // Fallthru otherwise.
1657       }
1658     }
1659 
1660     if (VM_Version::has_Crypto_AES192()) {
1661       __ bind(parmBlk_192);
1662       generate_push_Block(VM_Version::Cipher::_AES192_dataBlk,
1663                           VM_Version::Cipher::_AES192_parmBlk_C,
1664                           VM_Version::Cipher::_AES192 + mode,
1665                           parmBlk, keylen, fCode, cv, key);
1666       if (VM_Version::has_Crypto_AES128()) {
1667         __ z_bru(parmBlk_set);  // Fallthru otherwise.
1668       }
1669     }
1670 
1671     if (VM_Version::has_Crypto_AES128()) {
1672       __ bind(parmBlk_128);
1673       generate_push_Block(VM_Version::Cipher::_AES128_dataBlk,
1674                           VM_Version::Cipher::_AES128_parmBlk_C,
1675                           VM_Version::Cipher::_AES128 + mode,
1676                           parmBlk, keylen, fCode, cv, key);
1677       // Fallthru
1678     }
1679 
1680     __ bind(parmBlk_set);
1681     BLOCK_COMMENT("} push parmBlk");
1682   }
1683 
1684   // Pop a parameter block from the stack. The chaining value portion of the parameter block
1685   // is copied back to the cv array as it is needed for subsequent cipher steps.
1686   // The keylen value as well as the original SP (before resizing) was pushed to the stack
1687   // when pushing the parameter block.
1688   void generate_pop_parmBlk(Register keylen, Register parmBlk, Register key, Register cv) {
1689 
1690     BLOCK_COMMENT("pop parmBlk {");
1691     bool identical_dataBlk_len =  (VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES192_dataBlk) &&
1692                                   (VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES256_dataBlk);
1693     if (identical_dataBlk_len) {
1694       int cv_len = VM_Version::Cipher::_AES128_dataBlk;
1695       __ z_mvc(0, cv_len-1, cv, 0, parmBlk);  // Copy cv.
1696     } else {
1697       int cv_len;
1698       Label parmBlk_128, parmBlk_192, parmBlk_256, parmBlk_set;
1699       __ z_lg(keylen, -8, parmBlk);  // restore keylen
1700       __ z_cghi(keylen, 52);
1701       if (VM_Version::has_Crypto_AES256()) __ z_brh(parmBlk_256);  // keyLen >  52: AES256
1702       if (VM_Version::has_Crypto_AES192()) __ z_bre(parmBlk_192);  // keyLen == 52: AES192
1703       // if (VM_Version::has_Crypto_AES128()) __ z_brl(parmBlk_128);  // keyLen <  52: AES128  // fallthru
1704 
1705       // Security net: there is no one here. If we would need it, we should have
1706       // fallen into it already when pushing the parameter block.
1707       if (VM_Version::has_Crypto_AES128()) {
1708         __ bind(parmBlk_128);
1709         cv_len = VM_Version::Cipher::_AES128_dataBlk;
1710         __ z_mvc(0, cv_len-1, cv, 0, parmBlk);  // Copy cv.
1711         if (VM_Version::has_Crypto_AES192() || VM_Version::has_Crypto_AES256()) {
1712           __ z_bru(parmBlk_set);
1713         }
1714       }
1715 
1716       if (VM_Version::has_Crypto_AES192()) {
1717         __ bind(parmBlk_192);
1718         cv_len = VM_Version::Cipher::_AES192_dataBlk;
1719         __ z_mvc(0, cv_len-1, cv, 0, parmBlk);  // Copy cv.
1720         if (VM_Version::has_Crypto_AES256()) {
1721           __ z_bru(parmBlk_set);
1722         }
1723       }
1724 
1725       if (VM_Version::has_Crypto_AES256()) {
1726         __ bind(parmBlk_256);
1727         cv_len = VM_Version::Cipher::_AES256_dataBlk;
1728         __ z_mvc(0, cv_len-1, cv, 0, parmBlk);  // Copy cv.
1729         // __ z_bru(parmBlk_set);  // fallthru
1730       }
1731       __ bind(parmBlk_set);
1732     }
1733     __ z_lg(Z_SP, -16, parmBlk); // Revert resize_frame_absolute. Z_SP saved by push_parmBlk.
1734     BLOCK_COMMENT("} pop parmBlk");
1735   }
1736 
1737   // Compute AES encrypt/decrypt function.
1738   void generate_AES_cipherBlock(bool is_decipher) {
1739     // Incoming arguments.
1740     Register       from    = Z_ARG1; // source byte array
1741     Register       to      = Z_ARG2; // destination byte array
1742     Register       key     = Z_ARG3; // expanded key array
1743 
1744     const Register keylen  = Z_R0;   // Temporarily (until fCode is set) holds the expanded key array length.
1745 
1746     // Register definitions as required by KM instruction.
1747     const Register fCode   = Z_R0;   // crypto function code
1748     const Register parmBlk = Z_R1;   // parameter block address (points to crypto key)
1749     const Register src     = Z_ARG1; // Must be even reg (KM requirement).
1750     const Register srclen  = Z_ARG2; // Must be odd reg and pair with src. Overwrites destination address.
1751     const Register dst     = Z_ARG3; // Must be even reg (KM requirement). Overwrites expanded key address.
1752 
1753     // Read key len of expanded key (in 4-byte words).
1754     __ z_lgf(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
1755 
1756     // Copy arguments to registers as required by crypto instruction.
1757     __ z_lgr(parmBlk, key);          // crypto key (in T_INT array).
1758     __ lgr_if_needed(src, from);     // Copy src address. Will not emit, src/from are identical.
1759     __ z_lgr(dst, to);               // Copy dst address, even register required.
1760 
1761     // Construct function code into fCode(Z_R0), data block length into srclen(Z_ARG2).
1762     generate_load_AES_fCode(keylen, fCode, srclen, is_decipher);
1763 
1764     __ km(dst, src);                 // Cipher the message.
1765 
1766     __ z_br(Z_R14);
1767   }
1768 
1769   // Compute AES encrypt function.
1770   address generate_AES_encryptBlock(const char* name) {
1771     __ align(CodeEntryAlignment);
1772     StubCodeMark mark(this, "StubRoutines", name);
1773     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1774 
1775     generate_AES_cipherBlock(false);
1776 
1777     return __ addr_at(start_off);
1778   }
1779 
1780   // Compute AES decrypt function.
1781   address generate_AES_decryptBlock(const char* name) {
1782     __ align(CodeEntryAlignment);
1783     StubCodeMark mark(this, "StubRoutines", name);
1784     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1785 
1786     generate_AES_cipherBlock(true);
1787 
1788     return __ addr_at(start_off);
1789   }
1790 
1791   // These stubs receive the addresses of the cryptographic key and of the chaining value as two separate
1792   // arguments (registers "key" and "cv", respectively). The KMC instruction, on the other hand, requires
1793   // chaining value and key to be, in this sequence, adjacent in storage. Thus, we need to allocate some
1794   // thread-local working storage. Using heap memory incurs all the hassles of allocating/freeing.
1795   // Stack space, on the contrary, is deallocated automatically when we return from the stub to the caller.
1796   // *** WARNING ***
1797   // Please note that we do not formally allocate stack space, nor do we
1798   // update the stack pointer. Therefore, no function calls are allowed
1799   // and nobody else must use the stack range where the parameter block
1800   // is located.
1801   // We align the parameter block to the next available octoword.
1802   //
1803   // Compute chained AES encrypt function.
1804   void generate_AES_cipherBlockChaining(bool is_decipher) {
1805 
1806     Register       from    = Z_ARG1; // source byte array (clear text)
1807     Register       to      = Z_ARG2; // destination byte array (ciphered)
1808     Register       key     = Z_ARG3; // expanded key array.
1809     Register       cv      = Z_ARG4; // chaining value
1810     const Register msglen  = Z_ARG5; // Total length of the msg to be encrypted. Value must be returned
1811                                      // in Z_RET upon completion of this stub. Is 32-bit integer.
1812 
1813     const Register keylen  = Z_R0;   // Expanded key length, as read from key array. Temp only.
1814     const Register fCode   = Z_R0;   // crypto function code
1815     const Register parmBlk = Z_R1;   // parameter block address (points to crypto key)
1816     const Register src     = Z_ARG1; // is Z_R2
1817     const Register srclen  = Z_ARG2; // Overwrites destination address.
1818     const Register dst     = Z_ARG3; // Overwrites key address.
1819 
1820     // Read key len of expanded key (in 4-byte words).
1821     __ z_lgf(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
1822 
1823     // Construct parm block address in parmBlk (== Z_R1), copy cv and key to parm block.
1824     // Construct function code in fCode (Z_R0).
1825     generate_push_parmBlk(keylen, fCode, parmBlk, key, cv, is_decipher);
1826 
1827     // Prepare other registers for instruction.
1828     __ lgr_if_needed(src, from);     // Copy src address. Will not emit, src/from are identical.
1829     __ z_lgr(dst, to);
1830     __ z_llgfr(srclen, msglen);      // We pass the offsets as ints, not as longs as required.
1831 
1832     __ kmc(dst, src);                // Cipher the message.
1833 
1834     generate_pop_parmBlk(keylen, parmBlk, key, cv);
1835 
1836     __ z_llgfr(Z_RET, msglen);       // We pass the offsets as ints, not as longs as required.
1837     __ z_br(Z_R14);
1838   }
1839 
1840   // Compute chained AES encrypt function.
1841   address generate_cipherBlockChaining_AES_encrypt(const char* name) {
1842     __ align(CodeEntryAlignment);
1843     StubCodeMark mark(this, "StubRoutines", name);
1844     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
1845 
1846     generate_AES_cipherBlockChaining(false);
1847 
1848     return __ addr_at(start_off);
1849   }
1850 
1851   // Compute chained AES encrypt function.
1852   address generate_cipherBlockChaining_AES_decrypt(const char* name) {
1853     __ align(CodeEntryAlignment);
1854     StubCodeMark mark(this, "StubRoutines", name);
1855     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
1856 
1857     generate_AES_cipherBlockChaining(true);
1858 
1859     return __ addr_at(start_off);
1860   }
1861 
1862 
1863   // Compute GHASH function.
1864   address generate_ghash_processBlocks() {
1865     __ align(CodeEntryAlignment);
1866     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
1867     unsigned int start_off = __ offset();   // Remember stub start address (is rtn value).
1868 
1869     const Register state   = Z_ARG1;
1870     const Register subkeyH = Z_ARG2;
1871     const Register data    = Z_ARG3; // 1st of even-odd register pair.
1872     const Register blocks  = Z_ARG4;
1873     const Register len     = blocks; // 2nd of even-odd register pair.
1874 
1875     const int param_block_size = 4 * 8;
1876     const int frame_resize = param_block_size + 8; // Extra space for copy of fp.
1877 
1878     // Reserve stack space for parameter block (R1).
1879     __ z_lgr(Z_R1, Z_SP);
1880     __ resize_frame(-frame_resize, Z_R0, true);
1881     __ z_aghi(Z_R1, -param_block_size);
1882 
1883     // Fill parameter block.
1884     __ z_mvc(Address(Z_R1)    , Address(state)  , 16);
1885     __ z_mvc(Address(Z_R1, 16), Address(subkeyH), 16);
1886 
1887     // R4+5: data pointer + length
1888     __ z_llgfr(len, blocks);  // Cast to 64-bit.
1889 
1890     // R0: function code
1891     __ load_const_optimized(Z_R0, (int)VM_Version::MsgDigest::_GHASH);
1892 
1893     // Compute.
1894     __ z_sllg(len, len, 4);  // In bytes.
1895     __ kimd(data);
1896 
1897     // Copy back result and free parameter block.
1898     __ z_mvc(Address(state), Address(Z_R1), 16);
1899     __ z_xc(Address(Z_R1), param_block_size, Address(Z_R1));
1900     __ z_aghi(Z_SP, frame_resize);
1901 
1902     __ z_br(Z_R14);
1903 
1904     return __ addr_at(start_off);
1905   }
1906 
1907 
1908   // Call interface for all SHA* stubs.
1909   //
1910   //   Z_ARG1 - source data block. Ptr to leftmost byte to be processed.
1911   //   Z_ARG2 - current SHA state. Ptr to state area. This area serves as
1912   //            parameter block as required by the crypto instruction.
1913   //   Z_ARG3 - current byte offset in source data block.
1914   //   Z_ARG4 - last byte offset in source data block.
1915   //            (Z_ARG4 - Z_ARG3) gives the #bytes remaining to be processed.
1916   //
1917   //   Z_RET  - return value. First unprocessed byte offset in src buffer.
1918   //
1919   //   A few notes on the call interface:
1920   //    - All stubs, whether they are single-block or multi-block, are assumed to
1921   //      digest an integer multiple of the data block length of data. All data
1922   //      blocks are digested using the intermediate message digest (KIMD) instruction.
1923   //      Special end processing, as done by the KLMD instruction, seems to be
1924   //      emulated by the calling code.
1925   //
1926   //    - Z_ARG1 addresses the first byte of source data. The offset (Z_ARG3) is
1927   //      already accounted for.
1928   //
1929   //    - The current SHA state (the intermediate message digest value) is contained
1930   //      in an area addressed by Z_ARG2. The area size depends on the SHA variant
1931   //      and is accessible via the enum VM_Version::MsgDigest::_SHA<n>_parmBlk_I
1932   //
1933   //    - The single-block stub is expected to digest exactly one data block, starting
1934   //      at the address passed in Z_ARG1.
1935   //
1936   //    - The multi-block stub is expected to digest all data blocks which start in
1937   //      the offset interval [srcOff(Z_ARG3), srcLimit(Z_ARG4)). The exact difference
1938   //      (srcLimit-srcOff), rounded up to the next multiple of the data block length,
1939   //      gives the number of blocks to digest. It must be assumed that the calling code
1940   //      provides for a large enough source data buffer.
1941   //
1942   // Compute SHA-1 function.
1943   address generate_SHA1_stub(bool multiBlock, const char* name) {
1944     __ align(CodeEntryAlignment);
1945     StubCodeMark mark(this, "StubRoutines", name);
1946     unsigned int start_off = __ offset();   // Remember stub start address (is rtn value).
1947 
1948     const Register srcBuff        = Z_ARG1; // Points to first block to process (offset already added).
1949     const Register SHAState       = Z_ARG2; // Only on entry. Reused soon thereafter for kimd register pairs.
1950     const Register srcOff         = Z_ARG3; // int
1951     const Register srcLimit       = Z_ARG4; // Only passed in multiBlock case. int
1952 
1953     const Register SHAState_local = Z_R1;
1954     const Register SHAState_save  = Z_ARG3;
1955     const Register srcBufLen      = Z_ARG2; // Destroys state address, must be copied before.
1956     Label useKLMD, rtn;
1957 
1958     __ load_const_optimized(Z_R0, (int)VM_Version::MsgDigest::_SHA1);   // function code
1959     __ z_lgr(SHAState_local, SHAState);                                 // SHAState == parameter block
1960 
1961     if (multiBlock) {  // Process everything from offset to limit.
1962 
1963       // The following description is valid if we get a raw (unpimped) source data buffer,
1964       // spanning the range between [srcOff(Z_ARG3), srcLimit(Z_ARG4)). As detailled above,
1965       // the calling convention for these stubs is different. We leave the description in
1966       // to inform the reader what must be happening hidden in the calling code.
1967       //
1968       // The data block to be processed can have arbitrary length, i.e. its length does not
1969       // need to be an integer multiple of SHA<n>_datablk. Therefore, we need to implement
1970       // two different paths. If the length is an integer multiple, we use KIMD, saving us
1971       // to copy the SHA state back and forth. If the length is odd, we copy the SHA state
1972       // to the stack, execute a KLMD instruction on it and copy the result back to the
1973       // caller's SHA state location.
1974 
1975       // Total #srcBuff blocks to process.
1976       if (VM_Version::has_DistinctOpnds()) {
1977         __ z_srk(srcBufLen, srcLimit, srcOff); // exact difference
1978         __ z_ahi(srcBufLen, VM_Version::MsgDigest::_SHA1_dataBlk-1);   // round up
1979         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA1_dataBlk-1)) & 0xffff);
1980         __ z_ark(srcLimit, srcOff, srcBufLen); // Srclimit temporarily holds return value.
1981         __ z_llgfr(srcBufLen, srcBufLen);      // Cast to 64-bit.
1982       } else {
1983         __ z_lgfr(srcBufLen, srcLimit);        // Exact difference. srcLimit passed as int.
1984         __ z_sgfr(srcBufLen, srcOff);          // SrcOff passed as int, now properly casted to long.
1985         __ z_aghi(srcBufLen, VM_Version::MsgDigest::_SHA1_dataBlk-1);   // round up
1986         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA1_dataBlk-1)) & 0xffff);
1987         __ z_lgr(srcLimit, srcOff);            // SrcLimit temporarily holds return value.
1988         __ z_agr(srcLimit, srcBufLen);
1989       }
1990 
1991       // Integral #blocks to digest?
1992       // As a result of the calculations above, srcBufLen MUST be an integer
1993       // multiple of _SHA1_dataBlk, or else we are in big trouble.
1994       // We insert an asm_assert into the KLMD case to guard against that.
1995       __ z_tmll(srcBufLen, VM_Version::MsgDigest::_SHA1_dataBlk-1);
1996       __ z_brc(Assembler::bcondNotAllZero, useKLMD);
1997 
1998       // Process all full blocks.
1999       __ kimd(srcBuff);
2000 
2001       __ z_lgr(Z_RET, srcLimit);  // Offset of first unprocessed byte in buffer.
2002     } else {  // Process one data block only.
2003       __ load_const_optimized(srcBufLen, (int)VM_Version::MsgDigest::_SHA1_dataBlk);   // #srcBuff bytes to process
2004       __ kimd(srcBuff);
2005       __ add2reg(Z_RET, (int)VM_Version::MsgDigest::_SHA1_dataBlk, srcOff);            // Offset of first unprocessed byte in buffer. No 32 to 64 bit extension needed.
2006     }
2007 
2008     __ bind(rtn);
2009     __ z_br(Z_R14);
2010 
2011     if (multiBlock) {
2012       __ bind(useKLMD);
2013 
2014 #if 1
2015       // Security net: this stub is believed to be called for full-sized data blocks only
2016       // NOTE: The following code is believed to be correct, but is is not tested.
2017       __ stop_static("SHA128 stub can digest full data blocks only. Use -XX:-UseSHA as remedy.", 0);
2018 #endif
2019     }
2020 
2021     return __ addr_at(start_off);
2022   }
2023 
2024   // Compute SHA-256 function.
2025   address generate_SHA256_stub(bool multiBlock, const char* name) {
2026     __ align(CodeEntryAlignment);
2027     StubCodeMark mark(this, "StubRoutines", name);
2028     unsigned int start_off = __ offset();   // Remember stub start address (is rtn value).
2029 
2030     const Register srcBuff        = Z_ARG1;
2031     const Register SHAState       = Z_ARG2; // Only on entry. Reused soon thereafter.
2032     const Register SHAState_local = Z_R1;
2033     const Register SHAState_save  = Z_ARG3;
2034     const Register srcOff         = Z_ARG3;
2035     const Register srcLimit       = Z_ARG4;
2036     const Register srcBufLen      = Z_ARG2; // Destroys state address, must be copied before.
2037     Label useKLMD, rtn;
2038 
2039     __ load_const_optimized(Z_R0, (int)VM_Version::MsgDigest::_SHA256); // function code
2040     __ z_lgr(SHAState_local, SHAState);                                 // SHAState == parameter block
2041 
2042     if (multiBlock) {  // Process everything from offset to limit.
2043       // The following description is valid if we get a raw (unpimped) source data buffer,
2044       // spanning the range between [srcOff(Z_ARG3), srcLimit(Z_ARG4)). As detailled above,
2045       // the calling convention for these stubs is different. We leave the description in
2046       // to inform the reader what must be happening hidden in the calling code.
2047       //
2048       // The data block to be processed can have arbitrary length, i.e. its length does not
2049       // need to be an integer multiple of SHA<n>_datablk. Therefore, we need to implement
2050       // two different paths. If the length is an integer multiple, we use KIMD, saving us
2051       // to copy the SHA state back and forth. If the length is odd, we copy the SHA state
2052       // to the stack, execute a KLMD instruction on it and copy the result back to the
2053       // caller's SHA state location.
2054 
2055       // total #srcBuff blocks to process
2056       if (VM_Version::has_DistinctOpnds()) {
2057         __ z_srk(srcBufLen, srcLimit, srcOff);   // exact difference
2058         __ z_ahi(srcBufLen, VM_Version::MsgDigest::_SHA256_dataBlk-1); // round up
2059         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA256_dataBlk-1)) & 0xffff);
2060         __ z_ark(srcLimit, srcOff, srcBufLen);   // Srclimit temporarily holds return value.
2061         __ z_llgfr(srcBufLen, srcBufLen);        // Cast to 64-bit.
2062       } else {
2063         __ z_lgfr(srcBufLen, srcLimit);          // exact difference
2064         __ z_sgfr(srcBufLen, srcOff);
2065         __ z_aghi(srcBufLen, VM_Version::MsgDigest::_SHA256_dataBlk-1); // round up
2066         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA256_dataBlk-1)) & 0xffff);
2067         __ z_lgr(srcLimit, srcOff);              // Srclimit temporarily holds return value.
2068         __ z_agr(srcLimit, srcBufLen);
2069       }
2070 
2071       // Integral #blocks to digest?
2072       // As a result of the calculations above, srcBufLen MUST be an integer
2073       // multiple of _SHA1_dataBlk, or else we are in big trouble.
2074       // We insert an asm_assert into the KLMD case to guard against that.
2075       __ z_tmll(srcBufLen, VM_Version::MsgDigest::_SHA256_dataBlk-1);
2076       __ z_brc(Assembler::bcondNotAllZero, useKLMD);
2077 
2078       // Process all full blocks.
2079       __ kimd(srcBuff);
2080 
2081       __ z_lgr(Z_RET, srcLimit);  // Offset of first unprocessed byte in buffer.
2082     } else {  // Process one data block only.
2083       __ load_const_optimized(srcBufLen, (int)VM_Version::MsgDigest::_SHA256_dataBlk); // #srcBuff bytes to process
2084       __ kimd(srcBuff);
2085       __ add2reg(Z_RET, (int)VM_Version::MsgDigest::_SHA256_dataBlk, srcOff);          // Offset of first unprocessed byte in buffer.
2086     }
2087 
2088     __ bind(rtn);
2089     __ z_br(Z_R14);
2090 
2091     if (multiBlock) {
2092       __ bind(useKLMD);
2093 #if 1
2094       // Security net: this stub is believed to be called for full-sized data blocks only.
2095       // NOTE:
2096       //   The following code is believed to be correct, but is is not tested.
2097       __ stop_static("SHA256 stub can digest full data blocks only. Use -XX:-UseSHA as remedy.", 0);
2098 #endif
2099     }
2100 
2101     return __ addr_at(start_off);
2102   }
2103 
2104   // Compute SHA-512 function.
2105   address generate_SHA512_stub(bool multiBlock, const char* name) {
2106     __ align(CodeEntryAlignment);
2107     StubCodeMark mark(this, "StubRoutines", name);
2108     unsigned int start_off = __ offset();   // Remember stub start address (is rtn value).
2109 
2110     const Register srcBuff        = Z_ARG1;
2111     const Register SHAState       = Z_ARG2; // Only on entry. Reused soon thereafter.
2112     const Register SHAState_local = Z_R1;
2113     const Register SHAState_save  = Z_ARG3;
2114     const Register srcOff         = Z_ARG3;
2115     const Register srcLimit       = Z_ARG4;
2116     const Register srcBufLen      = Z_ARG2; // Destroys state address, must be copied before.
2117     Label useKLMD, rtn;
2118 
2119     __ load_const_optimized(Z_R0, (int)VM_Version::MsgDigest::_SHA512); // function code
2120     __ z_lgr(SHAState_local, SHAState);                                 // SHAState == parameter block
2121 
2122     if (multiBlock) {  // Process everything from offset to limit.
2123       // The following description is valid if we get a raw (unpimped) source data buffer,
2124       // spanning the range between [srcOff(Z_ARG3), srcLimit(Z_ARG4)). As detailled above,
2125       // the calling convention for these stubs is different. We leave the description in
2126       // to inform the reader what must be happening hidden in the calling code.
2127       //
2128       // The data block to be processed can have arbitrary length, i.e. its length does not
2129       // need to be an integer multiple of SHA<n>_datablk. Therefore, we need to implement
2130       // two different paths. If the length is an integer multiple, we use KIMD, saving us
2131       // to copy the SHA state back and forth. If the length is odd, we copy the SHA state
2132       // to the stack, execute a KLMD instruction on it and copy the result back to the
2133       // caller's SHA state location.
2134 
2135       // total #srcBuff blocks to process
2136       if (VM_Version::has_DistinctOpnds()) {
2137         __ z_srk(srcBufLen, srcLimit, srcOff);   // exact difference
2138         __ z_ahi(srcBufLen, VM_Version::MsgDigest::_SHA512_dataBlk-1); // round up
2139         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA512_dataBlk-1)) & 0xffff);
2140         __ z_ark(srcLimit, srcOff, srcBufLen);   // Srclimit temporarily holds return value.
2141         __ z_llgfr(srcBufLen, srcBufLen);        // Cast to 64-bit.
2142       } else {
2143         __ z_lgfr(srcBufLen, srcLimit);          // exact difference
2144         __ z_sgfr(srcBufLen, srcOff);
2145         __ z_aghi(srcBufLen, VM_Version::MsgDigest::_SHA512_dataBlk-1); // round up
2146         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA512_dataBlk-1)) & 0xffff);
2147         __ z_lgr(srcLimit, srcOff);              // Srclimit temporarily holds return value.
2148         __ z_agr(srcLimit, srcBufLen);
2149       }
2150 
2151       // integral #blocks to digest?
2152       // As a result of the calculations above, srcBufLen MUST be an integer
2153       // multiple of _SHA1_dataBlk, or else we are in big trouble.
2154       // We insert an asm_assert into the KLMD case to guard against that.
2155       __ z_tmll(srcBufLen, VM_Version::MsgDigest::_SHA512_dataBlk-1);
2156       __ z_brc(Assembler::bcondNotAllZero, useKLMD);
2157 
2158       // Process all full blocks.
2159       __ kimd(srcBuff);
2160 
2161       __ z_lgr(Z_RET, srcLimit);  // Offset of first unprocessed byte in buffer.
2162     } else {  // Process one data block only.
2163       __ load_const_optimized(srcBufLen, (int)VM_Version::MsgDigest::_SHA512_dataBlk); // #srcBuff bytes to process
2164       __ kimd(srcBuff);
2165       __ add2reg(Z_RET, (int)VM_Version::MsgDigest::_SHA512_dataBlk, srcOff);          // Offset of first unprocessed byte in buffer.
2166     }
2167 
2168     __ bind(rtn);
2169     __ z_br(Z_R14);
2170 
2171     if (multiBlock) {
2172       __ bind(useKLMD);
2173 #if 1
2174       // Security net: this stub is believed to be called for full-sized data blocks only
2175       // NOTE:
2176       //   The following code is believed to be correct, but is is not tested.
2177       __ stop_static("SHA512 stub can digest full data blocks only. Use -XX:-UseSHA as remedy.", 0);
2178 #endif
2179     }
2180 
2181     return __ addr_at(start_off);
2182   }
2183 
2184 
2185   /**
2186    *  Arguments:
2187    *
2188    * Inputs:
2189    *   Z_ARG1    - int   crc
2190    *   Z_ARG2    - byte* buf
2191    *   Z_ARG3    - int   length (of buffer)
2192    *
2193    * Result:
2194    *   Z_RET     - int   crc result
2195    **/
2196   // Compute CRC function (generic, for all polynomials).
2197   void generate_CRC_updateBytes(const char* name, Register table, bool invertCRC) {
2198 
2199     // arguments to kernel_crc32:
2200     Register       crc     = Z_ARG1;  // Current checksum, preset by caller or result from previous call, int.
2201     Register       data    = Z_ARG2;  // source byte array
2202     Register       dataLen = Z_ARG3;  // #bytes to process, int
2203 //    Register       table   = Z_ARG4;  // crc table address. Preloaded and passed in by caller.
2204     const Register t0      = Z_R10;   // work reg for kernel* emitters
2205     const Register t1      = Z_R11;   // work reg for kernel* emitters
2206     const Register t2      = Z_R12;   // work reg for kernel* emitters
2207     const Register t3      = Z_R13;   // work reg for kernel* emitters
2208 
2209     assert_different_registers(crc, data, dataLen, table);
2210 
2211     // We pass these values as ints, not as longs as required by C calling convention.
2212     // Crc used as int.
2213     __ z_llgfr(dataLen, dataLen);
2214 
2215     __ resize_frame(-(6*8), Z_R0, true); // Resize frame to provide add'l space to spill 5 registers.
2216     __ z_stmg(Z_R10, Z_R13, 1*8, Z_SP);  // Spill regs 10..11 to make them available as work registers.
2217     __ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, invertCRC);
2218     __ z_lmg(Z_R10, Z_R13, 1*8, Z_SP);   // Spill regs 10..11 back from stack.
2219     __ resize_frame(+(6*8), Z_R0, true); // Resize frame to provide add'l space to spill 5 registers.
2220 
2221     __ z_llgfr(Z_RET, crc);  // Updated crc is function result. No copying required, just zero upper 32 bits.
2222     __ z_br(Z_R14);          // Result already in Z_RET == Z_ARG1.
2223   }
2224 
2225 
2226   // Compute CRC32 function.
2227   address generate_CRC32_updateBytes(const char* name) {
2228     __ align(CodeEntryAlignment);
2229     StubCodeMark mark(this, "StubRoutines", name);
2230     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
2231 
2232     assert(UseCRC32Intrinsics, "should not generate this stub (%s) with CRC32 intrinsics disabled", name);
2233 
2234     BLOCK_COMMENT("CRC32_updateBytes {");
2235     Register       table   = Z_ARG4;  // crc32 table address.
2236     StubRoutines::zarch::generate_load_crc_table_addr(_masm, table);
2237 
2238     generate_CRC_updateBytes(name, table, true);
2239     BLOCK_COMMENT("} CRC32_updateBytes");
2240 
2241     return __ addr_at(start_off);
2242   }
2243 
2244 
2245   // Compute CRC32C function.
2246   address generate_CRC32C_updateBytes(const char* name) {
2247     __ align(CodeEntryAlignment);
2248     StubCodeMark mark(this, "StubRoutines", name);
2249     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
2250 
2251     assert(UseCRC32CIntrinsics, "should not generate this stub (%s) with CRC32C intrinsics disabled", name);
2252 
2253     BLOCK_COMMENT("CRC32C_updateBytes {");
2254     Register       table   = Z_ARG4;  // crc32c table address.
2255     StubRoutines::zarch::generate_load_crc32c_table_addr(_masm, table);
2256 
2257     generate_CRC_updateBytes(name, table, false);
2258     BLOCK_COMMENT("} CRC32C_updateBytes");
2259 
2260     return __ addr_at(start_off);
2261   }
2262 
2263 
2264   // Arguments:
2265   //   Z_ARG1    - x address
2266   //   Z_ARG2    - x length
2267   //   Z_ARG3    - y address
2268   //   Z_ARG4    - y length
2269   //   Z_ARG5    - z address
2270   //   160[Z_SP] - z length
2271   address generate_multiplyToLen() {
2272     __ align(CodeEntryAlignment);
2273     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
2274 
2275     address start = __ pc();
2276 
2277     const Register x    = Z_ARG1;
2278     const Register xlen = Z_ARG2;
2279     const Register y    = Z_ARG3;
2280     const Register ylen = Z_ARG4;
2281     const Register z    = Z_ARG5;
2282     // zlen is passed on the stack:
2283     // Address zlen(Z_SP, _z_abi(remaining_cargs));
2284 
2285     // Next registers will be saved on stack in multiply_to_len().
2286     const Register tmp1 = Z_tmp_1;
2287     const Register tmp2 = Z_tmp_2;
2288     const Register tmp3 = Z_tmp_3;
2289     const Register tmp4 = Z_tmp_4;
2290     const Register tmp5 = Z_R9;
2291 
2292     BLOCK_COMMENT("Entry:");
2293 
2294     __ z_llgfr(xlen, xlen);
2295     __ z_llgfr(ylen, ylen);
2296 
2297     __ multiply_to_len(x, xlen, y, ylen, z, tmp1, tmp2, tmp3, tmp4, tmp5);
2298 
2299     __ z_br(Z_R14);  // Return to caller.
2300 
2301     return start;
2302   }
2303 
2304   void generate_initial() {
2305     // Generates all stubs and initializes the entry points.
2306 
2307     // Entry points that exist in all platforms.
2308     // Note: This is code that could be shared among different
2309     // platforms - however the benefit seems to be smaller than the
2310     // disadvantage of having a much more complicated generator
2311     // structure. See also comment in stubRoutines.hpp.
2312     StubRoutines::_forward_exception_entry                 = generate_forward_exception();
2313 
2314     StubRoutines::_call_stub_entry                         = generate_call_stub(StubRoutines::_call_stub_return_address);
2315     StubRoutines::_catch_exception_entry                   = generate_catch_exception();
2316 
2317     // Build this early so it's available for the interpreter.
2318     StubRoutines::_throw_StackOverflowError_entry          =
2319       generate_throw_exception("StackOverflowError throw_exception",
2320                                CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false);
2321     StubRoutines::_throw_delayed_StackOverflowError_entry  =
2322       generate_throw_exception("delayed StackOverflowError throw_exception",
2323                                CAST_FROM_FN_PTR(address, SharedRuntime::throw_delayed_StackOverflowError), false);
2324 
2325     //----------------------------------------------------------------------
2326     // Entry points that are platform specific.
2327 
2328     if (UseCRC32Intrinsics) {
2329       StubRoutines::_crc_table_adr     = (address)StubRoutines::zarch::_crc_table;
2330       StubRoutines::_updateBytesCRC32  = generate_CRC32_updateBytes("CRC32_updateBytes");
2331     }
2332 
2333     if (UseCRC32CIntrinsics) {
2334       StubRoutines::_crc32c_table_addr = (address)StubRoutines::zarch::_crc32c_table;
2335       StubRoutines::_updateBytesCRC32C = generate_CRC32C_updateBytes("CRC32C_updateBytes");
2336     }
2337 
2338     // Comapct string intrinsics: Translate table for string inflate intrinsic. Used by trot instruction.
2339     StubRoutines::zarch::_trot_table_addr = (address)StubRoutines::zarch::_trot_table;
2340 
2341     // safefetch stubs
2342     generate_safefetch("SafeFetch32", sizeof(int),      &StubRoutines::_safefetch32_entry, &StubRoutines::_safefetch32_fault_pc, &StubRoutines::_safefetch32_continuation_pc);
2343     generate_safefetch("SafeFetchN",  sizeof(intptr_t), &StubRoutines::_safefetchN_entry,  &StubRoutines::_safefetchN_fault_pc,  &StubRoutines::_safefetchN_continuation_pc);
2344   }
2345 
2346 
2347   void generate_all() {
2348     // Generates all stubs and initializes the entry points.
2349 
2350     StubRoutines::zarch::_partial_subtype_check            = generate_partial_subtype_check();
2351 
2352     // These entry points require SharedInfo::stack0 to be set up in non-core builds.
2353     StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError),  false);
2354     StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError),  false);
2355     StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
2356 
2357     // Support for verify_oop (must happen after universe_init).
2358     StubRoutines::_verify_oop_subroutine_entry             = generate_verify_oop_subroutine();
2359 
2360     // Arraycopy stubs used by compilers.
2361     generate_arraycopy_stubs();
2362 
2363     // Generate AES intrinsics code.
2364     if (UseAESIntrinsics) {
2365       StubRoutines::_aescrypt_encryptBlock = generate_AES_encryptBlock("AES_encryptBlock");
2366       StubRoutines::_aescrypt_decryptBlock = generate_AES_decryptBlock("AES_decryptBlock");
2367       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_AES_encrypt("AES_encryptBlock_chaining");
2368       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_AES_decrypt("AES_decryptBlock_chaining");
2369     }
2370 
2371     // Generate GHASH intrinsics code
2372     if (UseGHASHIntrinsics) {
2373       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
2374     }
2375 
2376     // Generate SHA1/SHA256/SHA512 intrinsics code.
2377     if (UseSHA1Intrinsics) {
2378       StubRoutines::_sha1_implCompress     = generate_SHA1_stub(false,   "SHA1_singleBlock");
2379       StubRoutines::_sha1_implCompressMB   = generate_SHA1_stub(true,    "SHA1_multiBlock");
2380     }
2381     if (UseSHA256Intrinsics) {
2382       StubRoutines::_sha256_implCompress   = generate_SHA256_stub(false, "SHA256_singleBlock");
2383       StubRoutines::_sha256_implCompressMB = generate_SHA256_stub(true,  "SHA256_multiBlock");
2384     }
2385     if (UseSHA512Intrinsics) {
2386       StubRoutines::_sha512_implCompress   = generate_SHA512_stub(false, "SHA512_singleBlock");
2387       StubRoutines::_sha512_implCompressMB = generate_SHA512_stub(true,  "SHA512_multiBlock");
2388     }
2389 
2390 #ifdef COMPILER2
2391     if (UseMultiplyToLenIntrinsic) {
2392       StubRoutines::_multiplyToLen = generate_multiplyToLen();
2393     }
2394     if (UseMontgomeryMultiplyIntrinsic) {
2395       StubRoutines::_montgomeryMultiply
2396         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
2397     }
2398     if (UseMontgomerySquareIntrinsic) {
2399       StubRoutines::_montgomerySquare
2400         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
2401     }
2402 #endif
2403   }
2404 
2405  public:
2406   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
2407     // Replace the standard masm with a special one:
2408     _masm = new MacroAssembler(code);
2409 
2410     _stub_count = !all ? 0x100 : 0x200;
2411     if (all) {
2412       generate_all();
2413     } else {
2414       generate_initial();
2415     }
2416   }
2417 
2418  private:
2419   int _stub_count;
2420   void stub_prolog(StubCodeDesc* cdesc) {
2421 #ifdef ASSERT
2422     // Put extra information in the stub code, to make it more readable.
2423     // Write the high part of the address.
2424     // [RGV] Check if there is a dependency on the size of this prolog.
2425     __ emit_32((intptr_t)cdesc >> 32);
2426     __ emit_32((intptr_t)cdesc);
2427     __ emit_32(++_stub_count);
2428 #endif
2429     align(true);
2430   }
2431 
2432   void align(bool at_header = false) {
2433     // z/Architecture cache line size is 256 bytes.
2434     // There is no obvious benefit in aligning stub
2435     // code to cache lines. Use CodeEntryAlignment instead.
2436     const unsigned int icache_line_size      = CodeEntryAlignment;
2437     const unsigned int icache_half_line_size = MIN2<unsigned int>(32, CodeEntryAlignment);
2438 
2439     if (at_header) {
2440       while ((intptr_t)(__ pc()) % icache_line_size != 0) {
2441         __ emit_16(0);
2442       }
2443     } else {
2444       while ((intptr_t)(__ pc()) % icache_half_line_size != 0) {
2445         __ z_nop();
2446       }
2447     }
2448   }
2449 
2450 };
2451 
2452 void StubGenerator_generate(CodeBuffer* code, int phase) {
2453   StubGenerator g(code, phase);
2454 }
--- EOF ---