1 /*
  2  * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.
  8  *
  9  * This code is distributed in the hope that it will be useful, but WITHOUT
 10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 12  * version 2 for more details (a copy is included in the LICENSE file that
 13  * accompanied this code).
 14  *
 15  * You should have received a copy of the GNU General Public License version
 16  * 2 along with this work; if not, write to the Free Software Foundation,
 17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 18  *
 19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 20  * or visit www.oracle.com if you need additional information or have any
 21  * questions.
 22  *
 23  */
 24 
 25 #ifndef CPU_X86_STUBGENERATOR_X86_64_HPP
 26 #define CPU_X86_STUBGENERATOR_X86_64_HPP
 27 
 28 #include "code/codeBlob.hpp"
 29 #include "runtime/continuation.hpp"
 30 #include "runtime/stubCodeGenerator.hpp"
 31 
 32 // Stub Code definitions
 33 
 34 class StubGenerator: public StubCodeGenerator {
 35  private:
 36 
 37   // Call stubs are used to call Java from C.
 38   address generate_call_stub(address& return_address);
 39 
 40   // Return point for a Java call if there's an exception thrown in
 41   // Java code.  The exception is caught and transformed into a
 42   // pending exception stored in JavaThread that can be tested from
 43   // within the VM.
 44   //
 45   // Note: Usually the parameters are removed by the callee. In case
 46   // of an exception crossing an activation frame boundary, that is
 47   // not the case if the callee is compiled code => need to setup the
 48   // rsp.
 49   //
 50   // rax: exception oop
 51 
 52   address generate_catch_exception();
 53 
 54   // Continuation point for runtime calls returning with a pending
 55   // exception.  The pending exception check happened in the runtime
 56   // or native call stub.  The pending exception in Thread is
 57   // converted into a Java-level exception.
 58   //
 59   // Contract with Java-level exception handlers:
 60   // rax: exception
 61   // rdx: throwing pc
 62   //
 63   // NOTE: At entry of this stub, exception-pc must be on stack !!
 64 
 65   address generate_forward_exception();
 66 
 67   // Support for intptr_t OrderAccess::fence()
 68   address generate_orderaccess_fence();
 69 
 70   // Support for intptr_t get_previous_sp()
 71   //
 72   // This routine is used to find the previous stack pointer for the
 73   // caller.
 74   address generate_get_previous_sp();
 75 
 76   //----------------------------------------------------------------------------------------------------
 77   // Support for void verify_mxcsr()
 78   //
 79   // This routine is used with -Xcheck:jni to verify that native
 80   // JNI code does not return to Java code without restoring the
 81   // MXCSR register to our expected state.
 82 
 83   address generate_verify_mxcsr();
 84 
 85   address generate_f2i_fixup();
 86   address generate_f2l_fixup();
 87   address generate_d2i_fixup();
 88   address generate_d2l_fixup();
 89 
 90   address generate_count_leading_zeros_lut(const char *stub_name);
 91   address generate_popcount_avx_lut(const char *stub_name);
 92   address generate_iota_indices(const char *stub_name);
 93   address generate_vector_reverse_bit_lut(const char *stub_name);
 94 
 95   address generate_vector_reverse_byte_perm_mask_long(const char *stub_name);
 96   address generate_vector_reverse_byte_perm_mask_int(const char *stub_name);
 97   address generate_vector_reverse_byte_perm_mask_short(const char *stub_name);
 98   address generate_vector_byte_shuffle_mask(const char *stub_name);
 99 
100   address generate_fp_mask(const char *stub_name, int64_t mask);
101 
102   address generate_vector_mask(const char *stub_name, int64_t mask);
103 
104   address generate_vector_byte_perm_mask(const char *stub_name);
105 
106   address generate_vector_fp_mask(const char *stub_name, int64_t mask);
107 
108   address generate_vector_custom_i32(const char *stub_name, Assembler::AvxVectorLen len,
109                                      int32_t val0, int32_t val1, int32_t val2, int32_t val3,
110                                      int32_t val4 = 0, int32_t val5 = 0, int32_t val6 = 0, int32_t val7 = 0,
111                                      int32_t val8 = 0, int32_t val9 = 0, int32_t val10 = 0, int32_t val11 = 0,
112                                      int32_t val12 = 0, int32_t val13 = 0, int32_t val14 = 0, int32_t val15 = 0);
113 
114   // Non-destructive plausibility checks for oops
115   address generate_verify_oop();
116 
117   // Verify that a register contains clean 32-bits positive value
118   // (high 32-bits are 0) so it could be used in 64-bits shifts.
119   void assert_clean_int(Register Rint, Register Rtmp);
120 
121   //  Generate overlap test for array copy stubs
122   void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf);
123 
124   void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) {
125     assert(no_overlap_target != NULL, "must be generated");
126     array_overlap_test(no_overlap_target, NULL, sf);
127   }
128   void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) {
129     array_overlap_test(NULL, &L_no_overlap, sf);
130   }
131 
132 
133   // Shuffle first three arg regs on Windows into Linux/Solaris locations.
134   void setup_arg_regs(int nargs = 3);
135   void restore_arg_regs();
136 
137 #ifdef ASSERT
138   bool _regs_in_thread;
139 #endif
140 
141   // This is used in places where r10 is a scratch register, and can
142   // be adapted if r9 is needed also.
143   void setup_arg_regs_using_thread();
144 
145   void restore_arg_regs_using_thread();
146 
147   // Copy big chunks forward
148   void copy_bytes_forward(Register end_from, Register end_to,
149                           Register qword_count, Register to,
150                           Label& L_copy_bytes, Label& L_copy_8_bytes);
151 
152   // Copy big chunks backward
153   void copy_bytes_backward(Register from, Register dest,
154                            Register qword_count, Register to,
155                            Label& L_copy_bytes, Label& L_copy_8_bytes);
156 
157   void setup_argument_regs(BasicType type);
158 
159   void restore_argument_regs(BasicType type);
160 
161 #if COMPILER2_OR_JVMCI
162   // Following rules apply to AVX3 optimized arraycopy stubs:
163   // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
164   //   for both special cases (various small block sizes) and aligned copy loop. This is the
165   //   default configuration.
166   // - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs)
167   //   for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
168   // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
169   //   better performance for disjoint copies. For conjoint/backward copy vector based
170   //   copy performs better.
171   // - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over
172   //   64 byte vector registers (ZMMs).
173 
174   address generate_disjoint_copy_avx3_masked(address* entry, const char *name, int shift,
175                                              bool aligned, bool is_oop, bool dest_uninitialized);
176 
177   address generate_conjoint_copy_avx3_masked(address* entry, const char *name, int shift,
178                                              address nooverlap_target, bool aligned, bool is_oop,
179                                              bool dest_uninitialized);
180 
181   void arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from,
182                                     Register to, Register count, int shift,
183                                     Register index, Register temp,
184                                     bool use64byteVector, Label& L_entry, Label& L_exit);
185 
186   void arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
187                                              Register to, Register start_index, Register end_index,
188                                              Register count, int shift, Register temp,
189                                              bool use64byteVector, Label& L_entry, Label& L_exit);
190 
191   void copy32_avx(Register dst, Register src, Register index, XMMRegister xmm,
192                   int shift = Address::times_1, int offset = 0);
193 
194   void copy64_avx(Register dst, Register src, Register index, XMMRegister xmm,
195                   bool conjoint, int shift = Address::times_1, int offset = 0,
196                   bool use64byteVector = false);
197 
198   void copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
199                          KRegister mask, Register length, Register index,
200                          Register temp, int shift = Address::times_1, int offset = 0,
201                          bool use64byteVector = false);
202 
203   void copy32_masked_avx(Register dst, Register src, XMMRegister xmm,
204                          KRegister mask, Register length, Register index,
205                          Register temp, int shift = Address::times_1, int offset = 0);
206 #endif // COMPILER2_OR_JVMCI
207 
208   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name);
209 
210   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
211                                       address* entry, const char *name);
212 
213   address generate_disjoint_short_copy(bool aligned, address *entry, const char *name);
214 
215   address generate_fill(BasicType t, bool aligned, const char *name);
216 
217   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
218                                        address *entry, const char *name);
219   address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
220                                          const char *name, bool dest_uninitialized = false);
221   address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
222                                          address *entry, const char *name,
223                                          bool dest_uninitialized = false);
224   address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
225                                           const char *name, bool dest_uninitialized = false);
226   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
227                                           address nooverlap_target, address *entry,
228                                           const char *name, bool dest_uninitialized = false);
229 
230   // Helper for generating a dynamic type check.
231   // Smashes no registers.
232   void generate_type_check(Register sub_klass,
233                            Register super_check_offset,
234                            Register super_klass,
235                            Label& L_success);
236 
237   // Generate checkcasting array copy stub
238   address generate_checkcast_copy(const char *name, address *entry,
239                                   bool dest_uninitialized = false);
240 
241   // Generate 'unsafe' array copy stub
242   // Though just as safe as the other stubs, it takes an unscaled
243   // size_t argument instead of an element count.
244   //
245   // Examines the alignment of the operands and dispatches
246   // to a long, int, short, or byte copy loop.
247   address generate_unsafe_copy(const char *name,
248                                address byte_copy_entry, address short_copy_entry,
249                                address int_copy_entry, address long_copy_entry);
250 
251   // Perform range checks on the proposed arraycopy.
252   // Kills temp, but nothing else.
253   // Also, clean the sign bits of src_pos and dst_pos.
254   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
255                               Register src_pos, // source position (c_rarg1)
256                               Register dst,     // destination array oo (c_rarg2)
257                               Register dst_pos, // destination position (c_rarg3)
258                               Register length,
259                               Register temp,
260                               Label& L_failed);
261 
262   // Generate generic array copy stubs
263   address generate_generic_copy(const char *name,
264                                 address byte_copy_entry, address short_copy_entry,
265                                 address int_copy_entry, address oop_copy_entry,
266                                 address long_copy_entry, address checkcast_copy_entry);
267 
268   address generate_data_cache_writeback();
269 
270   address generate_data_cache_writeback_sync();
271 
272   void generate_arraycopy_stubs();
273 
274 
275   // MD5 stubs
276 
277   // ofs and limit are use for multi-block byte array.
278   // int com.sun.security.provider.MD5.implCompress(byte[] b, int ofs)
279   address generate_md5_implCompress(bool multi_block, const char *name);
280 
281 
282   // SHA stubs
283 
284   // ofs and limit are use for multi-block byte array.
285   // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
286   address generate_sha1_implCompress(bool multi_block, const char *name);
287 
288   // ofs and limit are use for multi-block byte array.
289   // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
290   address generate_sha256_implCompress(bool multi_block, const char *name);
291   address generate_sha512_implCompress(bool multi_block, const char *name);
292 
293   // Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
294   address generate_pshuffle_byte_flip_mask_sha512();
295 
296   address generate_upper_word_mask();
297   address generate_shuffle_byte_flip_mask();
298   address generate_pshuffle_byte_flip_mask();
299 
300 
301   // AES intrinsic stubs
302 
303   address generate_aescrypt_encryptBlock();
304 
305   address generate_aescrypt_decryptBlock();
306 
307   address generate_cipherBlockChaining_encryptAESCrypt();
308 
309   // A version of CBC/AES Decrypt which does 4 blocks in a loop at a time
310   // to hide instruction latency
311   address generate_cipherBlockChaining_decryptAESCrypt_Parallel();
312 
313   address generate_electronicCodeBook_encryptAESCrypt();
314 
315   void aesecb_encrypt(Register source_addr, Register dest_addr, Register key, Register len);
316 
317   address generate_electronicCodeBook_decryptAESCrypt();
318 
319   void aesecb_decrypt(Register source_addr, Register dest_addr, Register key, Register len);
320 
321   // Vector AES Galois Counter Mode implementation
322   address generate_galoisCounterMode_AESCrypt();
323   void aesgcm_encrypt(Register in, Register len, Register ct, Register out, Register key,
324                       Register state, Register subkeyHtbl, Register avx512_subkeyHtbl, Register counter);
325 
326 
327  // Vector AES Counter implementation
328   address generate_counterMode_VectorAESCrypt();
329   void aesctr_encrypt(Register src_addr, Register dest_addr, Register key, Register counter,
330                       Register len_reg, Register used, Register used_addr, Register saved_encCounter_start);
331 
332   // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
333   // to hide instruction latency
334   address generate_counterMode_AESCrypt_Parallel();
335 
336   address generate_cipherBlockChaining_decryptVectorAESCrypt();
337 
338   address generate_key_shuffle_mask();
339 
340   void roundDec(XMMRegister xmm_reg);
341   void roundDeclast(XMMRegister xmm_reg);
342   void roundEnc(XMMRegister key, int rnum);
343   void lastroundEnc(XMMRegister key, int rnum);
344   void roundDec(XMMRegister key, int rnum);
345   void lastroundDec(XMMRegister key, int rnum);
346   void gfmul_avx512(XMMRegister ghash, XMMRegister hkey);
347   void generateHtbl_48_block_zmm(Register htbl, Register avx512_subkeyHtbl, Register rscratch);
348   void ghash16_encrypt16_parallel(Register key, Register subkeyHtbl, XMMRegister ctr_blockx,
349                                   XMMRegister aad_hashx, Register in, Register out, Register data, Register pos, bool reduction,
350                                   XMMRegister addmask, bool no_ghash_input, Register rounds, Register ghash_pos,
351                                   bool final_reduction, int index, XMMRegister counter_inc_mask);
352   // Load key and shuffle operation
353   void ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask);
354   void ev_load_key(XMMRegister xmmdst, Register key, int offset, Register rscratch);
355 
356   // Utility routine for loading a 128-bit key word in little endian format
357   // can optionally specify that the shuffle mask is already in an xmmregister
358   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask);
359   void load_key(XMMRegister xmmdst, Register key, int offset, Register rscratch);
360 
361   // Utility routine for increase 128bit counter (iv in CTR mode)
362   void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block);
363 
364   void generate_aes_stubs();
365 
366 
367   // GHASH stubs
368 
369   void generate_ghash_stubs();
370 
371   void schoolbookAAD(int i, Register subkeyH, XMMRegister data, XMMRegister tmp0,
372                      XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3);
373   void gfmul(XMMRegister tmp0, XMMRegister t);
374   void generateHtbl_one_block(Register htbl, Register rscratch);
375   void generateHtbl_eight_blocks(Register htbl);
376   void avx_ghash(Register state, Register htbl, Register data, Register blocks);
377 
378   // Used by GHASH and AES stubs.
379   address ghash_polynomial_addr();
380   address ghash_shufflemask_addr();
381   address ghash_long_swap_mask_addr(); // byte swap x86 long
382   address ghash_byte_swap_mask_addr(); // byte swap x86 byte array
383 
384   // Single and multi-block ghash operations
385   address generate_ghash_processBlocks();
386 
387   // Ghash single and multi block operations using AVX instructions
388   address generate_avx_ghash_processBlocks();
389 
390 
391   // BASE64 stubs
392 
393   address base64_shuffle_addr();
394   address base64_avx2_shuffle_addr();
395   address base64_avx2_input_mask_addr();
396   address base64_avx2_lut_addr();
397   address base64_encoding_table_addr();
398 
399   // Code for generating Base64 encoding.
400   // Intrinsic function prototype in Base64.java:
401   // private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL)
402   address generate_base64_encodeBlock();
403 
404   // base64 AVX512vbmi tables
405   address base64_vbmi_lookup_lo_addr();
406   address base64_vbmi_lookup_hi_addr();
407   address base64_vbmi_lookup_lo_url_addr();
408   address base64_vbmi_lookup_hi_url_addr();
409   address base64_vbmi_pack_vec_addr();
410   address base64_vbmi_join_0_1_addr();
411   address base64_vbmi_join_1_2_addr();
412   address base64_vbmi_join_2_3_addr();
413   address base64_decoding_table_addr();
414 
415   // Code for generating Base64 decoding.
416   //
417   // Based on the article (and associated code) from https://arxiv.org/abs/1910.05109.
418   //
419   // Intrinsic function prototype in Base64.java:
420   // private void decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, isMIME);
421   address generate_base64_decodeBlock();
422 
423   address generate_updateBytesCRC32();
424   address generate_updateBytesCRC32C(bool is_pclmulqdq_supported);
425 
426   address generate_updateBytesAdler32();
427 
428   address generate_multiplyToLen();
429 
430   address generate_vectorizedMismatch();
431 
432   address generate_squareToLen();
433 
434   address generate_method_entry_barrier();
435 
436   address generate_mulAdd();
437 
438   address generate_bigIntegerRightShift();
439   address generate_bigIntegerLeftShift();
440 
441 
442   // Libm trigonometric stubs
443 
444   address generate_libmSin();
445   address generate_libmCos();
446   address generate_libmTan();
447   address generate_libmExp();
448   address generate_libmPow();
449   address generate_libmLog();
450   address generate_libmLog10();
451 
452   // Shared constants
453   static address ZERO;
454   static address NEG_ZERO;
455   static address ONE;
456   static address ONEHALF;
457   static address SIGN_MASK;
458   static address TWO_POW_55;
459   static address TWO_POW_M55;
460   static address SHIFTER;
461   static address PI32INV;
462   static address PI_INV_TABLE;
463   static address Ctable;
464   static address SC_1;
465   static address SC_2;
466   static address SC_3;
467   static address SC_4;
468   static address PI_4;
469   static address P_1;
470   static address P_3;
471   static address P_2;
472 
473   void generate_libm_stubs();
474 
475 
476   address generate_cont_thaw(const char* label, Continuation::thaw_kind kind);
477   address generate_cont_thaw();
478 
479   // TODO: will probably need multiple return barriers depending on return type
480   address generate_cont_returnBarrier();
481   address generate_cont_returnBarrier_exception();
482 
483 #if INCLUDE_JFR
484 
485   // For c2: c_rarg0 is junk, call to runtime to write a checkpoint.
486   // It returns a jobject handle to the event writer.
487   // The handle is dereferenced and the return value is the event writer oop.
488   RuntimeStub* generate_jfr_write_checkpoint();
489 
490 #endif // INCLUDE_JFR
491 
492   // Continuation point for throwing of implicit exceptions that are
493   // not handled in the current activation. Fabricates an exception
494   // oop and initiates normal exception dispatching in this
495   // frame. Since we need to preserve callee-saved values (currently
496   // only for C2, but done for C1 as well) we need a callee-saved oop
497   // map and therefore have to make these stubs into RuntimeStubs
498   // rather than BufferBlobs.  If the compiler needs all registers to
499   // be preserved between the fault point and the exception handler
500   // then it must assume responsibility for that in
501   // AbstractCompiler::continuation_for_implicit_null_exception or
502   // continuation_for_implicit_division_by_zero_exception. All other
503   // implicit exceptions (e.g., NullPointerException or
504   // AbstractMethodError on entry) are either at call sites or
505   // otherwise assume that stack unwinding will be initiated, so
506   // caller saved registers were assumed volatile in the compiler.
507   address generate_throw_exception(const char* name,
508                                    address runtime_entry,
509                                    Register arg1 = noreg,
510                                    Register arg2 = noreg);
511 
512   // interpreter or compiled code marshalling registers to/from inline type instance
513   address generate_return_value_stub(address destination, const char* name, bool has_res);
514 
515   void create_control_words();
516 
517   // Initialization
518   void generate_initial();
519   void generate_phase1();
520   void generate_all();
521 
522  public:
523   StubGenerator(CodeBuffer* code, int phase) : StubCodeGenerator(code) {
524     DEBUG_ONLY( _regs_in_thread = false; )
525     if (phase == 0) {
526       generate_initial();
527     } else if (phase == 1) {
528       generate_phase1(); // stubs that must be available for the interpreter
529     } else {
530       generate_all();
531     }
532   }
533 };
534 
535 #endif // CPU_X86_STUBGENERATOR_X86_64_HPP