1 /*
2 * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "precompiled.hpp"
26 #include "asm/macroAssembler.hpp"
27 #include "asm/macroAssembler.inline.hpp"
28 #include "ci/ciUtilities.hpp"
29 #include "compiler/oopMap.hpp"
30 #include "gc/shared/barrierSet.hpp"
31 #include "gc/shared/barrierSetAssembler.hpp"
32 #include "gc/shared/barrierSetNMethod.hpp"
33 #include "gc/shared/gc_globals.hpp"
34 #include "interpreter/interpreter.hpp"
35 #include "memory/universe.hpp"
36 #include "nativeInst_x86.hpp"
37 #include "oops/instanceOop.hpp"
38 #include "oops/method.hpp"
39 #include "oops/objArrayKlass.hpp"
40 #include "oops/oop.inline.hpp"
41 #include "prims/methodHandles.hpp"
42 #include "runtime/arguments.hpp"
43 #include "runtime/frame.inline.hpp"
44 #include "runtime/handles.inline.hpp"
45 #include "runtime/sharedRuntime.hpp"
46 #include "runtime/stubCodeGenerator.hpp"
47 #include "runtime/stubRoutines.hpp"
48 #include "runtime/thread.inline.hpp"
49 #ifdef COMPILER2
50 #include "opto/runtime.hpp"
51 #endif
52 #if INCLUDE_JVMCI
53 #include "jvmci/jvmci_globals.hpp"
54 #endif
55 #if INCLUDE_ZGC
56 #include "gc/z/zThreadLocalData.hpp"
57 #endif
58
59 // Declaration and definition of StubGenerator (no .hpp file).
60 // For a more detailed description of the stub routine structure
61 // see the comment in stubRoutines.hpp
62
63 #define __ _masm->
64 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
65 #define a__ ((Assembler*)_masm)->
66
67 #ifdef PRODUCT
68 #define BLOCK_COMMENT(str) /* nothing */
69 #else
70 #define BLOCK_COMMENT(str) __ block_comment(str)
71 #endif
72
73 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
74 const int MXCSR_MASK = 0xFFC0; // Mask out any pending exceptions
75
76 // Stub Code definitions
77
78 class StubGenerator: public StubCodeGenerator {
79 private:
80
81 #ifdef PRODUCT
82 #define inc_counter_np(counter) ((void)0)
83 #else
84 void inc_counter_np_(int& counter) {
85 // This can destroy rscratch1 if counter is far from the code cache
86 __ incrementl(ExternalAddress((address)&counter));
87 }
88 #define inc_counter_np(counter) \
89 BLOCK_COMMENT("inc_counter " #counter); \
90 inc_counter_np_(counter);
91 #endif
92
93 // Call stubs are used to call Java from C
94 //
95 // Linux Arguments:
96 // c_rarg0: call wrapper address address
97 // c_rarg1: result address
98 // c_rarg2: result type BasicType
99 // c_rarg3: method Method*
100 // c_rarg4: (interpreter) entry point address
101 // c_rarg5: parameters intptr_t*
102 // 16(rbp): parameter size (in words) int
103 // 24(rbp): thread Thread*
104 //
105 // [ return_from_Java ] <--- rsp
106 // [ argument word n ]
107 // ...
108 // -12 [ argument word 1 ]
109 // -11 [ saved r15 ] <--- rsp_after_call
110 // -10 [ saved r14 ]
111 // -9 [ saved r13 ]
112 // -8 [ saved r12 ]
113 // -7 [ saved rbx ]
114 // -6 [ call wrapper ]
115 // -5 [ result ]
116 // -4 [ result type ]
117 // -3 [ method ]
118 // -2 [ entry point ]
119 // -1 [ parameters ]
120 // 0 [ saved rbp ] <--- rbp
121 // 1 [ return address ]
122 // 2 [ parameter size ]
123 // 3 [ thread ]
124 //
125 // Windows Arguments:
126 // c_rarg0: call wrapper address address
127 // c_rarg1: result address
128 // c_rarg2: result type BasicType
129 // c_rarg3: method Method*
130 // 48(rbp): (interpreter) entry point address
131 // 56(rbp): parameters intptr_t*
132 // 64(rbp): parameter size (in words) int
133 // 72(rbp): thread Thread*
134 //
135 // [ return_from_Java ] <--- rsp
136 // [ argument word n ]
137 // ...
138 // -60 [ argument word 1 ]
139 // -59 [ saved xmm31 ] <--- rsp after_call
140 // [ saved xmm16-xmm30 ] (EVEX enabled, else the space is blank)
141 // -27 [ saved xmm15 ]
142 // [ saved xmm7-xmm14 ]
143 // -9 [ saved xmm6 ] (each xmm register takes 2 slots)
144 // -7 [ saved r15 ]
145 // -6 [ saved r14 ]
146 // -5 [ saved r13 ]
147 // -4 [ saved r12 ]
148 // -3 [ saved rdi ]
149 // -2 [ saved rsi ]
150 // -1 [ saved rbx ]
151 // 0 [ saved rbp ] <--- rbp
152 // 1 [ return address ]
153 // 2 [ call wrapper ]
154 // 3 [ result ]
155 // 4 [ result type ]
156 // 5 [ method ]
157 // 6 [ entry point ]
158 // 7 [ parameters ]
159 // 8 [ parameter size ]
160 // 9 [ thread ]
161 //
162 // Windows reserves the callers stack space for arguments 1-4.
163 // We spill c_rarg0-c_rarg3 to this space.
164
165 // Call stub stack layout word offsets from rbp
166 enum call_stub_layout {
167 #ifdef _WIN64
168 xmm_save_first = 6, // save from xmm6
169 xmm_save_last = 31, // to xmm31
170 xmm_save_base = -9,
171 rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27
172 r15_off = -7,
173 r14_off = -6,
174 r13_off = -5,
175 r12_off = -4,
176 rdi_off = -3,
177 rsi_off = -2,
178 rbx_off = -1,
179 rbp_off = 0,
180 retaddr_off = 1,
181 call_wrapper_off = 2,
182 result_off = 3,
183 result_type_off = 4,
184 method_off = 5,
185 entry_point_off = 6,
186 parameters_off = 7,
187 parameter_size_off = 8,
188 thread_off = 9
189 #else
190 rsp_after_call_off = -12,
191 mxcsr_off = rsp_after_call_off,
192 r15_off = -11,
193 r14_off = -10,
194 r13_off = -9,
195 r12_off = -8,
196 rbx_off = -7,
197 call_wrapper_off = -6,
198 result_off = -5,
199 result_type_off = -4,
200 method_off = -3,
201 entry_point_off = -2,
202 parameters_off = -1,
203 rbp_off = 0,
204 retaddr_off = 1,
205 parameter_size_off = 2,
206 thread_off = 3
207 #endif
208 };
209
210 #ifdef _WIN64
211 Address xmm_save(int reg) {
212 assert(reg >= xmm_save_first && reg <= xmm_save_last, "XMM register number out of range");
213 return Address(rbp, (xmm_save_base - (reg - xmm_save_first) * 2) * wordSize);
214 }
215 #endif
216
217 address generate_call_stub(address& return_address) {
218 assert((int)frame::entry_frame_after_call_words == -(int)rsp_after_call_off + 1 &&
219 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
220 "adjust this code");
221 StubCodeMark mark(this, "StubRoutines", "call_stub");
222 address start = __ pc();
223
224 // same as in generate_catch_exception()!
225 const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
226
227 const Address call_wrapper (rbp, call_wrapper_off * wordSize);
228 const Address result (rbp, result_off * wordSize);
229 const Address result_type (rbp, result_type_off * wordSize);
230 const Address method (rbp, method_off * wordSize);
231 const Address entry_point (rbp, entry_point_off * wordSize);
232 const Address parameters (rbp, parameters_off * wordSize);
233 const Address parameter_size(rbp, parameter_size_off * wordSize);
234
235 // same as in generate_catch_exception()!
236 const Address thread (rbp, thread_off * wordSize);
237
238 const Address r15_save(rbp, r15_off * wordSize);
239 const Address r14_save(rbp, r14_off * wordSize);
240 const Address r13_save(rbp, r13_off * wordSize);
241 const Address r12_save(rbp, r12_off * wordSize);
242 const Address rbx_save(rbp, rbx_off * wordSize);
243
244 // stub code
245 __ enter();
246 __ subptr(rsp, -rsp_after_call_off * wordSize);
247
248 // save register parameters
249 #ifndef _WIN64
250 __ movptr(parameters, c_rarg5); // parameters
251 __ movptr(entry_point, c_rarg4); // entry_point
252 #endif
253
254 __ movptr(method, c_rarg3); // method
255 __ movl(result_type, c_rarg2); // result type
256 __ movptr(result, c_rarg1); // result
257 __ movptr(call_wrapper, c_rarg0); // call wrapper
258
259 // save regs belonging to calling function
260 __ movptr(rbx_save, rbx);
261 __ movptr(r12_save, r12);
262 __ movptr(r13_save, r13);
263 __ movptr(r14_save, r14);
264 __ movptr(r15_save, r15);
265
266 #ifdef _WIN64
267 int last_reg = 15;
268 if (UseAVX > 2) {
269 last_reg = 31;
270 }
271 if (VM_Version::supports_evex()) {
272 for (int i = xmm_save_first; i <= last_reg; i++) {
273 __ vextractf32x4(xmm_save(i), as_XMMRegister(i), 0);
274 }
275 } else {
276 for (int i = xmm_save_first; i <= last_reg; i++) {
277 __ movdqu(xmm_save(i), as_XMMRegister(i));
278 }
279 }
280
281 const Address rdi_save(rbp, rdi_off * wordSize);
282 const Address rsi_save(rbp, rsi_off * wordSize);
283
284 __ movptr(rsi_save, rsi);
285 __ movptr(rdi_save, rdi);
286 #else
287 const Address mxcsr_save(rbp, mxcsr_off * wordSize);
288 {
289 Label skip_ldmx;
290 __ stmxcsr(mxcsr_save);
291 __ movl(rax, mxcsr_save);
292 __ andl(rax, MXCSR_MASK); // Only check control and mask bits
293 ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
294 __ cmp32(rax, mxcsr_std);
295 __ jcc(Assembler::equal, skip_ldmx);
296 __ ldmxcsr(mxcsr_std);
297 __ bind(skip_ldmx);
298 }
299 #endif
300
301 // Load up thread register
302 __ movptr(r15_thread, thread);
303 __ reinit_heapbase();
304
305 #ifdef ASSERT
306 // make sure we have no pending exceptions
307 {
308 Label L;
309 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
310 __ jcc(Assembler::equal, L);
311 __ stop("StubRoutines::call_stub: entered with pending exception");
312 __ bind(L);
313 }
314 #endif
315
316 // pass parameters if any
317 BLOCK_COMMENT("pass parameters if any");
318 Label parameters_done;
319 __ movl(c_rarg3, parameter_size);
320 __ testl(c_rarg3, c_rarg3);
321 __ jcc(Assembler::zero, parameters_done);
322
323 Label loop;
324 __ movptr(c_rarg2, parameters); // parameter pointer
325 __ movl(c_rarg1, c_rarg3); // parameter counter is in c_rarg1
326 __ BIND(loop);
327 __ movptr(rax, Address(c_rarg2, 0));// get parameter
328 __ addptr(c_rarg2, wordSize); // advance to next parameter
329 __ decrementl(c_rarg1); // decrement counter
330 __ push(rax); // pass parameter
331 __ jcc(Assembler::notZero, loop);
332
333 // call Java function
334 __ BIND(parameters_done);
335 __ movptr(rbx, method); // get Method*
336 __ movptr(c_rarg1, entry_point); // get entry_point
337 __ mov(r13, rsp); // set sender sp
338 BLOCK_COMMENT("call Java function");
339 __ call(c_rarg1);
340
341 BLOCK_COMMENT("call_stub_return_address:");
342 return_address = __ pc();
343
344 // store result depending on type (everything that is not
345 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
346 __ movptr(c_rarg0, result);
347 Label is_long, is_float, is_double, exit;
348 __ movl(c_rarg1, result_type);
349 __ cmpl(c_rarg1, T_OBJECT);
350 __ jcc(Assembler::equal, is_long);
351 __ cmpl(c_rarg1, T_LONG);
352 __ jcc(Assembler::equal, is_long);
353 __ cmpl(c_rarg1, T_FLOAT);
354 __ jcc(Assembler::equal, is_float);
355 __ cmpl(c_rarg1, T_DOUBLE);
356 __ jcc(Assembler::equal, is_double);
357
358 // handle T_INT case
359 __ movl(Address(c_rarg0, 0), rax);
360
361 __ BIND(exit);
362
363 // pop parameters
364 __ lea(rsp, rsp_after_call);
365
366 #ifdef ASSERT
367 // verify that threads correspond
368 {
369 Label L1, L2, L3;
370 __ cmpptr(r15_thread, thread);
371 __ jcc(Assembler::equal, L1);
372 __ stop("StubRoutines::call_stub: r15_thread is corrupted");
373 __ bind(L1);
374 __ get_thread(rbx);
375 __ cmpptr(r15_thread, thread);
376 __ jcc(Assembler::equal, L2);
377 __ stop("StubRoutines::call_stub: r15_thread is modified by call");
378 __ bind(L2);
379 __ cmpptr(r15_thread, rbx);
380 __ jcc(Assembler::equal, L3);
381 __ stop("StubRoutines::call_stub: threads must correspond");
382 __ bind(L3);
383 }
384 #endif
385
386 // restore regs belonging to calling function
387 #ifdef _WIN64
388 // emit the restores for xmm regs
389 if (VM_Version::supports_evex()) {
390 for (int i = xmm_save_first; i <= last_reg; i++) {
391 __ vinsertf32x4(as_XMMRegister(i), as_XMMRegister(i), xmm_save(i), 0);
392 }
393 } else {
394 for (int i = xmm_save_first; i <= last_reg; i++) {
395 __ movdqu(as_XMMRegister(i), xmm_save(i));
396 }
397 }
398 #endif
399 __ movptr(r15, r15_save);
400 __ movptr(r14, r14_save);
401 __ movptr(r13, r13_save);
402 __ movptr(r12, r12_save);
403 __ movptr(rbx, rbx_save);
404
405 #ifdef _WIN64
406 __ movptr(rdi, rdi_save);
407 __ movptr(rsi, rsi_save);
408 #else
409 __ ldmxcsr(mxcsr_save);
410 #endif
411
412 // restore rsp
413 __ addptr(rsp, -rsp_after_call_off * wordSize);
414
415 // return
416 __ vzeroupper();
417 __ pop(rbp);
418 __ ret(0);
419
420 // handle return types different from T_INT
421 __ BIND(is_long);
422 __ movq(Address(c_rarg0, 0), rax);
423 __ jmp(exit);
424
425 __ BIND(is_float);
426 __ movflt(Address(c_rarg0, 0), xmm0);
427 __ jmp(exit);
428
429 __ BIND(is_double);
430 __ movdbl(Address(c_rarg0, 0), xmm0);
431 __ jmp(exit);
432
433 return start;
434 }
435
436 // Return point for a Java call if there's an exception thrown in
437 // Java code. The exception is caught and transformed into a
438 // pending exception stored in JavaThread that can be tested from
439 // within the VM.
440 //
441 // Note: Usually the parameters are removed by the callee. In case
442 // of an exception crossing an activation frame boundary, that is
443 // not the case if the callee is compiled code => need to setup the
444 // rsp.
445 //
446 // rax: exception oop
447
448 address generate_catch_exception() {
449 StubCodeMark mark(this, "StubRoutines", "catch_exception");
450 address start = __ pc();
451
452 // same as in generate_call_stub():
453 const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
454 const Address thread (rbp, thread_off * wordSize);
455
456 #ifdef ASSERT
457 // verify that threads correspond
458 {
459 Label L1, L2, L3;
460 __ cmpptr(r15_thread, thread);
461 __ jcc(Assembler::equal, L1);
462 __ stop("StubRoutines::catch_exception: r15_thread is corrupted");
463 __ bind(L1);
464 __ get_thread(rbx);
465 __ cmpptr(r15_thread, thread);
466 __ jcc(Assembler::equal, L2);
467 __ stop("StubRoutines::catch_exception: r15_thread is modified by call");
468 __ bind(L2);
469 __ cmpptr(r15_thread, rbx);
470 __ jcc(Assembler::equal, L3);
471 __ stop("StubRoutines::catch_exception: threads must correspond");
472 __ bind(L3);
473 }
474 #endif
475
476 // set pending exception
477 __ verify_oop(rax);
478
479 __ movptr(Address(r15_thread, Thread::pending_exception_offset()), rax);
480 __ lea(rscratch1, ExternalAddress((address)__FILE__));
481 __ movptr(Address(r15_thread, Thread::exception_file_offset()), rscratch1);
482 __ movl(Address(r15_thread, Thread::exception_line_offset()), (int) __LINE__);
483
484 // complete return to VM
485 assert(StubRoutines::_call_stub_return_address != NULL,
486 "_call_stub_return_address must have been generated before");
487 __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
488
489 return start;
490 }
491
492 // Continuation point for runtime calls returning with a pending
493 // exception. The pending exception check happened in the runtime
494 // or native call stub. The pending exception in Thread is
495 // converted into a Java-level exception.
496 //
497 // Contract with Java-level exception handlers:
498 // rax: exception
499 // rdx: throwing pc
500 //
501 // NOTE: At entry of this stub, exception-pc must be on stack !!
502
503 address generate_forward_exception() {
504 StubCodeMark mark(this, "StubRoutines", "forward exception");
505 address start = __ pc();
506
507 // Upon entry, the sp points to the return address returning into
508 // Java (interpreted or compiled) code; i.e., the return address
509 // becomes the throwing pc.
510 //
511 // Arguments pushed before the runtime call are still on the stack
512 // but the exception handler will reset the stack pointer ->
513 // ignore them. A potential result in registers can be ignored as
514 // well.
515
516 #ifdef ASSERT
517 // make sure this code is only executed if there is a pending exception
518 {
519 Label L;
520 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t) NULL);
521 __ jcc(Assembler::notEqual, L);
522 __ stop("StubRoutines::forward exception: no pending exception (1)");
523 __ bind(L);
524 }
525 #endif
526
527 // compute exception handler into rbx
528 __ movptr(c_rarg0, Address(rsp, 0));
529 BLOCK_COMMENT("call exception_handler_for_return_address");
530 __ call_VM_leaf(CAST_FROM_FN_PTR(address,
531 SharedRuntime::exception_handler_for_return_address),
532 r15_thread, c_rarg0);
533 __ mov(rbx, rax);
534
535 // setup rax & rdx, remove return address & clear pending exception
536 __ pop(rdx);
537 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
538 __ movptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
539
540 #ifdef ASSERT
541 // make sure exception is set
542 {
543 Label L;
544 __ testptr(rax, rax);
545 __ jcc(Assembler::notEqual, L);
546 __ stop("StubRoutines::forward exception: no pending exception (2)");
547 __ bind(L);
548 }
549 #endif
550
551 // continue at exception handler (return address removed)
552 // rax: exception
553 // rbx: exception handler
554 // rdx: throwing pc
555 __ verify_oop(rax);
556 __ jmp(rbx);
557
558 return start;
559 }
560
561 // Support for intptr_t OrderAccess::fence()
562 //
563 // Arguments :
564 //
565 // Result:
566 address generate_orderaccess_fence() {
567 StubCodeMark mark(this, "StubRoutines", "orderaccess_fence");
568 address start = __ pc();
569 __ membar(Assembler::StoreLoad);
570 __ ret(0);
571
572 return start;
573 }
574
575
576 // Support for intptr_t get_previous_sp()
577 //
578 // This routine is used to find the previous stack pointer for the
579 // caller.
580 address generate_get_previous_sp() {
581 StubCodeMark mark(this, "StubRoutines", "get_previous_sp");
582 address start = __ pc();
583
584 __ movptr(rax, rsp);
585 __ addptr(rax, 8); // return address is at the top of the stack.
586 __ ret(0);
587
588 return start;
589 }
590
591 //----------------------------------------------------------------------------------------------------
592 // Support for void verify_mxcsr()
593 //
594 // This routine is used with -Xcheck:jni to verify that native
595 // JNI code does not return to Java code without restoring the
596 // MXCSR register to our expected state.
597
598 address generate_verify_mxcsr() {
599 StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
600 address start = __ pc();
601
602 const Address mxcsr_save(rsp, 0);
603
604 if (CheckJNICalls) {
605 Label ok_ret;
606 ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std());
607 __ push(rax);
608 __ subptr(rsp, wordSize); // allocate a temp location
609 __ stmxcsr(mxcsr_save);
610 __ movl(rax, mxcsr_save);
611 __ andl(rax, MXCSR_MASK); // Only check control and mask bits
612 __ cmp32(rax, mxcsr_std);
613 __ jcc(Assembler::equal, ok_ret);
614
615 __ warn("MXCSR changed by native JNI code, use -XX:+RestoreMXCSROnJNICall");
616
617 __ ldmxcsr(mxcsr_std);
618
619 __ bind(ok_ret);
620 __ addptr(rsp, wordSize);
621 __ pop(rax);
622 }
623
624 __ ret(0);
625
626 return start;
627 }
628
629 address generate_f2i_fixup() {
630 StubCodeMark mark(this, "StubRoutines", "f2i_fixup");
631 Address inout(rsp, 5 * wordSize); // return address + 4 saves
632
633 address start = __ pc();
634
635 Label L;
636
637 __ push(rax);
638 __ push(c_rarg3);
639 __ push(c_rarg2);
640 __ push(c_rarg1);
641
642 __ movl(rax, 0x7f800000);
643 __ xorl(c_rarg3, c_rarg3);
644 __ movl(c_rarg2, inout);
645 __ movl(c_rarg1, c_rarg2);
646 __ andl(c_rarg1, 0x7fffffff);
647 __ cmpl(rax, c_rarg1); // NaN? -> 0
648 __ jcc(Assembler::negative, L);
649 __ testl(c_rarg2, c_rarg2); // signed ? min_jint : max_jint
650 __ movl(c_rarg3, 0x80000000);
651 __ movl(rax, 0x7fffffff);
652 __ cmovl(Assembler::positive, c_rarg3, rax);
653
654 __ bind(L);
655 __ movptr(inout, c_rarg3);
656
657 __ pop(c_rarg1);
658 __ pop(c_rarg2);
659 __ pop(c_rarg3);
660 __ pop(rax);
661
662 __ ret(0);
663
664 return start;
665 }
666
667 address generate_f2l_fixup() {
668 StubCodeMark mark(this, "StubRoutines", "f2l_fixup");
669 Address inout(rsp, 5 * wordSize); // return address + 4 saves
670 address start = __ pc();
671
672 Label L;
673
674 __ push(rax);
675 __ push(c_rarg3);
676 __ push(c_rarg2);
677 __ push(c_rarg1);
678
679 __ movl(rax, 0x7f800000);
680 __ xorl(c_rarg3, c_rarg3);
681 __ movl(c_rarg2, inout);
682 __ movl(c_rarg1, c_rarg2);
683 __ andl(c_rarg1, 0x7fffffff);
684 __ cmpl(rax, c_rarg1); // NaN? -> 0
685 __ jcc(Assembler::negative, L);
686 __ testl(c_rarg2, c_rarg2); // signed ? min_jlong : max_jlong
687 __ mov64(c_rarg3, 0x8000000000000000);
688 __ mov64(rax, 0x7fffffffffffffff);
689 __ cmov(Assembler::positive, c_rarg3, rax);
690
691 __ bind(L);
692 __ movptr(inout, c_rarg3);
693
694 __ pop(c_rarg1);
695 __ pop(c_rarg2);
696 __ pop(c_rarg3);
697 __ pop(rax);
698
699 __ ret(0);
700
701 return start;
702 }
703
704 address generate_d2i_fixup() {
705 StubCodeMark mark(this, "StubRoutines", "d2i_fixup");
706 Address inout(rsp, 6 * wordSize); // return address + 5 saves
707
708 address start = __ pc();
709
710 Label L;
711
712 __ push(rax);
713 __ push(c_rarg3);
714 __ push(c_rarg2);
715 __ push(c_rarg1);
716 __ push(c_rarg0);
717
718 __ movl(rax, 0x7ff00000);
719 __ movq(c_rarg2, inout);
720 __ movl(c_rarg3, c_rarg2);
721 __ mov(c_rarg1, c_rarg2);
722 __ mov(c_rarg0, c_rarg2);
723 __ negl(c_rarg3);
724 __ shrptr(c_rarg1, 0x20);
725 __ orl(c_rarg3, c_rarg2);
726 __ andl(c_rarg1, 0x7fffffff);
727 __ xorl(c_rarg2, c_rarg2);
728 __ shrl(c_rarg3, 0x1f);
729 __ orl(c_rarg1, c_rarg3);
730 __ cmpl(rax, c_rarg1);
731 __ jcc(Assembler::negative, L); // NaN -> 0
732 __ testptr(c_rarg0, c_rarg0); // signed ? min_jint : max_jint
733 __ movl(c_rarg2, 0x80000000);
734 __ movl(rax, 0x7fffffff);
735 __ cmov(Assembler::positive, c_rarg2, rax);
736
737 __ bind(L);
738 __ movptr(inout, c_rarg2);
739
740 __ pop(c_rarg0);
741 __ pop(c_rarg1);
742 __ pop(c_rarg2);
743 __ pop(c_rarg3);
744 __ pop(rax);
745
746 __ ret(0);
747
748 return start;
749 }
750
751 address generate_d2l_fixup() {
752 StubCodeMark mark(this, "StubRoutines", "d2l_fixup");
753 Address inout(rsp, 6 * wordSize); // return address + 5 saves
754
755 address start = __ pc();
756
757 Label L;
758
759 __ push(rax);
760 __ push(c_rarg3);
761 __ push(c_rarg2);
762 __ push(c_rarg1);
763 __ push(c_rarg0);
764
765 __ movl(rax, 0x7ff00000);
766 __ movq(c_rarg2, inout);
767 __ movl(c_rarg3, c_rarg2);
768 __ mov(c_rarg1, c_rarg2);
769 __ mov(c_rarg0, c_rarg2);
770 __ negl(c_rarg3);
771 __ shrptr(c_rarg1, 0x20);
772 __ orl(c_rarg3, c_rarg2);
773 __ andl(c_rarg1, 0x7fffffff);
774 __ xorl(c_rarg2, c_rarg2);
775 __ shrl(c_rarg3, 0x1f);
776 __ orl(c_rarg1, c_rarg3);
777 __ cmpl(rax, c_rarg1);
778 __ jcc(Assembler::negative, L); // NaN -> 0
779 __ testq(c_rarg0, c_rarg0); // signed ? min_jlong : max_jlong
780 __ mov64(c_rarg2, 0x8000000000000000);
781 __ mov64(rax, 0x7fffffffffffffff);
782 __ cmovq(Assembler::positive, c_rarg2, rax);
783
784 __ bind(L);
785 __ movq(inout, c_rarg2);
786
787 __ pop(c_rarg0);
788 __ pop(c_rarg1);
789 __ pop(c_rarg2);
790 __ pop(c_rarg3);
791 __ pop(rax);
792
793 __ ret(0);
794
795 return start;
796 }
797
798 address generate_popcount_avx_lut(const char *stub_name) {
799 __ align64();
800 StubCodeMark mark(this, "StubRoutines", stub_name);
801 address start = __ pc();
802 __ emit_data64(0x0302020102010100, relocInfo::none);
803 __ emit_data64(0x0403030203020201, relocInfo::none);
804 __ emit_data64(0x0302020102010100, relocInfo::none);
805 __ emit_data64(0x0403030203020201, relocInfo::none);
806 __ emit_data64(0x0302020102010100, relocInfo::none);
807 __ emit_data64(0x0403030203020201, relocInfo::none);
808 __ emit_data64(0x0302020102010100, relocInfo::none);
809 __ emit_data64(0x0403030203020201, relocInfo::none);
810 return start;
811 }
812
813 address generate_iota_indices(const char *stub_name) {
814 __ align(CodeEntryAlignment);
815 StubCodeMark mark(this, "StubRoutines", stub_name);
816 address start = __ pc();
817 __ emit_data64(0x0706050403020100, relocInfo::none);
818 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
819 __ emit_data64(0x1716151413121110, relocInfo::none);
820 __ emit_data64(0x1F1E1D1C1B1A1918, relocInfo::none);
821 __ emit_data64(0x2726252423222120, relocInfo::none);
822 __ emit_data64(0x2F2E2D2C2B2A2928, relocInfo::none);
823 __ emit_data64(0x3736353433323130, relocInfo::none);
824 __ emit_data64(0x3F3E3D3C3B3A3938, relocInfo::none);
825 return start;
826 }
827
828 address generate_vector_byte_shuffle_mask(const char *stub_name) {
829 __ align(CodeEntryAlignment);
830 StubCodeMark mark(this, "StubRoutines", stub_name);
831 address start = __ pc();
832 __ emit_data64(0x7070707070707070, relocInfo::none);
833 __ emit_data64(0x7070707070707070, relocInfo::none);
834 __ emit_data64(0xF0F0F0F0F0F0F0F0, relocInfo::none);
835 __ emit_data64(0xF0F0F0F0F0F0F0F0, relocInfo::none);
836 return start;
837 }
838
839 address generate_fp_mask(const char *stub_name, int64_t mask) {
840 __ align(CodeEntryAlignment);
841 StubCodeMark mark(this, "StubRoutines", stub_name);
842 address start = __ pc();
843
844 __ emit_data64( mask, relocInfo::none );
845 __ emit_data64( mask, relocInfo::none );
846
847 return start;
848 }
849
850 address generate_vector_mask(const char *stub_name, int64_t mask) {
851 __ align(CodeEntryAlignment);
852 StubCodeMark mark(this, "StubRoutines", stub_name);
853 address start = __ pc();
854
855 __ emit_data64(mask, relocInfo::none);
856 __ emit_data64(mask, relocInfo::none);
857 __ emit_data64(mask, relocInfo::none);
858 __ emit_data64(mask, relocInfo::none);
859 __ emit_data64(mask, relocInfo::none);
860 __ emit_data64(mask, relocInfo::none);
861 __ emit_data64(mask, relocInfo::none);
862 __ emit_data64(mask, relocInfo::none);
863
864 return start;
865 }
866
867 address generate_vector_byte_perm_mask(const char *stub_name) {
868 __ align(CodeEntryAlignment);
869 StubCodeMark mark(this, "StubRoutines", stub_name);
870 address start = __ pc();
871
872 __ emit_data64(0x0000000000000001, relocInfo::none);
873 __ emit_data64(0x0000000000000003, relocInfo::none);
874 __ emit_data64(0x0000000000000005, relocInfo::none);
875 __ emit_data64(0x0000000000000007, relocInfo::none);
876 __ emit_data64(0x0000000000000000, relocInfo::none);
877 __ emit_data64(0x0000000000000002, relocInfo::none);
878 __ emit_data64(0x0000000000000004, relocInfo::none);
879 __ emit_data64(0x0000000000000006, relocInfo::none);
880
881 return start;
882 }
883
884 address generate_vector_fp_mask(const char *stub_name, int64_t mask) {
885 __ align(CodeEntryAlignment);
886 StubCodeMark mark(this, "StubRoutines", stub_name);
887 address start = __ pc();
888
889 __ emit_data64(mask, relocInfo::none);
890 __ emit_data64(mask, relocInfo::none);
891 __ emit_data64(mask, relocInfo::none);
892 __ emit_data64(mask, relocInfo::none);
893 __ emit_data64(mask, relocInfo::none);
894 __ emit_data64(mask, relocInfo::none);
895 __ emit_data64(mask, relocInfo::none);
896 __ emit_data64(mask, relocInfo::none);
897
898 return start;
899 }
900
901 address generate_vector_custom_i32(const char *stub_name, Assembler::AvxVectorLen len,
902 int32_t val0, int32_t val1, int32_t val2, int32_t val3,
903 int32_t val4 = 0, int32_t val5 = 0, int32_t val6 = 0, int32_t val7 = 0,
904 int32_t val8 = 0, int32_t val9 = 0, int32_t val10 = 0, int32_t val11 = 0,
905 int32_t val12 = 0, int32_t val13 = 0, int32_t val14 = 0, int32_t val15 = 0) {
906 __ align(CodeEntryAlignment);
907 StubCodeMark mark(this, "StubRoutines", stub_name);
908 address start = __ pc();
909
910 assert(len != Assembler::AVX_NoVec, "vector len must be specified");
911 __ emit_data(val0, relocInfo::none, 0);
912 __ emit_data(val1, relocInfo::none, 0);
913 __ emit_data(val2, relocInfo::none, 0);
914 __ emit_data(val3, relocInfo::none, 0);
915 if (len >= Assembler::AVX_256bit) {
916 __ emit_data(val4, relocInfo::none, 0);
917 __ emit_data(val5, relocInfo::none, 0);
918 __ emit_data(val6, relocInfo::none, 0);
919 __ emit_data(val7, relocInfo::none, 0);
920 if (len >= Assembler::AVX_512bit) {
921 __ emit_data(val8, relocInfo::none, 0);
922 __ emit_data(val9, relocInfo::none, 0);
923 __ emit_data(val10, relocInfo::none, 0);
924 __ emit_data(val11, relocInfo::none, 0);
925 __ emit_data(val12, relocInfo::none, 0);
926 __ emit_data(val13, relocInfo::none, 0);
927 __ emit_data(val14, relocInfo::none, 0);
928 __ emit_data(val15, relocInfo::none, 0);
929 }
930 }
931
932 return start;
933 }
934
935 // Non-destructive plausibility checks for oops
936 //
937 // Arguments:
938 // all args on stack!
939 //
940 // Stack after saving c_rarg3:
941 // [tos + 0]: saved c_rarg3
942 // [tos + 1]: saved c_rarg2
943 // [tos + 2]: saved r12 (several TemplateTable methods use it)
944 // [tos + 3]: saved flags
945 // [tos + 4]: return address
946 // * [tos + 5]: error message (char*)
947 // * [tos + 6]: object to verify (oop)
948 // * [tos + 7]: saved rax - saved by caller and bashed
949 // * [tos + 8]: saved r10 (rscratch1) - saved by caller
950 // * = popped on exit
951 address generate_verify_oop() {
952 StubCodeMark mark(this, "StubRoutines", "verify_oop");
953 address start = __ pc();
954
955 Label exit, error;
956
957 __ pushf();
958 __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
959
960 __ push(r12);
961
962 // save c_rarg2 and c_rarg3
963 __ push(c_rarg2);
964 __ push(c_rarg3);
965
966 enum {
967 // After previous pushes.
968 oop_to_verify = 6 * wordSize,
969 saved_rax = 7 * wordSize,
970 saved_r10 = 8 * wordSize,
971
972 // Before the call to MacroAssembler::debug(), see below.
973 return_addr = 16 * wordSize,
974 error_msg = 17 * wordSize
975 };
976
977 // get object
978 __ movptr(rax, Address(rsp, oop_to_verify));
979
980 // make sure object is 'reasonable'
981 __ testptr(rax, rax);
982 __ jcc(Assembler::zero, exit); // if obj is NULL it is OK
983
984 #if INCLUDE_ZGC
985 if (UseZGC) {
986 // Check if metadata bits indicate a bad oop
987 __ testptr(rax, Address(r15_thread, ZThreadLocalData::address_bad_mask_offset()));
988 __ jcc(Assembler::notZero, error);
989 }
990 #endif
991
992 // Check if the oop is in the right area of memory
993 __ movptr(c_rarg2, rax);
994 __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_mask());
995 __ andptr(c_rarg2, c_rarg3);
996 __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_bits());
997 __ cmpptr(c_rarg2, c_rarg3);
998 __ jcc(Assembler::notZero, error);
999
1000 // make sure klass is 'reasonable', which is not zero.
1001 __ load_klass(rax, rax, rscratch1); // get klass
1002 __ testptr(rax, rax);
1003 __ jcc(Assembler::zero, error); // if klass is NULL it is broken
1004
1005 // return if everything seems ok
1006 __ bind(exit);
1007 __ movptr(rax, Address(rsp, saved_rax)); // get saved rax back
1008 __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1009 __ pop(c_rarg3); // restore c_rarg3
1010 __ pop(c_rarg2); // restore c_rarg2
1011 __ pop(r12); // restore r12
1012 __ popf(); // restore flags
1013 __ ret(4 * wordSize); // pop caller saved stuff
1014
1015 // handle errors
1016 __ bind(error);
1017 __ movptr(rax, Address(rsp, saved_rax)); // get saved rax back
1018 __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1019 __ pop(c_rarg3); // get saved c_rarg3 back
1020 __ pop(c_rarg2); // get saved c_rarg2 back
1021 __ pop(r12); // get saved r12 back
1022 __ popf(); // get saved flags off stack --
1023 // will be ignored
1024
1025 __ pusha(); // push registers
1026 // (rip is already
1027 // already pushed)
1028 // debug(char* msg, int64_t pc, int64_t regs[])
1029 // We've popped the registers we'd saved (c_rarg3, c_rarg2 and flags), and
1030 // pushed all the registers, so now the stack looks like:
1031 // [tos + 0] 16 saved registers
1032 // [tos + 16] return address
1033 // * [tos + 17] error message (char*)
1034 // * [tos + 18] object to verify (oop)
1035 // * [tos + 19] saved rax - saved by caller and bashed
1036 // * [tos + 20] saved r10 (rscratch1) - saved by caller
1037 // * = popped on exit
1038
1039 __ movptr(c_rarg0, Address(rsp, error_msg)); // pass address of error message
1040 __ movptr(c_rarg1, Address(rsp, return_addr)); // pass return address
1041 __ movq(c_rarg2, rsp); // pass address of regs on stack
1042 __ mov(r12, rsp); // remember rsp
1043 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
1044 __ andptr(rsp, -16); // align stack as required by ABI
1045 BLOCK_COMMENT("call MacroAssembler::debug");
1046 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
1047 __ hlt();
1048 return start;
1049 }
1050
1051 //
1052 // Verify that a register contains clean 32-bits positive value
1053 // (high 32-bits are 0) so it could be used in 64-bits shifts.
1054 //
1055 // Input:
1056 // Rint - 32-bits value
1057 // Rtmp - scratch
1058 //
1059 void assert_clean_int(Register Rint, Register Rtmp) {
1060 #ifdef ASSERT
1061 Label L;
1062 assert_different_registers(Rtmp, Rint);
1063 __ movslq(Rtmp, Rint);
1064 __ cmpq(Rtmp, Rint);
1065 __ jcc(Assembler::equal, L);
1066 __ stop("high 32-bits of int value are not 0");
1067 __ bind(L);
1068 #endif
1069 }
1070
1071 // Generate overlap test for array copy stubs
1072 //
1073 // Input:
1074 // c_rarg0 - from
1075 // c_rarg1 - to
1076 // c_rarg2 - element count
1077 //
1078 // Output:
1079 // rax - &from[element count - 1]
1080 //
1081 void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) {
1082 assert(no_overlap_target != NULL, "must be generated");
1083 array_overlap_test(no_overlap_target, NULL, sf);
1084 }
1085 void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) {
1086 array_overlap_test(NULL, &L_no_overlap, sf);
1087 }
1088 void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
1089 const Register from = c_rarg0;
1090 const Register to = c_rarg1;
1091 const Register count = c_rarg2;
1092 const Register end_from = rax;
1093
1094 __ cmpptr(to, from);
1095 __ lea(end_from, Address(from, count, sf, 0));
1096 if (NOLp == NULL) {
1097 ExternalAddress no_overlap(no_overlap_target);
1098 __ jump_cc(Assembler::belowEqual, no_overlap);
1099 __ cmpptr(to, end_from);
1100 __ jump_cc(Assembler::aboveEqual, no_overlap);
1101 } else {
1102 __ jcc(Assembler::belowEqual, (*NOLp));
1103 __ cmpptr(to, end_from);
1104 __ jcc(Assembler::aboveEqual, (*NOLp));
1105 }
1106 }
1107
1108 // Shuffle first three arg regs on Windows into Linux/Solaris locations.
1109 //
1110 // Outputs:
1111 // rdi - rcx
1112 // rsi - rdx
1113 // rdx - r8
1114 // rcx - r9
1115 //
1116 // Registers r9 and r10 are used to save rdi and rsi on Windows, which latter
1117 // are non-volatile. r9 and r10 should not be used by the caller.
1118 //
1119 DEBUG_ONLY(bool regs_in_thread;)
1120
1121 void setup_arg_regs(int nargs = 3) {
1122 const Register saved_rdi = r9;
1123 const Register saved_rsi = r10;
1124 assert(nargs == 3 || nargs == 4, "else fix");
1125 #ifdef _WIN64
1126 assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1127 "unexpected argument registers");
1128 if (nargs >= 4)
1129 __ mov(rax, r9); // r9 is also saved_rdi
1130 __ movptr(saved_rdi, rdi);
1131 __ movptr(saved_rsi, rsi);
1132 __ mov(rdi, rcx); // c_rarg0
1133 __ mov(rsi, rdx); // c_rarg1
1134 __ mov(rdx, r8); // c_rarg2
1135 if (nargs >= 4)
1136 __ mov(rcx, rax); // c_rarg3 (via rax)
1137 #else
1138 assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1139 "unexpected argument registers");
1140 #endif
1141 DEBUG_ONLY(regs_in_thread = false;)
1142 }
1143
1144 void restore_arg_regs() {
1145 assert(!regs_in_thread, "wrong call to restore_arg_regs");
1146 const Register saved_rdi = r9;
1147 const Register saved_rsi = r10;
1148 #ifdef _WIN64
1149 __ movptr(rdi, saved_rdi);
1150 __ movptr(rsi, saved_rsi);
1151 #endif
1152 }
1153
1154 // This is used in places where r10 is a scratch register, and can
1155 // be adapted if r9 is needed also.
1156 void setup_arg_regs_using_thread() {
1157 const Register saved_r15 = r9;
1158 #ifdef _WIN64
1159 __ mov(saved_r15, r15); // r15 is callee saved and needs to be restored
1160 __ get_thread(r15_thread);
1161 assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1162 "unexpected argument registers");
1163 __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())), rdi);
1164 __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())), rsi);
1165
1166 __ mov(rdi, rcx); // c_rarg0
1167 __ mov(rsi, rdx); // c_rarg1
1168 __ mov(rdx, r8); // c_rarg2
1169 #else
1170 assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1171 "unexpected argument registers");
1172 #endif
1173 DEBUG_ONLY(regs_in_thread = true;)
1174 }
1175
1176 void restore_arg_regs_using_thread() {
1177 assert(regs_in_thread, "wrong call to restore_arg_regs");
1178 const Register saved_r15 = r9;
1179 #ifdef _WIN64
1180 __ get_thread(r15_thread);
1181 __ movptr(rsi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())));
1182 __ movptr(rdi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())));
1183 __ mov(r15, saved_r15); // r15 is callee saved and needs to be restored
1184 #endif
1185 }
1186
1187 // Copy big chunks forward
1188 //
1189 // Inputs:
1190 // end_from - source arrays end address
1191 // end_to - destination array end address
1192 // qword_count - 64-bits element count, negative
1193 // to - scratch
1194 // L_copy_bytes - entry label
1195 // L_copy_8_bytes - exit label
1196 //
1197 void copy_bytes_forward(Register end_from, Register end_to,
1198 Register qword_count, Register to,
1199 Label& L_copy_bytes, Label& L_copy_8_bytes) {
1200 DEBUG_ONLY(__ stop("enter at entry label, not here"));
1201 Label L_loop;
1202 __ align(OptoLoopAlignment);
1203 if (UseUnalignedLoadStores) {
1204 Label L_end;
1205 __ BIND(L_loop);
1206 if (UseAVX >= 2) {
1207 __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1208 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1209 __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
1210 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
1211 } else {
1212 __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1213 __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1214 __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
1215 __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
1216 __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
1217 __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
1218 __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
1219 __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
1220 }
1221
1222 __ BIND(L_copy_bytes);
1223 __ addptr(qword_count, 8);
1224 __ jcc(Assembler::lessEqual, L_loop);
1225 __ subptr(qword_count, 4); // sub(8) and add(4)
1226 __ jccb(Assembler::greater, L_end);
1227 // Copy trailing 32 bytes
1228 if (UseAVX >= 2) {
1229 __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1230 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1231 } else {
1232 __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1233 __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1234 __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
1235 __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
1236 }
1237 __ addptr(qword_count, 4);
1238 __ BIND(L_end);
1239 } else {
1240 // Copy 32-bytes per iteration
1241 __ BIND(L_loop);
1242 __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
1243 __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
1244 __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
1245 __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
1246 __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
1247 __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
1248 __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
1249 __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
1250
1251 __ BIND(L_copy_bytes);
1252 __ addptr(qword_count, 4);
1253 __ jcc(Assembler::lessEqual, L_loop);
1254 }
1255 __ subptr(qword_count, 4);
1256 __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
1257 }
1258
1259 // Copy big chunks backward
1260 //
1261 // Inputs:
1262 // from - source arrays address
1263 // dest - destination array address
1264 // qword_count - 64-bits element count
1265 // to - scratch
1266 // L_copy_bytes - entry label
1267 // L_copy_8_bytes - exit label
1268 //
1269 void copy_bytes_backward(Register from, Register dest,
1270 Register qword_count, Register to,
1271 Label& L_copy_bytes, Label& L_copy_8_bytes) {
1272 DEBUG_ONLY(__ stop("enter at entry label, not here"));
1273 Label L_loop;
1274 __ align(OptoLoopAlignment);
1275 if (UseUnalignedLoadStores) {
1276 Label L_end;
1277 __ BIND(L_loop);
1278 if (UseAVX >= 2) {
1279 __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
1280 __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
1281 __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
1282 __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
1283 } else {
1284 __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
1285 __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
1286 __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
1287 __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
1288 __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
1289 __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
1290 __ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0));
1291 __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3);
1292 }
1293
1294 __ BIND(L_copy_bytes);
1295 __ subptr(qword_count, 8);
1296 __ jcc(Assembler::greaterEqual, L_loop);
1297
1298 __ addptr(qword_count, 4); // add(8) and sub(4)
1299 __ jccb(Assembler::less, L_end);
1300 // Copy trailing 32 bytes
1301 if (UseAVX >= 2) {
1302 __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
1303 __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0);
1304 } else {
1305 __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
1306 __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
1307 __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
1308 __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
1309 }
1310 __ subptr(qword_count, 4);
1311 __ BIND(L_end);
1312 } else {
1313 // Copy 32-bytes per iteration
1314 __ BIND(L_loop);
1315 __ movq(to, Address(from, qword_count, Address::times_8, 24));
1316 __ movq(Address(dest, qword_count, Address::times_8, 24), to);
1317 __ movq(to, Address(from, qword_count, Address::times_8, 16));
1318 __ movq(Address(dest, qword_count, Address::times_8, 16), to);
1319 __ movq(to, Address(from, qword_count, Address::times_8, 8));
1320 __ movq(Address(dest, qword_count, Address::times_8, 8), to);
1321 __ movq(to, Address(from, qword_count, Address::times_8, 0));
1322 __ movq(Address(dest, qword_count, Address::times_8, 0), to);
1323
1324 __ BIND(L_copy_bytes);
1325 __ subptr(qword_count, 4);
1326 __ jcc(Assembler::greaterEqual, L_loop);
1327 }
1328 __ addptr(qword_count, 4);
1329 __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
1330 }
1331
1332 #ifndef PRODUCT
1333 int& get_profile_ctr(int shift) {
1334 if ( 0 == shift)
1335 return SharedRuntime::_jbyte_array_copy_ctr;
1336 else if(1 == shift)
1337 return SharedRuntime::_jshort_array_copy_ctr;
1338 else if(2 == shift)
1339 return SharedRuntime::_jint_array_copy_ctr;
1340 else
1341 return SharedRuntime::_jlong_array_copy_ctr;
1342 }
1343 #endif
1344
1345 void setup_argument_regs(BasicType type) {
1346 if (type == T_BYTE || type == T_SHORT) {
1347 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1348 // r9 and r10 may be used to save non-volatile registers
1349 } else {
1350 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
1351 // r9 is used to save r15_thread
1352 }
1353 }
1354
1355 void restore_argument_regs(BasicType type) {
1356 if (type == T_BYTE || type == T_SHORT) {
1357 restore_arg_regs();
1358 } else {
1359 restore_arg_regs_using_thread();
1360 }
1361 }
1362
1363 #if COMPILER2_OR_JVMCI
1364 // Note: Following rules apply to AVX3 optimized arraycopy stubs:-
1365 // - If target supports AVX3 features (BW+VL+F) then implementation uses 32 byte vectors (YMMs)
1366 // for both special cases (various small block sizes) and aligned copy loop. This is the
1367 // default configuration.
1368 // - If copy length is above AVX3Threshold, then implementation use 64 byte vectors (ZMMs)
1369 // for main copy loop (and subsequent tail) since bulk of the cycles will be consumed in it.
1370 // - If user forces MaxVectorSize=32 then above 4096 bytes its seen that REP MOVs shows a
1371 // better performance for disjoint copies. For conjoint/backward copy vector based
1372 // copy performs better.
1373 // - If user sets AVX3Threshold=0, then special cases for small blocks sizes operate over
1374 // 64 byte vector registers (ZMMs).
1375
1376 // Inputs:
1377 // c_rarg0 - source array address
1378 // c_rarg1 - destination array address
1379 // c_rarg2 - element count, treated as ssize_t, can be zero
1380 //
1381 //
1382 // Side Effects:
1383 // disjoint_copy_avx3_masked is set to the no-overlap entry point
1384 // used by generate_conjoint_[byte/int/short/long]_copy().
1385 //
1386
1387 address generate_disjoint_copy_avx3_masked(address* entry, const char *name, int shift,
1388 bool aligned, bool is_oop, bool dest_uninitialized) {
1389 __ align(CodeEntryAlignment);
1390 StubCodeMark mark(this, "StubRoutines", name);
1391 address start = __ pc();
1392 int avx3threshold = VM_Version::avx3_threshold();
1393 bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
1394 Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
1395 Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
1396 const Register from = rdi; // source array address
1397 const Register to = rsi; // destination array address
1398 const Register count = rdx; // elements count
1399 const Register temp1 = r8;
1400 const Register temp2 = r11;
1401 const Register temp3 = rax;
1402 const Register temp4 = rcx;
1403 // End pointers are inclusive, and if count is not zero they point
1404 // to the last unit copied: end_to[0] := end_from[0]
1405
1406 __ enter(); // required for proper stackwalking of RuntimeStub frame
1407 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1408
1409 if (entry != NULL) {
1410 *entry = __ pc();
1411 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1412 BLOCK_COMMENT("Entry:");
1413 }
1414
1415 BasicType type_vec[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
1416 BasicType type = is_oop ? T_OBJECT : type_vec[shift];
1417
1418 setup_argument_regs(type);
1419
1420 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1421 if (dest_uninitialized) {
1422 decorators |= IS_DEST_UNINITIALIZED;
1423 }
1424 if (aligned) {
1425 decorators |= ARRAYCOPY_ALIGNED;
1426 }
1427 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1428 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1429
1430 {
1431 // Type(shift) byte(0), short(1), int(2), long(3)
1432 int loop_size[] = { 192, 96, 48, 24};
1433 int threshold[] = { 4096, 2048, 1024, 512};
1434
1435 // UnsafeCopyMemory page error: continue after ucm
1436 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
1437 // 'from', 'to' and 'count' are now valid
1438
1439 // temp1 holds remaining count and temp4 holds running count used to compute
1440 // next address offset for start of to/from addresses (temp4 * scale).
1441 __ mov64(temp4, 0);
1442 __ movq(temp1, count);
1443
1444 // Zero length check.
1445 __ BIND(L_tail);
1446 __ cmpq(temp1, 0);
1447 __ jcc(Assembler::lessEqual, L_exit);
1448
1449 // Special cases using 32 byte [masked] vector copy operations.
1450 __ arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
1451 temp4, temp3, use64byteVector, L_entry, L_exit);
1452
1453 // PRE-MAIN-POST loop for aligned copy.
1454 __ BIND(L_entry);
1455
1456 if (avx3threshold != 0) {
1457 __ cmpq(count, threshold[shift]);
1458 if (MaxVectorSize == 64) {
1459 // Copy using 64 byte vectors.
1460 __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
1461 } else {
1462 assert(MaxVectorSize < 64, "vector size should be < 64 bytes");
1463 // REP MOVS offer a faster copy path.
1464 __ jcc(Assembler::greaterEqual, L_repmovs);
1465 }
1466 }
1467
1468 if ((MaxVectorSize < 64) || (avx3threshold != 0)) {
1469 // Partial copy to make dst address 32 byte aligned.
1470 __ movq(temp2, to);
1471 __ andq(temp2, 31);
1472 __ jcc(Assembler::equal, L_main_pre_loop);
1473
1474 __ negptr(temp2);
1475 __ addq(temp2, 32);
1476 if (shift) {
1477 __ shrq(temp2, shift);
1478 }
1479 __ movq(temp3, temp2);
1480 __ copy32_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift);
1481 __ movq(temp4, temp2);
1482 __ movq(temp1, count);
1483 __ subq(temp1, temp2);
1484
1485 __ cmpq(temp1, loop_size[shift]);
1486 __ jcc(Assembler::less, L_tail);
1487
1488 __ BIND(L_main_pre_loop);
1489 __ subq(temp1, loop_size[shift]);
1490
1491 // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
1492 __ align32();
1493 __ BIND(L_main_loop);
1494 __ copy64_avx(to, from, temp4, xmm1, false, shift, 0);
1495 __ copy64_avx(to, from, temp4, xmm1, false, shift, 64);
1496 __ copy64_avx(to, from, temp4, xmm1, false, shift, 128);
1497 __ addptr(temp4, loop_size[shift]);
1498 __ subq(temp1, loop_size[shift]);
1499 __ jcc(Assembler::greater, L_main_loop);
1500
1501 __ addq(temp1, loop_size[shift]);
1502
1503 // Tail loop.
1504 __ jmp(L_tail);
1505
1506 __ BIND(L_repmovs);
1507 __ movq(temp2, temp1);
1508 // Swap to(RSI) and from(RDI) addresses to comply with REP MOVs semantics.
1509 __ movq(temp3, to);
1510 __ movq(to, from);
1511 __ movq(from, temp3);
1512 // Save to/from for restoration post rep_mov.
1513 __ movq(temp1, to);
1514 __ movq(temp3, from);
1515 if(shift < 3) {
1516 __ shrq(temp2, 3-shift); // quad word count
1517 }
1518 __ movq(temp4 , temp2); // move quad ward count into temp4(RCX).
1519 __ rep_mov();
1520 __ shlq(temp2, 3); // convert quad words into byte count.
1521 if(shift) {
1522 __ shrq(temp2, shift); // type specific count.
1523 }
1524 // Restore original addresses in to/from.
1525 __ movq(to, temp3);
1526 __ movq(from, temp1);
1527 __ movq(temp4, temp2);
1528 __ movq(temp1, count);
1529 __ subq(temp1, temp2); // tailing part (less than a quad ward size).
1530 __ jmp(L_tail);
1531 }
1532
1533 if (MaxVectorSize > 32) {
1534 __ BIND(L_pre_main_post_64);
1535 // Partial copy to make dst address 64 byte aligned.
1536 __ movq(temp2, to);
1537 __ andq(temp2, 63);
1538 __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
1539
1540 __ negptr(temp2);
1541 __ addq(temp2, 64);
1542 if (shift) {
1543 __ shrq(temp2, shift);
1544 }
1545 __ movq(temp3, temp2);
1546 __ copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true);
1547 __ movq(temp4, temp2);
1548 __ movq(temp1, count);
1549 __ subq(temp1, temp2);
1550
1551 __ cmpq(temp1, loop_size[shift]);
1552 __ jcc(Assembler::less, L_tail64);
1553
1554 __ BIND(L_main_pre_loop_64bytes);
1555 __ subq(temp1, loop_size[shift]);
1556
1557 // Main loop with aligned copy block size of 192 bytes at
1558 // 64 byte copy granularity.
1559 __ align32();
1560 __ BIND(L_main_loop_64bytes);
1561 __ copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true);
1562 __ copy64_avx(to, from, temp4, xmm1, false, shift, 64, true);
1563 __ copy64_avx(to, from, temp4, xmm1, false, shift, 128, true);
1564 __ addptr(temp4, loop_size[shift]);
1565 __ subq(temp1, loop_size[shift]);
1566 __ jcc(Assembler::greater, L_main_loop_64bytes);
1567
1568 __ addq(temp1, loop_size[shift]);
1569 // Zero length check.
1570 __ jcc(Assembler::lessEqual, L_exit);
1571
1572 __ BIND(L_tail64);
1573
1574 // Tail handling using 64 byte [masked] vector copy operations.
1575 use64byteVector = true;
1576 __ arraycopy_avx3_special_cases(xmm1, k2, from, to, temp1, shift,
1577 temp4, temp3, use64byteVector, L_entry, L_exit);
1578 }
1579 __ BIND(L_exit);
1580 }
1581
1582 address ucme_exit_pc = __ pc();
1583 // When called from generic_arraycopy r11 contains specific values
1584 // used during arraycopy epilogue, re-initializing r11.
1585 if (is_oop) {
1586 __ movq(r11, shift == 3 ? count : to);
1587 }
1588 bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
1589 restore_argument_regs(type);
1590 inc_counter_np(get_profile_ctr(shift)); // Update counter after rscratch1 is free
1591 __ xorptr(rax, rax); // return 0
1592 __ vzeroupper();
1593 __ leave(); // required for proper stackwalking of RuntimeStub frame
1594 __ ret(0);
1595 return start;
1596 }
1597
1598 // Inputs:
1599 // c_rarg0 - source array address
1600 // c_rarg1 - destination array address
1601 // c_rarg2 - element count, treated as ssize_t, can be zero
1602 //
1603 //
1604 address generate_conjoint_copy_avx3_masked(address* entry, const char *name, int shift,
1605 address nooverlap_target, bool aligned, bool is_oop,
1606 bool dest_uninitialized) {
1607 __ align(CodeEntryAlignment);
1608 StubCodeMark mark(this, "StubRoutines", name);
1609 address start = __ pc();
1610
1611 int avx3threshold = VM_Version::avx3_threshold();
1612 bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
1613
1614 Label L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
1615 Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
1616 const Register from = rdi; // source array address
1617 const Register to = rsi; // destination array address
1618 const Register count = rdx; // elements count
1619 const Register temp1 = r8;
1620 const Register temp2 = rcx;
1621 const Register temp3 = r11;
1622 const Register temp4 = rax;
1623 // End pointers are inclusive, and if count is not zero they point
1624 // to the last unit copied: end_to[0] := end_from[0]
1625
1626 __ enter(); // required for proper stackwalking of RuntimeStub frame
1627 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1628
1629 if (entry != NULL) {
1630 *entry = __ pc();
1631 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1632 BLOCK_COMMENT("Entry:");
1633 }
1634
1635 array_overlap_test(nooverlap_target, (Address::ScaleFactor)(shift));
1636
1637 BasicType type_vec[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
1638 BasicType type = is_oop ? T_OBJECT : type_vec[shift];
1639
1640 setup_argument_regs(type);
1641
1642 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1643 if (dest_uninitialized) {
1644 decorators |= IS_DEST_UNINITIALIZED;
1645 }
1646 if (aligned) {
1647 decorators |= ARRAYCOPY_ALIGNED;
1648 }
1649 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1650 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
1651 {
1652 // Type(shift) byte(0), short(1), int(2), long(3)
1653 int loop_size[] = { 192, 96, 48, 24};
1654 int threshold[] = { 4096, 2048, 1024, 512};
1655
1656 // UnsafeCopyMemory page error: continue after ucm
1657 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
1658 // 'from', 'to' and 'count' are now valid
1659
1660 // temp1 holds remaining count.
1661 __ movq(temp1, count);
1662
1663 // Zero length check.
1664 __ BIND(L_tail);
1665 __ cmpq(temp1, 0);
1666 __ jcc(Assembler::lessEqual, L_exit);
1667
1668 __ mov64(temp2, 0);
1669 __ movq(temp3, temp1);
1670 // Special cases using 32 byte [masked] vector copy operations.
1671 __ arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
1672 temp4, use64byteVector, L_entry, L_exit);
1673
1674 // PRE-MAIN-POST loop for aligned copy.
1675 __ BIND(L_entry);
1676
1677 if ((MaxVectorSize > 32) && (avx3threshold != 0)) {
1678 __ cmpq(temp1, threshold[shift]);
1679 __ jcc(Assembler::greaterEqual, L_pre_main_post_64);
1680 }
1681
1682 if ((MaxVectorSize < 64) || (avx3threshold != 0)) {
1683 // Partial copy to make dst address 32 byte aligned.
1684 __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
1685 __ andq(temp2, 31);
1686 __ jcc(Assembler::equal, L_main_pre_loop);
1687
1688 if (shift) {
1689 __ shrq(temp2, shift);
1690 }
1691 __ subq(temp1, temp2);
1692 __ copy32_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift);
1693
1694 __ cmpq(temp1, loop_size[shift]);
1695 __ jcc(Assembler::less, L_tail);
1696
1697 __ BIND(L_main_pre_loop);
1698
1699 // Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
1700 __ align32();
1701 __ BIND(L_main_loop);
1702 __ copy64_avx(to, from, temp1, xmm1, true, shift, -64);
1703 __ copy64_avx(to, from, temp1, xmm1, true, shift, -128);
1704 __ copy64_avx(to, from, temp1, xmm1, true, shift, -192);
1705 __ subptr(temp1, loop_size[shift]);
1706 __ cmpq(temp1, loop_size[shift]);
1707 __ jcc(Assembler::greater, L_main_loop);
1708
1709 // Tail loop.
1710 __ jmp(L_tail);
1711 }
1712
1713 if (MaxVectorSize > 32) {
1714 __ BIND(L_pre_main_post_64);
1715 // Partial copy to make dst address 64 byte aligned.
1716 __ leaq(temp2, Address(to, temp1, (Address::ScaleFactor)(shift), 0));
1717 __ andq(temp2, 63);
1718 __ jcc(Assembler::equal, L_main_pre_loop_64bytes);
1719
1720 if (shift) {
1721 __ shrq(temp2, shift);
1722 }
1723 __ subq(temp1, temp2);
1724 __ copy64_masked_avx(to, from, xmm1, k2, temp2, temp1, temp3, shift, 0 , true);
1725
1726 __ cmpq(temp1, loop_size[shift]);
1727 __ jcc(Assembler::less, L_tail64);
1728
1729 __ BIND(L_main_pre_loop_64bytes);
1730
1731 // Main loop with aligned copy block size of 192 bytes at
1732 // 64 byte copy granularity.
1733 __ align32();
1734 __ BIND(L_main_loop_64bytes);
1735 __ copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true);
1736 __ copy64_avx(to, from, temp1, xmm1, true, shift, -128, true);
1737 __ copy64_avx(to, from, temp1, xmm1, true, shift, -192, true);
1738 __ subq(temp1, loop_size[shift]);
1739 __ cmpq(temp1, loop_size[shift]);
1740 __ jcc(Assembler::greater, L_main_loop_64bytes);
1741
1742 // Zero length check.
1743 __ cmpq(temp1, 0);
1744 __ jcc(Assembler::lessEqual, L_exit);
1745
1746 __ BIND(L_tail64);
1747
1748 // Tail handling using 64 byte [masked] vector copy operations.
1749 use64byteVector = true;
1750 __ mov64(temp2, 0);
1751 __ movq(temp3, temp1);
1752 __ arraycopy_avx3_special_cases_conjoint(xmm1, k2, from, to, temp2, temp3, temp1, shift,
1753 temp4, use64byteVector, L_entry, L_exit);
1754 }
1755 __ BIND(L_exit);
1756 }
1757 address ucme_exit_pc = __ pc();
1758 // When called from generic_arraycopy r11 contains specific values
1759 // used during arraycopy epilogue, re-initializing r11.
1760 if(is_oop) {
1761 __ movq(r11, count);
1762 }
1763 bs->arraycopy_epilogue(_masm, decorators, type, from, to, count);
1764 restore_argument_regs(type);
1765 inc_counter_np(get_profile_ctr(shift)); // Update counter after rscratch1 is free
1766 __ xorptr(rax, rax); // return 0
1767 __ vzeroupper();
1768 __ leave(); // required for proper stackwalking of RuntimeStub frame
1769 __ ret(0);
1770 return start;
1771 }
1772 #endif // COMPILER2_OR_JVMCI
1773
1774
1775 // Arguments:
1776 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1777 // ignored
1778 // name - stub name string
1779 //
1780 // Inputs:
1781 // c_rarg0 - source array address
1782 // c_rarg1 - destination array address
1783 // c_rarg2 - element count, treated as ssize_t, can be zero
1784 //
1785 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1786 // we let the hardware handle it. The one to eight bytes within words,
1787 // dwords or qwords that span cache line boundaries will still be loaded
1788 // and stored atomically.
1789 //
1790 // Side Effects:
1791 // disjoint_byte_copy_entry is set to the no-overlap entry point
1792 // used by generate_conjoint_byte_copy().
1793 //
1794 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1795 #if COMPILER2_OR_JVMCI
1796 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
1797 return generate_disjoint_copy_avx3_masked(entry, "jbyte_disjoint_arraycopy_avx3", 0,
1798 aligned, false, false);
1799 }
1800 #endif
1801 __ align(CodeEntryAlignment);
1802 StubCodeMark mark(this, "StubRoutines", name);
1803 address start = __ pc();
1804
1805 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1806 Label L_copy_byte, L_exit;
1807 const Register from = rdi; // source array address
1808 const Register to = rsi; // destination array address
1809 const Register count = rdx; // elements count
1810 const Register byte_count = rcx;
1811 const Register qword_count = count;
1812 const Register end_from = from; // source array end address
1813 const Register end_to = to; // destination array end address
1814 // End pointers are inclusive, and if count is not zero they point
1815 // to the last unit copied: end_to[0] := end_from[0]
1816
1817 __ enter(); // required for proper stackwalking of RuntimeStub frame
1818 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1819
1820 if (entry != NULL) {
1821 *entry = __ pc();
1822 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1823 BLOCK_COMMENT("Entry:");
1824 }
1825
1826 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1827 // r9 and r10 may be used to save non-volatile registers
1828
1829 {
1830 // UnsafeCopyMemory page error: continue after ucm
1831 UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1832 // 'from', 'to' and 'count' are now valid
1833 __ movptr(byte_count, count);
1834 __ shrptr(count, 3); // count => qword_count
1835
1836 // Copy from low to high addresses. Use 'to' as scratch.
1837 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1838 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
1839 __ negptr(qword_count); // make the count negative
1840 __ jmp(L_copy_bytes);
1841
1842 // Copy trailing qwords
1843 __ BIND(L_copy_8_bytes);
1844 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1845 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1846 __ increment(qword_count);
1847 __ jcc(Assembler::notZero, L_copy_8_bytes);
1848
1849 // Check for and copy trailing dword
1850 __ BIND(L_copy_4_bytes);
1851 __ testl(byte_count, 4);
1852 __ jccb(Assembler::zero, L_copy_2_bytes);
1853 __ movl(rax, Address(end_from, 8));
1854 __ movl(Address(end_to, 8), rax);
1855
1856 __ addptr(end_from, 4);
1857 __ addptr(end_to, 4);
1858
1859 // Check for and copy trailing word
1860 __ BIND(L_copy_2_bytes);
1861 __ testl(byte_count, 2);
1862 __ jccb(Assembler::zero, L_copy_byte);
1863 __ movw(rax, Address(end_from, 8));
1864 __ movw(Address(end_to, 8), rax);
1865
1866 __ addptr(end_from, 2);
1867 __ addptr(end_to, 2);
1868
1869 // Check for and copy trailing byte
1870 __ BIND(L_copy_byte);
1871 __ testl(byte_count, 1);
1872 __ jccb(Assembler::zero, L_exit);
1873 __ movb(rax, Address(end_from, 8));
1874 __ movb(Address(end_to, 8), rax);
1875 }
1876 __ BIND(L_exit);
1877 address ucme_exit_pc = __ pc();
1878 restore_arg_regs();
1879 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1880 __ xorptr(rax, rax); // return 0
1881 __ vzeroupper();
1882 __ leave(); // required for proper stackwalking of RuntimeStub frame
1883 __ ret(0);
1884
1885 {
1886 UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc);
1887 // Copy in multi-bytes chunks
1888 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1889 __ jmp(L_copy_4_bytes);
1890 }
1891 return start;
1892 }
1893
1894 // Arguments:
1895 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1896 // ignored
1897 // name - stub name string
1898 //
1899 // Inputs:
1900 // c_rarg0 - source array address
1901 // c_rarg1 - destination array address
1902 // c_rarg2 - element count, treated as ssize_t, can be zero
1903 //
1904 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1905 // we let the hardware handle it. The one to eight bytes within words,
1906 // dwords or qwords that span cache line boundaries will still be loaded
1907 // and stored atomically.
1908 //
1909 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1910 address* entry, const char *name) {
1911 #if COMPILER2_OR_JVMCI
1912 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
1913 return generate_conjoint_copy_avx3_masked(entry, "jbyte_conjoint_arraycopy_avx3", 0,
1914 nooverlap_target, aligned, false, false);
1915 }
1916 #endif
1917 __ align(CodeEntryAlignment);
1918 StubCodeMark mark(this, "StubRoutines", name);
1919 address start = __ pc();
1920
1921 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1922 const Register from = rdi; // source array address
1923 const Register to = rsi; // destination array address
1924 const Register count = rdx; // elements count
1925 const Register byte_count = rcx;
1926 const Register qword_count = count;
1927
1928 __ enter(); // required for proper stackwalking of RuntimeStub frame
1929 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
1930
1931 if (entry != NULL) {
1932 *entry = __ pc();
1933 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1934 BLOCK_COMMENT("Entry:");
1935 }
1936
1937 array_overlap_test(nooverlap_target, Address::times_1);
1938 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1939 // r9 and r10 may be used to save non-volatile registers
1940
1941 {
1942 // UnsafeCopyMemory page error: continue after ucm
1943 UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1944 // 'from', 'to' and 'count' are now valid
1945 __ movptr(byte_count, count);
1946 __ shrptr(count, 3); // count => qword_count
1947
1948 // Copy from high to low addresses.
1949
1950 // Check for and copy trailing byte
1951 __ testl(byte_count, 1);
1952 __ jcc(Assembler::zero, L_copy_2_bytes);
1953 __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1954 __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1955 __ decrement(byte_count); // Adjust for possible trailing word
1956
1957 // Check for and copy trailing word
1958 __ BIND(L_copy_2_bytes);
1959 __ testl(byte_count, 2);
1960 __ jcc(Assembler::zero, L_copy_4_bytes);
1961 __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1962 __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1963
1964 // Check for and copy trailing dword
1965 __ BIND(L_copy_4_bytes);
1966 __ testl(byte_count, 4);
1967 __ jcc(Assembler::zero, L_copy_bytes);
1968 __ movl(rax, Address(from, qword_count, Address::times_8));
1969 __ movl(Address(to, qword_count, Address::times_8), rax);
1970 __ jmp(L_copy_bytes);
1971
1972 // Copy trailing qwords
1973 __ BIND(L_copy_8_bytes);
1974 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1975 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1976 __ decrement(qword_count);
1977 __ jcc(Assembler::notZero, L_copy_8_bytes);
1978 }
1979 restore_arg_regs();
1980 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1981 __ xorptr(rax, rax); // return 0
1982 __ vzeroupper();
1983 __ leave(); // required for proper stackwalking of RuntimeStub frame
1984 __ ret(0);
1985
1986 {
1987 // UnsafeCopyMemory page error: continue after ucm
1988 UnsafeCopyMemoryMark ucmm(this, !aligned, true);
1989 // Copy in multi-bytes chunks
1990 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1991 }
1992 restore_arg_regs();
1993 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1994 __ xorptr(rax, rax); // return 0
1995 __ vzeroupper();
1996 __ leave(); // required for proper stackwalking of RuntimeStub frame
1997 __ ret(0);
1998
1999 return start;
2000 }
2001
2002 // Arguments:
2003 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2004 // ignored
2005 // name - stub name string
2006 //
2007 // Inputs:
2008 // c_rarg0 - source array address
2009 // c_rarg1 - destination array address
2010 // c_rarg2 - element count, treated as ssize_t, can be zero
2011 //
2012 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
2013 // let the hardware handle it. The two or four words within dwords
2014 // or qwords that span cache line boundaries will still be loaded
2015 // and stored atomically.
2016 //
2017 // Side Effects:
2018 // disjoint_short_copy_entry is set to the no-overlap entry point
2019 // used by generate_conjoint_short_copy().
2020 //
2021 address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
2022 #if COMPILER2_OR_JVMCI
2023 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
2024 return generate_disjoint_copy_avx3_masked(entry, "jshort_disjoint_arraycopy_avx3", 1,
2025 aligned, false, false);
2026 }
2027 #endif
2028
2029 __ align(CodeEntryAlignment);
2030 StubCodeMark mark(this, "StubRoutines", name);
2031 address start = __ pc();
2032
2033 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
2034 const Register from = rdi; // source array address
2035 const Register to = rsi; // destination array address
2036 const Register count = rdx; // elements count
2037 const Register word_count = rcx;
2038 const Register qword_count = count;
2039 const Register end_from = from; // source array end address
2040 const Register end_to = to; // destination array end address
2041 // End pointers are inclusive, and if count is not zero they point
2042 // to the last unit copied: end_to[0] := end_from[0]
2043
2044 __ enter(); // required for proper stackwalking of RuntimeStub frame
2045 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
2046
2047 if (entry != NULL) {
2048 *entry = __ pc();
2049 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2050 BLOCK_COMMENT("Entry:");
2051 }
2052
2053 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2054 // r9 and r10 may be used to save non-volatile registers
2055
2056 {
2057 // UnsafeCopyMemory page error: continue after ucm
2058 UnsafeCopyMemoryMark ucmm(this, !aligned, true);
2059 // 'from', 'to' and 'count' are now valid
2060 __ movptr(word_count, count);
2061 __ shrptr(count, 2); // count => qword_count
2062
2063 // Copy from low to high addresses. Use 'to' as scratch.
2064 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2065 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
2066 __ negptr(qword_count);
2067 __ jmp(L_copy_bytes);
2068
2069 // Copy trailing qwords
2070 __ BIND(L_copy_8_bytes);
2071 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2072 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2073 __ increment(qword_count);
2074 __ jcc(Assembler::notZero, L_copy_8_bytes);
2075
2076 // Original 'dest' is trashed, so we can't use it as a
2077 // base register for a possible trailing word copy
2078
2079 // Check for and copy trailing dword
2080 __ BIND(L_copy_4_bytes);
2081 __ testl(word_count, 2);
2082 __ jccb(Assembler::zero, L_copy_2_bytes);
2083 __ movl(rax, Address(end_from, 8));
2084 __ movl(Address(end_to, 8), rax);
2085
2086 __ addptr(end_from, 4);
2087 __ addptr(end_to, 4);
2088
2089 // Check for and copy trailing word
2090 __ BIND(L_copy_2_bytes);
2091 __ testl(word_count, 1);
2092 __ jccb(Assembler::zero, L_exit);
2093 __ movw(rax, Address(end_from, 8));
2094 __ movw(Address(end_to, 8), rax);
2095 }
2096 __ BIND(L_exit);
2097 address ucme_exit_pc = __ pc();
2098 restore_arg_regs();
2099 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2100 __ xorptr(rax, rax); // return 0
2101 __ vzeroupper();
2102 __ leave(); // required for proper stackwalking of RuntimeStub frame
2103 __ ret(0);
2104
2105 {
2106 UnsafeCopyMemoryMark ucmm(this, !aligned, false, ucme_exit_pc);
2107 // Copy in multi-bytes chunks
2108 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2109 __ jmp(L_copy_4_bytes);
2110 }
2111
2112 return start;
2113 }
2114
2115 address generate_fill(BasicType t, bool aligned, const char *name) {
2116 __ align(CodeEntryAlignment);
2117 StubCodeMark mark(this, "StubRoutines", name);
2118 address start = __ pc();
2119
2120 BLOCK_COMMENT("Entry:");
2121
2122 const Register to = c_rarg0; // destination array address
2123 const Register value = c_rarg1; // value
2124 const Register count = c_rarg2; // elements count
2125 __ mov(r11, count);
2126
2127 __ enter(); // required for proper stackwalking of RuntimeStub frame
2128
2129 __ generate_fill(t, aligned, to, value, r11, rax, xmm0);
2130
2131 __ vzeroupper();
2132 __ leave(); // required for proper stackwalking of RuntimeStub frame
2133 __ ret(0);
2134 return start;
2135 }
2136
2137 // Arguments:
2138 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2139 // ignored
2140 // name - stub name string
2141 //
2142 // Inputs:
2143 // c_rarg0 - source array address
2144 // c_rarg1 - destination array address
2145 // c_rarg2 - element count, treated as ssize_t, can be zero
2146 //
2147 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
2148 // let the hardware handle it. The two or four words within dwords
2149 // or qwords that span cache line boundaries will still be loaded
2150 // and stored atomically.
2151 //
2152 address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
2153 address *entry, const char *name) {
2154 #if COMPILER2_OR_JVMCI
2155 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
2156 return generate_conjoint_copy_avx3_masked(entry, "jshort_conjoint_arraycopy_avx3", 1,
2157 nooverlap_target, aligned, false, false);
2158 }
2159 #endif
2160 __ align(CodeEntryAlignment);
2161 StubCodeMark mark(this, "StubRoutines", name);
2162 address start = __ pc();
2163
2164 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
2165 const Register from = rdi; // source array address
2166 const Register to = rsi; // destination array address
2167 const Register count = rdx; // elements count
2168 const Register word_count = rcx;
2169 const Register qword_count = count;
2170
2171 __ enter(); // required for proper stackwalking of RuntimeStub frame
2172 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
2173
2174 if (entry != NULL) {
2175 *entry = __ pc();
2176 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2177 BLOCK_COMMENT("Entry:");
2178 }
2179
2180 array_overlap_test(nooverlap_target, Address::times_2);
2181 setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2182 // r9 and r10 may be used to save non-volatile registers
2183
2184 {
2185 // UnsafeCopyMemory page error: continue after ucm
2186 UnsafeCopyMemoryMark ucmm(this, !aligned, true);
2187 // 'from', 'to' and 'count' are now valid
2188 __ movptr(word_count, count);
2189 __ shrptr(count, 2); // count => qword_count
2190
2191 // Copy from high to low addresses. Use 'to' as scratch.
2192
2193 // Check for and copy trailing word
2194 __ testl(word_count, 1);
2195 __ jccb(Assembler::zero, L_copy_4_bytes);
2196 __ movw(rax, Address(from, word_count, Address::times_2, -2));
2197 __ movw(Address(to, word_count, Address::times_2, -2), rax);
2198
2199 // Check for and copy trailing dword
2200 __ BIND(L_copy_4_bytes);
2201 __ testl(word_count, 2);
2202 __ jcc(Assembler::zero, L_copy_bytes);
2203 __ movl(rax, Address(from, qword_count, Address::times_8));
2204 __ movl(Address(to, qword_count, Address::times_8), rax);
2205 __ jmp(L_copy_bytes);
2206
2207 // Copy trailing qwords
2208 __ BIND(L_copy_8_bytes);
2209 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2210 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2211 __ decrement(qword_count);
2212 __ jcc(Assembler::notZero, L_copy_8_bytes);
2213 }
2214 restore_arg_regs();
2215 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2216 __ xorptr(rax, rax); // return 0
2217 __ vzeroupper();
2218 __ leave(); // required for proper stackwalking of RuntimeStub frame
2219 __ ret(0);
2220
2221 {
2222 // UnsafeCopyMemory page error: continue after ucm
2223 UnsafeCopyMemoryMark ucmm(this, !aligned, true);
2224 // Copy in multi-bytes chunks
2225 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2226 }
2227 restore_arg_regs();
2228 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2229 __ xorptr(rax, rax); // return 0
2230 __ vzeroupper();
2231 __ leave(); // required for proper stackwalking of RuntimeStub frame
2232 __ ret(0);
2233
2234 return start;
2235 }
2236
2237 // Arguments:
2238 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2239 // ignored
2240 // is_oop - true => oop array, so generate store check code
2241 // name - stub name string
2242 //
2243 // Inputs:
2244 // c_rarg0 - source array address
2245 // c_rarg1 - destination array address
2246 // c_rarg2 - element count, treated as ssize_t, can be zero
2247 //
2248 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2249 // the hardware handle it. The two dwords within qwords that span
2250 // cache line boundaries will still be loaded and stored atomically.
2251 //
2252 // Side Effects:
2253 // disjoint_int_copy_entry is set to the no-overlap entry point
2254 // used by generate_conjoint_int_oop_copy().
2255 //
2256 address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
2257 const char *name, bool dest_uninitialized = false) {
2258 #if COMPILER2_OR_JVMCI
2259 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
2260 return generate_disjoint_copy_avx3_masked(entry, "jint_disjoint_arraycopy_avx3", 2,
2261 aligned, is_oop, dest_uninitialized);
2262 }
2263 #endif
2264
2265 __ align(CodeEntryAlignment);
2266 StubCodeMark mark(this, "StubRoutines", name);
2267 address start = __ pc();
2268
2269 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
2270 const Register from = rdi; // source array address
2271 const Register to = rsi; // destination array address
2272 const Register count = rdx; // elements count
2273 const Register dword_count = rcx;
2274 const Register qword_count = count;
2275 const Register end_from = from; // source array end address
2276 const Register end_to = to; // destination array end address
2277 // End pointers are inclusive, and if count is not zero they point
2278 // to the last unit copied: end_to[0] := end_from[0]
2279
2280 __ enter(); // required for proper stackwalking of RuntimeStub frame
2281 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
2282
2283 if (entry != NULL) {
2284 *entry = __ pc();
2285 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2286 BLOCK_COMMENT("Entry:");
2287 }
2288
2289 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2290 // r9 is used to save r15_thread
2291
2292 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2293 if (dest_uninitialized) {
2294 decorators |= IS_DEST_UNINITIALIZED;
2295 }
2296 if (aligned) {
2297 decorators |= ARRAYCOPY_ALIGNED;
2298 }
2299
2300 BasicType type = is_oop ? T_OBJECT : T_INT;
2301 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2302 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2303
2304 {
2305 // UnsafeCopyMemory page error: continue after ucm
2306 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2307 // 'from', 'to' and 'count' are now valid
2308 __ movptr(dword_count, count);
2309 __ shrptr(count, 1); // count => qword_count
2310
2311 // Copy from low to high addresses. Use 'to' as scratch.
2312 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2313 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
2314 __ negptr(qword_count);
2315 __ jmp(L_copy_bytes);
2316
2317 // Copy trailing qwords
2318 __ BIND(L_copy_8_bytes);
2319 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2320 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2321 __ increment(qword_count);
2322 __ jcc(Assembler::notZero, L_copy_8_bytes);
2323
2324 // Check for and copy trailing dword
2325 __ BIND(L_copy_4_bytes);
2326 __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
2327 __ jccb(Assembler::zero, L_exit);
2328 __ movl(rax, Address(end_from, 8));
2329 __ movl(Address(end_to, 8), rax);
2330 }
2331 __ BIND(L_exit);
2332 address ucme_exit_pc = __ pc();
2333 bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2334 restore_arg_regs_using_thread();
2335 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2336 __ vzeroupper();
2337 __ xorptr(rax, rax); // return 0
2338 __ leave(); // required for proper stackwalking of RuntimeStub frame
2339 __ ret(0);
2340
2341 {
2342 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, false, ucme_exit_pc);
2343 // Copy in multi-bytes chunks
2344 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2345 __ jmp(L_copy_4_bytes);
2346 }
2347
2348 return start;
2349 }
2350
2351 // Arguments:
2352 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2353 // ignored
2354 // is_oop - true => oop array, so generate store check code
2355 // name - stub name string
2356 //
2357 // Inputs:
2358 // c_rarg0 - source array address
2359 // c_rarg1 - destination array address
2360 // c_rarg2 - element count, treated as ssize_t, can be zero
2361 //
2362 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2363 // the hardware handle it. The two dwords within qwords that span
2364 // cache line boundaries will still be loaded and stored atomically.
2365 //
2366 address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
2367 address *entry, const char *name,
2368 bool dest_uninitialized = false) {
2369 #if COMPILER2_OR_JVMCI
2370 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
2371 return generate_conjoint_copy_avx3_masked(entry, "jint_conjoint_arraycopy_avx3", 2,
2372 nooverlap_target, aligned, is_oop, dest_uninitialized);
2373 }
2374 #endif
2375 __ align(CodeEntryAlignment);
2376 StubCodeMark mark(this, "StubRoutines", name);
2377 address start = __ pc();
2378
2379 Label L_copy_bytes, L_copy_8_bytes, L_exit;
2380 const Register from = rdi; // source array address
2381 const Register to = rsi; // destination array address
2382 const Register count = rdx; // elements count
2383 const Register dword_count = rcx;
2384 const Register qword_count = count;
2385
2386 __ enter(); // required for proper stackwalking of RuntimeStub frame
2387 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
2388
2389 if (entry != NULL) {
2390 *entry = __ pc();
2391 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2392 BLOCK_COMMENT("Entry:");
2393 }
2394
2395 array_overlap_test(nooverlap_target, Address::times_4);
2396 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2397 // r9 is used to save r15_thread
2398
2399 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2400 if (dest_uninitialized) {
2401 decorators |= IS_DEST_UNINITIALIZED;
2402 }
2403 if (aligned) {
2404 decorators |= ARRAYCOPY_ALIGNED;
2405 }
2406
2407 BasicType type = is_oop ? T_OBJECT : T_INT;
2408 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2409 // no registers are destroyed by this call
2410 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2411
2412 assert_clean_int(count, rax); // Make sure 'count' is clean int.
2413 {
2414 // UnsafeCopyMemory page error: continue after ucm
2415 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2416 // 'from', 'to' and 'count' are now valid
2417 __ movptr(dword_count, count);
2418 __ shrptr(count, 1); // count => qword_count
2419
2420 // Copy from high to low addresses. Use 'to' as scratch.
2421
2422 // Check for and copy trailing dword
2423 __ testl(dword_count, 1);
2424 __ jcc(Assembler::zero, L_copy_bytes);
2425 __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2426 __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2427 __ jmp(L_copy_bytes);
2428
2429 // Copy trailing qwords
2430 __ BIND(L_copy_8_bytes);
2431 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2432 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2433 __ decrement(qword_count);
2434 __ jcc(Assembler::notZero, L_copy_8_bytes);
2435 }
2436 if (is_oop) {
2437 __ jmp(L_exit);
2438 }
2439 restore_arg_regs_using_thread();
2440 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2441 __ xorptr(rax, rax); // return 0
2442 __ vzeroupper();
2443 __ leave(); // required for proper stackwalking of RuntimeStub frame
2444 __ ret(0);
2445
2446 {
2447 // UnsafeCopyMemory page error: continue after ucm
2448 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2449 // Copy in multi-bytes chunks
2450 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2451 }
2452
2453 __ BIND(L_exit);
2454 bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2455 restore_arg_regs_using_thread();
2456 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2457 __ xorptr(rax, rax); // return 0
2458 __ vzeroupper();
2459 __ leave(); // required for proper stackwalking of RuntimeStub frame
2460 __ ret(0);
2461
2462 return start;
2463 }
2464
2465 // Arguments:
2466 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2467 // ignored
2468 // is_oop - true => oop array, so generate store check code
2469 // name - stub name string
2470 //
2471 // Inputs:
2472 // c_rarg0 - source array address
2473 // c_rarg1 - destination array address
2474 // c_rarg2 - element count, treated as ssize_t, can be zero
2475 //
2476 // Side Effects:
2477 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2478 // no-overlap entry point used by generate_conjoint_long_oop_copy().
2479 //
2480 address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
2481 const char *name, bool dest_uninitialized = false) {
2482 #if COMPILER2_OR_JVMCI
2483 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
2484 return generate_disjoint_copy_avx3_masked(entry, "jlong_disjoint_arraycopy_avx3", 3,
2485 aligned, is_oop, dest_uninitialized);
2486 }
2487 #endif
2488 __ align(CodeEntryAlignment);
2489 StubCodeMark mark(this, "StubRoutines", name);
2490 address start = __ pc();
2491
2492 Label L_copy_bytes, L_copy_8_bytes, L_exit;
2493 const Register from = rdi; // source array address
2494 const Register to = rsi; // destination array address
2495 const Register qword_count = rdx; // elements count
2496 const Register end_from = from; // source array end address
2497 const Register end_to = rcx; // destination array end address
2498 const Register saved_count = r11;
2499 // End pointers are inclusive, and if count is not zero they point
2500 // to the last unit copied: end_to[0] := end_from[0]
2501
2502 __ enter(); // required for proper stackwalking of RuntimeStub frame
2503 // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2504 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
2505
2506 if (entry != NULL) {
2507 *entry = __ pc();
2508 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2509 BLOCK_COMMENT("Entry:");
2510 }
2511
2512 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2513 // r9 is used to save r15_thread
2514 // 'from', 'to' and 'qword_count' are now valid
2515
2516 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2517 if (dest_uninitialized) {
2518 decorators |= IS_DEST_UNINITIALIZED;
2519 }
2520 if (aligned) {
2521 decorators |= ARRAYCOPY_ALIGNED;
2522 }
2523
2524 BasicType type = is_oop ? T_OBJECT : T_LONG;
2525 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2526 bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2527 {
2528 // UnsafeCopyMemory page error: continue after ucm
2529 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2530
2531 // Copy from low to high addresses. Use 'to' as scratch.
2532 __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2533 __ lea(end_to, Address(to, qword_count, Address::times_8, -8));
2534 __ negptr(qword_count);
2535 __ jmp(L_copy_bytes);
2536
2537 // Copy trailing qwords
2538 __ BIND(L_copy_8_bytes);
2539 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2540 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2541 __ increment(qword_count);
2542 __ jcc(Assembler::notZero, L_copy_8_bytes);
2543 }
2544 if (is_oop) {
2545 __ jmp(L_exit);
2546 } else {
2547 restore_arg_regs_using_thread();
2548 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2549 __ xorptr(rax, rax); // return 0
2550 __ vzeroupper();
2551 __ leave(); // required for proper stackwalking of RuntimeStub frame
2552 __ ret(0);
2553 }
2554
2555 {
2556 // UnsafeCopyMemory page error: continue after ucm
2557 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2558 // Copy in multi-bytes chunks
2559 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2560 }
2561
2562 __ BIND(L_exit);
2563 bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2564 restore_arg_regs_using_thread();
2565 if (is_oop) {
2566 inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2567 } else {
2568 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2569 }
2570 __ vzeroupper();
2571 __ xorptr(rax, rax); // return 0
2572 __ leave(); // required for proper stackwalking of RuntimeStub frame
2573 __ ret(0);
2574
2575 return start;
2576 }
2577
2578 // Arguments:
2579 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2580 // ignored
2581 // is_oop - true => oop array, so generate store check code
2582 // name - stub name string
2583 //
2584 // Inputs:
2585 // c_rarg0 - source array address
2586 // c_rarg1 - destination array address
2587 // c_rarg2 - element count, treated as ssize_t, can be zero
2588 //
2589 address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
2590 address nooverlap_target, address *entry,
2591 const char *name, bool dest_uninitialized = false) {
2592 #if COMPILER2_OR_JVMCI
2593 if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
2594 return generate_conjoint_copy_avx3_masked(entry, "jlong_conjoint_arraycopy_avx3", 3,
2595 nooverlap_target, aligned, is_oop, dest_uninitialized);
2596 }
2597 #endif
2598 __ align(CodeEntryAlignment);
2599 StubCodeMark mark(this, "StubRoutines", name);
2600 address start = __ pc();
2601
2602 Label L_copy_bytes, L_copy_8_bytes, L_exit;
2603 const Register from = rdi; // source array address
2604 const Register to = rsi; // destination array address
2605 const Register qword_count = rdx; // elements count
2606 const Register saved_count = rcx;
2607
2608 __ enter(); // required for proper stackwalking of RuntimeStub frame
2609 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int.
2610
2611 if (entry != NULL) {
2612 *entry = __ pc();
2613 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2614 BLOCK_COMMENT("Entry:");
2615 }
2616
2617 array_overlap_test(nooverlap_target, Address::times_8);
2618 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2619 // r9 is used to save r15_thread
2620 // 'from', 'to' and 'qword_count' are now valid
2621
2622 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2623 if (dest_uninitialized) {
2624 decorators |= IS_DEST_UNINITIALIZED;
2625 }
2626 if (aligned) {
2627 decorators |= ARRAYCOPY_ALIGNED;
2628 }
2629
2630 BasicType type = is_oop ? T_OBJECT : T_LONG;
2631 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2632 bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2633 {
2634 // UnsafeCopyMemory page error: continue after ucm
2635 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2636
2637 __ jmp(L_copy_bytes);
2638
2639 // Copy trailing qwords
2640 __ BIND(L_copy_8_bytes);
2641 __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2642 __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2643 __ decrement(qword_count);
2644 __ jcc(Assembler::notZero, L_copy_8_bytes);
2645 }
2646 if (is_oop) {
2647 __ jmp(L_exit);
2648 } else {
2649 restore_arg_regs_using_thread();
2650 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2651 __ xorptr(rax, rax); // return 0
2652 __ vzeroupper();
2653 __ leave(); // required for proper stackwalking of RuntimeStub frame
2654 __ ret(0);
2655 }
2656 {
2657 // UnsafeCopyMemory page error: continue after ucm
2658 UnsafeCopyMemoryMark ucmm(this, !is_oop && !aligned, true);
2659
2660 // Copy in multi-bytes chunks
2661 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2662 }
2663 __ BIND(L_exit);
2664 bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2665 restore_arg_regs_using_thread();
2666 if (is_oop) {
2667 inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2668 } else {
2669 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2670 }
2671 __ vzeroupper();
2672 __ xorptr(rax, rax); // return 0
2673 __ leave(); // required for proper stackwalking of RuntimeStub frame
2674 __ ret(0);
2675
2676 return start;
2677 }
2678
2679
2680 // Helper for generating a dynamic type check.
2681 // Smashes no registers.
2682 void generate_type_check(Register sub_klass,
2683 Register super_check_offset,
2684 Register super_klass,
2685 Label& L_success) {
2686 assert_different_registers(sub_klass, super_check_offset, super_klass);
2687
2688 BLOCK_COMMENT("type_check:");
2689
2690 Label L_miss;
2691
2692 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL,
2693 super_check_offset);
2694 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
2695
2696 // Fall through on failure!
2697 __ BIND(L_miss);
2698 }
2699
2700 //
2701 // Generate checkcasting array copy stub
2702 //
2703 // Input:
2704 // c_rarg0 - source array address
2705 // c_rarg1 - destination array address
2706 // c_rarg2 - element count, treated as ssize_t, can be zero
2707 // c_rarg3 - size_t ckoff (super_check_offset)
2708 // not Win64
2709 // c_rarg4 - oop ckval (super_klass)
2710 // Win64
2711 // rsp+40 - oop ckval (super_klass)
2712 //
2713 // Output:
2714 // rax == 0 - success
2715 // rax == -1^K - failure, where K is partial transfer count
2716 //
2717 address generate_checkcast_copy(const char *name, address *entry,
2718 bool dest_uninitialized = false) {
2719
2720 Label L_load_element, L_store_element, L_do_card_marks, L_done;
2721
2722 // Input registers (after setup_arg_regs)
2723 const Register from = rdi; // source array address
2724 const Register to = rsi; // destination array address
2725 const Register length = rdx; // elements count
2726 const Register ckoff = rcx; // super_check_offset
2727 const Register ckval = r8; // super_klass
2728
2729 // Registers used as temps (r13, r14 are save-on-entry)
2730 const Register end_from = from; // source array end address
2731 const Register end_to = r13; // destination array end address
2732 const Register count = rdx; // -(count_remaining)
2733 const Register r14_length = r14; // saved copy of length
2734 // End pointers are inclusive, and if length is not zero they point
2735 // to the last unit copied: end_to[0] := end_from[0]
2736
2737 const Register rax_oop = rax; // actual oop copied
2738 const Register r11_klass = r11; // oop._klass
2739
2740 //---------------------------------------------------------------
2741 // Assembler stub will be used for this call to arraycopy
2742 // if the two arrays are subtypes of Object[] but the
2743 // destination array type is not equal to or a supertype
2744 // of the source type. Each element must be separately
2745 // checked.
2746
2747 __ align(CodeEntryAlignment);
2748 StubCodeMark mark(this, "StubRoutines", name);
2749 address start = __ pc();
2750
2751 __ enter(); // required for proper stackwalking of RuntimeStub frame
2752
2753 #ifdef ASSERT
2754 // caller guarantees that the arrays really are different
2755 // otherwise, we would have to make conjoint checks
2756 { Label L;
2757 array_overlap_test(L, TIMES_OOP);
2758 __ stop("checkcast_copy within a single array");
2759 __ bind(L);
2760 }
2761 #endif //ASSERT
2762
2763 setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
2764 // ckoff => rcx, ckval => r8
2765 // r9 and r10 may be used to save non-volatile registers
2766 #ifdef _WIN64
2767 // last argument (#4) is on stack on Win64
2768 __ movptr(ckval, Address(rsp, 6 * wordSize));
2769 #endif
2770
2771 // Caller of this entry point must set up the argument registers.
2772 if (entry != NULL) {
2773 *entry = __ pc();
2774 BLOCK_COMMENT("Entry:");
2775 }
2776
2777 // allocate spill slots for r13, r14
2778 enum {
2779 saved_r13_offset,
2780 saved_r14_offset,
2781 saved_r10_offset,
2782 saved_rbp_offset
2783 };
2784 __ subptr(rsp, saved_rbp_offset * wordSize);
2785 __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2786 __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2787 __ movptr(Address(rsp, saved_r10_offset * wordSize), r10);
2788
2789 #ifdef ASSERT
2790 Label L2;
2791 __ get_thread(r14);
2792 __ cmpptr(r15_thread, r14);
2793 __ jcc(Assembler::equal, L2);
2794 __ stop("StubRoutines::call_stub: r15_thread is modified by call");
2795 __ bind(L2);
2796 #endif // ASSERT
2797
2798 // check that int operands are properly extended to size_t
2799 assert_clean_int(length, rax);
2800 assert_clean_int(ckoff, rax);
2801
2802 #ifdef ASSERT
2803 BLOCK_COMMENT("assert consistent ckoff/ckval");
2804 // The ckoff and ckval must be mutually consistent,
2805 // even though caller generates both.
2806 { Label L;
2807 int sco_offset = in_bytes(Klass::super_check_offset_offset());
2808 __ cmpl(ckoff, Address(ckval, sco_offset));
2809 __ jcc(Assembler::equal, L);
2810 __ stop("super_check_offset inconsistent");
2811 __ bind(L);
2812 }
2813 #endif //ASSERT
2814
2815 // Loop-invariant addresses. They are exclusive end pointers.
2816 Address end_from_addr(from, length, TIMES_OOP, 0);
2817 Address end_to_addr(to, length, TIMES_OOP, 0);
2818 // Loop-variant addresses. They assume post-incremented count < 0.
2819 Address from_element_addr(end_from, count, TIMES_OOP, 0);
2820 Address to_element_addr(end_to, count, TIMES_OOP, 0);
2821
2822 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
2823 if (dest_uninitialized) {
2824 decorators |= IS_DEST_UNINITIALIZED;
2825 }
2826
2827 BasicType type = T_OBJECT;
2828 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2829 bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2830
2831 // Copy from low to high addresses, indexed from the end of each array.
2832 __ lea(end_from, end_from_addr);
2833 __ lea(end_to, end_to_addr);
2834 __ movptr(r14_length, length); // save a copy of the length
2835 assert(length == count, ""); // else fix next line:
2836 __ negptr(count); // negate and test the length
2837 __ jcc(Assembler::notZero, L_load_element);
2838
2839 // Empty array: Nothing to do.
2840 __ xorptr(rax, rax); // return 0 on (trivial) success
2841 __ jmp(L_done);
2842
2843 // ======== begin loop ========
2844 // (Loop is rotated; its entry is L_load_element.)
2845 // Loop control:
2846 // for (count = -count; count != 0; count++)
2847 // Base pointers src, dst are biased by 8*(count-1),to last element.
2848 __ align(OptoLoopAlignment);
2849
2850 __ BIND(L_store_element);
2851 __ store_heap_oop(to_element_addr, rax_oop, noreg, noreg, noreg, AS_RAW); // store the oop
2852 __ increment(count); // increment the count toward zero
2853 __ jcc(Assembler::zero, L_do_card_marks);
2854
2855 // ======== loop entry is here ========
2856 __ BIND(L_load_element);
2857 __ load_heap_oop(rax_oop, from_element_addr, noreg, noreg, AS_RAW); // load the oop
2858 __ testptr(rax_oop, rax_oop);
2859 __ jcc(Assembler::zero, L_store_element);
2860
2861 __ load_klass(r11_klass, rax_oop, rscratch1);// query the object klass
2862 generate_type_check(r11_klass, ckoff, ckval, L_store_element);
2863 // ======== end loop ========
2864
2865 // It was a real error; we must depend on the caller to finish the job.
2866 // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2867 // Emit GC store barriers for the oops we have copied (r14 + rdx),
2868 // and report their number to the caller.
2869 assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
2870 Label L_post_barrier;
2871 __ addptr(r14_length, count); // K = (original - remaining) oops
2872 __ movptr(rax, r14_length); // save the value
2873 __ notptr(rax); // report (-1^K) to caller (does not affect flags)
2874 __ jccb(Assembler::notZero, L_post_barrier);
2875 __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
2876
2877 // Come here on success only.
2878 __ BIND(L_do_card_marks);
2879 __ xorptr(rax, rax); // return 0 on success
2880
2881 __ BIND(L_post_barrier);
2882 bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length);
2883
2884 // Common exit point (success or failure).
2885 __ BIND(L_done);
2886 __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2887 __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2888 __ movptr(r10, Address(rsp, saved_r10_offset * wordSize));
2889 restore_arg_regs();
2890 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); // Update counter after rscratch1 is free
2891 __ leave(); // required for proper stackwalking of RuntimeStub frame
2892 __ ret(0);
2893
2894 return start;
2895 }
2896
2897 //
2898 // Generate 'unsafe' array copy stub
2899 // Though just as safe as the other stubs, it takes an unscaled
2900 // size_t argument instead of an element count.
2901 //
2902 // Input:
2903 // c_rarg0 - source array address
2904 // c_rarg1 - destination array address
2905 // c_rarg2 - byte count, treated as ssize_t, can be zero
2906 //
2907 // Examines the alignment of the operands and dispatches
2908 // to a long, int, short, or byte copy loop.
2909 //
2910 address generate_unsafe_copy(const char *name,
2911 address byte_copy_entry, address short_copy_entry,
2912 address int_copy_entry, address long_copy_entry) {
2913
2914 Label L_long_aligned, L_int_aligned, L_short_aligned;
2915
2916 // Input registers (before setup_arg_regs)
2917 const Register from = c_rarg0; // source array address
2918 const Register to = c_rarg1; // destination array address
2919 const Register size = c_rarg2; // byte count (size_t)
2920
2921 // Register used as a temp
2922 const Register bits = rax; // test copy of low bits
2923
2924 __ align(CodeEntryAlignment);
2925 StubCodeMark mark(this, "StubRoutines", name);
2926 address start = __ pc();
2927
2928 __ enter(); // required for proper stackwalking of RuntimeStub frame
2929
2930 // bump this on entry, not on exit:
2931 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2932
2933 __ mov(bits, from);
2934 __ orptr(bits, to);
2935 __ orptr(bits, size);
2936
2937 __ testb(bits, BytesPerLong-1);
2938 __ jccb(Assembler::zero, L_long_aligned);
2939
2940 __ testb(bits, BytesPerInt-1);
2941 __ jccb(Assembler::zero, L_int_aligned);
2942
2943 __ testb(bits, BytesPerShort-1);
2944 __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
2945
2946 __ BIND(L_short_aligned);
2947 __ shrptr(size, LogBytesPerShort); // size => short_count
2948 __ jump(RuntimeAddress(short_copy_entry));
2949
2950 __ BIND(L_int_aligned);
2951 __ shrptr(size, LogBytesPerInt); // size => int_count
2952 __ jump(RuntimeAddress(int_copy_entry));
2953
2954 __ BIND(L_long_aligned);
2955 __ shrptr(size, LogBytesPerLong); // size => qword_count
2956 __ jump(RuntimeAddress(long_copy_entry));
2957
2958 return start;
2959 }
2960
2961 // Perform range checks on the proposed arraycopy.
2962 // Kills temp, but nothing else.
2963 // Also, clean the sign bits of src_pos and dst_pos.
2964 void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
2965 Register src_pos, // source position (c_rarg1)
2966 Register dst, // destination array oo (c_rarg2)
2967 Register dst_pos, // destination position (c_rarg3)
2968 Register length,
2969 Register temp,
2970 Label& L_failed) {
2971 BLOCK_COMMENT("arraycopy_range_checks:");
2972
2973 // if (src_pos + length > arrayOop(src)->length()) FAIL;
2974 __ movl(temp, length);
2975 __ addl(temp, src_pos); // src_pos + length
2976 __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2977 __ jcc(Assembler::above, L_failed);
2978
2979 // if (dst_pos + length > arrayOop(dst)->length()) FAIL;
2980 __ movl(temp, length);
2981 __ addl(temp, dst_pos); // dst_pos + length
2982 __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2983 __ jcc(Assembler::above, L_failed);
2984
2985 // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2986 // Move with sign extension can be used since they are positive.
2987 __ movslq(src_pos, src_pos);
2988 __ movslq(dst_pos, dst_pos);
2989
2990 BLOCK_COMMENT("arraycopy_range_checks done");
2991 }
2992
2993 //
2994 // Generate generic array copy stubs
2995 //
2996 // Input:
2997 // c_rarg0 - src oop
2998 // c_rarg1 - src_pos (32-bits)
2999 // c_rarg2 - dst oop
3000 // c_rarg3 - dst_pos (32-bits)
3001 // not Win64
3002 // c_rarg4 - element count (32-bits)
3003 // Win64
3004 // rsp+40 - element count (32-bits)
3005 //
3006 // Output:
3007 // rax == 0 - success
3008 // rax == -1^K - failure, where K is partial transfer count
3009 //
3010 address generate_generic_copy(const char *name,
3011 address byte_copy_entry, address short_copy_entry,
3012 address int_copy_entry, address oop_copy_entry,
3013 address long_copy_entry, address checkcast_copy_entry) {
3014
3015 Label L_failed, L_failed_0, L_objArray;
3016 Label L_copy_shorts, L_copy_ints, L_copy_longs;
3017
3018 // Input registers
3019 const Register src = c_rarg0; // source array oop
3020 const Register src_pos = c_rarg1; // source position
3021 const Register dst = c_rarg2; // destination array oop
3022 const Register dst_pos = c_rarg3; // destination position
3023 #ifndef _WIN64
3024 const Register length = c_rarg4;
3025 const Register rklass_tmp = r9; // load_klass
3026 #else
3027 const Address length(rsp, 7 * wordSize); // elements count is on stack on Win64
3028 const Register rklass_tmp = rdi; // load_klass
3029 #endif
3030
3031 { int modulus = CodeEntryAlignment;
3032 int target = modulus - 5; // 5 = sizeof jmp(L_failed)
3033 int advance = target - (__ offset() % modulus);
3034 if (advance < 0) advance += modulus;
3035 if (advance > 0) __ nop(advance);
3036 }
3037 StubCodeMark mark(this, "StubRoutines", name);
3038
3039 // Short-hop target to L_failed. Makes for denser prologue code.
3040 __ BIND(L_failed_0);
3041 __ jmp(L_failed);
3042 assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
3043
3044 __ align(CodeEntryAlignment);
3045 address start = __ pc();
3046
3047 __ enter(); // required for proper stackwalking of RuntimeStub frame
3048
3049 #ifdef _WIN64
3050 __ push(rklass_tmp); // rdi is callee-save on Windows
3051 #endif
3052
3053 // bump this on entry, not on exit:
3054 inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
3055
3056 //-----------------------------------------------------------------------
3057 // Assembler stub will be used for this call to arraycopy
3058 // if the following conditions are met:
3059 //
3060 // (1) src and dst must not be null.
3061 // (2) src_pos must not be negative.
3062 // (3) dst_pos must not be negative.
3063 // (4) length must not be negative.
3064 // (5) src klass and dst klass should be the same and not NULL.
3065 // (6) src and dst should be arrays.
3066 // (7) src_pos + length must not exceed length of src.
3067 // (8) dst_pos + length must not exceed length of dst.
3068 //
3069
3070 // if (src == NULL) return -1;
3071 __ testptr(src, src); // src oop
3072 size_t j1off = __ offset();
3073 __ jccb(Assembler::zero, L_failed_0);
3074
3075 // if (src_pos < 0) return -1;
3076 __ testl(src_pos, src_pos); // src_pos (32-bits)
3077 __ jccb(Assembler::negative, L_failed_0);
3078
3079 // if (dst == NULL) return -1;
3080 __ testptr(dst, dst); // dst oop
3081 __ jccb(Assembler::zero, L_failed_0);
3082
3083 // if (dst_pos < 0) return -1;
3084 __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
3085 size_t j4off = __ offset();
3086 __ jccb(Assembler::negative, L_failed_0);
3087
3088 // The first four tests are very dense code,
3089 // but not quite dense enough to put four
3090 // jumps in a 16-byte instruction fetch buffer.
3091 // That's good, because some branch predicters
3092 // do not like jumps so close together.
3093 // Make sure of this.
3094 guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
3095
3096 // registers used as temp
3097 const Register r11_length = r11; // elements count to copy
3098 const Register r10_src_klass = r10; // array klass
3099
3100 // if (length < 0) return -1;
3101 __ movl(r11_length, length); // length (elements count, 32-bits value)
3102 __ testl(r11_length, r11_length);
3103 __ jccb(Assembler::negative, L_failed_0);
3104
3105 __ load_klass(r10_src_klass, src, rklass_tmp);
3106 #ifdef ASSERT
3107 // assert(src->klass() != NULL);
3108 {
3109 BLOCK_COMMENT("assert klasses not null {");
3110 Label L1, L2;
3111 __ testptr(r10_src_klass, r10_src_klass);
3112 __ jcc(Assembler::notZero, L2); // it is broken if klass is NULL
3113 __ bind(L1);
3114 __ stop("broken null klass");
3115 __ bind(L2);
3116 __ load_klass(rax, dst, rklass_tmp);
3117 __ cmpq(rax, 0);
3118 __ jcc(Assembler::equal, L1); // this would be broken also
3119 BLOCK_COMMENT("} assert klasses not null done");
3120 }
3121 #endif
3122
3123 // Load layout helper (32-bits)
3124 //
3125 // |array_tag| | header_size | element_type | |log2_element_size|
3126 // 32 30 24 16 8 2 0
3127 //
3128 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
3129 //
3130
3131 const int lh_offset = in_bytes(Klass::layout_helper_offset());
3132
3133 // Handle objArrays completely differently...
3134 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
3135 __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
3136 __ jcc(Assembler::equal, L_objArray);
3137
3138 // if (src->klass() != dst->klass()) return -1;
3139 __ load_klass(rax, dst, rklass_tmp);
3140 __ cmpq(r10_src_klass, rax);
3141 __ jcc(Assembler::notEqual, L_failed);
3142
3143 const Register rax_lh = rax; // layout helper
3144 __ movl(rax_lh, Address(r10_src_klass, lh_offset));
3145
3146 // if (!src->is_Array()) return -1;
3147 __ cmpl(rax_lh, Klass::_lh_neutral_value);
3148 __ jcc(Assembler::greaterEqual, L_failed);
3149
3150 // At this point, it is known to be a typeArray (array_tag 0x3).
3151 #ifdef ASSERT
3152 {
3153 BLOCK_COMMENT("assert primitive array {");
3154 Label L;
3155 __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
3156 __ jcc(Assembler::greaterEqual, L);
3157 __ stop("must be a primitive array");
3158 __ bind(L);
3159 BLOCK_COMMENT("} assert primitive array done");
3160 }
3161 #endif
3162
3163 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3164 r10, L_failed);
3165
3166 // TypeArrayKlass
3167 //
3168 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
3169 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
3170 //
3171
3172 const Register r10_offset = r10; // array offset
3173 const Register rax_elsize = rax_lh; // element size
3174
3175 __ movl(r10_offset, rax_lh);
3176 __ shrl(r10_offset, Klass::_lh_header_size_shift);
3177 __ andptr(r10_offset, Klass::_lh_header_size_mask); // array_offset
3178 __ addptr(src, r10_offset); // src array offset
3179 __ addptr(dst, r10_offset); // dst array offset
3180 BLOCK_COMMENT("choose copy loop based on element size");
3181 __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
3182
3183 #ifdef _WIN64
3184 __ pop(rklass_tmp); // Restore callee-save rdi
3185 #endif
3186
3187 // next registers should be set before the jump to corresponding stub
3188 const Register from = c_rarg0; // source array address
3189 const Register to = c_rarg1; // destination array address
3190 const Register count = c_rarg2; // elements count
3191
3192 // 'from', 'to', 'count' registers should be set in such order
3193 // since they are the same as 'src', 'src_pos', 'dst'.
3194
3195 __ cmpl(rax_elsize, 0);
3196 __ jccb(Assembler::notEqual, L_copy_shorts);
3197 __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
3198 __ lea(to, Address(dst, dst_pos, Address::times_1, 0));// dst_addr
3199 __ movl2ptr(count, r11_length); // length
3200 __ jump(RuntimeAddress(byte_copy_entry));
3201
3202 __ BIND(L_copy_shorts);
3203 __ cmpl(rax_elsize, LogBytesPerShort);
3204 __ jccb(Assembler::notEqual, L_copy_ints);
3205 __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
3206 __ lea(to, Address(dst, dst_pos, Address::times_2, 0));// dst_addr
3207 __ movl2ptr(count, r11_length); // length
3208 __ jump(RuntimeAddress(short_copy_entry));
3209
3210 __ BIND(L_copy_ints);
3211 __ cmpl(rax_elsize, LogBytesPerInt);
3212 __ jccb(Assembler::notEqual, L_copy_longs);
3213 __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
3214 __ lea(to, Address(dst, dst_pos, Address::times_4, 0));// dst_addr
3215 __ movl2ptr(count, r11_length); // length
3216 __ jump(RuntimeAddress(int_copy_entry));
3217
3218 __ BIND(L_copy_longs);
3219 #ifdef ASSERT
3220 {
3221 BLOCK_COMMENT("assert long copy {");
3222 Label L;
3223 __ cmpl(rax_elsize, LogBytesPerLong);
3224 __ jcc(Assembler::equal, L);
3225 __ stop("must be long copy, but elsize is wrong");
3226 __ bind(L);
3227 BLOCK_COMMENT("} assert long copy done");
3228 }
3229 #endif
3230 __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
3231 __ lea(to, Address(dst, dst_pos, Address::times_8, 0));// dst_addr
3232 __ movl2ptr(count, r11_length); // length
3233 __ jump(RuntimeAddress(long_copy_entry));
3234
3235 // ObjArrayKlass
3236 __ BIND(L_objArray);
3237 // live at this point: r10_src_klass, r11_length, src[_pos], dst[_pos]
3238
3239 Label L_plain_copy, L_checkcast_copy;
3240 // test array classes for subtyping
3241 __ load_klass(rax, dst, rklass_tmp);
3242 __ cmpq(r10_src_klass, rax); // usual case is exact equality
3243 __ jcc(Assembler::notEqual, L_checkcast_copy);
3244
3245 // Identically typed arrays can be copied without element-wise checks.
3246 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3247 r10, L_failed);
3248
3249 __ lea(from, Address(src, src_pos, TIMES_OOP,
3250 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
3251 __ lea(to, Address(dst, dst_pos, TIMES_OOP,
3252 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
3253 __ movl2ptr(count, r11_length); // length
3254 __ BIND(L_plain_copy);
3255 #ifdef _WIN64
3256 __ pop(rklass_tmp); // Restore callee-save rdi
3257 #endif
3258 __ jump(RuntimeAddress(oop_copy_entry));
3259
3260 __ BIND(L_checkcast_copy);
3261 // live at this point: r10_src_klass, r11_length, rax (dst_klass)
3262 {
3263 // Before looking at dst.length, make sure dst is also an objArray.
3264 __ cmpl(Address(rax, lh_offset), objArray_lh);
3265 __ jcc(Assembler::notEqual, L_failed);
3266
3267 // It is safe to examine both src.length and dst.length.
3268 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3269 rax, L_failed);
3270
3271 const Register r11_dst_klass = r11;
3272 __ load_klass(r11_dst_klass, dst, rklass_tmp); // reload
3273
3274 // Marshal the base address arguments now, freeing registers.
3275 __ lea(from, Address(src, src_pos, TIMES_OOP,
3276 arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3277 __ lea(to, Address(dst, dst_pos, TIMES_OOP,
3278 arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3279 __ movl(count, length); // length (reloaded)
3280 Register sco_temp = c_rarg3; // this register is free now
3281 assert_different_registers(from, to, count, sco_temp,
3282 r11_dst_klass, r10_src_klass);
3283 assert_clean_int(count, sco_temp);
3284
3285 // Generate the type check.
3286 const int sco_offset = in_bytes(Klass::super_check_offset_offset());
3287 __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
3288 assert_clean_int(sco_temp, rax);
3289 generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
3290
3291 // Fetch destination element klass from the ObjArrayKlass header.
3292 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3293 __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
3294 __ movl( sco_temp, Address(r11_dst_klass, sco_offset));
3295 assert_clean_int(sco_temp, rax);
3296
3297 #ifdef _WIN64
3298 __ pop(rklass_tmp); // Restore callee-save rdi
3299 #endif
3300
3301 // the checkcast_copy loop needs two extra arguments:
3302 assert(c_rarg3 == sco_temp, "#3 already in place");
3303 // Set up arguments for checkcast_copy_entry.
3304 setup_arg_regs(4);
3305 __ movptr(r8, r11_dst_klass); // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
3306 __ jump(RuntimeAddress(checkcast_copy_entry));
3307 }
3308
3309 __ BIND(L_failed);
3310 #ifdef _WIN64
3311 __ pop(rklass_tmp); // Restore callee-save rdi
3312 #endif
3313 __ xorptr(rax, rax);
3314 __ notptr(rax); // return -1
3315 __ leave(); // required for proper stackwalking of RuntimeStub frame
3316 __ ret(0);
3317
3318 return start;
3319 }
3320
3321 address generate_data_cache_writeback() {
3322 const Register src = c_rarg0; // source address
3323
3324 __ align(CodeEntryAlignment);
3325
3326 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
3327
3328 address start = __ pc();
3329 __ enter();
3330 __ cache_wb(Address(src, 0));
3331 __ leave();
3332 __ ret(0);
3333
3334 return start;
3335 }
3336
3337 address generate_data_cache_writeback_sync() {
3338 const Register is_pre = c_rarg0; // pre or post sync
3339
3340 __ align(CodeEntryAlignment);
3341
3342 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
3343
3344 // pre wbsync is a no-op
3345 // post wbsync translates to an sfence
3346
3347 Label skip;
3348 address start = __ pc();
3349 __ enter();
3350 __ cmpl(is_pre, 0);
3351 __ jcc(Assembler::notEqual, skip);
3352 __ cache_wbsync(false);
3353 __ bind(skip);
3354 __ leave();
3355 __ ret(0);
3356
3357 return start;
3358 }
3359
3360 void generate_arraycopy_stubs() {
3361 address entry;
3362 address entry_jbyte_arraycopy;
3363 address entry_jshort_arraycopy;
3364 address entry_jint_arraycopy;
3365 address entry_oop_arraycopy;
3366 address entry_jlong_arraycopy;
3367 address entry_checkcast_arraycopy;
3368
3369 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry,
3370 "jbyte_disjoint_arraycopy");
3371 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy,
3372 "jbyte_arraycopy");
3373
3374 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
3375 "jshort_disjoint_arraycopy");
3376 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy,
3377 "jshort_arraycopy");
3378
3379 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, false, &entry,
3380 "jint_disjoint_arraycopy");
3381 StubRoutines::_jint_arraycopy = generate_conjoint_int_oop_copy(false, false, entry,
3382 &entry_jint_arraycopy, "jint_arraycopy");
3383
3384 StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_oop_copy(false, false, &entry,
3385 "jlong_disjoint_arraycopy");
3386 StubRoutines::_jlong_arraycopy = generate_conjoint_long_oop_copy(false, false, entry,
3387 &entry_jlong_arraycopy, "jlong_arraycopy");
3388
3389
3390 if (UseCompressedOops) {
3391 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, true, &entry,
3392 "oop_disjoint_arraycopy");
3393 StubRoutines::_oop_arraycopy = generate_conjoint_int_oop_copy(false, true, entry,
3394 &entry_oop_arraycopy, "oop_arraycopy");
3395 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_int_oop_copy(false, true, &entry,
3396 "oop_disjoint_arraycopy_uninit",
3397 /*dest_uninitialized*/true);
3398 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_int_oop_copy(false, true, entry,
3399 NULL, "oop_arraycopy_uninit",
3400 /*dest_uninitialized*/true);
3401 } else {
3402 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_long_oop_copy(false, true, &entry,
3403 "oop_disjoint_arraycopy");
3404 StubRoutines::_oop_arraycopy = generate_conjoint_long_oop_copy(false, true, entry,
3405 &entry_oop_arraycopy, "oop_arraycopy");
3406 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_long_oop_copy(false, true, &entry,
3407 "oop_disjoint_arraycopy_uninit",
3408 /*dest_uninitialized*/true);
3409 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_long_oop_copy(false, true, entry,
3410 NULL, "oop_arraycopy_uninit",
3411 /*dest_uninitialized*/true);
3412 }
3413
3414 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
3415 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
3416 /*dest_uninitialized*/true);
3417
3418 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy",
3419 entry_jbyte_arraycopy,
3420 entry_jshort_arraycopy,
3421 entry_jint_arraycopy,
3422 entry_jlong_arraycopy);
3423 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy",
3424 entry_jbyte_arraycopy,
3425 entry_jshort_arraycopy,
3426 entry_jint_arraycopy,
3427 entry_oop_arraycopy,
3428 entry_jlong_arraycopy,
3429 entry_checkcast_arraycopy);
3430
3431 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
3432 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
3433 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
3434 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
3435 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3436 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
3437
3438 // We don't generate specialized code for HeapWord-aligned source
3439 // arrays, so just use the code we've already generated
3440 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = StubRoutines::_jbyte_disjoint_arraycopy;
3441 StubRoutines::_arrayof_jbyte_arraycopy = StubRoutines::_jbyte_arraycopy;
3442
3443 StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
3444 StubRoutines::_arrayof_jshort_arraycopy = StubRoutines::_jshort_arraycopy;
3445
3446 StubRoutines::_arrayof_jint_disjoint_arraycopy = StubRoutines::_jint_disjoint_arraycopy;
3447 StubRoutines::_arrayof_jint_arraycopy = StubRoutines::_jint_arraycopy;
3448
3449 StubRoutines::_arrayof_jlong_disjoint_arraycopy = StubRoutines::_jlong_disjoint_arraycopy;
3450 StubRoutines::_arrayof_jlong_arraycopy = StubRoutines::_jlong_arraycopy;
3451
3452 StubRoutines::_arrayof_oop_disjoint_arraycopy = StubRoutines::_oop_disjoint_arraycopy;
3453 StubRoutines::_arrayof_oop_arraycopy = StubRoutines::_oop_arraycopy;
3454
3455 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = StubRoutines::_oop_disjoint_arraycopy_uninit;
3456 StubRoutines::_arrayof_oop_arraycopy_uninit = StubRoutines::_oop_arraycopy_uninit;
3457 }
3458
3459 // AES intrinsic stubs
3460 enum {AESBlockSize = 16};
3461
3462 address generate_key_shuffle_mask() {
3463 __ align(16);
3464 StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
3465 address start = __ pc();
3466 __ emit_data64( 0x0405060700010203, relocInfo::none );
3467 __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
3468 return start;
3469 }
3470
3471 address generate_counter_shuffle_mask() {
3472 __ align(16);
3473 StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
3474 address start = __ pc();
3475 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3476 __ emit_data64(0x0001020304050607, relocInfo::none);
3477 return start;
3478 }
3479
3480 // Utility routine for loading a 128-bit key word in little endian format
3481 // can optionally specify that the shuffle mask is already in an xmmregister
3482 void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
3483 __ movdqu(xmmdst, Address(key, offset));
3484 if (xmm_shuf_mask != NULL) {
3485 __ pshufb(xmmdst, xmm_shuf_mask);
3486 } else {
3487 __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3488 }
3489 }
3490
3491 // Utility routine for increase 128bit counter (iv in CTR mode)
3492 void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
3493 __ pextrq(reg, xmmdst, 0x0);
3494 __ addq(reg, inc_delta);
3495 __ pinsrq(xmmdst, reg, 0x0);
3496 __ jcc(Assembler::carryClear, next_block); // jump if no carry
3497 __ pextrq(reg, xmmdst, 0x01); // Carry
3498 __ addq(reg, 0x01);
3499 __ pinsrq(xmmdst, reg, 0x01); //Carry end
3500 __ BIND(next_block); // next instruction
3501 }
3502
3503 // Arguments:
3504 //
3505 // Inputs:
3506 // c_rarg0 - source byte array address
3507 // c_rarg1 - destination byte array address
3508 // c_rarg2 - K (key) in little endian int array
3509 //
3510 address generate_aescrypt_encryptBlock() {
3511 assert(UseAES, "need AES instructions and misaligned SSE support");
3512 __ align(CodeEntryAlignment);
3513 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
3514 Label L_doLast;
3515 address start = __ pc();
3516
3517 const Register from = c_rarg0; // source array address
3518 const Register to = c_rarg1; // destination array address
3519 const Register key = c_rarg2; // key array address
3520 const Register keylen = rax;
3521
3522 const XMMRegister xmm_result = xmm0;
3523 const XMMRegister xmm_key_shuf_mask = xmm1;
3524 // On win64 xmm6-xmm15 must be preserved so don't use them.
3525 const XMMRegister xmm_temp1 = xmm2;
3526 const XMMRegister xmm_temp2 = xmm3;
3527 const XMMRegister xmm_temp3 = xmm4;
3528 const XMMRegister xmm_temp4 = xmm5;
3529
3530 __ enter(); // required for proper stackwalking of RuntimeStub frame
3531
3532 // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3533 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3534
3535 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3536 __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input
3537
3538 // For encryption, the java expanded key ordering is just what we need
3539 // we don't know if the key is aligned, hence not using load-execute form
3540
3541 load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
3542 __ pxor(xmm_result, xmm_temp1);
3543
3544 load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3545 load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3546 load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3547 load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3548
3549 __ aesenc(xmm_result, xmm_temp1);
3550 __ aesenc(xmm_result, xmm_temp2);
3551 __ aesenc(xmm_result, xmm_temp3);
3552 __ aesenc(xmm_result, xmm_temp4);
3553
3554 load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3555 load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3556 load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3557 load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3558
3559 __ aesenc(xmm_result, xmm_temp1);
3560 __ aesenc(xmm_result, xmm_temp2);
3561 __ aesenc(xmm_result, xmm_temp3);
3562 __ aesenc(xmm_result, xmm_temp4);
3563
3564 load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3565 load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3566
3567 __ cmpl(keylen, 44);
3568 __ jccb(Assembler::equal, L_doLast);
3569
3570 __ aesenc(xmm_result, xmm_temp1);
3571 __ aesenc(xmm_result, xmm_temp2);
3572
3573 load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3574 load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3575
3576 __ cmpl(keylen, 52);
3577 __ jccb(Assembler::equal, L_doLast);
3578
3579 __ aesenc(xmm_result, xmm_temp1);
3580 __ aesenc(xmm_result, xmm_temp2);
3581
3582 load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3583 load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3584
3585 __ BIND(L_doLast);
3586 __ aesenc(xmm_result, xmm_temp1);
3587 __ aesenclast(xmm_result, xmm_temp2);
3588 __ movdqu(Address(to, 0), xmm_result); // store the result
3589 __ xorptr(rax, rax); // return 0
3590 __ leave(); // required for proper stackwalking of RuntimeStub frame
3591 __ ret(0);
3592
3593 return start;
3594 }
3595
3596
3597 // Arguments:
3598 //
3599 // Inputs:
3600 // c_rarg0 - source byte array address
3601 // c_rarg1 - destination byte array address
3602 // c_rarg2 - K (key) in little endian int array
3603 //
3604 address generate_aescrypt_decryptBlock() {
3605 assert(UseAES, "need AES instructions and misaligned SSE support");
3606 __ align(CodeEntryAlignment);
3607 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3608 Label L_doLast;
3609 address start = __ pc();
3610
3611 const Register from = c_rarg0; // source array address
3612 const Register to = c_rarg1; // destination array address
3613 const Register key = c_rarg2; // key array address
3614 const Register keylen = rax;
3615
3616 const XMMRegister xmm_result = xmm0;
3617 const XMMRegister xmm_key_shuf_mask = xmm1;
3618 // On win64 xmm6-xmm15 must be preserved so don't use them.
3619 const XMMRegister xmm_temp1 = xmm2;
3620 const XMMRegister xmm_temp2 = xmm3;
3621 const XMMRegister xmm_temp3 = xmm4;
3622 const XMMRegister xmm_temp4 = xmm5;
3623
3624 __ enter(); // required for proper stackwalking of RuntimeStub frame
3625
3626 // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3627 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3628
3629 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3630 __ movdqu(xmm_result, Address(from, 0));
3631
3632 // for decryption java expanded key ordering is rotated one position from what we want
3633 // so we start from 0x10 here and hit 0x00 last
3634 // we don't know if the key is aligned, hence not using load-execute form
3635 load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3636 load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3637 load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3638 load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3639
3640 __ pxor (xmm_result, xmm_temp1);
3641 __ aesdec(xmm_result, xmm_temp2);
3642 __ aesdec(xmm_result, xmm_temp3);
3643 __ aesdec(xmm_result, xmm_temp4);
3644
3645 load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3646 load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3647 load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3648 load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3649
3650 __ aesdec(xmm_result, xmm_temp1);
3651 __ aesdec(xmm_result, xmm_temp2);
3652 __ aesdec(xmm_result, xmm_temp3);
3653 __ aesdec(xmm_result, xmm_temp4);
3654
3655 load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3656 load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3657 load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
3658
3659 __ cmpl(keylen, 44);
3660 __ jccb(Assembler::equal, L_doLast);
3661
3662 __ aesdec(xmm_result, xmm_temp1);
3663 __ aesdec(xmm_result, xmm_temp2);
3664
3665 load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3666 load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3667
3668 __ cmpl(keylen, 52);
3669 __ jccb(Assembler::equal, L_doLast);
3670
3671 __ aesdec(xmm_result, xmm_temp1);
3672 __ aesdec(xmm_result, xmm_temp2);
3673
3674 load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3675 load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3676
3677 __ BIND(L_doLast);
3678 __ aesdec(xmm_result, xmm_temp1);
3679 __ aesdec(xmm_result, xmm_temp2);
3680
3681 // for decryption the aesdeclast operation is always on key+0x00
3682 __ aesdeclast(xmm_result, xmm_temp3);
3683 __ movdqu(Address(to, 0), xmm_result); // store the result
3684 __ xorptr(rax, rax); // return 0
3685 __ leave(); // required for proper stackwalking of RuntimeStub frame
3686 __ ret(0);
3687
3688 return start;
3689 }
3690
3691
3692 // Arguments:
3693 //
3694 // Inputs:
3695 // c_rarg0 - source byte array address
3696 // c_rarg1 - destination byte array address
3697 // c_rarg2 - K (key) in little endian int array
3698 // c_rarg3 - r vector byte array address
3699 // c_rarg4 - input length
3700 //
3701 // Output:
3702 // rax - input length
3703 //
3704 address generate_cipherBlockChaining_encryptAESCrypt() {
3705 assert(UseAES, "need AES instructions and misaligned SSE support");
3706 __ align(CodeEntryAlignment);
3707 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3708 address start = __ pc();
3709
3710 Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
3711 const Register from = c_rarg0; // source array address
3712 const Register to = c_rarg1; // destination array address
3713 const Register key = c_rarg2; // key array address
3714 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3715 // and left with the results of the last encryption block
3716 #ifndef _WIN64
3717 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3718 #else
3719 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
3720 const Register len_reg = r11; // pick the volatile windows register
3721 #endif
3722 const Register pos = rax;
3723
3724 // xmm register assignments for the loops below
3725 const XMMRegister xmm_result = xmm0;
3726 const XMMRegister xmm_temp = xmm1;
3727 // keys 0-10 preloaded into xmm2-xmm12
3728 const int XMM_REG_NUM_KEY_FIRST = 2;
3729 const int XMM_REG_NUM_KEY_LAST = 15;
3730 const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3731 const XMMRegister xmm_key10 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10);
3732 const XMMRegister xmm_key11 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11);
3733 const XMMRegister xmm_key12 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12);
3734 const XMMRegister xmm_key13 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13);
3735
3736 __ enter(); // required for proper stackwalking of RuntimeStub frame
3737
3738 #ifdef _WIN64
3739 // on win64, fill len_reg from stack position
3740 __ movl(len_reg, len_mem);
3741 #else
3742 __ push(len_reg); // Save
3743 #endif
3744
3745 const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front
3746 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3747 // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0
3748 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
3749 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3750 offset += 0x10;
3751 }
3752 __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec
3753
3754 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3755 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3756 __ cmpl(rax, 44);
3757 __ jcc(Assembler::notEqual, L_key_192_256);
3758
3759 // 128 bit code follows here
3760 __ movptr(pos, 0);
3761 __ align(OptoLoopAlignment);
3762
3763 __ BIND(L_loopTop_128);
3764 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
3765 __ pxor (xmm_result, xmm_temp); // xor with the current r vector
3766 __ pxor (xmm_result, xmm_key0); // do the aes rounds
3767 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
3768 __ aesenc(xmm_result, as_XMMRegister(rnum));
3769 }
3770 __ aesenclast(xmm_result, xmm_key10);
3771 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3772 // no need to store r to memory until we exit
3773 __ addptr(pos, AESBlockSize);
3774 __ subptr(len_reg, AESBlockSize);
3775 __ jcc(Assembler::notEqual, L_loopTop_128);
3776
3777 __ BIND(L_exit);
3778 __ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object
3779
3780 #ifdef _WIN64
3781 __ movl(rax, len_mem);
3782 #else
3783 __ pop(rax); // return length
3784 #endif
3785 __ leave(); // required for proper stackwalking of RuntimeStub frame
3786 __ ret(0);
3787
3788 __ BIND(L_key_192_256);
3789 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3790 load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
3791 load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
3792 __ cmpl(rax, 52);
3793 __ jcc(Assembler::notEqual, L_key_256);
3794
3795 // 192-bit code follows here (could be changed to use more xmm registers)
3796 __ movptr(pos, 0);
3797 __ align(OptoLoopAlignment);
3798
3799 __ BIND(L_loopTop_192);
3800 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
3801 __ pxor (xmm_result, xmm_temp); // xor with the current r vector
3802 __ pxor (xmm_result, xmm_key0); // do the aes rounds
3803 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) {
3804 __ aesenc(xmm_result, as_XMMRegister(rnum));
3805 }
3806 __ aesenclast(xmm_result, xmm_key12);
3807 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3808 // no need to store r to memory until we exit
3809 __ addptr(pos, AESBlockSize);
3810 __ subptr(len_reg, AESBlockSize);
3811 __ jcc(Assembler::notEqual, L_loopTop_192);
3812 __ jmp(L_exit);
3813
3814 __ BIND(L_key_256);
3815 // 256-bit code follows here (could be changed to use more xmm registers)
3816 load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask);
3817 __ movptr(pos, 0);
3818 __ align(OptoLoopAlignment);
3819
3820 __ BIND(L_loopTop_256);
3821 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
3822 __ pxor (xmm_result, xmm_temp); // xor with the current r vector
3823 __ pxor (xmm_result, xmm_key0); // do the aes rounds
3824 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) {
3825 __ aesenc(xmm_result, as_XMMRegister(rnum));
3826 }
3827 load_key(xmm_temp, key, 0xe0);
3828 __ aesenclast(xmm_result, xmm_temp);
3829 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3830 // no need to store r to memory until we exit
3831 __ addptr(pos, AESBlockSize);
3832 __ subptr(len_reg, AESBlockSize);
3833 __ jcc(Assembler::notEqual, L_loopTop_256);
3834 __ jmp(L_exit);
3835
3836 return start;
3837 }
3838
3839 // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
3840 // to hide instruction latency
3841 //
3842 // Arguments:
3843 //
3844 // Inputs:
3845 // c_rarg0 - source byte array address
3846 // c_rarg1 - destination byte array address
3847 // c_rarg2 - K (key) in little endian int array
3848 // c_rarg3 - r vector byte array address
3849 // c_rarg4 - input length
3850 //
3851 // Output:
3852 // rax - input length
3853 //
3854 address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3855 assert(UseAES, "need AES instructions and misaligned SSE support");
3856 __ align(CodeEntryAlignment);
3857 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3858 address start = __ pc();
3859
3860 const Register from = c_rarg0; // source array address
3861 const Register to = c_rarg1; // destination array address
3862 const Register key = c_rarg2; // key array address
3863 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3864 // and left with the results of the last encryption block
3865 #ifndef _WIN64
3866 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3867 #else
3868 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
3869 const Register len_reg = r11; // pick the volatile windows register
3870 #endif
3871 const Register pos = rax;
3872
3873 const int PARALLEL_FACTOR = 4;
3874 const int ROUNDS[3] = { 10, 12, 14 }; // aes rounds for key128, key192, key256
3875
3876 Label L_exit;
3877 Label L_singleBlock_loopTopHead[3]; // 128, 192, 256
3878 Label L_singleBlock_loopTopHead2[3]; // 128, 192, 256
3879 Label L_singleBlock_loopTop[3]; // 128, 192, 256
3880 Label L_multiBlock_loopTopHead[3]; // 128, 192, 256
3881 Label L_multiBlock_loopTop[3]; // 128, 192, 256
3882
3883 // keys 0-10 preloaded into xmm5-xmm15
3884 const int XMM_REG_NUM_KEY_FIRST = 5;
3885 const int XMM_REG_NUM_KEY_LAST = 15;
3886 const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3887 const XMMRegister xmm_key_last = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3888
3889 __ enter(); // required for proper stackwalking of RuntimeStub frame
3890
3891 #ifdef _WIN64
3892 // on win64, fill len_reg from stack position
3893 __ movl(len_reg, len_mem);
3894 #else
3895 __ push(len_reg); // Save
3896 #endif
3897 __ push(rbx);
3898 // the java expanded key ordering is rotated one position from what we want
3899 // so we start from 0x10 here and hit 0x00 last
3900 const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
3901 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3902 // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
3903 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
3904 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3905 offset += 0x10;
3906 }
3907 load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);
3908
3909 const XMMRegister xmm_prev_block_cipher = xmm1; // holds cipher of previous block
3910
3911 // registers holding the four results in the parallelized loop
3912 const XMMRegister xmm_result0 = xmm0;
3913 const XMMRegister xmm_result1 = xmm2;
3914 const XMMRegister xmm_result2 = xmm3;
3915 const XMMRegister xmm_result3 = xmm4;
3916
3917 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec
3918
3919 __ xorptr(pos, pos);
3920
3921 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3922 __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3923 __ cmpl(rbx, 52);
3924 __ jcc(Assembler::equal, L_multiBlock_loopTopHead[1]);
3925 __ cmpl(rbx, 60);
3926 __ jcc(Assembler::equal, L_multiBlock_loopTopHead[2]);
3927
3928 #define DoFour(opc, src_reg) \
3929 __ opc(xmm_result0, src_reg); \
3930 __ opc(xmm_result1, src_reg); \
3931 __ opc(xmm_result2, src_reg); \
3932 __ opc(xmm_result3, src_reg); \
3933
3934 for (int k = 0; k < 3; ++k) {
3935 __ BIND(L_multiBlock_loopTopHead[k]);
3936 if (k != 0) {
3937 __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3938 __ jcc(Assembler::less, L_singleBlock_loopTopHead2[k]);
3939 }
3940 if (k == 1) {
3941 __ subptr(rsp, 6 * wordSize);
3942 __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3943 load_key(xmm15, key, 0xb0); // 0xb0; 192-bit key goes up to 0xc0
3944 __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3945 load_key(xmm1, key, 0xc0); // 0xc0;
3946 __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3947 } else if (k == 2) {
3948 __ subptr(rsp, 10 * wordSize);
3949 __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3950 load_key(xmm15, key, 0xd0); // 0xd0; 256-bit key goes up to 0xe0
3951 __ movdqu(Address(rsp, 6 * wordSize), xmm15);
3952 load_key(xmm1, key, 0xe0); // 0xe0;
3953 __ movdqu(Address(rsp, 8 * wordSize), xmm1);
3954 load_key(xmm15, key, 0xb0); // 0xb0;
3955 __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3956 load_key(xmm1, key, 0xc0); // 0xc0;
3957 __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3958 }
3959 __ align(OptoLoopAlignment);
3960 __ BIND(L_multiBlock_loopTop[k]);
3961 __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3962 __ jcc(Assembler::less, L_singleBlock_loopTopHead[k]);
3963
3964 if (k != 0) {
3965 __ movdqu(xmm15, Address(rsp, 2 * wordSize));
3966 __ movdqu(xmm1, Address(rsp, 4 * wordSize));
3967 }
3968
3969 __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
3970 __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3971 __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3972 __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
3973
3974 DoFour(pxor, xmm_key_first);
3975 if (k == 0) {
3976 for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
3977 DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3978 }
3979 DoFour(aesdeclast, xmm_key_last);
3980 } else if (k == 1) {
3981 for (int rnum = 1; rnum <= ROUNDS[k]-2; rnum++) {
3982 DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3983 }
3984 __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
3985 DoFour(aesdec, xmm1); // key : 0xc0
3986 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // xmm1 needs to be loaded again
3987 DoFour(aesdeclast, xmm_key_last);
3988 } else if (k == 2) {
3989 for (int rnum = 1; rnum <= ROUNDS[k] - 4; rnum++) {
3990 DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3991 }
3992 DoFour(aesdec, xmm1); // key : 0xc0
3993 __ movdqu(xmm15, Address(rsp, 6 * wordSize));
3994 __ movdqu(xmm1, Address(rsp, 8 * wordSize));
3995 DoFour(aesdec, xmm15); // key : 0xd0
3996 __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
3997 DoFour(aesdec, xmm1); // key : 0xe0
3998 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // xmm1 needs to be loaded again
3999 DoFour(aesdeclast, xmm_key_last);
4000 }
4001
4002 // for each result, xor with the r vector of previous cipher block
4003 __ pxor(xmm_result0, xmm_prev_block_cipher);
4004 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4005 __ pxor(xmm_result1, xmm_prev_block_cipher);
4006 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
4007 __ pxor(xmm_result2, xmm_prev_block_cipher);
4008 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
4009 __ pxor(xmm_result3, xmm_prev_block_cipher);
4010 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize)); // this will carry over to next set of blocks
4011 if (k != 0) {
4012 __ movdqu(Address(rvec, 0x00), xmm_prev_block_cipher);
4013 }
4014
4015 __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); // store 4 results into the next 64 bytes of output
4016 __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
4017 __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
4018 __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
4019
4020 __ addptr(pos, PARALLEL_FACTOR * AESBlockSize);
4021 __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize);
4022 __ jmp(L_multiBlock_loopTop[k]);
4023
4024 // registers used in the non-parallelized loops
4025 // xmm register assignments for the loops below
4026 const XMMRegister xmm_result = xmm0;
4027 const XMMRegister xmm_prev_block_cipher_save = xmm2;
4028 const XMMRegister xmm_key11 = xmm3;
4029 const XMMRegister xmm_key12 = xmm4;
4030 const XMMRegister key_tmp = xmm4;
4031
4032 __ BIND(L_singleBlock_loopTopHead[k]);
4033 if (k == 1) {
4034 __ addptr(rsp, 6 * wordSize);
4035 } else if (k == 2) {
4036 __ addptr(rsp, 10 * wordSize);
4037 }
4038 __ cmpptr(len_reg, 0); // any blocks left??
4039 __ jcc(Assembler::equal, L_exit);
4040 __ BIND(L_singleBlock_loopTopHead2[k]);
4041 if (k == 1) {
4042 load_key(xmm_key11, key, 0xb0); // 0xb0; 192-bit key goes up to 0xc0
4043 load_key(xmm_key12, key, 0xc0); // 0xc0; 192-bit key goes up to 0xc0
4044 }
4045 if (k == 2) {
4046 load_key(xmm_key11, key, 0xb0); // 0xb0; 256-bit key goes up to 0xe0
4047 }
4048 __ align(OptoLoopAlignment);
4049 __ BIND(L_singleBlock_loopTop[k]);
4050 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
4051 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
4052 __ pxor(xmm_result, xmm_key_first); // do the aes dec rounds
4053 for (int rnum = 1; rnum <= 9 ; rnum++) {
4054 __ aesdec(xmm_result, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
4055 }
4056 if (k == 1) {
4057 __ aesdec(xmm_result, xmm_key11);
4058 __ aesdec(xmm_result, xmm_key12);
4059 }
4060 if (k == 2) {
4061 __ aesdec(xmm_result, xmm_key11);
4062 load_key(key_tmp, key, 0xc0);
4063 __ aesdec(xmm_result, key_tmp);
4064 load_key(key_tmp, key, 0xd0);
4065 __ aesdec(xmm_result, key_tmp);
4066 load_key(key_tmp, key, 0xe0);
4067 __ aesdec(xmm_result, key_tmp);
4068 }
4069
4070 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
4071 __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector
4072 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
4073 // no need to store r to memory until we exit
4074 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
4075 __ addptr(pos, AESBlockSize);
4076 __ subptr(len_reg, AESBlockSize);
4077 __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]);
4078 if (k != 2) {
4079 __ jmp(L_exit);
4080 }
4081 } //for 128/192/256
4082
4083 __ BIND(L_exit);
4084 __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object
4085 __ pop(rbx);
4086 #ifdef _WIN64
4087 __ movl(rax, len_mem);
4088 #else
4089 __ pop(rax); // return length
4090 #endif
4091 __ leave(); // required for proper stackwalking of RuntimeStub frame
4092 __ ret(0);
4093 return start;
4094 }
4095
4096 address generate_electronicCodeBook_encryptAESCrypt() {
4097 __ align(CodeEntryAlignment);
4098 StubCodeMark mark(this, "StubRoutines", "electronicCodeBook_encryptAESCrypt");
4099 address start = __ pc();
4100 const Register from = c_rarg0; // source array address
4101 const Register to = c_rarg1; // destination array address
4102 const Register key = c_rarg2; // key array address
4103 const Register len = c_rarg3; // src len (must be multiple of blocksize 16)
4104 __ enter(); // required for proper stackwalking of RuntimeStub frame
4105 __ aesecb_encrypt(from, to, key, len);
4106 __ vzeroupper();
4107 __ leave(); // required for proper stackwalking of RuntimeStub frame
4108 __ ret(0);
4109 return start;
4110 }
4111
4112 address generate_electronicCodeBook_decryptAESCrypt() {
4113 __ align(CodeEntryAlignment);
4114 StubCodeMark mark(this, "StubRoutines", "electronicCodeBook_decryptAESCrypt");
4115 address start = __ pc();
4116 const Register from = c_rarg0; // source array address
4117 const Register to = c_rarg1; // destination array address
4118 const Register key = c_rarg2; // key array address
4119 const Register len = c_rarg3; // src len (must be multiple of blocksize 16)
4120 __ enter(); // required for proper stackwalking of RuntimeStub frame
4121 __ aesecb_decrypt(from, to, key, len);
4122 __ vzeroupper();
4123 __ leave(); // required for proper stackwalking of RuntimeStub frame
4124 __ ret(0);
4125 return start;
4126 }
4127
4128 // ofs and limit are use for multi-block byte array.
4129 // int com.sun.security.provider.MD5.implCompress(byte[] b, int ofs)
4130 address generate_md5_implCompress(bool multi_block, const char *name) {
4131 __ align(CodeEntryAlignment);
4132 StubCodeMark mark(this, "StubRoutines", name);
4133 address start = __ pc();
4134
4135 const Register buf_param = r15;
4136 const Address state_param(rsp, 0 * wordSize);
4137 const Address ofs_param (rsp, 1 * wordSize );
4138 const Address limit_param(rsp, 1 * wordSize + 4);
4139
4140 __ enter();
4141 __ push(rbx);
4142 __ push(rdi);
4143 __ push(rsi);
4144 __ push(r15);
4145 __ subptr(rsp, 2 * wordSize);
4146
4147 __ movptr(buf_param, c_rarg0);
4148 __ movptr(state_param, c_rarg1);
4149 if (multi_block) {
4150 __ movl(ofs_param, c_rarg2);
4151 __ movl(limit_param, c_rarg3);
4152 }
4153 __ fast_md5(buf_param, state_param, ofs_param, limit_param, multi_block);
4154
4155 __ addptr(rsp, 2 * wordSize);
4156 __ pop(r15);
4157 __ pop(rsi);
4158 __ pop(rdi);
4159 __ pop(rbx);
4160 __ leave();
4161 __ ret(0);
4162 return start;
4163 }
4164
4165 address generate_upper_word_mask() {
4166 __ align64();
4167 StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
4168 address start = __ pc();
4169 __ emit_data64(0x0000000000000000, relocInfo::none);
4170 __ emit_data64(0xFFFFFFFF00000000, relocInfo::none);
4171 return start;
4172 }
4173
4174 address generate_shuffle_byte_flip_mask() {
4175 __ align64();
4176 StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
4177 address start = __ pc();
4178 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4179 __ emit_data64(0x0001020304050607, relocInfo::none);
4180 return start;
4181 }
4182
4183 // ofs and limit are use for multi-block byte array.
4184 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
4185 address generate_sha1_implCompress(bool multi_block, const char *name) {
4186 __ align(CodeEntryAlignment);
4187 StubCodeMark mark(this, "StubRoutines", name);
4188 address start = __ pc();
4189
4190 Register buf = c_rarg0;
4191 Register state = c_rarg1;
4192 Register ofs = c_rarg2;
4193 Register limit = c_rarg3;
4194
4195 const XMMRegister abcd = xmm0;
4196 const XMMRegister e0 = xmm1;
4197 const XMMRegister e1 = xmm2;
4198 const XMMRegister msg0 = xmm3;
4199
4200 const XMMRegister msg1 = xmm4;
4201 const XMMRegister msg2 = xmm5;
4202 const XMMRegister msg3 = xmm6;
4203 const XMMRegister shuf_mask = xmm7;
4204
4205 __ enter();
4206
4207 __ subptr(rsp, 4 * wordSize);
4208
4209 __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
4210 buf, state, ofs, limit, rsp, multi_block);
4211
4212 __ addptr(rsp, 4 * wordSize);
4213
4214 __ leave();
4215 __ ret(0);
4216 return start;
4217 }
4218
4219 address generate_pshuffle_byte_flip_mask() {
4220 __ align64();
4221 StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
4222 address start = __ pc();
4223 __ emit_data64(0x0405060700010203, relocInfo::none);
4224 __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
4225
4226 if (VM_Version::supports_avx2()) {
4227 __ emit_data64(0x0405060700010203, relocInfo::none); // second copy
4228 __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
4229 // _SHUF_00BA
4230 __ emit_data64(0x0b0a090803020100, relocInfo::none);
4231 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4232 __ emit_data64(0x0b0a090803020100, relocInfo::none);
4233 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4234 // _SHUF_DC00
4235 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4236 __ emit_data64(0x0b0a090803020100, relocInfo::none);
4237 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4238 __ emit_data64(0x0b0a090803020100, relocInfo::none);
4239 }
4240
4241 return start;
4242 }
4243
4244 //Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
4245 address generate_pshuffle_byte_flip_mask_sha512() {
4246 __ align32();
4247 StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask_sha512");
4248 address start = __ pc();
4249 if (VM_Version::supports_avx2()) {
4250 __ emit_data64(0x0001020304050607, relocInfo::none); // PSHUFFLE_BYTE_FLIP_MASK
4251 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4252 __ emit_data64(0x1011121314151617, relocInfo::none);
4253 __ emit_data64(0x18191a1b1c1d1e1f, relocInfo::none);
4254 __ emit_data64(0x0000000000000000, relocInfo::none); //MASK_YMM_LO
4255 __ emit_data64(0x0000000000000000, relocInfo::none);
4256 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4257 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4258 }
4259
4260 return start;
4261 }
4262
4263 // ofs and limit are use for multi-block byte array.
4264 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
4265 address generate_sha256_implCompress(bool multi_block, const char *name) {
4266 assert(VM_Version::supports_sha() || VM_Version::supports_avx2(), "");
4267 __ align(CodeEntryAlignment);
4268 StubCodeMark mark(this, "StubRoutines", name);
4269 address start = __ pc();
4270
4271 Register buf = c_rarg0;
4272 Register state = c_rarg1;
4273 Register ofs = c_rarg2;
4274 Register limit = c_rarg3;
4275
4276 const XMMRegister msg = xmm0;
4277 const XMMRegister state0 = xmm1;
4278 const XMMRegister state1 = xmm2;
4279 const XMMRegister msgtmp0 = xmm3;
4280
4281 const XMMRegister msgtmp1 = xmm4;
4282 const XMMRegister msgtmp2 = xmm5;
4283 const XMMRegister msgtmp3 = xmm6;
4284 const XMMRegister msgtmp4 = xmm7;
4285
4286 const XMMRegister shuf_mask = xmm8;
4287
4288 __ enter();
4289
4290 __ subptr(rsp, 4 * wordSize);
4291
4292 if (VM_Version::supports_sha()) {
4293 __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4294 buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4295 } else if (VM_Version::supports_avx2()) {
4296 __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4297 buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4298 }
4299 __ addptr(rsp, 4 * wordSize);
4300 __ vzeroupper();
4301 __ leave();
4302 __ ret(0);
4303 return start;
4304 }
4305
4306 address generate_sha512_implCompress(bool multi_block, const char *name) {
4307 assert(VM_Version::supports_avx2(), "");
4308 assert(VM_Version::supports_bmi2(), "");
4309 __ align(CodeEntryAlignment);
4310 StubCodeMark mark(this, "StubRoutines", name);
4311 address start = __ pc();
4312
4313 Register buf = c_rarg0;
4314 Register state = c_rarg1;
4315 Register ofs = c_rarg2;
4316 Register limit = c_rarg3;
4317
4318 const XMMRegister msg = xmm0;
4319 const XMMRegister state0 = xmm1;
4320 const XMMRegister state1 = xmm2;
4321 const XMMRegister msgtmp0 = xmm3;
4322 const XMMRegister msgtmp1 = xmm4;
4323 const XMMRegister msgtmp2 = xmm5;
4324 const XMMRegister msgtmp3 = xmm6;
4325 const XMMRegister msgtmp4 = xmm7;
4326
4327 const XMMRegister shuf_mask = xmm8;
4328
4329 __ enter();
4330
4331 __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4332 buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4333
4334 __ vzeroupper();
4335 __ leave();
4336 __ ret(0);
4337 return start;
4338 }
4339
4340 address ghash_polynomial512_addr() {
4341 __ align(CodeEntryAlignment);
4342 StubCodeMark mark(this, "StubRoutines", "_ghash_poly512_addr");
4343 address start = __ pc();
4344 __ emit_data64(0x00000001C2000000, relocInfo::none); // POLY for reduction
4345 __ emit_data64(0xC200000000000000, relocInfo::none);
4346 __ emit_data64(0x00000001C2000000, relocInfo::none);
4347 __ emit_data64(0xC200000000000000, relocInfo::none);
4348 __ emit_data64(0x00000001C2000000, relocInfo::none);
4349 __ emit_data64(0xC200000000000000, relocInfo::none);
4350 __ emit_data64(0x00000001C2000000, relocInfo::none);
4351 __ emit_data64(0xC200000000000000, relocInfo::none);
4352 __ emit_data64(0x0000000000000001, relocInfo::none); // POLY
4353 __ emit_data64(0xC200000000000000, relocInfo::none);
4354 __ emit_data64(0x0000000000000001, relocInfo::none); // TWOONE
4355 __ emit_data64(0x0000000100000000, relocInfo::none);
4356 return start;
4357 }
4358
4359 // Vector AES Galois Counter Mode implementation. Parameters:
4360 // Windows regs | Linux regs
4361 // in = c_rarg0 (rcx) | c_rarg0 (rsi)
4362 // len = c_rarg1 (rdx) | c_rarg1 (rdi)
4363 // ct = c_rarg2 (r8) | c_rarg2 (rdx)
4364 // out = c_rarg3 (r9) | c_rarg3 (rcx)
4365 // key = r10 | c_rarg4 (r8)
4366 // state = r13 | c_rarg5 (r9)
4367 // subkeyHtbl = r14 | r11
4368 // counter = rsi | r12
4369 // return - number of processed bytes
4370 address generate_galoisCounterMode_AESCrypt() {
4371 __ align(CodeEntryAlignment);
4372 StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
4373 address start = __ pc();
4374 const Register in = c_rarg0;
4375 const Register len = c_rarg1;
4376 const Register ct = c_rarg2;
4377 const Register out = c_rarg3;
4378 // and updated with the incremented counter in the end
4379 #ifndef _WIN64
4380 const Register key = c_rarg4;
4381 const Register state = c_rarg5;
4382 const Address subkeyH_mem(rbp, 2 * wordSize);
4383 const Register subkeyHtbl = r11;
4384 const Register avx512_subkeyHtbl = r13;
4385 const Address counter_mem(rbp, 3 * wordSize);
4386 const Register counter = r12;
4387 #else
4388 const Address key_mem(rbp, 6 * wordSize);
4389 const Register key = r10;
4390 const Address state_mem(rbp, 7 * wordSize);
4391 const Register state = r13;
4392 const Address subkeyH_mem(rbp, 8 * wordSize);
4393 const Register subkeyHtbl = r14;
4394 const Register avx512_subkeyHtbl = r12;
4395 const Address counter_mem(rbp, 9 * wordSize);
4396 const Register counter = rsi;
4397 #endif
4398 __ enter();
4399 // Save state before entering routine
4400 __ push(r12);
4401 __ push(r13);
4402 __ push(r14);
4403 __ push(r15);
4404 __ push(rbx);
4405 #ifdef _WIN64
4406 // on win64, fill len_reg from stack position
4407 __ push(rsi);
4408 __ movptr(key, key_mem);
4409 __ movptr(state, state_mem);
4410 #endif
4411 __ movptr(subkeyHtbl, subkeyH_mem);
4412 __ movptr(counter, counter_mem);
4413 // Save rbp and rsp
4414 __ push(rbp);
4415 __ movq(rbp, rsp);
4416 // Align stack
4417 __ andq(rsp, -64);
4418 __ subptr(rsp, 96 * longSize); // Create space on the stack for htbl entries
4419 __ movptr(avx512_subkeyHtbl, rsp);
4420
4421 __ aesgcm_encrypt(in, len, ct, out, key, state, subkeyHtbl, avx512_subkeyHtbl, counter);
4422 __ vzeroupper();
4423
4424 __ movq(rsp, rbp);
4425 __ pop(rbp);
4426
4427 // Restore state before leaving routine
4428 #ifdef _WIN64
4429 __ pop(rsi);
4430 #endif
4431 __ pop(rbx);
4432 __ pop(r15);
4433 __ pop(r14);
4434 __ pop(r13);
4435 __ pop(r12);
4436
4437 __ leave(); // required for proper stackwalking of RuntimeStub frame
4438 __ ret(0);
4439 return start;
4440 }
4441
4442 // This mask is used for incrementing counter value(linc0, linc4, etc.)
4443 address counter_mask_addr() {
4444 __ align64();
4445 StubCodeMark mark(this, "StubRoutines", "counter_mask_addr");
4446 address start = __ pc();
4447 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);//lbswapmask
4448 __ emit_data64(0x0001020304050607, relocInfo::none);
4449 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4450 __ emit_data64(0x0001020304050607, relocInfo::none);
4451 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4452 __ emit_data64(0x0001020304050607, relocInfo::none);
4453 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4454 __ emit_data64(0x0001020304050607, relocInfo::none);
4455 __ emit_data64(0x0000000000000000, relocInfo::none);//linc0 = counter_mask_addr+64
4456 __ emit_data64(0x0000000000000000, relocInfo::none);
4457 __ emit_data64(0x0000000000000001, relocInfo::none);//counter_mask_addr() + 80
4458 __ emit_data64(0x0000000000000000, relocInfo::none);
4459 __ emit_data64(0x0000000000000002, relocInfo::none);
4460 __ emit_data64(0x0000000000000000, relocInfo::none);
4461 __ emit_data64(0x0000000000000003, relocInfo::none);
4462 __ emit_data64(0x0000000000000000, relocInfo::none);
4463 __ emit_data64(0x0000000000000004, relocInfo::none);//linc4 = counter_mask_addr() + 128
4464 __ emit_data64(0x0000000000000000, relocInfo::none);
4465 __ emit_data64(0x0000000000000004, relocInfo::none);
4466 __ emit_data64(0x0000000000000000, relocInfo::none);
4467 __ emit_data64(0x0000000000000004, relocInfo::none);
4468 __ emit_data64(0x0000000000000000, relocInfo::none);
4469 __ emit_data64(0x0000000000000004, relocInfo::none);
4470 __ emit_data64(0x0000000000000000, relocInfo::none);
4471 __ emit_data64(0x0000000000000008, relocInfo::none);//linc8 = counter_mask_addr() + 192
4472 __ emit_data64(0x0000000000000000, relocInfo::none);
4473 __ emit_data64(0x0000000000000008, relocInfo::none);
4474 __ emit_data64(0x0000000000000000, relocInfo::none);
4475 __ emit_data64(0x0000000000000008, relocInfo::none);
4476 __ emit_data64(0x0000000000000000, relocInfo::none);
4477 __ emit_data64(0x0000000000000008, relocInfo::none);
4478 __ emit_data64(0x0000000000000000, relocInfo::none);
4479 __ emit_data64(0x0000000000000020, relocInfo::none);//linc32 = counter_mask_addr() + 256
4480 __ emit_data64(0x0000000000000000, relocInfo::none);
4481 __ emit_data64(0x0000000000000020, relocInfo::none);
4482 __ emit_data64(0x0000000000000000, relocInfo::none);
4483 __ emit_data64(0x0000000000000020, relocInfo::none);
4484 __ emit_data64(0x0000000000000000, relocInfo::none);
4485 __ emit_data64(0x0000000000000020, relocInfo::none);
4486 __ emit_data64(0x0000000000000000, relocInfo::none);
4487 __ emit_data64(0x0000000000000010, relocInfo::none);//linc16 = counter_mask_addr() + 320
4488 __ emit_data64(0x0000000000000000, relocInfo::none);
4489 __ emit_data64(0x0000000000000010, relocInfo::none);
4490 __ emit_data64(0x0000000000000000, relocInfo::none);
4491 __ emit_data64(0x0000000000000010, relocInfo::none);
4492 __ emit_data64(0x0000000000000000, relocInfo::none);
4493 __ emit_data64(0x0000000000000010, relocInfo::none);
4494 __ emit_data64(0x0000000000000000, relocInfo::none);
4495 return start;
4496 }
4497
4498 // Vector AES Counter implementation
4499 address generate_counterMode_VectorAESCrypt() {
4500 __ align(CodeEntryAlignment);
4501 StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
4502 address start = __ pc();
4503 const Register from = c_rarg0; // source array address
4504 const Register to = c_rarg1; // destination array address
4505 const Register key = c_rarg2; // key array address r8
4506 const Register counter = c_rarg3; // counter byte array initialized from counter array address
4507 // and updated with the incremented counter in the end
4508 #ifndef _WIN64
4509 const Register len_reg = c_rarg4;
4510 const Register saved_encCounter_start = c_rarg5;
4511 const Register used_addr = r10;
4512 const Address used_mem(rbp, 2 * wordSize);
4513 const Register used = r11;
4514 #else
4515 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
4516 const Address saved_encCounter_mem(rbp, 7 * wordSize); // saved encrypted counter is on stack on Win64
4517 const Address used_mem(rbp, 8 * wordSize); // used length is on stack on Win64
4518 const Register len_reg = r10; // pick the first volatile windows register
4519 const Register saved_encCounter_start = r11;
4520 const Register used_addr = r13;
4521 const Register used = r14;
4522 #endif
4523 __ enter();
4524 // Save state before entering routine
4525 __ push(r12);
4526 __ push(r13);
4527 __ push(r14);
4528 __ push(r15);
4529 #ifdef _WIN64
4530 // on win64, fill len_reg from stack position
4531 __ movl(len_reg, len_mem);
4532 __ movptr(saved_encCounter_start, saved_encCounter_mem);
4533 __ movptr(used_addr, used_mem);
4534 __ movl(used, Address(used_addr, 0));
4535 #else
4536 __ push(len_reg); // Save
4537 __ movptr(used_addr, used_mem);
4538 __ movl(used, Address(used_addr, 0));
4539 #endif
4540 __ push(rbx);
4541 __ aesctr_encrypt(from, to, key, counter, len_reg, used, used_addr, saved_encCounter_start);
4542 __ vzeroupper();
4543 // Restore state before leaving routine
4544 __ pop(rbx);
4545 #ifdef _WIN64
4546 __ movl(rax, len_mem); // return length
4547 #else
4548 __ pop(rax); // return length
4549 #endif
4550 __ pop(r15);
4551 __ pop(r14);
4552 __ pop(r13);
4553 __ pop(r12);
4554
4555 __ leave(); // required for proper stackwalking of RuntimeStub frame
4556 __ ret(0);
4557 return start;
4558 }
4559
4560 // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
4561 // to hide instruction latency
4562 //
4563 // Arguments:
4564 //
4565 // Inputs:
4566 // c_rarg0 - source byte array address
4567 // c_rarg1 - destination byte array address
4568 // c_rarg2 - K (key) in little endian int array
4569 // c_rarg3 - counter vector byte array address
4570 // Linux
4571 // c_rarg4 - input length
4572 // c_rarg5 - saved encryptedCounter start
4573 // rbp + 6 * wordSize - saved used length
4574 // Windows
4575 // rbp + 6 * wordSize - input length
4576 // rbp + 7 * wordSize - saved encryptedCounter start
4577 // rbp + 8 * wordSize - saved used length
4578 //
4579 // Output:
4580 // rax - input length
4581 //
4582 address generate_counterMode_AESCrypt_Parallel() {
4583 assert(UseAES, "need AES instructions and misaligned SSE support");
4584 __ align(CodeEntryAlignment);
4585 StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
4586 address start = __ pc();
4587 const Register from = c_rarg0; // source array address
4588 const Register to = c_rarg1; // destination array address
4589 const Register key = c_rarg2; // key array address
4590 const Register counter = c_rarg3; // counter byte array initialized from counter array address
4591 // and updated with the incremented counter in the end
4592 #ifndef _WIN64
4593 const Register len_reg = c_rarg4;
4594 const Register saved_encCounter_start = c_rarg5;
4595 const Register used_addr = r10;
4596 const Address used_mem(rbp, 2 * wordSize);
4597 const Register used = r11;
4598 #else
4599 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
4600 const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64
4601 const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64
4602 const Register len_reg = r10; // pick the first volatile windows register
4603 const Register saved_encCounter_start = r11;
4604 const Register used_addr = r13;
4605 const Register used = r14;
4606 #endif
4607 const Register pos = rax;
4608
4609 const int PARALLEL_FACTOR = 6;
4610 const XMMRegister xmm_counter_shuf_mask = xmm0;
4611 const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
4612 const XMMRegister xmm_curr_counter = xmm2;
4613
4614 const XMMRegister xmm_key_tmp0 = xmm3;
4615 const XMMRegister xmm_key_tmp1 = xmm4;
4616
4617 // registers holding the four results in the parallelized loop
4618 const XMMRegister xmm_result0 = xmm5;
4619 const XMMRegister xmm_result1 = xmm6;
4620 const XMMRegister xmm_result2 = xmm7;
4621 const XMMRegister xmm_result3 = xmm8;
4622 const XMMRegister xmm_result4 = xmm9;
4623 const XMMRegister xmm_result5 = xmm10;
4624
4625 const XMMRegister xmm_from0 = xmm11;
4626 const XMMRegister xmm_from1 = xmm12;
4627 const XMMRegister xmm_from2 = xmm13;
4628 const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64.
4629 const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text
4630 const XMMRegister xmm_from5 = xmm4;
4631
4632 //for key_128, key_192, key_256
4633 const int rounds[3] = {10, 12, 14};
4634 Label L_exit_preLoop, L_preLoop_start;
4635 Label L_multiBlock_loopTop[3];
4636 Label L_singleBlockLoopTop[3];
4637 Label L__incCounter[3][6]; //for 6 blocks
4638 Label L__incCounter_single[3]; //for single block, key128, key192, key256
4639 Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
4640 Label L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
4641
4642 Label L_exit;
4643
4644 __ enter(); // required for proper stackwalking of RuntimeStub frame
4645
4646 #ifdef _WIN64
4647 // allocate spill slots for r13, r14
4648 enum {
4649 saved_r13_offset,
4650 saved_r14_offset
4651 };
4652 __ subptr(rsp, 2 * wordSize);
4653 __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
4654 __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
4655
4656 // on win64, fill len_reg from stack position
4657 __ movl(len_reg, len_mem);
4658 __ movptr(saved_encCounter_start, saved_encCounter_mem);
4659 __ movptr(used_addr, used_mem);
4660 __ movl(used, Address(used_addr, 0));
4661 #else
4662 __ push(len_reg); // Save
4663 __ movptr(used_addr, used_mem);
4664 __ movl(used, Address(used_addr, 0));
4665 #endif
4666
4667 __ push(rbx); // Save RBX
4668 __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
4669 __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()), pos); // pos as scratch
4670 __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
4671 __ movptr(pos, 0);
4672
4673 // Use the partially used encrpyted counter from last invocation
4674 __ BIND(L_preLoop_start);
4675 __ cmpptr(used, 16);
4676 __ jcc(Assembler::aboveEqual, L_exit_preLoop);
4677 __ cmpptr(len_reg, 0);
4678 __ jcc(Assembler::lessEqual, L_exit_preLoop);
4679 __ movb(rbx, Address(saved_encCounter_start, used));
4680 __ xorb(rbx, Address(from, pos));
4681 __ movb(Address(to, pos), rbx);
4682 __ addptr(pos, 1);
4683 __ addptr(used, 1);
4684 __ subptr(len_reg, 1);
4685
4686 __ jmp(L_preLoop_start);
4687
4688 __ BIND(L_exit_preLoop);
4689 __ movl(Address(used_addr, 0), used);
4690
4691 // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
4692 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()), rbx); // rbx as scratch
4693 __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4694 __ cmpl(rbx, 52);
4695 __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
4696 __ cmpl(rbx, 60);
4697 __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
4698
4699 #define CTR_DoSix(opc, src_reg) \
4700 __ opc(xmm_result0, src_reg); \
4701 __ opc(xmm_result1, src_reg); \
4702 __ opc(xmm_result2, src_reg); \
4703 __ opc(xmm_result3, src_reg); \
4704 __ opc(xmm_result4, src_reg); \
4705 __ opc(xmm_result5, src_reg);
4706
4707 // k == 0 : generate code for key_128
4708 // k == 1 : generate code for key_192
4709 // k == 2 : generate code for key_256
4710 for (int k = 0; k < 3; ++k) {
4711 //multi blocks starts here
4712 __ align(OptoLoopAlignment);
4713 __ BIND(L_multiBlock_loopTop[k]);
4714 __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
4715 __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
4716 load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4717
4718 //load, then increase counters
4719 CTR_DoSix(movdqa, xmm_curr_counter);
4720 inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]);
4721 inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]);
4722 inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]);
4723 inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]);
4724 inc_counter(rbx, xmm_result5, 0x05, L__incCounter[k][4]);
4725 inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]);
4726 CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
4727 CTR_DoSix(pxor, xmm_key_tmp0); //PXOR with Round 0 key
4728
4729 //load two ROUND_KEYs at a time
4730 for (int i = 1; i < rounds[k]; ) {
4731 load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask);
4732 load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask);
4733 CTR_DoSix(aesenc, xmm_key_tmp1);
4734 i++;
4735 if (i != rounds[k]) {
4736 CTR_DoSix(aesenc, xmm_key_tmp0);
4737 } else {
4738 CTR_DoSix(aesenclast, xmm_key_tmp0);
4739 }
4740 i++;
4741 }
4742
4743 // get next PARALLEL_FACTOR blocks into xmm_result registers
4744 __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4745 __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
4746 __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
4747 __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
4748 __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize));
4749 __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize));
4750
4751 __ pxor(xmm_result0, xmm_from0);
4752 __ pxor(xmm_result1, xmm_from1);
4753 __ pxor(xmm_result2, xmm_from2);
4754 __ pxor(xmm_result3, xmm_from3);
4755 __ pxor(xmm_result4, xmm_from4);
4756 __ pxor(xmm_result5, xmm_from5);
4757
4758 // store 6 results into the next 64 bytes of output
4759 __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4760 __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
4761 __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
4762 __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
4763 __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4);
4764 __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5);
4765
4766 __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
4767 __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
4768 __ jmp(L_multiBlock_loopTop[k]);
4769
4770 // singleBlock starts here
4771 __ align(OptoLoopAlignment);
4772 __ BIND(L_singleBlockLoopTop[k]);
4773 __ cmpptr(len_reg, 0);
4774 __ jcc(Assembler::lessEqual, L_exit);
4775 load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4776 __ movdqa(xmm_result0, xmm_curr_counter);
4777 inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]);
4778 __ pshufb(xmm_result0, xmm_counter_shuf_mask);
4779 __ pxor(xmm_result0, xmm_key_tmp0);
4780 for (int i = 1; i < rounds[k]; i++) {
4781 load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask);
4782 __ aesenc(xmm_result0, xmm_key_tmp0);
4783 }
4784 load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask);
4785 __ aesenclast(xmm_result0, xmm_key_tmp0);
4786 __ cmpptr(len_reg, AESBlockSize);
4787 __ jcc(Assembler::less, L_processTail_insr[k]);
4788 __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4789 __ pxor(xmm_result0, xmm_from0);
4790 __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4791 __ addptr(pos, AESBlockSize);
4792 __ subptr(len_reg, AESBlockSize);
4793 __ jmp(L_singleBlockLoopTop[k]);
4794 __ BIND(L_processTail_insr[k]); // Process the tail part of the input array
4795 __ addptr(pos, len_reg); // 1. Insert bytes from src array into xmm_from0 register
4796 __ testptr(len_reg, 8);
4797 __ jcc(Assembler::zero, L_processTail_4_insr[k]);
4798 __ subptr(pos,8);
4799 __ pinsrq(xmm_from0, Address(from, pos), 0);
4800 __ BIND(L_processTail_4_insr[k]);
4801 __ testptr(len_reg, 4);
4802 __ jcc(Assembler::zero, L_processTail_2_insr[k]);
4803 __ subptr(pos,4);
4804 __ pslldq(xmm_from0, 4);
4805 __ pinsrd(xmm_from0, Address(from, pos), 0);
4806 __ BIND(L_processTail_2_insr[k]);
4807 __ testptr(len_reg, 2);
4808 __ jcc(Assembler::zero, L_processTail_1_insr[k]);
4809 __ subptr(pos, 2);
4810 __ pslldq(xmm_from0, 2);
4811 __ pinsrw(xmm_from0, Address(from, pos), 0);
4812 __ BIND(L_processTail_1_insr[k]);
4813 __ testptr(len_reg, 1);
4814 __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
4815 __ subptr(pos, 1);
4816 __ pslldq(xmm_from0, 1);
4817 __ pinsrb(xmm_from0, Address(from, pos), 0);
4818 __ BIND(L_processTail_exit_insr[k]);
4819
4820 __ movdqu(Address(saved_encCounter_start, 0), xmm_result0); // 2. Perform pxor of the encrypted counter and plaintext Bytes.
4821 __ pxor(xmm_result0, xmm_from0); // Also the encrypted counter is saved for next invocation.
4822
4823 __ testptr(len_reg, 8);
4824 __ jcc(Assembler::zero, L_processTail_4_extr[k]); // 3. Extract bytes from xmm_result0 into the dest. array
4825 __ pextrq(Address(to, pos), xmm_result0, 0);
4826 __ psrldq(xmm_result0, 8);
4827 __ addptr(pos, 8);
4828 __ BIND(L_processTail_4_extr[k]);
4829 __ testptr(len_reg, 4);
4830 __ jcc(Assembler::zero, L_processTail_2_extr[k]);
4831 __ pextrd(Address(to, pos), xmm_result0, 0);
4832 __ psrldq(xmm_result0, 4);
4833 __ addptr(pos, 4);
4834 __ BIND(L_processTail_2_extr[k]);
4835 __ testptr(len_reg, 2);
4836 __ jcc(Assembler::zero, L_processTail_1_extr[k]);
4837 __ pextrw(Address(to, pos), xmm_result0, 0);
4838 __ psrldq(xmm_result0, 2);
4839 __ addptr(pos, 2);
4840 __ BIND(L_processTail_1_extr[k]);
4841 __ testptr(len_reg, 1);
4842 __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
4843 __ pextrb(Address(to, pos), xmm_result0, 0);
4844
4845 __ BIND(L_processTail_exit_extr[k]);
4846 __ movl(Address(used_addr, 0), len_reg);
4847 __ jmp(L_exit);
4848
4849 }
4850
4851 __ BIND(L_exit);
4852 __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
4853 __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
4854 __ pop(rbx); // pop the saved RBX.
4855 #ifdef _WIN64
4856 __ movl(rax, len_mem);
4857 __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
4858 __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
4859 __ addptr(rsp, 2 * wordSize);
4860 #else
4861 __ pop(rax); // return 'len'
4862 #endif
4863 __ leave(); // required for proper stackwalking of RuntimeStub frame
4864 __ ret(0);
4865 return start;
4866 }
4867
4868 void roundDec(XMMRegister xmm_reg) {
4869 __ vaesdec(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
4870 __ vaesdec(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
4871 __ vaesdec(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
4872 __ vaesdec(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
4873 __ vaesdec(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
4874 __ vaesdec(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
4875 __ vaesdec(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
4876 __ vaesdec(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
4877 }
4878
4879 void roundDeclast(XMMRegister xmm_reg) {
4880 __ vaesdeclast(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
4881 __ vaesdeclast(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
4882 __ vaesdeclast(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
4883 __ vaesdeclast(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
4884 __ vaesdeclast(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
4885 __ vaesdeclast(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
4886 __ vaesdeclast(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
4887 __ vaesdeclast(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
4888 }
4889
4890 void ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask = NULL) {
4891 __ movdqu(xmmdst, Address(key, offset));
4892 if (xmm_shuf_mask != NULL) {
4893 __ pshufb(xmmdst, xmm_shuf_mask);
4894 } else {
4895 __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
4896 }
4897 __ evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit);
4898
4899 }
4900
4901 address generate_cipherBlockChaining_decryptVectorAESCrypt() {
4902 assert(VM_Version::supports_avx512_vaes(), "need AES instructions and misaligned SSE support");
4903 __ align(CodeEntryAlignment);
4904 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
4905 address start = __ pc();
4906
4907 const Register from = c_rarg0; // source array address
4908 const Register to = c_rarg1; // destination array address
4909 const Register key = c_rarg2; // key array address
4910 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
4911 // and left with the results of the last encryption block
4912 #ifndef _WIN64
4913 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
4914 #else
4915 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
4916 const Register len_reg = r11; // pick the volatile windows register
4917 #endif
4918
4919 Label Loop, Loop1, L_128, L_256, L_192, KEY_192, KEY_256, Loop2, Lcbc_dec_rem_loop,
4920 Lcbc_dec_rem_last, Lcbc_dec_ret, Lcbc_dec_rem, Lcbc_exit;
4921
4922 __ enter();
4923
4924 #ifdef _WIN64
4925 // on win64, fill len_reg from stack position
4926 __ movl(len_reg, len_mem);
4927 #else
4928 __ push(len_reg); // Save
4929 #endif
4930 __ push(rbx);
4931 __ vzeroupper();
4932
4933 // Temporary variable declaration for swapping key bytes
4934 const XMMRegister xmm_key_shuf_mask = xmm1;
4935 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
4936
4937 // Calculate number of rounds from key size: 44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
4938 const Register rounds = rbx;
4939 __ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4940
4941 const XMMRegister IV = xmm0;
4942 // Load IV and broadcast value to 512-bits
4943 __ evbroadcasti64x2(IV, Address(rvec, 0), Assembler::AVX_512bit);
4944
4945 // Temporary variables for storing round keys
4946 const XMMRegister RK0 = xmm30;
4947 const XMMRegister RK1 = xmm9;
4948 const XMMRegister RK2 = xmm18;
4949 const XMMRegister RK3 = xmm19;
4950 const XMMRegister RK4 = xmm20;
4951 const XMMRegister RK5 = xmm21;
4952 const XMMRegister RK6 = xmm22;
4953 const XMMRegister RK7 = xmm23;
4954 const XMMRegister RK8 = xmm24;
4955 const XMMRegister RK9 = xmm25;
4956 const XMMRegister RK10 = xmm26;
4957
4958 // Load and shuffle key
4959 // the java expanded key ordering is rotated one position from what we want
4960 // so we start from 1*16 here and hit 0*16 last
4961 ev_load_key(RK1, key, 1 * 16, xmm_key_shuf_mask);
4962 ev_load_key(RK2, key, 2 * 16, xmm_key_shuf_mask);
4963 ev_load_key(RK3, key, 3 * 16, xmm_key_shuf_mask);
4964 ev_load_key(RK4, key, 4 * 16, xmm_key_shuf_mask);
4965 ev_load_key(RK5, key, 5 * 16, xmm_key_shuf_mask);
4966 ev_load_key(RK6, key, 6 * 16, xmm_key_shuf_mask);
4967 ev_load_key(RK7, key, 7 * 16, xmm_key_shuf_mask);
4968 ev_load_key(RK8, key, 8 * 16, xmm_key_shuf_mask);
4969 ev_load_key(RK9, key, 9 * 16, xmm_key_shuf_mask);
4970 ev_load_key(RK10, key, 10 * 16, xmm_key_shuf_mask);
4971 ev_load_key(RK0, key, 0*16, xmm_key_shuf_mask);
4972
4973 // Variables for storing source cipher text
4974 const XMMRegister S0 = xmm10;
4975 const XMMRegister S1 = xmm11;
4976 const XMMRegister S2 = xmm12;
4977 const XMMRegister S3 = xmm13;
4978 const XMMRegister S4 = xmm14;
4979 const XMMRegister S5 = xmm15;
4980 const XMMRegister S6 = xmm16;
4981 const XMMRegister S7 = xmm17;
4982
4983 // Variables for storing decrypted text
4984 const XMMRegister B0 = xmm1;
4985 const XMMRegister B1 = xmm2;
4986 const XMMRegister B2 = xmm3;
4987 const XMMRegister B3 = xmm4;
4988 const XMMRegister B4 = xmm5;
4989 const XMMRegister B5 = xmm6;
4990 const XMMRegister B6 = xmm7;
4991 const XMMRegister B7 = xmm8;
4992
4993 __ cmpl(rounds, 44);
4994 __ jcc(Assembler::greater, KEY_192);
4995 __ jmp(Loop);
4996
4997 __ BIND(KEY_192);
4998 const XMMRegister RK11 = xmm27;
4999 const XMMRegister RK12 = xmm28;
5000 ev_load_key(RK11, key, 11*16, xmm_key_shuf_mask);
5001 ev_load_key(RK12, key, 12*16, xmm_key_shuf_mask);
5002
5003 __ cmpl(rounds, 52);
5004 __ jcc(Assembler::greater, KEY_256);
5005 __ jmp(Loop);
5006
5007 __ BIND(KEY_256);
5008 const XMMRegister RK13 = xmm29;
5009 const XMMRegister RK14 = xmm31;
5010 ev_load_key(RK13, key, 13*16, xmm_key_shuf_mask);
5011 ev_load_key(RK14, key, 14*16, xmm_key_shuf_mask);
5012
5013 __ BIND(Loop);
5014 __ cmpl(len_reg, 512);
5015 __ jcc(Assembler::below, Lcbc_dec_rem);
5016 __ BIND(Loop1);
5017 __ subl(len_reg, 512);
5018 __ evmovdquq(S0, Address(from, 0 * 64), Assembler::AVX_512bit);
5019 __ evmovdquq(S1, Address(from, 1 * 64), Assembler::AVX_512bit);
5020 __ evmovdquq(S2, Address(from, 2 * 64), Assembler::AVX_512bit);
5021 __ evmovdquq(S3, Address(from, 3 * 64), Assembler::AVX_512bit);
5022 __ evmovdquq(S4, Address(from, 4 * 64), Assembler::AVX_512bit);
5023 __ evmovdquq(S5, Address(from, 5 * 64), Assembler::AVX_512bit);
5024 __ evmovdquq(S6, Address(from, 6 * 64), Assembler::AVX_512bit);
5025 __ evmovdquq(S7, Address(from, 7 * 64), Assembler::AVX_512bit);
5026 __ leaq(from, Address(from, 8 * 64));
5027
5028 __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
5029 __ evpxorq(B1, S1, RK1, Assembler::AVX_512bit);
5030 __ evpxorq(B2, S2, RK1, Assembler::AVX_512bit);
5031 __ evpxorq(B3, S3, RK1, Assembler::AVX_512bit);
5032 __ evpxorq(B4, S4, RK1, Assembler::AVX_512bit);
5033 __ evpxorq(B5, S5, RK1, Assembler::AVX_512bit);
5034 __ evpxorq(B6, S6, RK1, Assembler::AVX_512bit);
5035 __ evpxorq(B7, S7, RK1, Assembler::AVX_512bit);
5036
5037 __ evalignq(IV, S0, IV, 0x06);
5038 __ evalignq(S0, S1, S0, 0x06);
5039 __ evalignq(S1, S2, S1, 0x06);
5040 __ evalignq(S2, S3, S2, 0x06);
5041 __ evalignq(S3, S4, S3, 0x06);
5042 __ evalignq(S4, S5, S4, 0x06);
5043 __ evalignq(S5, S6, S5, 0x06);
5044 __ evalignq(S6, S7, S6, 0x06);
5045
5046 roundDec(RK2);
5047 roundDec(RK3);
5048 roundDec(RK4);
5049 roundDec(RK5);
5050 roundDec(RK6);
5051 roundDec(RK7);
5052 roundDec(RK8);
5053 roundDec(RK9);
5054 roundDec(RK10);
5055
5056 __ cmpl(rounds, 44);
5057 __ jcc(Assembler::belowEqual, L_128);
5058 roundDec(RK11);
5059 roundDec(RK12);
5060
5061 __ cmpl(rounds, 52);
5062 __ jcc(Assembler::belowEqual, L_192);
5063 roundDec(RK13);
5064 roundDec(RK14);
5065
5066 __ BIND(L_256);
5067 roundDeclast(RK0);
5068 __ jmp(Loop2);
5069
5070 __ BIND(L_128);
5071 roundDeclast(RK0);
5072 __ jmp(Loop2);
5073
5074 __ BIND(L_192);
5075 roundDeclast(RK0);
5076
5077 __ BIND(Loop2);
5078 __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
5079 __ evpxorq(B1, B1, S0, Assembler::AVX_512bit);
5080 __ evpxorq(B2, B2, S1, Assembler::AVX_512bit);
5081 __ evpxorq(B3, B3, S2, Assembler::AVX_512bit);
5082 __ evpxorq(B4, B4, S3, Assembler::AVX_512bit);
5083 __ evpxorq(B5, B5, S4, Assembler::AVX_512bit);
5084 __ evpxorq(B6, B6, S5, Assembler::AVX_512bit);
5085 __ evpxorq(B7, B7, S6, Assembler::AVX_512bit);
5086 __ evmovdquq(IV, S7, Assembler::AVX_512bit);
5087
5088 __ evmovdquq(Address(to, 0 * 64), B0, Assembler::AVX_512bit);
5089 __ evmovdquq(Address(to, 1 * 64), B1, Assembler::AVX_512bit);
5090 __ evmovdquq(Address(to, 2 * 64), B2, Assembler::AVX_512bit);
5091 __ evmovdquq(Address(to, 3 * 64), B3, Assembler::AVX_512bit);
5092 __ evmovdquq(Address(to, 4 * 64), B4, Assembler::AVX_512bit);
5093 __ evmovdquq(Address(to, 5 * 64), B5, Assembler::AVX_512bit);
5094 __ evmovdquq(Address(to, 6 * 64), B6, Assembler::AVX_512bit);
5095 __ evmovdquq(Address(to, 7 * 64), B7, Assembler::AVX_512bit);
5096 __ leaq(to, Address(to, 8 * 64));
5097 __ jmp(Loop);
5098
5099 __ BIND(Lcbc_dec_rem);
5100 __ evshufi64x2(IV, IV, IV, 0x03, Assembler::AVX_512bit);
5101
5102 __ BIND(Lcbc_dec_rem_loop);
5103 __ subl(len_reg, 16);
5104 __ jcc(Assembler::carrySet, Lcbc_dec_ret);
5105
5106 __ movdqu(S0, Address(from, 0));
5107 __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
5108 __ vaesdec(B0, B0, RK2, Assembler::AVX_512bit);
5109 __ vaesdec(B0, B0, RK3, Assembler::AVX_512bit);
5110 __ vaesdec(B0, B0, RK4, Assembler::AVX_512bit);
5111 __ vaesdec(B0, B0, RK5, Assembler::AVX_512bit);
5112 __ vaesdec(B0, B0, RK6, Assembler::AVX_512bit);
5113 __ vaesdec(B0, B0, RK7, Assembler::AVX_512bit);
5114 __ vaesdec(B0, B0, RK8, Assembler::AVX_512bit);
5115 __ vaesdec(B0, B0, RK9, Assembler::AVX_512bit);
5116 __ vaesdec(B0, B0, RK10, Assembler::AVX_512bit);
5117 __ cmpl(rounds, 44);
5118 __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
5119
5120 __ vaesdec(B0, B0, RK11, Assembler::AVX_512bit);
5121 __ vaesdec(B0, B0, RK12, Assembler::AVX_512bit);
5122 __ cmpl(rounds, 52);
5123 __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
5124
5125 __ vaesdec(B0, B0, RK13, Assembler::AVX_512bit);
5126 __ vaesdec(B0, B0, RK14, Assembler::AVX_512bit);
5127
5128 __ BIND(Lcbc_dec_rem_last);
5129 __ vaesdeclast(B0, B0, RK0, Assembler::AVX_512bit);
5130
5131 __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
5132 __ evmovdquq(IV, S0, Assembler::AVX_512bit);
5133 __ movdqu(Address(to, 0), B0);
5134 __ leaq(from, Address(from, 16));
5135 __ leaq(to, Address(to, 16));
5136 __ jmp(Lcbc_dec_rem_loop);
5137
5138 __ BIND(Lcbc_dec_ret);
5139 __ movdqu(Address(rvec, 0), IV);
5140
5141 // Zero out the round keys
5142 __ evpxorq(RK0, RK0, RK0, Assembler::AVX_512bit);
5143 __ evpxorq(RK1, RK1, RK1, Assembler::AVX_512bit);
5144 __ evpxorq(RK2, RK2, RK2, Assembler::AVX_512bit);
5145 __ evpxorq(RK3, RK3, RK3, Assembler::AVX_512bit);
5146 __ evpxorq(RK4, RK4, RK4, Assembler::AVX_512bit);
5147 __ evpxorq(RK5, RK5, RK5, Assembler::AVX_512bit);
5148 __ evpxorq(RK6, RK6, RK6, Assembler::AVX_512bit);
5149 __ evpxorq(RK7, RK7, RK7, Assembler::AVX_512bit);
5150 __ evpxorq(RK8, RK8, RK8, Assembler::AVX_512bit);
5151 __ evpxorq(RK9, RK9, RK9, Assembler::AVX_512bit);
5152 __ evpxorq(RK10, RK10, RK10, Assembler::AVX_512bit);
5153 __ cmpl(rounds, 44);
5154 __ jcc(Assembler::belowEqual, Lcbc_exit);
5155 __ evpxorq(RK11, RK11, RK11, Assembler::AVX_512bit);
5156 __ evpxorq(RK12, RK12, RK12, Assembler::AVX_512bit);
5157 __ cmpl(rounds, 52);
5158 __ jcc(Assembler::belowEqual, Lcbc_exit);
5159 __ evpxorq(RK13, RK13, RK13, Assembler::AVX_512bit);
5160 __ evpxorq(RK14, RK14, RK14, Assembler::AVX_512bit);
5161
5162 __ BIND(Lcbc_exit);
5163 __ vzeroupper();
5164 __ pop(rbx);
5165 #ifdef _WIN64
5166 __ movl(rax, len_mem);
5167 #else
5168 __ pop(rax); // return length
5169 #endif
5170 __ leave(); // required for proper stackwalking of RuntimeStub frame
5171 __ ret(0);
5172 return start;
5173 }
5174
5175 // Polynomial x^128+x^127+x^126+x^121+1
5176 address ghash_polynomial_addr() {
5177 __ align(CodeEntryAlignment);
5178 StubCodeMark mark(this, "StubRoutines", "_ghash_poly_addr");
5179 address start = __ pc();
5180 __ emit_data64(0x0000000000000001, relocInfo::none);
5181 __ emit_data64(0xc200000000000000, relocInfo::none);
5182 return start;
5183 }
5184
5185 address ghash_shufflemask_addr() {
5186 __ align(CodeEntryAlignment);
5187 StubCodeMark mark(this, "StubRoutines", "_ghash_shuffmask_addr");
5188 address start = __ pc();
5189 __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
5190 __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
5191 return start;
5192 }
5193
5194 // Ghash single and multi block operations using AVX instructions
5195 address generate_avx_ghash_processBlocks() {
5196 __ align(CodeEntryAlignment);
5197
5198 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5199 address start = __ pc();
5200
5201 // arguments
5202 const Register state = c_rarg0;
5203 const Register htbl = c_rarg1;
5204 const Register data = c_rarg2;
5205 const Register blocks = c_rarg3;
5206 __ enter();
5207 // Save state before entering routine
5208 __ avx_ghash(state, htbl, data, blocks);
5209 __ leave(); // required for proper stackwalking of RuntimeStub frame
5210 __ ret(0);
5211 return start;
5212 }
5213
5214 // byte swap x86 long
5215 address generate_ghash_long_swap_mask() {
5216 __ align(CodeEntryAlignment);
5217 StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
5218 address start = __ pc();
5219 __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
5220 __ emit_data64(0x0706050403020100, relocInfo::none );
5221 return start;
5222 }
5223
5224 // byte swap x86 byte array
5225 address generate_ghash_byte_swap_mask() {
5226 __ align(CodeEntryAlignment);
5227 StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
5228 address start = __ pc();
5229 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
5230 __ emit_data64(0x0001020304050607, relocInfo::none );
5231 return start;
5232 }
5233
5234 /* Single and multi-block ghash operations */
5235 address generate_ghash_processBlocks() {
5236 __ align(CodeEntryAlignment);
5237 Label L_ghash_loop, L_exit;
5238 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
5239 address start = __ pc();
5240
5241 const Register state = c_rarg0;
5242 const Register subkeyH = c_rarg1;
5243 const Register data = c_rarg2;
5244 const Register blocks = c_rarg3;
5245
5246 const XMMRegister xmm_temp0 = xmm0;
5247 const XMMRegister xmm_temp1 = xmm1;
5248 const XMMRegister xmm_temp2 = xmm2;
5249 const XMMRegister xmm_temp3 = xmm3;
5250 const XMMRegister xmm_temp4 = xmm4;
5251 const XMMRegister xmm_temp5 = xmm5;
5252 const XMMRegister xmm_temp6 = xmm6;
5253 const XMMRegister xmm_temp7 = xmm7;
5254 const XMMRegister xmm_temp8 = xmm8;
5255 const XMMRegister xmm_temp9 = xmm9;
5256 const XMMRegister xmm_temp10 = xmm10;
5257
5258 __ enter();
5259
5260 __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
5261
5262 __ movdqu(xmm_temp0, Address(state, 0));
5263 __ pshufb(xmm_temp0, xmm_temp10);
5264
5265
5266 __ BIND(L_ghash_loop);
5267 __ movdqu(xmm_temp2, Address(data, 0));
5268 __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
5269
5270 __ movdqu(xmm_temp1, Address(subkeyH, 0));
5271 __ pshufb(xmm_temp1, xmm_temp10);
5272
5273 __ pxor(xmm_temp0, xmm_temp2);
5274
5275 //
5276 // Multiply with the hash key
5277 //
5278 __ movdqu(xmm_temp3, xmm_temp0);
5279 __ pclmulqdq(xmm_temp3, xmm_temp1, 0); // xmm3 holds a0*b0
5280 __ movdqu(xmm_temp4, xmm_temp0);
5281 __ pclmulqdq(xmm_temp4, xmm_temp1, 16); // xmm4 holds a0*b1
5282
5283 __ movdqu(xmm_temp5, xmm_temp0);
5284 __ pclmulqdq(xmm_temp5, xmm_temp1, 1); // xmm5 holds a1*b0
5285 __ movdqu(xmm_temp6, xmm_temp0);
5286 __ pclmulqdq(xmm_temp6, xmm_temp1, 17); // xmm6 holds a1*b1
5287
5288 __ pxor(xmm_temp4, xmm_temp5); // xmm4 holds a0*b1 + a1*b0
5289
5290 __ movdqu(xmm_temp5, xmm_temp4); // move the contents of xmm4 to xmm5
5291 __ psrldq(xmm_temp4, 8); // shift by xmm4 64 bits to the right
5292 __ pslldq(xmm_temp5, 8); // shift by xmm5 64 bits to the left
5293 __ pxor(xmm_temp3, xmm_temp5);
5294 __ pxor(xmm_temp6, xmm_temp4); // Register pair <xmm6:xmm3> holds the result
5295 // of the carry-less multiplication of
5296 // xmm0 by xmm1.
5297
5298 // We shift the result of the multiplication by one bit position
5299 // to the left to cope for the fact that the bits are reversed.
5300 __ movdqu(xmm_temp7, xmm_temp3);
5301 __ movdqu(xmm_temp8, xmm_temp6);
5302 __ pslld(xmm_temp3, 1);
5303 __ pslld(xmm_temp6, 1);
5304 __ psrld(xmm_temp7, 31);
5305 __ psrld(xmm_temp8, 31);
5306 __ movdqu(xmm_temp9, xmm_temp7);
5307 __ pslldq(xmm_temp8, 4);
5308 __ pslldq(xmm_temp7, 4);
5309 __ psrldq(xmm_temp9, 12);
5310 __ por(xmm_temp3, xmm_temp7);
5311 __ por(xmm_temp6, xmm_temp8);
5312 __ por(xmm_temp6, xmm_temp9);
5313
5314 //
5315 // First phase of the reduction
5316 //
5317 // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
5318 // independently.
5319 __ movdqu(xmm_temp7, xmm_temp3);
5320 __ movdqu(xmm_temp8, xmm_temp3);
5321 __ movdqu(xmm_temp9, xmm_temp3);
5322 __ pslld(xmm_temp7, 31); // packed right shift shifting << 31
5323 __ pslld(xmm_temp8, 30); // packed right shift shifting << 30
5324 __ pslld(xmm_temp9, 25); // packed right shift shifting << 25
5325 __ pxor(xmm_temp7, xmm_temp8); // xor the shifted versions
5326 __ pxor(xmm_temp7, xmm_temp9);
5327 __ movdqu(xmm_temp8, xmm_temp7);
5328 __ pslldq(xmm_temp7, 12);
5329 __ psrldq(xmm_temp8, 4);
5330 __ pxor(xmm_temp3, xmm_temp7); // first phase of the reduction complete
5331
5332 //
5333 // Second phase of the reduction
5334 //
5335 // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
5336 // shift operations.
5337 __ movdqu(xmm_temp2, xmm_temp3);
5338 __ movdqu(xmm_temp4, xmm_temp3);
5339 __ movdqu(xmm_temp5, xmm_temp3);
5340 __ psrld(xmm_temp2, 1); // packed left shifting >> 1
5341 __ psrld(xmm_temp4, 2); // packed left shifting >> 2
5342 __ psrld(xmm_temp5, 7); // packed left shifting >> 7
5343 __ pxor(xmm_temp2, xmm_temp4); // xor the shifted versions
5344 __ pxor(xmm_temp2, xmm_temp5);
5345 __ pxor(xmm_temp2, xmm_temp8);
5346 __ pxor(xmm_temp3, xmm_temp2);
5347 __ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6
5348
5349 __ decrement(blocks);
5350 __ jcc(Assembler::zero, L_exit);
5351 __ movdqu(xmm_temp0, xmm_temp6);
5352 __ addptr(data, 16);
5353 __ jmp(L_ghash_loop);
5354
5355 __ BIND(L_exit);
5356 __ pshufb(xmm_temp6, xmm_temp10); // Byte swap 16-byte result
5357 __ movdqu(Address(state, 0), xmm_temp6); // store the result
5358 __ leave();
5359 __ ret(0);
5360 return start;
5361 }
5362
5363 address base64_shuffle_addr()
5364 {
5365 __ align64();
5366 StubCodeMark mark(this, "StubRoutines", "shuffle_base64");
5367 address start = __ pc();
5368 assert(((unsigned long long)start & 0x3f) == 0,
5369 "Alignment problem (0x%08llx)", (unsigned long long)start);
5370 __ emit_data64(0x0405030401020001, relocInfo::none);
5371 __ emit_data64(0x0a0b090a07080607, relocInfo::none);
5372 __ emit_data64(0x10110f100d0e0c0d, relocInfo::none);
5373 __ emit_data64(0x1617151613141213, relocInfo::none);
5374 __ emit_data64(0x1c1d1b1c191a1819, relocInfo::none);
5375 __ emit_data64(0x222321221f201e1f, relocInfo::none);
5376 __ emit_data64(0x2829272825262425, relocInfo::none);
5377 __ emit_data64(0x2e2f2d2e2b2c2a2b, relocInfo::none);
5378 return start;
5379 }
5380
5381 address base64_avx2_shuffle_addr()
5382 {
5383 __ align32();
5384 StubCodeMark mark(this, "StubRoutines", "avx2_shuffle_base64");
5385 address start = __ pc();
5386 __ emit_data64(0x0809070805060405, relocInfo::none);
5387 __ emit_data64(0x0e0f0d0e0b0c0a0b, relocInfo::none);
5388 __ emit_data64(0x0405030401020001, relocInfo::none);
5389 __ emit_data64(0x0a0b090a07080607, relocInfo::none);
5390 return start;
5391 }
5392
5393 address base64_avx2_input_mask_addr()
5394 {
5395 __ align32();
5396 StubCodeMark mark(this, "StubRoutines", "avx2_input_mask_base64");
5397 address start = __ pc();
5398 __ emit_data64(0x8000000000000000, relocInfo::none);
5399 __ emit_data64(0x8000000080000000, relocInfo::none);
5400 __ emit_data64(0x8000000080000000, relocInfo::none);
5401 __ emit_data64(0x8000000080000000, relocInfo::none);
5402 return start;
5403 }
5404
5405 address base64_avx2_lut_addr()
5406 {
5407 __ align32();
5408 StubCodeMark mark(this, "StubRoutines", "avx2_lut_base64");
5409 address start = __ pc();
5410 __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5411 __ emit_data64(0x0000f0edfcfcfcfc, relocInfo::none);
5412 __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5413 __ emit_data64(0x0000f0edfcfcfcfc, relocInfo::none);
5414
5415 // URL LUT
5416 __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5417 __ emit_data64(0x000020effcfcfcfc, relocInfo::none);
5418 __ emit_data64(0xfcfcfcfcfcfc4741, relocInfo::none);
5419 __ emit_data64(0x000020effcfcfcfc, relocInfo::none);
5420 return start;
5421 }
5422
5423 address base64_encoding_table_addr()
5424 {
5425 __ align64();
5426 StubCodeMark mark(this, "StubRoutines", "encoding_table_base64");
5427 address start = __ pc();
5428 assert(((unsigned long long)start & 0x3f) == 0, "Alignment problem (0x%08llx)", (unsigned long long)start);
5429 __ emit_data64(0x4847464544434241, relocInfo::none);
5430 __ emit_data64(0x504f4e4d4c4b4a49, relocInfo::none);
5431 __ emit_data64(0x5857565554535251, relocInfo::none);
5432 __ emit_data64(0x6665646362615a59, relocInfo::none);
5433 __ emit_data64(0x6e6d6c6b6a696867, relocInfo::none);
5434 __ emit_data64(0x767574737271706f, relocInfo::none);
5435 __ emit_data64(0x333231307a797877, relocInfo::none);
5436 __ emit_data64(0x2f2b393837363534, relocInfo::none);
5437
5438 // URL table
5439 __ emit_data64(0x4847464544434241, relocInfo::none);
5440 __ emit_data64(0x504f4e4d4c4b4a49, relocInfo::none);
5441 __ emit_data64(0x5857565554535251, relocInfo::none);
5442 __ emit_data64(0x6665646362615a59, relocInfo::none);
5443 __ emit_data64(0x6e6d6c6b6a696867, relocInfo::none);
5444 __ emit_data64(0x767574737271706f, relocInfo::none);
5445 __ emit_data64(0x333231307a797877, relocInfo::none);
5446 __ emit_data64(0x5f2d393837363534, relocInfo::none);
5447 return start;
5448 }
5449
5450 // Code for generating Base64 encoding.
5451 // Intrinsic function prototype in Base64.java:
5452 // private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp,
5453 // boolean isURL) {
5454 address generate_base64_encodeBlock()
5455 {
5456 __ align(CodeEntryAlignment);
5457 StubCodeMark mark(this, "StubRoutines", "implEncode");
5458 address start = __ pc();
5459 __ enter();
5460
5461 // Save callee-saved registers before using them
5462 __ push(r12);
5463 __ push(r13);
5464 __ push(r14);
5465 __ push(r15);
5466
5467 // arguments
5468 const Register source = c_rarg0; // Source Array
5469 const Register start_offset = c_rarg1; // start offset
5470 const Register end_offset = c_rarg2; // end offset
5471 const Register dest = c_rarg3; // destination array
5472
5473 #ifndef _WIN64
5474 const Register dp = c_rarg4; // Position for writing to dest array
5475 const Register isURL = c_rarg5; // Base64 or URL character set
5476 #else
5477 const Address dp_mem(rbp, 6 * wordSize); // length is on stack on Win64
5478 const Address isURL_mem(rbp, 7 * wordSize);
5479 const Register isURL = r10; // pick the volatile windows register
5480 const Register dp = r12;
5481 __ movl(dp, dp_mem);
5482 __ movl(isURL, isURL_mem);
5483 #endif
5484
5485 const Register length = r14;
5486 const Register encode_table = r13;
5487 Label L_process3, L_exit, L_processdata, L_vbmiLoop, L_not512, L_32byteLoop;
5488
5489 // calculate length from offsets
5490 __ movl(length, end_offset);
5491 __ subl(length, start_offset);
5492 __ cmpl(length, 0);
5493 __ jcc(Assembler::lessEqual, L_exit);
5494
5495 // Code for 512-bit VBMI encoding. Encodes 48 input bytes into 64
5496 // output bytes. We read 64 input bytes and ignore the last 16, so be
5497 // sure not to read past the end of the input buffer.
5498 if (VM_Version::supports_avx512_vbmi()) {
5499 __ cmpl(length, 64); // Do not overrun input buffer.
5500 __ jcc(Assembler::below, L_not512);
5501
5502 __ shll(isURL, 6); // index into decode table based on isURL
5503 __ lea(encode_table, ExternalAddress(StubRoutines::x86::base64_encoding_table_addr()));
5504 __ addptr(encode_table, isURL);
5505 __ shrl(isURL, 6); // restore isURL
5506
5507 __ mov64(rax, 0x3036242a1016040aull); // Shifts
5508 __ evmovdquq(xmm3, ExternalAddress(StubRoutines::x86::base64_shuffle_addr()), Assembler::AVX_512bit, r15);
5509 __ evmovdquq(xmm2, Address(encode_table, 0), Assembler::AVX_512bit);
5510 __ evpbroadcastq(xmm1, rax, Assembler::AVX_512bit);
5511
5512 __ align32();
5513 __ BIND(L_vbmiLoop);
5514
5515 __ vpermb(xmm0, xmm3, Address(source, start_offset), Assembler::AVX_512bit);
5516 __ subl(length, 48);
5517
5518 // Put the input bytes into the proper lanes for writing, then
5519 // encode them.
5520 __ evpmultishiftqb(xmm0, xmm1, xmm0, Assembler::AVX_512bit);
5521 __ vpermb(xmm0, xmm0, xmm2, Assembler::AVX_512bit);
5522
5523 // Write to destination
5524 __ evmovdquq(Address(dest, dp), xmm0, Assembler::AVX_512bit);
5525
5526 __ addptr(dest, 64);
5527 __ addptr(source, 48);
5528 __ cmpl(length, 64);
5529 __ jcc(Assembler::aboveEqual, L_vbmiLoop);
5530
5531 __ vzeroupper();
5532 }
5533
5534 __ BIND(L_not512);
5535 if (VM_Version::supports_avx2()
5536 && VM_Version::supports_avx512vlbw()) {
5537 /*
5538 ** This AVX2 encoder is based off the paper at:
5539 ** https://dl.acm.org/doi/10.1145/3132709
5540 **
5541 ** We use AVX2 SIMD instructions to encode 24 bytes into 32
5542 ** output bytes.
5543 **
5544 */
5545 // Lengths under 32 bytes are done with scalar routine
5546 __ cmpl(length, 31);
5547 __ jcc(Assembler::belowEqual, L_process3);
5548
5549 // Set up supporting constant table data
5550 __ vmovdqu(xmm9, ExternalAddress(StubRoutines::x86::base64_avx2_shuffle_addr()), rax);
5551 // 6-bit mask for 2nd and 4th (and multiples) 6-bit values
5552 __ movl(rax, 0x0fc0fc00);
5553 __ vmovdqu(xmm1, ExternalAddress(StubRoutines::x86::base64_avx2_input_mask_addr()), rax);
5554 __ evpbroadcastd(xmm8, rax, Assembler::AVX_256bit);
5555
5556 // Multiplication constant for "shifting" right by 6 and 10
5557 // bits
5558 __ movl(rax, 0x04000040);
5559
5560 __ subl(length, 24);
5561 __ evpbroadcastd(xmm7, rax, Assembler::AVX_256bit);
5562
5563 // For the first load, we mask off reading of the first 4
5564 // bytes into the register. This is so we can get 4 3-byte
5565 // chunks into each lane of the register, avoiding having to
5566 // handle end conditions. We then shuffle these bytes into a
5567 // specific order so that manipulation is easier.
5568 //
5569 // The initial read loads the XMM register like this:
5570 //
5571 // Lower 128-bit lane:
5572 // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
5573 // | XX | XX | XX | XX | A0 | A1 | A2 | B0 | B1 | B2 | C0 | C1
5574 // | C2 | D0 | D1 | D2 |
5575 // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
5576 //
5577 // Upper 128-bit lane:
5578 // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
5579 // | E0 | E1 | E2 | F0 | F1 | F2 | G0 | G1 | G2 | H0 | H1 | H2
5580 // | XX | XX | XX | XX |
5581 // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
5582 //
5583 // Where A0 is the first input byte, B0 is the fourth, etc.
5584 // The alphabetical significance denotes the 3 bytes to be
5585 // consumed and encoded into 4 bytes.
5586 //
5587 // We then shuffle the register so each 32-bit word contains
5588 // the sequence:
5589 // A1 A0 A2 A1, B1, B0, B2, B1, etc.
5590 // Each of these byte sequences are then manipulated into 4
5591 // 6-bit values ready for encoding.
5592 //
5593 // If we focus on one set of 3-byte chunks, changing the
5594 // nomenclature such that A0 => a, A1 => b, and A2 => c, we
5595 // shuffle such that each 24-bit chunk contains:
5596 //
5597 // b7 b6 b5 b4 b3 b2 b1 b0 | a7 a6 a5 a4 a3 a2 a1 a0 | c7 c6
5598 // c5 c4 c3 c2 c1 c0 | b7 b6 b5 b4 b3 b2 b1 b0
5599 // Explain this step.
5600 // b3 b2 b1 b0 c5 c4 c3 c2 | c1 c0 d5 d4 d3 d2 d1 d0 | a5 a4
5601 // a3 a2 a1 a0 b5 b4 | b3 b2 b1 b0 c5 c4 c3 c2
5602 //
5603 // W first and off all but bits 4-9 and 16-21 (c5..c0 and
5604 // a5..a0) and shift them using a vector multiplication
5605 // operation (vpmulhuw) which effectively shifts c right by 6
5606 // bits and a right by 10 bits. We similarly mask bits 10-15
5607 // (d5..d0) and 22-27 (b5..b0) and shift them left by 8 and 4
5608 // bits respectively. This is done using vpmullw. We end up
5609 // with 4 6-bit values, thus splitting the 3 input bytes,
5610 // ready for encoding:
5611 // 0 0 d5..d0 0 0 c5..c0 0 0 b5..b0 0 0 a5..a0
5612 //
5613 // For translation, we recognize that there are 5 distinct
5614 // ranges of legal Base64 characters as below:
5615 //
5616 // +-------------+-------------+------------+
5617 // | 6-bit value | ASCII range | offset |
5618 // +-------------+-------------+------------+
5619 // | 0..25 | A..Z | 65 |
5620 // | 26..51 | a..z | 71 |
5621 // | 52..61 | 0..9 | -4 |
5622 // | 62 | + or - | -19 or -17 |
5623 // | 63 | / or _ | -16 or 32 |
5624 // +-------------+-------------+------------+
5625 //
5626 // We note that vpshufb does a parallel lookup in a
5627 // destination register using the lower 4 bits of bytes from a
5628 // source register. If we use a saturated subtraction and
5629 // subtract 51 from each 6-bit value, bytes from [0,51]
5630 // saturate to 0, and [52,63] map to a range of [1,12]. We
5631 // distinguish the [0,25] and [26,51] ranges by assigning a
5632 // value of 13 for all 6-bit values less than 26. We end up
5633 // with:
5634 //
5635 // +-------------+-------------+------------+
5636 // | 6-bit value | Reduced | offset |
5637 // +-------------+-------------+------------+
5638 // | 0..25 | 13 | 65 |
5639 // | 26..51 | 0 | 71 |
5640 // | 52..61 | 0..9 | -4 |
5641 // | 62 | 11 | -19 or -17 |
5642 // | 63 | 12 | -16 or 32 |
5643 // +-------------+-------------+------------+
5644 //
5645 // We then use a final vpshufb to add the appropriate offset,
5646 // translating the bytes.
5647 //
5648 // Load input bytes - only 28 bytes. Mask the first load to
5649 // not load into the full register.
5650 __ vpmaskmovd(xmm1, xmm1, Address(source, start_offset, Address::times_1, -4), Assembler::AVX_256bit);
5651
5652 // Move 3-byte chunks of input (12 bytes) into 16 bytes,
5653 // ordering by:
5654 // 1, 0, 2, 1; 4, 3, 5, 4; etc. This groups 6-bit chunks
5655 // for easy masking
5656 __ vpshufb(xmm1, xmm1, xmm9, Assembler::AVX_256bit);
5657
5658 __ addl(start_offset, 24);
5659
5660 // Load masking register for first and third (and multiples)
5661 // 6-bit values.
5662 __ movl(rax, 0x003f03f0);
5663 __ evpbroadcastd(xmm6, rax, Assembler::AVX_256bit);
5664 // Multiplication constant for "shifting" left by 4 and 8 bits
5665 __ movl(rax, 0x01000010);
5666 __ evpbroadcastd(xmm5, rax, Assembler::AVX_256bit);
5667
5668 // Isolate 6-bit chunks of interest
5669 __ vpand(xmm0, xmm8, xmm1, Assembler::AVX_256bit);
5670
5671 // Load constants for encoding
5672 __ movl(rax, 0x19191919);
5673 __ evpbroadcastd(xmm3, rax, Assembler::AVX_256bit);
5674 __ movl(rax, 0x33333333);
5675 __ evpbroadcastd(xmm4, rax, Assembler::AVX_256bit);
5676
5677 // Shift output bytes 0 and 2 into proper lanes
5678 __ vpmulhuw(xmm2, xmm0, xmm7, Assembler::AVX_256bit);
5679
5680 // Mask and shift output bytes 1 and 3 into proper lanes and
5681 // combine
5682 __ vpand(xmm0, xmm6, xmm1, Assembler::AVX_256bit);
5683 __ vpmullw(xmm0, xmm5, xmm0, Assembler::AVX_256bit);
5684 __ vpor(xmm0, xmm0, xmm2, Assembler::AVX_256bit);
5685
5686 // Find out which are 0..25. This indicates which input
5687 // values fall in the range of 'A'-'Z', which require an
5688 // additional offset (see comments above)
5689 __ vpcmpgtb(xmm2, xmm0, xmm3, Assembler::AVX_256bit);
5690 __ vpsubusb(xmm1, xmm0, xmm4, Assembler::AVX_256bit);
5691 __ vpsubb(xmm1, xmm1, xmm2, Assembler::AVX_256bit);
5692
5693 // Load the proper lookup table
5694 __ lea(r11, ExternalAddress(StubRoutines::x86::base64_avx2_lut_addr()));
5695 __ movl(r15, isURL);
5696 __ shll(r15, 5);
5697 __ vmovdqu(xmm2, Address(r11, r15));
5698
5699 // Shuffle the offsets based on the range calculation done
5700 // above. This allows us to add the correct offset to the
5701 // 6-bit value corresponding to the range documented above.
5702 __ vpshufb(xmm1, xmm2, xmm1, Assembler::AVX_256bit);
5703 __ vpaddb(xmm0, xmm1, xmm0, Assembler::AVX_256bit);
5704
5705 // Store the encoded bytes
5706 __ vmovdqu(Address(dest, dp), xmm0);
5707 __ addl(dp, 32);
5708
5709 __ cmpl(length, 31);
5710 __ jcc(Assembler::belowEqual, L_process3);
5711
5712 __ align32();
5713 __ BIND(L_32byteLoop);
5714
5715 // Get next 32 bytes
5716 __ vmovdqu(xmm1, Address(source, start_offset, Address::times_1, -4));
5717
5718 __ subl(length, 24);
5719 __ addl(start_offset, 24);
5720
5721 // This logic is identical to the above, with only constant
5722 // register loads removed. Shuffle the input, mask off 6-bit
5723 // chunks, shift them into place, then add the offset to
5724 // encode.
5725 __ vpshufb(xmm1, xmm1, xmm9, Assembler::AVX_256bit);
5726
5727 __ vpand(xmm0, xmm8, xmm1, Assembler::AVX_256bit);
5728 __ vpmulhuw(xmm10, xmm0, xmm7, Assembler::AVX_256bit);
5729 __ vpand(xmm0, xmm6, xmm1, Assembler::AVX_256bit);
5730 __ vpmullw(xmm0, xmm5, xmm0, Assembler::AVX_256bit);
5731 __ vpor(xmm0, xmm0, xmm10, Assembler::AVX_256bit);
5732 __ vpcmpgtb(xmm10, xmm0, xmm3, Assembler::AVX_256bit);
5733 __ vpsubusb(xmm1, xmm0, xmm4, Assembler::AVX_256bit);
5734 __ vpsubb(xmm1, xmm1, xmm10, Assembler::AVX_256bit);
5735 __ vpshufb(xmm1, xmm2, xmm1, Assembler::AVX_256bit);
5736 __ vpaddb(xmm0, xmm1, xmm0, Assembler::AVX_256bit);
5737
5738 // Store the encoded bytes
5739 __ vmovdqu(Address(dest, dp), xmm0);
5740 __ addl(dp, 32);
5741
5742 __ cmpl(length, 31);
5743 __ jcc(Assembler::above, L_32byteLoop);
5744
5745 __ BIND(L_process3);
5746 __ vzeroupper();
5747 } else {
5748 __ BIND(L_process3);
5749 }
5750
5751 __ cmpl(length, 3);
5752 __ jcc(Assembler::below, L_exit);
5753
5754 // Load the encoding table based on isURL
5755 __ lea(r11, ExternalAddress(StubRoutines::x86::base64_encoding_table_addr()));
5756 __ movl(r15, isURL);
5757 __ shll(r15, 6);
5758 __ addptr(r11, r15);
5759
5760 __ BIND(L_processdata);
5761
5762 // Load 3 bytes
5763 __ load_unsigned_byte(r15, Address(source, start_offset));
5764 __ load_unsigned_byte(r10, Address(source, start_offset, Address::times_1, 1));
5765 __ load_unsigned_byte(r13, Address(source, start_offset, Address::times_1, 2));
5766
5767 // Build a 32-bit word with bytes 1, 2, 0, 1
5768 __ movl(rax, r10);
5769 __ shll(r10, 24);
5770 __ orl(rax, r10);
5771
5772 __ subl(length, 3);
5773
5774 __ shll(r15, 8);
5775 __ shll(r13, 16);
5776 __ orl(rax, r15);
5777
5778 __ addl(start_offset, 3);
5779
5780 __ orl(rax, r13);
5781 // At this point, rax contains | byte1 | byte2 | byte0 | byte1
5782 // r13 has byte2 << 16 - need low-order 6 bits to translate.
5783 // This translated byte is the fourth output byte.
5784 __ shrl(r13, 16);
5785 __ andl(r13, 0x3f);
5786
5787 // The high-order 6 bits of r15 (byte0) is translated.
5788 // The translated byte is the first output byte.
5789 __ shrl(r15, 10);
5790
5791 __ load_unsigned_byte(r13, Address(r11, r13));
5792 __ load_unsigned_byte(r15, Address(r11, r15));
5793
5794 __ movb(Address(dest, dp, Address::times_1, 3), r13);
5795
5796 // Extract high-order 4 bits of byte1 and low-order 2 bits of byte0.
5797 // This translated byte is the second output byte.
5798 __ shrl(rax, 4);
5799 __ movl(r10, rax);
5800 __ andl(rax, 0x3f);
5801
5802 __ movb(Address(dest, dp, Address::times_1, 0), r15);
5803
5804 __ load_unsigned_byte(rax, Address(r11, rax));
5805
5806 // Extract low-order 2 bits of byte1 and high-order 4 bits of byte2.
5807 // This translated byte is the third output byte.
5808 __ shrl(r10, 18);
5809 __ andl(r10, 0x3f);
5810
5811 __ load_unsigned_byte(r10, Address(r11, r10));
5812
5813 __ movb(Address(dest, dp, Address::times_1, 1), rax);
5814 __ movb(Address(dest, dp, Address::times_1, 2), r10);
5815
5816 __ addl(dp, 4);
5817 __ cmpl(length, 3);
5818 __ jcc(Assembler::aboveEqual, L_processdata);
5819
5820 __ BIND(L_exit);
5821 __ pop(r15);
5822 __ pop(r14);
5823 __ pop(r13);
5824 __ pop(r12);
5825 __ leave();
5826 __ ret(0);
5827 return start;
5828 }
5829
5830 // base64 AVX512vbmi tables
5831 address base64_vbmi_lookup_lo_addr() {
5832 __ align64();
5833 StubCodeMark mark(this, "StubRoutines", "lookup_lo_base64");
5834 address start = __ pc();
5835 assert(((unsigned long long)start & 0x3f) == 0,
5836 "Alignment problem (0x%08llx)", (unsigned long long)start);
5837 __ emit_data64(0x8080808080808080, relocInfo::none);
5838 __ emit_data64(0x8080808080808080, relocInfo::none);
5839 __ emit_data64(0x8080808080808080, relocInfo::none);
5840 __ emit_data64(0x8080808080808080, relocInfo::none);
5841 __ emit_data64(0x8080808080808080, relocInfo::none);
5842 __ emit_data64(0x3f8080803e808080, relocInfo::none);
5843 __ emit_data64(0x3b3a393837363534, relocInfo::none);
5844 __ emit_data64(0x8080808080803d3c, relocInfo::none);
5845 return start;
5846 }
5847
5848 address base64_vbmi_lookup_hi_addr() {
5849 __ align64();
5850 StubCodeMark mark(this, "StubRoutines", "lookup_hi_base64");
5851 address start = __ pc();
5852 assert(((unsigned long long)start & 0x3f) == 0,
5853 "Alignment problem (0x%08llx)", (unsigned long long)start);
5854 __ emit_data64(0x0605040302010080, relocInfo::none);
5855 __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
5856 __ emit_data64(0x161514131211100f, relocInfo::none);
5857 __ emit_data64(0x8080808080191817, relocInfo::none);
5858 __ emit_data64(0x201f1e1d1c1b1a80, relocInfo::none);
5859 __ emit_data64(0x2827262524232221, relocInfo::none);
5860 __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
5861 __ emit_data64(0x8080808080333231, relocInfo::none);
5862 return start;
5863 }
5864 address base64_vbmi_lookup_lo_url_addr() {
5865 __ align64();
5866 StubCodeMark mark(this, "StubRoutines", "lookup_lo_base64url");
5867 address start = __ pc();
5868 assert(((unsigned long long)start & 0x3f) == 0,
5869 "Alignment problem (0x%08llx)", (unsigned long long)start);
5870 __ emit_data64(0x8080808080808080, relocInfo::none);
5871 __ emit_data64(0x8080808080808080, relocInfo::none);
5872 __ emit_data64(0x8080808080808080, relocInfo::none);
5873 __ emit_data64(0x8080808080808080, relocInfo::none);
5874 __ emit_data64(0x8080808080808080, relocInfo::none);
5875 __ emit_data64(0x80803e8080808080, relocInfo::none);
5876 __ emit_data64(0x3b3a393837363534, relocInfo::none);
5877 __ emit_data64(0x8080808080803d3c, relocInfo::none);
5878 return start;
5879 }
5880
5881 address base64_vbmi_lookup_hi_url_addr() {
5882 __ align64();
5883 StubCodeMark mark(this, "StubRoutines", "lookup_hi_base64url");
5884 address start = __ pc();
5885 assert(((unsigned long long)start & 0x3f) == 0,
5886 "Alignment problem (0x%08llx)", (unsigned long long)start);
5887 __ emit_data64(0x0605040302010080, relocInfo::none);
5888 __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
5889 __ emit_data64(0x161514131211100f, relocInfo::none);
5890 __ emit_data64(0x3f80808080191817, relocInfo::none);
5891 __ emit_data64(0x201f1e1d1c1b1a80, relocInfo::none);
5892 __ emit_data64(0x2827262524232221, relocInfo::none);
5893 __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
5894 __ emit_data64(0x8080808080333231, relocInfo::none);
5895 return start;
5896 }
5897
5898 address base64_vbmi_pack_vec_addr() {
5899 __ align64();
5900 StubCodeMark mark(this, "StubRoutines", "pack_vec_base64");
5901 address start = __ pc();
5902 assert(((unsigned long long)start & 0x3f) == 0,
5903 "Alignment problem (0x%08llx)", (unsigned long long)start);
5904 __ emit_data64(0x090a040506000102, relocInfo::none);
5905 __ emit_data64(0x161011120c0d0e08, relocInfo::none);
5906 __ emit_data64(0x1c1d1e18191a1415, relocInfo::none);
5907 __ emit_data64(0x292a242526202122, relocInfo::none);
5908 __ emit_data64(0x363031322c2d2e28, relocInfo::none);
5909 __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
5910 __ emit_data64(0x0000000000000000, relocInfo::none);
5911 __ emit_data64(0x0000000000000000, relocInfo::none);
5912 return start;
5913 }
5914
5915 address base64_vbmi_join_0_1_addr() {
5916 __ align64();
5917 StubCodeMark mark(this, "StubRoutines", "join_0_1_base64");
5918 address start = __ pc();
5919 assert(((unsigned long long)start & 0x3f) == 0,
5920 "Alignment problem (0x%08llx)", (unsigned long long)start);
5921 __ emit_data64(0x090a040506000102, relocInfo::none);
5922 __ emit_data64(0x161011120c0d0e08, relocInfo::none);
5923 __ emit_data64(0x1c1d1e18191a1415, relocInfo::none);
5924 __ emit_data64(0x292a242526202122, relocInfo::none);
5925 __ emit_data64(0x363031322c2d2e28, relocInfo::none);
5926 __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
5927 __ emit_data64(0x494a444546404142, relocInfo::none);
5928 __ emit_data64(0x565051524c4d4e48, relocInfo::none);
5929 return start;
5930 }
5931
5932 address base64_vbmi_join_1_2_addr() {
5933 __ align64();
5934 StubCodeMark mark(this, "StubRoutines", "join_1_2_base64");
5935 address start = __ pc();
5936 assert(((unsigned long long)start & 0x3f) == 0,
5937 "Alignment problem (0x%08llx)", (unsigned long long)start);
5938 __ emit_data64(0x1c1d1e18191a1415, relocInfo::none);
5939 __ emit_data64(0x292a242526202122, relocInfo::none);
5940 __ emit_data64(0x363031322c2d2e28, relocInfo::none);
5941 __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
5942 __ emit_data64(0x494a444546404142, relocInfo::none);
5943 __ emit_data64(0x565051524c4d4e48, relocInfo::none);
5944 __ emit_data64(0x5c5d5e58595a5455, relocInfo::none);
5945 __ emit_data64(0x696a646566606162, relocInfo::none);
5946 return start;
5947 }
5948
5949 address base64_vbmi_join_2_3_addr() {
5950 __ align64();
5951 StubCodeMark mark(this, "StubRoutines", "join_2_3_base64");
5952 address start = __ pc();
5953 assert(((unsigned long long)start & 0x3f) == 0,
5954 "Alignment problem (0x%08llx)", (unsigned long long)start);
5955 __ emit_data64(0x363031322c2d2e28, relocInfo::none);
5956 __ emit_data64(0x3c3d3e38393a3435, relocInfo::none);
5957 __ emit_data64(0x494a444546404142, relocInfo::none);
5958 __ emit_data64(0x565051524c4d4e48, relocInfo::none);
5959 __ emit_data64(0x5c5d5e58595a5455, relocInfo::none);
5960 __ emit_data64(0x696a646566606162, relocInfo::none);
5961 __ emit_data64(0x767071726c6d6e68, relocInfo::none);
5962 __ emit_data64(0x7c7d7e78797a7475, relocInfo::none);
5963 return start;
5964 }
5965
5966 address base64_decoding_table_addr() {
5967 StubCodeMark mark(this, "StubRoutines", "decoding_table_base64");
5968 address start = __ pc();
5969 __ emit_data64(0xffffffffffffffff, relocInfo::none);
5970 __ emit_data64(0xffffffffffffffff, relocInfo::none);
5971 __ emit_data64(0xffffffffffffffff, relocInfo::none);
5972 __ emit_data64(0xffffffffffffffff, relocInfo::none);
5973 __ emit_data64(0xffffffffffffffff, relocInfo::none);
5974 __ emit_data64(0x3fffffff3effffff, relocInfo::none);
5975 __ emit_data64(0x3b3a393837363534, relocInfo::none);
5976 __ emit_data64(0xffffffffffff3d3c, relocInfo::none);
5977 __ emit_data64(0x06050403020100ff, relocInfo::none);
5978 __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
5979 __ emit_data64(0x161514131211100f, relocInfo::none);
5980 __ emit_data64(0xffffffffff191817, relocInfo::none);
5981 __ emit_data64(0x201f1e1d1c1b1aff, relocInfo::none);
5982 __ emit_data64(0x2827262524232221, relocInfo::none);
5983 __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
5984 __ emit_data64(0xffffffffff333231, relocInfo::none);
5985 __ emit_data64(0xffffffffffffffff, relocInfo::none);
5986 __ emit_data64(0xffffffffffffffff, relocInfo::none);
5987 __ emit_data64(0xffffffffffffffff, relocInfo::none);
5988 __ emit_data64(0xffffffffffffffff, relocInfo::none);
5989 __ emit_data64(0xffffffffffffffff, relocInfo::none);
5990 __ emit_data64(0xffffffffffffffff, relocInfo::none);
5991 __ emit_data64(0xffffffffffffffff, relocInfo::none);
5992 __ emit_data64(0xffffffffffffffff, relocInfo::none);
5993 __ emit_data64(0xffffffffffffffff, relocInfo::none);
5994 __ emit_data64(0xffffffffffffffff, relocInfo::none);
5995 __ emit_data64(0xffffffffffffffff, relocInfo::none);
5996 __ emit_data64(0xffffffffffffffff, relocInfo::none);
5997 __ emit_data64(0xffffffffffffffff, relocInfo::none);
5998 __ emit_data64(0xffffffffffffffff, relocInfo::none);
5999 __ emit_data64(0xffffffffffffffff, relocInfo::none);
6000 __ emit_data64(0xffffffffffffffff, relocInfo::none);
6001
6002 // URL table
6003 __ emit_data64(0xffffffffffffffff, relocInfo::none);
6004 __ emit_data64(0xffffffffffffffff, relocInfo::none);
6005 __ emit_data64(0xffffffffffffffff, relocInfo::none);
6006 __ emit_data64(0xffffffffffffffff, relocInfo::none);
6007 __ emit_data64(0xffffffffffffffff, relocInfo::none);
6008 __ emit_data64(0xffff3effffffffff, relocInfo::none);
6009 __ emit_data64(0x3b3a393837363534, relocInfo::none);
6010 __ emit_data64(0xffffffffffff3d3c, relocInfo::none);
6011 __ emit_data64(0x06050403020100ff, relocInfo::none);
6012 __ emit_data64(0x0e0d0c0b0a090807, relocInfo::none);
6013 __ emit_data64(0x161514131211100f, relocInfo::none);
6014 __ emit_data64(0x3fffffffff191817, relocInfo::none);
6015 __ emit_data64(0x201f1e1d1c1b1aff, relocInfo::none);
6016 __ emit_data64(0x2827262524232221, relocInfo::none);
6017 __ emit_data64(0x302f2e2d2c2b2a29, relocInfo::none);
6018 __ emit_data64(0xffffffffff333231, relocInfo::none);
6019 __ emit_data64(0xffffffffffffffff, relocInfo::none);
6020 __ emit_data64(0xffffffffffffffff, relocInfo::none);
6021 __ emit_data64(0xffffffffffffffff, relocInfo::none);
6022 __ emit_data64(0xffffffffffffffff, relocInfo::none);
6023 __ emit_data64(0xffffffffffffffff, relocInfo::none);
6024 __ emit_data64(0xffffffffffffffff, relocInfo::none);
6025 __ emit_data64(0xffffffffffffffff, relocInfo::none);
6026 __ emit_data64(0xffffffffffffffff, relocInfo::none);
6027 __ emit_data64(0xffffffffffffffff, relocInfo::none);
6028 __ emit_data64(0xffffffffffffffff, relocInfo::none);
6029 __ emit_data64(0xffffffffffffffff, relocInfo::none);
6030 __ emit_data64(0xffffffffffffffff, relocInfo::none);
6031 __ emit_data64(0xffffffffffffffff, relocInfo::none);
6032 __ emit_data64(0xffffffffffffffff, relocInfo::none);
6033 __ emit_data64(0xffffffffffffffff, relocInfo::none);
6034 __ emit_data64(0xffffffffffffffff, relocInfo::none);
6035 return start;
6036 }
6037
6038
6039 // Code for generating Base64 decoding.
6040 //
6041 // Based on the article (and associated code) from https://arxiv.org/abs/1910.05109.
6042 //
6043 // Intrinsic function prototype in Base64.java:
6044 // private void decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, isMIME) {
6045 address generate_base64_decodeBlock() {
6046 __ align(CodeEntryAlignment);
6047 StubCodeMark mark(this, "StubRoutines", "implDecode");
6048 address start = __ pc();
6049 __ enter();
6050
6051 // Save callee-saved registers before using them
6052 __ push(r12);
6053 __ push(r13);
6054 __ push(r14);
6055 __ push(r15);
6056 __ push(rbx);
6057
6058 // arguments
6059 const Register source = c_rarg0; // Source Array
6060 const Register start_offset = c_rarg1; // start offset
6061 const Register end_offset = c_rarg2; // end offset
6062 const Register dest = c_rarg3; // destination array
6063 const Register isMIME = rbx;
6064
6065 #ifndef _WIN64
6066 const Register dp = c_rarg4; // Position for writing to dest array
6067 const Register isURL = c_rarg5;// Base64 or URL character set
6068 __ movl(isMIME, Address(rbp, 2 * wordSize));
6069 #else
6070 const Address dp_mem(rbp, 6 * wordSize); // length is on stack on Win64
6071 const Address isURL_mem(rbp, 7 * wordSize);
6072 const Register isURL = r10; // pick the volatile windows register
6073 const Register dp = r12;
6074 __ movl(dp, dp_mem);
6075 __ movl(isURL, isURL_mem);
6076 __ movl(isMIME, Address(rbp, 8 * wordSize));
6077 #endif
6078
6079 const XMMRegister lookup_lo = xmm5;
6080 const XMMRegister lookup_hi = xmm6;
6081 const XMMRegister errorvec = xmm7;
6082 const XMMRegister pack16_op = xmm9;
6083 const XMMRegister pack32_op = xmm8;
6084 const XMMRegister input0 = xmm3;
6085 const XMMRegister input1 = xmm20;
6086 const XMMRegister input2 = xmm21;
6087 const XMMRegister input3 = xmm19;
6088 const XMMRegister join01 = xmm12;
6089 const XMMRegister join12 = xmm11;
6090 const XMMRegister join23 = xmm10;
6091 const XMMRegister translated0 = xmm2;
6092 const XMMRegister translated1 = xmm1;
6093 const XMMRegister translated2 = xmm0;
6094 const XMMRegister translated3 = xmm4;
6095
6096 const XMMRegister merged0 = xmm2;
6097 const XMMRegister merged1 = xmm1;
6098 const XMMRegister merged2 = xmm0;
6099 const XMMRegister merged3 = xmm4;
6100 const XMMRegister merge_ab_bc0 = xmm2;
6101 const XMMRegister merge_ab_bc1 = xmm1;
6102 const XMMRegister merge_ab_bc2 = xmm0;
6103 const XMMRegister merge_ab_bc3 = xmm4;
6104
6105 const XMMRegister pack24bits = xmm4;
6106
6107 const Register length = r14;
6108 const Register output_size = r13;
6109 const Register output_mask = r15;
6110 const KRegister input_mask = k1;
6111
6112 const XMMRegister input_initial_valid_b64 = xmm0;
6113 const XMMRegister tmp = xmm10;
6114 const XMMRegister mask = xmm0;
6115 const XMMRegister invalid_b64 = xmm1;
6116
6117 Label L_process256, L_process64, L_process64Loop, L_exit, L_processdata, L_loadURL;
6118 Label L_continue, L_finalBit, L_padding, L_donePadding, L_bruteForce;
6119 Label L_forceLoop, L_bottomLoop, L_checkMIME, L_exit_no_vzero;
6120
6121 // calculate length from offsets
6122 __ movl(length, end_offset);
6123 __ subl(length, start_offset);
6124 __ push(dest); // Save for return value calc
6125
6126 // If AVX512 VBMI not supported, just compile non-AVX code
6127 if(VM_Version::supports_avx512_vbmi() &&
6128 VM_Version::supports_avx512bw()) {
6129 __ cmpl(length, 128); // 128-bytes is break-even for AVX-512
6130 __ jcc(Assembler::lessEqual, L_bruteForce);
6131
6132 __ cmpl(isMIME, 0);
6133 __ jcc(Assembler::notEqual, L_bruteForce);
6134
6135 // Load lookup tables based on isURL
6136 __ cmpl(isURL, 0);
6137 __ jcc(Assembler::notZero, L_loadURL);
6138
6139 __ evmovdquq(lookup_lo, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_lo_addr()), Assembler::AVX_512bit, r13);
6140 __ evmovdquq(lookup_hi, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_hi_addr()), Assembler::AVX_512bit, r13);
6141
6142 __ BIND(L_continue);
6143
6144 __ movl(r15, 0x01400140);
6145 __ evpbroadcastd(pack16_op, r15, Assembler::AVX_512bit);
6146
6147 __ movl(r15, 0x00011000);
6148 __ evpbroadcastd(pack32_op, r15, Assembler::AVX_512bit);
6149
6150 __ cmpl(length, 0xff);
6151 __ jcc(Assembler::lessEqual, L_process64);
6152
6153 // load masks required for decoding data
6154 __ BIND(L_processdata);
6155 __ evmovdquq(join01, ExternalAddress(StubRoutines::x86::base64_vbmi_join_0_1_addr()), Assembler::AVX_512bit,r13);
6156 __ evmovdquq(join12, ExternalAddress(StubRoutines::x86::base64_vbmi_join_1_2_addr()), Assembler::AVX_512bit, r13);
6157 __ evmovdquq(join23, ExternalAddress(StubRoutines::x86::base64_vbmi_join_2_3_addr()), Assembler::AVX_512bit, r13);
6158
6159 __ align32();
6160 __ BIND(L_process256);
6161 // Grab input data
6162 __ evmovdquq(input0, Address(source, start_offset, Address::times_1, 0x00), Assembler::AVX_512bit);
6163 __ evmovdquq(input1, Address(source, start_offset, Address::times_1, 0x40), Assembler::AVX_512bit);
6164 __ evmovdquq(input2, Address(source, start_offset, Address::times_1, 0x80), Assembler::AVX_512bit);
6165 __ evmovdquq(input3, Address(source, start_offset, Address::times_1, 0xc0), Assembler::AVX_512bit);
6166
6167 // Copy the low part of the lookup table into the destination of the permutation
6168 __ evmovdquq(translated0, lookup_lo, Assembler::AVX_512bit);
6169 __ evmovdquq(translated1, lookup_lo, Assembler::AVX_512bit);
6170 __ evmovdquq(translated2, lookup_lo, Assembler::AVX_512bit);
6171 __ evmovdquq(translated3, lookup_lo, Assembler::AVX_512bit);
6172
6173 // Translate the base64 input into "decoded" bytes
6174 __ evpermt2b(translated0, input0, lookup_hi, Assembler::AVX_512bit);
6175 __ evpermt2b(translated1, input1, lookup_hi, Assembler::AVX_512bit);
6176 __ evpermt2b(translated2, input2, lookup_hi, Assembler::AVX_512bit);
6177 __ evpermt2b(translated3, input3, lookup_hi, Assembler::AVX_512bit);
6178
6179 // OR all of the translations together to check for errors (high-order bit of byte set)
6180 __ vpternlogd(input0, 0xfe, input1, input2, Assembler::AVX_512bit);
6181
6182 __ vpternlogd(input3, 0xfe, translated0, translated1, Assembler::AVX_512bit);
6183 __ vpternlogd(input0, 0xfe, translated2, translated3, Assembler::AVX_512bit);
6184 __ vpor(errorvec, input3, input0, Assembler::AVX_512bit);
6185
6186 // Check if there was an error - if so, try 64-byte chunks
6187 __ evpmovb2m(k3, errorvec, Assembler::AVX_512bit);
6188 __ kortestql(k3, k3);
6189 __ jcc(Assembler::notZero, L_process64);
6190
6191 // The merging and shuffling happens here
6192 // We multiply each byte pair [00dddddd | 00cccccc | 00bbbbbb | 00aaaaaa]
6193 // Multiply [00cccccc] by 2^6 added to [00dddddd] to get [0000cccc | ccdddddd]
6194 // The pack16_op is a vector of 0x01400140, so multiply D by 1 and C by 0x40
6195 __ vpmaddubsw(merge_ab_bc0, translated0, pack16_op, Assembler::AVX_512bit);
6196 __ vpmaddubsw(merge_ab_bc1, translated1, pack16_op, Assembler::AVX_512bit);
6197 __ vpmaddubsw(merge_ab_bc2, translated2, pack16_op, Assembler::AVX_512bit);
6198 __ vpmaddubsw(merge_ab_bc3, translated3, pack16_op, Assembler::AVX_512bit);
6199
6200 // Now do the same with packed 16-bit values.
6201 // We start with [0000cccc | ccdddddd | 0000aaaa | aabbbbbb]
6202 // pack32_op is 0x00011000 (2^12, 1), so this multiplies [0000aaaa | aabbbbbb] by 2^12
6203 // and adds [0000cccc | ccdddddd] to yield [00000000 | aaaaaabb | bbbbcccc | ccdddddd]
6204 __ vpmaddwd(merged0, merge_ab_bc0, pack32_op, Assembler::AVX_512bit);
6205 __ vpmaddwd(merged1, merge_ab_bc1, pack32_op, Assembler::AVX_512bit);
6206 __ vpmaddwd(merged2, merge_ab_bc2, pack32_op, Assembler::AVX_512bit);
6207 __ vpmaddwd(merged3, merge_ab_bc3, pack32_op, Assembler::AVX_512bit);
6208
6209 // The join vectors specify which byte from which vector goes into the outputs
6210 // One of every 4 bytes in the extended vector is zero, so we pack them into their
6211 // final positions in the register for storing (256 bytes in, 192 bytes out)
6212 __ evpermt2b(merged0, join01, merged1, Assembler::AVX_512bit);
6213 __ evpermt2b(merged1, join12, merged2, Assembler::AVX_512bit);
6214 __ evpermt2b(merged2, join23, merged3, Assembler::AVX_512bit);
6215
6216 // Store result
6217 __ evmovdquq(Address(dest, dp, Address::times_1, 0x00), merged0, Assembler::AVX_512bit);
6218 __ evmovdquq(Address(dest, dp, Address::times_1, 0x40), merged1, Assembler::AVX_512bit);
6219 __ evmovdquq(Address(dest, dp, Address::times_1, 0x80), merged2, Assembler::AVX_512bit);
6220
6221 __ addptr(source, 0x100);
6222 __ addptr(dest, 0xc0);
6223 __ subl(length, 0x100);
6224 __ cmpl(length, 64 * 4);
6225 __ jcc(Assembler::greaterEqual, L_process256);
6226
6227 // At this point, we've decoded 64 * 4 * n bytes.
6228 // The remaining length will be <= 64 * 4 - 1.
6229 // UNLESS there was an error decoding the first 256-byte chunk. In this
6230 // case, the length will be arbitrarily long.
6231 //
6232 // Note that this will be the path for MIME-encoded strings.
6233
6234 __ BIND(L_process64);
6235
6236 __ evmovdquq(pack24bits, ExternalAddress(StubRoutines::x86::base64_vbmi_pack_vec_addr()), Assembler::AVX_512bit, r13);
6237
6238 __ cmpl(length, 63);
6239 __ jcc(Assembler::lessEqual, L_finalBit);
6240
6241 __ mov64(rax, 0x0000ffffffffffff);
6242 __ kmovql(k2, rax);
6243
6244 __ align32();
6245 __ BIND(L_process64Loop);
6246
6247 // Handle first 64-byte block
6248
6249 __ evmovdquq(input0, Address(source, start_offset), Assembler::AVX_512bit);
6250 __ evmovdquq(translated0, lookup_lo, Assembler::AVX_512bit);
6251 __ evpermt2b(translated0, input0, lookup_hi, Assembler::AVX_512bit);
6252
6253 __ vpor(errorvec, translated0, input0, Assembler::AVX_512bit);
6254
6255 // Check for error and bomb out before updating dest
6256 __ evpmovb2m(k3, errorvec, Assembler::AVX_512bit);
6257 __ kortestql(k3, k3);
6258 __ jcc(Assembler::notZero, L_exit);
6259
6260 // Pack output register, selecting correct byte ordering
6261 __ vpmaddubsw(merge_ab_bc0, translated0, pack16_op, Assembler::AVX_512bit);
6262 __ vpmaddwd(merged0, merge_ab_bc0, pack32_op, Assembler::AVX_512bit);
6263 __ vpermb(merged0, pack24bits, merged0, Assembler::AVX_512bit);
6264
6265 __ evmovdqub(Address(dest, dp), k2, merged0, true, Assembler::AVX_512bit);
6266
6267 __ subl(length, 64);
6268 __ addptr(source, 64);
6269 __ addptr(dest, 48);
6270
6271 __ cmpl(length, 64);
6272 __ jcc(Assembler::greaterEqual, L_process64Loop);
6273
6274 __ cmpl(length, 0);
6275 __ jcc(Assembler::lessEqual, L_exit);
6276
6277 __ BIND(L_finalBit);
6278 // Now have 1 to 63 bytes left to decode
6279
6280 // I was going to let Java take care of the final fragment
6281 // however it will repeatedly call this routine for every 4 bytes
6282 // of input data, so handle the rest here.
6283 __ movq(rax, -1);
6284 __ bzhiq(rax, rax, length); // Input mask in rax
6285
6286 __ movl(output_size, length);
6287 __ shrl(output_size, 2); // Find (len / 4) * 3 (output length)
6288 __ lea(output_size, Address(output_size, output_size, Address::times_2, 0));
6289 // output_size in r13
6290
6291 // Strip pad characters, if any, and adjust length and mask
6292 __ cmpb(Address(source, length, Address::times_1, -1), '=');
6293 __ jcc(Assembler::equal, L_padding);
6294
6295 __ BIND(L_donePadding);
6296
6297 // Output size is (64 - output_size), output mask is (all 1s >> output_size).
6298 __ kmovql(input_mask, rax);
6299 __ movq(output_mask, -1);
6300 __ bzhiq(output_mask, output_mask, output_size);
6301
6302 // Load initial input with all valid base64 characters. Will be used
6303 // in merging source bytes to avoid masking when determining if an error occurred.
6304 __ movl(rax, 0x61616161);
6305 __ evpbroadcastd(input_initial_valid_b64, rax, Assembler::AVX_512bit);
6306
6307 // A register containing all invalid base64 decoded values
6308 __ movl(rax, 0x80808080);
6309 __ evpbroadcastd(invalid_b64, rax, Assembler::AVX_512bit);
6310
6311 // input_mask is in k1
6312 // output_size is in r13
6313 // output_mask is in r15
6314 // zmm0 - free
6315 // zmm1 - 0x00011000
6316 // zmm2 - 0x01400140
6317 // zmm3 - errorvec
6318 // zmm4 - pack vector
6319 // zmm5 - lookup_lo
6320 // zmm6 - lookup_hi
6321 // zmm7 - errorvec
6322 // zmm8 - 0x61616161
6323 // zmm9 - 0x80808080
6324
6325 // Load only the bytes from source, merging into our "fully-valid" register
6326 __ evmovdqub(input_initial_valid_b64, input_mask, Address(source, start_offset, Address::times_1, 0x0), true, Assembler::AVX_512bit);
6327
6328 // Decode all bytes within our merged input
6329 __ evmovdquq(tmp, lookup_lo, Assembler::AVX_512bit);
6330 __ evpermt2b(tmp, input_initial_valid_b64, lookup_hi, Assembler::AVX_512bit);
6331 __ vporq(mask, tmp, input_initial_valid_b64, Assembler::AVX_512bit);
6332
6333 // Check for error. Compare (decoded | initial) to all invalid.
6334 // If any bytes have their high-order bit set, then we have an error.
6335 __ evptestmb(k2, mask, invalid_b64, Assembler::AVX_512bit);
6336 __ kortestql(k2, k2);
6337
6338 // If we have an error, use the brute force loop to decode what we can (4-byte chunks).
6339 __ jcc(Assembler::notZero, L_bruteForce);
6340
6341 // Shuffle output bytes
6342 __ vpmaddubsw(tmp, tmp, pack16_op, Assembler::AVX_512bit);
6343 __ vpmaddwd(tmp, tmp, pack32_op, Assembler::AVX_512bit);
6344
6345 __ vpermb(tmp, pack24bits, tmp, Assembler::AVX_512bit);
6346 __ kmovql(k1, output_mask);
6347 __ evmovdqub(Address(dest, dp), k1, tmp, true, Assembler::AVX_512bit);
6348
6349 __ addptr(dest, output_size);
6350
6351 __ BIND(L_exit);
6352 __ vzeroupper();
6353 __ pop(rax); // Get original dest value
6354 __ subptr(dest, rax); // Number of bytes converted
6355 __ movptr(rax, dest);
6356 __ pop(rbx);
6357 __ pop(r15);
6358 __ pop(r14);
6359 __ pop(r13);
6360 __ pop(r12);
6361 __ leave();
6362 __ ret(0);
6363
6364 __ BIND(L_loadURL);
6365 __ evmovdquq(lookup_lo, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_lo_url_addr()), Assembler::AVX_512bit, r13);
6366 __ evmovdquq(lookup_hi, ExternalAddress(StubRoutines::x86::base64_vbmi_lookup_hi_url_addr()), Assembler::AVX_512bit, r13);
6367 __ jmp(L_continue);
6368
6369 __ BIND(L_padding);
6370 __ decrementq(output_size, 1);
6371 __ shrq(rax, 1);
6372
6373 __ cmpb(Address(source, length, Address::times_1, -2), '=');
6374 __ jcc(Assembler::notEqual, L_donePadding);
6375
6376 __ decrementq(output_size, 1);
6377 __ shrq(rax, 1);
6378 __ jmp(L_donePadding);
6379
6380 __ align32();
6381 __ BIND(L_bruteForce);
6382 } // End of if(avx512_vbmi)
6383
6384 // Use non-AVX code to decode 4-byte chunks into 3 bytes of output
6385
6386 // Register state (Linux):
6387 // r12-15 - saved on stack
6388 // rdi - src
6389 // rsi - sp
6390 // rdx - sl
6391 // rcx - dst
6392 // r8 - dp
6393 // r9 - isURL
6394
6395 // Register state (Windows):
6396 // r12-15 - saved on stack
6397 // rcx - src
6398 // rdx - sp
6399 // r8 - sl
6400 // r9 - dst
6401 // r12 - dp
6402 // r10 - isURL
6403
6404 // Registers (common):
6405 // length (r14) - bytes in src
6406
6407 const Register decode_table = r11;
6408 const Register out_byte_count = rbx;
6409 const Register byte1 = r13;
6410 const Register byte2 = r15;
6411 const Register byte3 = WINDOWS_ONLY(r8) NOT_WINDOWS(rdx);
6412 const Register byte4 = WINDOWS_ONLY(r10) NOT_WINDOWS(r9);
6413
6414 __ shrl(length, 2); // Multiple of 4 bytes only - length is # 4-byte chunks
6415 __ cmpl(length, 0);
6416 __ jcc(Assembler::lessEqual, L_exit_no_vzero);
6417
6418 __ shll(isURL, 8); // index into decode table based on isURL
6419 __ lea(decode_table, ExternalAddress(StubRoutines::x86::base64_decoding_table_addr()));
6420 __ addptr(decode_table, isURL);
6421
6422 __ jmp(L_bottomLoop);
6423
6424 __ align32();
6425 __ BIND(L_forceLoop);
6426 __ shll(byte1, 18);
6427 __ shll(byte2, 12);
6428 __ shll(byte3, 6);
6429 __ orl(byte1, byte2);
6430 __ orl(byte1, byte3);
6431 __ orl(byte1, byte4);
6432
6433 __ addptr(source, 4);
6434
6435 __ movb(Address(dest, dp, Address::times_1, 2), byte1);
6436 __ shrl(byte1, 8);
6437 __ movb(Address(dest, dp, Address::times_1, 1), byte1);
6438 __ shrl(byte1, 8);
6439 __ movb(Address(dest, dp, Address::times_1, 0), byte1);
6440
6441 __ addptr(dest, 3);
6442 __ decrementl(length, 1);
6443 __ jcc(Assembler::zero, L_exit_no_vzero);
6444
6445 __ BIND(L_bottomLoop);
6446 __ load_unsigned_byte(byte1, Address(source, start_offset, Address::times_1, 0x00));
6447 __ load_unsigned_byte(byte2, Address(source, start_offset, Address::times_1, 0x01));
6448 __ load_signed_byte(byte1, Address(decode_table, byte1));
6449 __ load_signed_byte(byte2, Address(decode_table, byte2));
6450 __ load_unsigned_byte(byte3, Address(source, start_offset, Address::times_1, 0x02));
6451 __ load_unsigned_byte(byte4, Address(source, start_offset, Address::times_1, 0x03));
6452 __ load_signed_byte(byte3, Address(decode_table, byte3));
6453 __ load_signed_byte(byte4, Address(decode_table, byte4));
6454
6455 __ mov(rax, byte1);
6456 __ orl(rax, byte2);
6457 __ orl(rax, byte3);
6458 __ orl(rax, byte4);
6459 __ jcc(Assembler::positive, L_forceLoop);
6460
6461 __ BIND(L_exit_no_vzero);
6462 __ pop(rax); // Get original dest value
6463 __ subptr(dest, rax); // Number of bytes converted
6464 __ movptr(rax, dest);
6465 __ pop(rbx);
6466 __ pop(r15);
6467 __ pop(r14);
6468 __ pop(r13);
6469 __ pop(r12);
6470 __ leave();
6471 __ ret(0);
6472
6473 return start;
6474 }
6475
6476
6477 /**
6478 * Arguments:
6479 *
6480 * Inputs:
6481 * c_rarg0 - int crc
6482 * c_rarg1 - byte* buf
6483 * c_rarg2 - int length
6484 *
6485 * Output:
6486 * rax - int crc result
6487 */
6488 address generate_updateBytesCRC32() {
6489 assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
6490
6491 __ align(CodeEntryAlignment);
6492 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
6493
6494 address start = __ pc();
6495 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6496 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
6497 // rscratch1: r10
6498 const Register crc = c_rarg0; // crc
6499 const Register buf = c_rarg1; // source java byte array address
6500 const Register len = c_rarg2; // length
6501 const Register table = c_rarg3; // crc_table address (reuse register)
6502 const Register tmp1 = r11;
6503 const Register tmp2 = r10;
6504 assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax);
6505
6506 BLOCK_COMMENT("Entry:");
6507 __ enter(); // required for proper stackwalking of RuntimeStub frame
6508
6509 if (VM_Version::supports_sse4_1() && VM_Version::supports_avx512_vpclmulqdq() &&
6510 VM_Version::supports_avx512bw() &&
6511 VM_Version::supports_avx512vl()) {
6512 // The constants used in the CRC32 algorithm requires the 1's compliment of the initial crc value.
6513 // However, the constant table for CRC32-C assumes the original crc value. Account for this
6514 // difference before calling and after returning.
6515 __ lea(table, ExternalAddress(StubRoutines::x86::crc_table_avx512_addr()));
6516 __ notl(crc);
6517 __ kernel_crc32_avx512(crc, buf, len, table, tmp1, tmp2);
6518 __ notl(crc);
6519 } else {
6520 __ kernel_crc32(crc, buf, len, table, tmp1);
6521 }
6522
6523 __ movl(rax, crc);
6524 __ vzeroupper();
6525 __ leave(); // required for proper stackwalking of RuntimeStub frame
6526 __ ret(0);
6527
6528 return start;
6529 }
6530
6531 /**
6532 * Arguments:
6533 *
6534 * Inputs:
6535 * c_rarg0 - int crc
6536 * c_rarg1 - byte* buf
6537 * c_rarg2 - long length
6538 * c_rarg3 - table_start - optional (present only when doing a library_call,
6539 * not used by x86 algorithm)
6540 *
6541 * Output:
6542 * rax - int crc result
6543 */
6544 address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
6545 assert(UseCRC32CIntrinsics, "need SSE4_2");
6546 __ align(CodeEntryAlignment);
6547 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
6548 address start = __ pc();
6549 //reg.arg int#0 int#1 int#2 int#3 int#4 int#5 float regs
6550 //Windows RCX RDX R8 R9 none none XMM0..XMM3
6551 //Lin / Sol RDI RSI RDX RCX R8 R9 XMM0..XMM7
6552 const Register crc = c_rarg0; // crc
6553 const Register buf = c_rarg1; // source java byte array address
6554 const Register len = c_rarg2; // length
6555 const Register a = rax;
6556 const Register j = r9;
6557 const Register k = r10;
6558 const Register l = r11;
6559 #ifdef _WIN64
6560 const Register y = rdi;
6561 const Register z = rsi;
6562 #else
6563 const Register y = rcx;
6564 const Register z = r8;
6565 #endif
6566 assert_different_registers(crc, buf, len, a, j, k, l, y, z);
6567
6568 BLOCK_COMMENT("Entry:");
6569 __ enter(); // required for proper stackwalking of RuntimeStub frame
6570 if (VM_Version::supports_sse4_1() && VM_Version::supports_avx512_vpclmulqdq() &&
6571 VM_Version::supports_avx512bw() &&
6572 VM_Version::supports_avx512vl()) {
6573 __ lea(j, ExternalAddress(StubRoutines::x86::crc32c_table_avx512_addr()));
6574 __ kernel_crc32_avx512(crc, buf, len, j, l, k);
6575 } else {
6576 #ifdef _WIN64
6577 __ push(y);
6578 __ push(z);
6579 #endif
6580 __ crc32c_ipl_alg2_alt2(crc, buf, len,
6581 a, j, k,
6582 l, y, z,
6583 c_farg0, c_farg1, c_farg2,
6584 is_pclmulqdq_supported);
6585 #ifdef _WIN64
6586 __ pop(z);
6587 __ pop(y);
6588 #endif
6589 }
6590 __ movl(rax, crc);
6591 __ vzeroupper();
6592 __ leave(); // required for proper stackwalking of RuntimeStub frame
6593 __ ret(0);
6594
6595 return start;
6596 }
6597
6598
6599 /***
6600 * Arguments:
6601 *
6602 * Inputs:
6603 * c_rarg0 - int adler
6604 * c_rarg1 - byte* buff
6605 * c_rarg2 - int len
6606 *
6607 * Output:
6608 * rax - int adler result
6609 */
6610
6611 address generate_updateBytesAdler32() {
6612 assert(UseAdler32Intrinsics, "need AVX2");
6613
6614 __ align(CodeEntryAlignment);
6615 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
6616
6617 address start = __ pc();
6618
6619 const Register data = r9;
6620 const Register size = r10;
6621
6622 const XMMRegister yshuf0 = xmm6;
6623 const XMMRegister yshuf1 = xmm7;
6624 assert_different_registers(c_rarg0, c_rarg1, c_rarg2, data, size);
6625
6626 BLOCK_COMMENT("Entry:");
6627 __ enter(); // required for proper stackwalking of RuntimeStub frame
6628
6629 __ vmovdqu(yshuf0, ExternalAddress((address) StubRoutines::x86::_adler32_shuf0_table), r9);
6630 __ vmovdqu(yshuf1, ExternalAddress((address) StubRoutines::x86::_adler32_shuf1_table), r9);
6631 __ movptr(data, c_rarg1); //data
6632 __ movl(size, c_rarg2); //length
6633 __ updateBytesAdler32(c_rarg0, data, size, yshuf0, yshuf1, ExternalAddress((address) StubRoutines::x86::_adler32_ascale_table));
6634 __ leave();
6635 __ ret(0);
6636 return start;
6637 }
6638
6639 /**
6640 * Arguments:
6641 *
6642 * Input:
6643 * c_rarg0 - x address
6644 * c_rarg1 - x length
6645 * c_rarg2 - y address
6646 * c_rarg3 - y length
6647 * not Win64
6648 * c_rarg4 - z address
6649 * c_rarg5 - z length
6650 * Win64
6651 * rsp+40 - z address
6652 * rsp+48 - z length
6653 */
6654 address generate_multiplyToLen() {
6655 __ align(CodeEntryAlignment);
6656 StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
6657
6658 address start = __ pc();
6659 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6660 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
6661 const Register x = rdi;
6662 const Register xlen = rax;
6663 const Register y = rsi;
6664 const Register ylen = rcx;
6665 const Register z = r8;
6666 const Register zlen = r11;
6667
6668 // Next registers will be saved on stack in multiply_to_len().
6669 const Register tmp1 = r12;
6670 const Register tmp2 = r13;
6671 const Register tmp3 = r14;
6672 const Register tmp4 = r15;
6673 const Register tmp5 = rbx;
6674
6675 BLOCK_COMMENT("Entry:");
6676 __ enter(); // required for proper stackwalking of RuntimeStub frame
6677
6678 #ifndef _WIN64
6679 __ movptr(zlen, r9); // Save r9 in r11 - zlen
6680 #endif
6681 setup_arg_regs(4); // x => rdi, xlen => rsi, y => rdx
6682 // ylen => rcx, z => r8, zlen => r11
6683 // r9 and r10 may be used to save non-volatile registers
6684 #ifdef _WIN64
6685 // last 2 arguments (#4, #5) are on stack on Win64
6686 __ movptr(z, Address(rsp, 6 * wordSize));
6687 __ movptr(zlen, Address(rsp, 7 * wordSize));
6688 #endif
6689
6690 __ movptr(xlen, rsi);
6691 __ movptr(y, rdx);
6692 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5);
6693
6694 restore_arg_regs();
6695
6696 __ leave(); // required for proper stackwalking of RuntimeStub frame
6697 __ ret(0);
6698
6699 return start;
6700 }
6701
6702 /**
6703 * Arguments:
6704 *
6705 * Input:
6706 * c_rarg0 - obja address
6707 * c_rarg1 - objb address
6708 * c_rarg3 - length length
6709 * c_rarg4 - scale log2_array_indxscale
6710 *
6711 * Output:
6712 * rax - int >= mismatched index, < 0 bitwise complement of tail
6713 */
6714 address generate_vectorizedMismatch() {
6715 __ align(CodeEntryAlignment);
6716 StubCodeMark mark(this, "StubRoutines", "vectorizedMismatch");
6717 address start = __ pc();
6718
6719 BLOCK_COMMENT("Entry:");
6720 __ enter();
6721
6722 #ifdef _WIN64 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6723 const Register scale = c_rarg0; //rcx, will exchange with r9
6724 const Register objb = c_rarg1; //rdx
6725 const Register length = c_rarg2; //r8
6726 const Register obja = c_rarg3; //r9
6727 __ xchgq(obja, scale); //now obja and scale contains the correct contents
6728
6729 const Register tmp1 = r10;
6730 const Register tmp2 = r11;
6731 #endif
6732 #ifndef _WIN64 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
6733 const Register obja = c_rarg0; //U:rdi
6734 const Register objb = c_rarg1; //U:rsi
6735 const Register length = c_rarg2; //U:rdx
6736 const Register scale = c_rarg3; //U:rcx
6737 const Register tmp1 = r8;
6738 const Register tmp2 = r9;
6739 #endif
6740 const Register result = rax; //return value
6741 const XMMRegister vec0 = xmm0;
6742 const XMMRegister vec1 = xmm1;
6743 const XMMRegister vec2 = xmm2;
6744
6745 __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2);
6746
6747 __ vzeroupper();
6748 __ leave();
6749 __ ret(0);
6750
6751 return start;
6752 }
6753
6754 /**
6755 * Arguments:
6756 *
6757 // Input:
6758 // c_rarg0 - x address
6759 // c_rarg1 - x length
6760 // c_rarg2 - z address
6761 // c_rarg3 - z length
6762 *
6763 */
6764 address generate_squareToLen() {
6765
6766 __ align(CodeEntryAlignment);
6767 StubCodeMark mark(this, "StubRoutines", "squareToLen");
6768
6769 address start = __ pc();
6770 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6771 // Unix: rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...)
6772 const Register x = rdi;
6773 const Register len = rsi;
6774 const Register z = r8;
6775 const Register zlen = rcx;
6776
6777 const Register tmp1 = r12;
6778 const Register tmp2 = r13;
6779 const Register tmp3 = r14;
6780 const Register tmp4 = r15;
6781 const Register tmp5 = rbx;
6782
6783 BLOCK_COMMENT("Entry:");
6784 __ enter(); // required for proper stackwalking of RuntimeStub frame
6785
6786 setup_arg_regs(4); // x => rdi, len => rsi, z => rdx
6787 // zlen => rcx
6788 // r9 and r10 may be used to save non-volatile registers
6789 __ movptr(r8, rdx);
6790 __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
6791
6792 restore_arg_regs();
6793
6794 __ leave(); // required for proper stackwalking of RuntimeStub frame
6795 __ ret(0);
6796
6797 return start;
6798 }
6799
6800 address generate_method_entry_barrier() {
6801 __ align(CodeEntryAlignment);
6802 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
6803
6804 Label deoptimize_label;
6805
6806 address start = __ pc();
6807
6808 __ push(-1); // cookie, this is used for writing the new rsp when deoptimizing
6809
6810 BLOCK_COMMENT("Entry:");
6811 __ enter(); // save rbp
6812
6813 // save c_rarg0, because we want to use that value.
6814 // We could do without it but then we depend on the number of slots used by pusha
6815 __ push(c_rarg0);
6816
6817 __ lea(c_rarg0, Address(rsp, wordSize * 3)); // 1 for cookie, 1 for rbp, 1 for c_rarg0 - this should be the return address
6818
6819 __ pusha();
6820
6821 // The method may have floats as arguments, and we must spill them before calling
6822 // the VM runtime.
6823 assert(Argument::n_float_register_parameters_j == 8, "Assumption");
6824 const int xmm_size = wordSize * 2;
6825 const int xmm_spill_size = xmm_size * Argument::n_float_register_parameters_j;
6826 __ subptr(rsp, xmm_spill_size);
6827 __ movdqu(Address(rsp, xmm_size * 7), xmm7);
6828 __ movdqu(Address(rsp, xmm_size * 6), xmm6);
6829 __ movdqu(Address(rsp, xmm_size * 5), xmm5);
6830 __ movdqu(Address(rsp, xmm_size * 4), xmm4);
6831 __ movdqu(Address(rsp, xmm_size * 3), xmm3);
6832 __ movdqu(Address(rsp, xmm_size * 2), xmm2);
6833 __ movdqu(Address(rsp, xmm_size * 1), xmm1);
6834 __ movdqu(Address(rsp, xmm_size * 0), xmm0);
6835
6836 __ call_VM_leaf(CAST_FROM_FN_PTR(address, static_cast<int (*)(address*)>(BarrierSetNMethod::nmethod_stub_entry_barrier)), 1);
6837
6838 __ movdqu(xmm0, Address(rsp, xmm_size * 0));
6839 __ movdqu(xmm1, Address(rsp, xmm_size * 1));
6840 __ movdqu(xmm2, Address(rsp, xmm_size * 2));
6841 __ movdqu(xmm3, Address(rsp, xmm_size * 3));
6842 __ movdqu(xmm4, Address(rsp, xmm_size * 4));
6843 __ movdqu(xmm5, Address(rsp, xmm_size * 5));
6844 __ movdqu(xmm6, Address(rsp, xmm_size * 6));
6845 __ movdqu(xmm7, Address(rsp, xmm_size * 7));
6846 __ addptr(rsp, xmm_spill_size);
6847
6848 __ cmpl(rax, 1); // 1 means deoptimize
6849 __ jcc(Assembler::equal, deoptimize_label);
6850
6851 __ popa();
6852 __ pop(c_rarg0);
6853
6854 __ leave();
6855
6856 __ addptr(rsp, 1 * wordSize); // cookie
6857 __ ret(0);
6858
6859
6860 __ BIND(deoptimize_label);
6861
6862 __ popa();
6863 __ pop(c_rarg0);
6864
6865 __ leave();
6866
6867 // this can be taken out, but is good for verification purposes. getting a SIGSEGV
6868 // here while still having a correct stack is valuable
6869 __ testptr(rsp, Address(rsp, 0));
6870
6871 __ movptr(rsp, Address(rsp, 0)); // new rsp was written in the barrier
6872 __ jmp(Address(rsp, -1 * wordSize)); // jmp target should be callers verified_entry_point
6873
6874 return start;
6875 }
6876
6877 /**
6878 * Arguments:
6879 *
6880 * Input:
6881 * c_rarg0 - out address
6882 * c_rarg1 - in address
6883 * c_rarg2 - offset
6884 * c_rarg3 - len
6885 * not Win64
6886 * c_rarg4 - k
6887 * Win64
6888 * rsp+40 - k
6889 */
6890 address generate_mulAdd() {
6891 __ align(CodeEntryAlignment);
6892 StubCodeMark mark(this, "StubRoutines", "mulAdd");
6893
6894 address start = __ pc();
6895 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
6896 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
6897 const Register out = rdi;
6898 const Register in = rsi;
6899 const Register offset = r11;
6900 const Register len = rcx;
6901 const Register k = r8;
6902
6903 // Next registers will be saved on stack in mul_add().
6904 const Register tmp1 = r12;
6905 const Register tmp2 = r13;
6906 const Register tmp3 = r14;
6907 const Register tmp4 = r15;
6908 const Register tmp5 = rbx;
6909
6910 BLOCK_COMMENT("Entry:");
6911 __ enter(); // required for proper stackwalking of RuntimeStub frame
6912
6913 setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx
6914 // len => rcx, k => r8
6915 // r9 and r10 may be used to save non-volatile registers
6916 #ifdef _WIN64
6917 // last argument is on stack on Win64
6918 __ movl(k, Address(rsp, 6 * wordSize));
6919 #endif
6920 __ movptr(r11, rdx); // move offset in rdx to offset(r11)
6921 __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
6922
6923 restore_arg_regs();
6924
6925 __ leave(); // required for proper stackwalking of RuntimeStub frame
6926 __ ret(0);
6927
6928 return start;
6929 }
6930
6931 address generate_bigIntegerRightShift() {
6932 __ align(CodeEntryAlignment);
6933 StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker");
6934
6935 address start = __ pc();
6936 Label Shift512Loop, ShiftTwo, ShiftTwoLoop, ShiftOne, Exit;
6937 // For Unix, the arguments are as follows: rdi, rsi, rdx, rcx, r8.
6938 const Register newArr = rdi;
6939 const Register oldArr = rsi;
6940 const Register newIdx = rdx;
6941 const Register shiftCount = rcx; // It was intentional to have shiftCount in rcx since it is used implicitly for shift.
6942 const Register totalNumIter = r8;
6943
6944 // For windows, we use r9 and r10 as temps to save rdi and rsi. Thus we cannot allocate them for our temps.
6945 // For everything else, we prefer using r9 and r10 since we do not have to save them before use.
6946 const Register tmp1 = r11; // Caller save.
6947 const Register tmp2 = rax; // Caller save.
6948 const Register tmp3 = WINDOWS_ONLY(r12) NOT_WINDOWS(r9); // Windows: Callee save. Linux: Caller save.
6949 const Register tmp4 = WINDOWS_ONLY(r13) NOT_WINDOWS(r10); // Windows: Callee save. Linux: Caller save.
6950 const Register tmp5 = r14; // Callee save.
6951 const Register tmp6 = r15;
6952
6953 const XMMRegister x0 = xmm0;
6954 const XMMRegister x1 = xmm1;
6955 const XMMRegister x2 = xmm2;
6956
6957 BLOCK_COMMENT("Entry:");
6958 __ enter(); // required for proper stackwalking of RuntimeStub frame
6959
6960 #ifdef _WINDOWS
6961 setup_arg_regs(4);
6962 // For windows, since last argument is on stack, we need to move it to the appropriate register.
6963 __ movl(totalNumIter, Address(rsp, 6 * wordSize));
6964 // Save callee save registers.
6965 __ push(tmp3);
6966 __ push(tmp4);
6967 #endif
6968 __ push(tmp5);
6969
6970 // Rename temps used throughout the code.
6971 const Register idx = tmp1;
6972 const Register nIdx = tmp2;
6973
6974 __ xorl(idx, idx);
6975
6976 // Start right shift from end of the array.
6977 // For example, if #iteration = 4 and newIdx = 1
6978 // then dest[4] = src[4] >> shiftCount | src[3] <<< (shiftCount - 32)
6979 // if #iteration = 4 and newIdx = 0
6980 // then dest[3] = src[4] >> shiftCount | src[3] <<< (shiftCount - 32)
6981 __ movl(idx, totalNumIter);
6982 __ movl(nIdx, idx);
6983 __ addl(nIdx, newIdx);
6984
6985 // If vectorization is enabled, check if the number of iterations is at least 64
6986 // If not, then go to ShifTwo processing 2 iterations
6987 if (VM_Version::supports_avx512_vbmi2()) {
6988 __ cmpptr(totalNumIter, (AVX3Threshold/64));
6989 __ jcc(Assembler::less, ShiftTwo);
6990
6991 if (AVX3Threshold < 16 * 64) {
6992 __ cmpl(totalNumIter, 16);
6993 __ jcc(Assembler::less, ShiftTwo);
6994 }
6995 __ evpbroadcastd(x0, shiftCount, Assembler::AVX_512bit);
6996 __ subl(idx, 16);
6997 __ subl(nIdx, 16);
6998 __ BIND(Shift512Loop);
6999 __ evmovdqul(x2, Address(oldArr, idx, Address::times_4, 4), Assembler::AVX_512bit);
7000 __ evmovdqul(x1, Address(oldArr, idx, Address::times_4), Assembler::AVX_512bit);
7001 __ vpshrdvd(x2, x1, x0, Assembler::AVX_512bit);
7002 __ evmovdqul(Address(newArr, nIdx, Address::times_4), x2, Assembler::AVX_512bit);
7003 __ subl(nIdx, 16);
7004 __ subl(idx, 16);
7005 __ jcc(Assembler::greaterEqual, Shift512Loop);
7006 __ addl(idx, 16);
7007 __ addl(nIdx, 16);
7008 }
7009 __ BIND(ShiftTwo);
7010 __ cmpl(idx, 2);
7011 __ jcc(Assembler::less, ShiftOne);
7012 __ subl(idx, 2);
7013 __ subl(nIdx, 2);
7014 __ BIND(ShiftTwoLoop);
7015 __ movl(tmp5, Address(oldArr, idx, Address::times_4, 8));
7016 __ movl(tmp4, Address(oldArr, idx, Address::times_4, 4));
7017 __ movl(tmp3, Address(oldArr, idx, Address::times_4));
7018 __ shrdl(tmp5, tmp4);
7019 __ shrdl(tmp4, tmp3);
7020 __ movl(Address(newArr, nIdx, Address::times_4, 4), tmp5);
7021 __ movl(Address(newArr, nIdx, Address::times_4), tmp4);
7022 __ subl(nIdx, 2);
7023 __ subl(idx, 2);
7024 __ jcc(Assembler::greaterEqual, ShiftTwoLoop);
7025 __ addl(idx, 2);
7026 __ addl(nIdx, 2);
7027
7028 // Do the last iteration
7029 __ BIND(ShiftOne);
7030 __ cmpl(idx, 1);
7031 __ jcc(Assembler::less, Exit);
7032 __ subl(idx, 1);
7033 __ subl(nIdx, 1);
7034 __ movl(tmp4, Address(oldArr, idx, Address::times_4, 4));
7035 __ movl(tmp3, Address(oldArr, idx, Address::times_4));
7036 __ shrdl(tmp4, tmp3);
7037 __ movl(Address(newArr, nIdx, Address::times_4), tmp4);
7038 __ BIND(Exit);
7039 __ vzeroupper();
7040 // Restore callee save registers.
7041 __ pop(tmp5);
7042 #ifdef _WINDOWS
7043 __ pop(tmp4);
7044 __ pop(tmp3);
7045 restore_arg_regs();
7046 #endif
7047 __ leave(); // required for proper stackwalking of RuntimeStub frame
7048 __ ret(0);
7049 return start;
7050 }
7051
7052 /**
7053 * Arguments:
7054 *
7055 * Input:
7056 * c_rarg0 - newArr address
7057 * c_rarg1 - oldArr address
7058 * c_rarg2 - newIdx
7059 * c_rarg3 - shiftCount
7060 * not Win64
7061 * c_rarg4 - numIter
7062 * Win64
7063 * rsp40 - numIter
7064 */
7065 address generate_bigIntegerLeftShift() {
7066 __ align(CodeEntryAlignment);
7067 StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker");
7068 address start = __ pc();
7069 Label Shift512Loop, ShiftTwo, ShiftTwoLoop, ShiftOne, Exit;
7070 // For Unix, the arguments are as follows: rdi, rsi, rdx, rcx, r8.
7071 const Register newArr = rdi;
7072 const Register oldArr = rsi;
7073 const Register newIdx = rdx;
7074 const Register shiftCount = rcx; // It was intentional to have shiftCount in rcx since it is used implicitly for shift.
7075 const Register totalNumIter = r8;
7076 // For windows, we use r9 and r10 as temps to save rdi and rsi. Thus we cannot allocate them for our temps.
7077 // For everything else, we prefer using r9 and r10 since we do not have to save them before use.
7078 const Register tmp1 = r11; // Caller save.
7079 const Register tmp2 = rax; // Caller save.
7080 const Register tmp3 = WINDOWS_ONLY(r12) NOT_WINDOWS(r9); // Windows: Callee save. Linux: Caller save.
7081 const Register tmp4 = WINDOWS_ONLY(r13) NOT_WINDOWS(r10); // Windows: Callee save. Linux: Caller save.
7082 const Register tmp5 = r14; // Callee save.
7083
7084 const XMMRegister x0 = xmm0;
7085 const XMMRegister x1 = xmm1;
7086 const XMMRegister x2 = xmm2;
7087 BLOCK_COMMENT("Entry:");
7088 __ enter(); // required for proper stackwalking of RuntimeStub frame
7089
7090 #ifdef _WINDOWS
7091 setup_arg_regs(4);
7092 // For windows, since last argument is on stack, we need to move it to the appropriate register.
7093 __ movl(totalNumIter, Address(rsp, 6 * wordSize));
7094 // Save callee save registers.
7095 __ push(tmp3);
7096 __ push(tmp4);
7097 #endif
7098 __ push(tmp5);
7099
7100 // Rename temps used throughout the code
7101 const Register idx = tmp1;
7102 const Register numIterTmp = tmp2;
7103
7104 // Start idx from zero.
7105 __ xorl(idx, idx);
7106 // Compute interior pointer for new array. We do this so that we can use same index for both old and new arrays.
7107 __ lea(newArr, Address(newArr, newIdx, Address::times_4));
7108 __ movl(numIterTmp, totalNumIter);
7109
7110 // If vectorization is enabled, check if the number of iterations is at least 64
7111 // If not, then go to ShiftTwo shifting two numbers at a time
7112 if (VM_Version::supports_avx512_vbmi2()) {
7113 __ cmpl(totalNumIter, (AVX3Threshold/64));
7114 __ jcc(Assembler::less, ShiftTwo);
7115
7116 if (AVX3Threshold < 16 * 64) {
7117 __ cmpl(totalNumIter, 16);
7118 __ jcc(Assembler::less, ShiftTwo);
7119 }
7120 __ evpbroadcastd(x0, shiftCount, Assembler::AVX_512bit);
7121 __ subl(numIterTmp, 16);
7122 __ BIND(Shift512Loop);
7123 __ evmovdqul(x1, Address(oldArr, idx, Address::times_4), Assembler::AVX_512bit);
7124 __ evmovdqul(x2, Address(oldArr, idx, Address::times_4, 0x4), Assembler::AVX_512bit);
7125 __ vpshldvd(x1, x2, x0, Assembler::AVX_512bit);
7126 __ evmovdqul(Address(newArr, idx, Address::times_4), x1, Assembler::AVX_512bit);
7127 __ addl(idx, 16);
7128 __ subl(numIterTmp, 16);
7129 __ jcc(Assembler::greaterEqual, Shift512Loop);
7130 __ addl(numIterTmp, 16);
7131 }
7132 __ BIND(ShiftTwo);
7133 __ cmpl(totalNumIter, 1);
7134 __ jcc(Assembler::less, Exit);
7135 __ movl(tmp3, Address(oldArr, idx, Address::times_4));
7136 __ subl(numIterTmp, 2);
7137 __ jcc(Assembler::less, ShiftOne);
7138
7139 __ BIND(ShiftTwoLoop);
7140 __ movl(tmp4, Address(oldArr, idx, Address::times_4, 0x4));
7141 __ movl(tmp5, Address(oldArr, idx, Address::times_4, 0x8));
7142 __ shldl(tmp3, tmp4);
7143 __ shldl(tmp4, tmp5);
7144 __ movl(Address(newArr, idx, Address::times_4), tmp3);
7145 __ movl(Address(newArr, idx, Address::times_4, 0x4), tmp4);
7146 __ movl(tmp3, tmp5);
7147 __ addl(idx, 2);
7148 __ subl(numIterTmp, 2);
7149 __ jcc(Assembler::greaterEqual, ShiftTwoLoop);
7150
7151 // Do the last iteration
7152 __ BIND(ShiftOne);
7153 __ addl(numIterTmp, 2);
7154 __ cmpl(numIterTmp, 1);
7155 __ jcc(Assembler::less, Exit);
7156 __ movl(tmp4, Address(oldArr, idx, Address::times_4, 0x4));
7157 __ shldl(tmp3, tmp4);
7158 __ movl(Address(newArr, idx, Address::times_4), tmp3);
7159
7160 __ BIND(Exit);
7161 __ vzeroupper();
7162 // Restore callee save registers.
7163 __ pop(tmp5);
7164 #ifdef _WINDOWS
7165 __ pop(tmp4);
7166 __ pop(tmp3);
7167 restore_arg_regs();
7168 #endif
7169 __ leave(); // required for proper stackwalking of RuntimeStub frame
7170 __ ret(0);
7171 return start;
7172 }
7173
7174 address generate_libmExp() {
7175 StubCodeMark mark(this, "StubRoutines", "libmExp");
7176
7177 address start = __ pc();
7178
7179 const XMMRegister x0 = xmm0;
7180 const XMMRegister x1 = xmm1;
7181 const XMMRegister x2 = xmm2;
7182 const XMMRegister x3 = xmm3;
7183
7184 const XMMRegister x4 = xmm4;
7185 const XMMRegister x5 = xmm5;
7186 const XMMRegister x6 = xmm6;
7187 const XMMRegister x7 = xmm7;
7188
7189 const Register tmp = r11;
7190
7191 BLOCK_COMMENT("Entry:");
7192 __ enter(); // required for proper stackwalking of RuntimeStub frame
7193
7194 __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
7195
7196 __ leave(); // required for proper stackwalking of RuntimeStub frame
7197 __ ret(0);
7198
7199 return start;
7200
7201 }
7202
7203 address generate_libmLog() {
7204 StubCodeMark mark(this, "StubRoutines", "libmLog");
7205
7206 address start = __ pc();
7207
7208 const XMMRegister x0 = xmm0;
7209 const XMMRegister x1 = xmm1;
7210 const XMMRegister x2 = xmm2;
7211 const XMMRegister x3 = xmm3;
7212
7213 const XMMRegister x4 = xmm4;
7214 const XMMRegister x5 = xmm5;
7215 const XMMRegister x6 = xmm6;
7216 const XMMRegister x7 = xmm7;
7217
7218 const Register tmp1 = r11;
7219 const Register tmp2 = r8;
7220
7221 BLOCK_COMMENT("Entry:");
7222 __ enter(); // required for proper stackwalking of RuntimeStub frame
7223
7224 __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2);
7225
7226 __ leave(); // required for proper stackwalking of RuntimeStub frame
7227 __ ret(0);
7228
7229 return start;
7230
7231 }
7232
7233 address generate_libmLog10() {
7234 StubCodeMark mark(this, "StubRoutines", "libmLog10");
7235
7236 address start = __ pc();
7237
7238 const XMMRegister x0 = xmm0;
7239 const XMMRegister x1 = xmm1;
7240 const XMMRegister x2 = xmm2;
7241 const XMMRegister x3 = xmm3;
7242
7243 const XMMRegister x4 = xmm4;
7244 const XMMRegister x5 = xmm5;
7245 const XMMRegister x6 = xmm6;
7246 const XMMRegister x7 = xmm7;
7247
7248 const Register tmp = r11;
7249
7250 BLOCK_COMMENT("Entry:");
7251 __ enter(); // required for proper stackwalking of RuntimeStub frame
7252
7253 __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
7254
7255 __ leave(); // required for proper stackwalking of RuntimeStub frame
7256 __ ret(0);
7257
7258 return start;
7259
7260 }
7261
7262 address generate_libmPow() {
7263 StubCodeMark mark(this, "StubRoutines", "libmPow");
7264
7265 address start = __ pc();
7266
7267 const XMMRegister x0 = xmm0;
7268 const XMMRegister x1 = xmm1;
7269 const XMMRegister x2 = xmm2;
7270 const XMMRegister x3 = xmm3;
7271
7272 const XMMRegister x4 = xmm4;
7273 const XMMRegister x5 = xmm5;
7274 const XMMRegister x6 = xmm6;
7275 const XMMRegister x7 = xmm7;
7276
7277 const Register tmp1 = r8;
7278 const Register tmp2 = r9;
7279 const Register tmp3 = r10;
7280 const Register tmp4 = r11;
7281
7282 BLOCK_COMMENT("Entry:");
7283 __ enter(); // required for proper stackwalking of RuntimeStub frame
7284
7285 __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7286
7287 __ leave(); // required for proper stackwalking of RuntimeStub frame
7288 __ ret(0);
7289
7290 return start;
7291
7292 }
7293
7294 address generate_libmSin() {
7295 StubCodeMark mark(this, "StubRoutines", "libmSin");
7296
7297 address start = __ pc();
7298
7299 const XMMRegister x0 = xmm0;
7300 const XMMRegister x1 = xmm1;
7301 const XMMRegister x2 = xmm2;
7302 const XMMRegister x3 = xmm3;
7303
7304 const XMMRegister x4 = xmm4;
7305 const XMMRegister x5 = xmm5;
7306 const XMMRegister x6 = xmm6;
7307 const XMMRegister x7 = xmm7;
7308
7309 const Register tmp1 = r8;
7310 const Register tmp2 = r9;
7311 const Register tmp3 = r10;
7312 const Register tmp4 = r11;
7313
7314 BLOCK_COMMENT("Entry:");
7315 __ enter(); // required for proper stackwalking of RuntimeStub frame
7316
7317 #ifdef _WIN64
7318 __ push(rsi);
7319 __ push(rdi);
7320 #endif
7321 __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7322
7323 #ifdef _WIN64
7324 __ pop(rdi);
7325 __ pop(rsi);
7326 #endif
7327
7328 __ leave(); // required for proper stackwalking of RuntimeStub frame
7329 __ ret(0);
7330
7331 return start;
7332
7333 }
7334
7335 address generate_libmCos() {
7336 StubCodeMark mark(this, "StubRoutines", "libmCos");
7337
7338 address start = __ pc();
7339
7340 const XMMRegister x0 = xmm0;
7341 const XMMRegister x1 = xmm1;
7342 const XMMRegister x2 = xmm2;
7343 const XMMRegister x3 = xmm3;
7344
7345 const XMMRegister x4 = xmm4;
7346 const XMMRegister x5 = xmm5;
7347 const XMMRegister x6 = xmm6;
7348 const XMMRegister x7 = xmm7;
7349
7350 const Register tmp1 = r8;
7351 const Register tmp2 = r9;
7352 const Register tmp3 = r10;
7353 const Register tmp4 = r11;
7354
7355 BLOCK_COMMENT("Entry:");
7356 __ enter(); // required for proper stackwalking of RuntimeStub frame
7357
7358 #ifdef _WIN64
7359 __ push(rsi);
7360 __ push(rdi);
7361 #endif
7362 __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7363
7364 #ifdef _WIN64
7365 __ pop(rdi);
7366 __ pop(rsi);
7367 #endif
7368
7369 __ leave(); // required for proper stackwalking of RuntimeStub frame
7370 __ ret(0);
7371
7372 return start;
7373
7374 }
7375
7376 address generate_libmTan() {
7377 StubCodeMark mark(this, "StubRoutines", "libmTan");
7378
7379 address start = __ pc();
7380
7381 const XMMRegister x0 = xmm0;
7382 const XMMRegister x1 = xmm1;
7383 const XMMRegister x2 = xmm2;
7384 const XMMRegister x3 = xmm3;
7385
7386 const XMMRegister x4 = xmm4;
7387 const XMMRegister x5 = xmm5;
7388 const XMMRegister x6 = xmm6;
7389 const XMMRegister x7 = xmm7;
7390
7391 const Register tmp1 = r8;
7392 const Register tmp2 = r9;
7393 const Register tmp3 = r10;
7394 const Register tmp4 = r11;
7395
7396 BLOCK_COMMENT("Entry:");
7397 __ enter(); // required for proper stackwalking of RuntimeStub frame
7398
7399 #ifdef _WIN64
7400 __ push(rsi);
7401 __ push(rdi);
7402 #endif
7403 __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
7404
7405 #ifdef _WIN64
7406 __ pop(rdi);
7407 __ pop(rsi);
7408 #endif
7409
7410 __ leave(); // required for proper stackwalking of RuntimeStub frame
7411 __ ret(0);
7412
7413 return start;
7414
7415 }
7416
7417 #undef __
7418 #define __ masm->
7419
7420 // Continuation point for throwing of implicit exceptions that are
7421 // not handled in the current activation. Fabricates an exception
7422 // oop and initiates normal exception dispatching in this
7423 // frame. Since we need to preserve callee-saved values (currently
7424 // only for C2, but done for C1 as well) we need a callee-saved oop
7425 // map and therefore have to make these stubs into RuntimeStubs
7426 // rather than BufferBlobs. If the compiler needs all registers to
7427 // be preserved between the fault point and the exception handler
7428 // then it must assume responsibility for that in
7429 // AbstractCompiler::continuation_for_implicit_null_exception or
7430 // continuation_for_implicit_division_by_zero_exception. All other
7431 // implicit exceptions (e.g., NullPointerException or
7432 // AbstractMethodError on entry) are either at call sites or
7433 // otherwise assume that stack unwinding will be initiated, so
7434 // caller saved registers were assumed volatile in the compiler.
7435 address generate_throw_exception(const char* name,
7436 address runtime_entry,
7437 Register arg1 = noreg,
7438 Register arg2 = noreg) {
7439 // Information about frame layout at time of blocking runtime call.
7440 // Note that we only have to preserve callee-saved registers since
7441 // the compilers are responsible for supplying a continuation point
7442 // if they expect all registers to be preserved.
7443 enum layout {
7444 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
7445 rbp_off2,
7446 return_off,
7447 return_off2,
7448 framesize // inclusive of return address
7449 };
7450
7451 int insts_size = 512;
7452 int locs_size = 64;
7453
7454 CodeBuffer code(name, insts_size, locs_size);
7455 OopMapSet* oop_maps = new OopMapSet();
7456 MacroAssembler* masm = new MacroAssembler(&code);
7457
7458 address start = __ pc();
7459
7460 // This is an inlined and slightly modified version of call_VM
7461 // which has the ability to fetch the return PC out of
7462 // thread-local storage and also sets up last_Java_sp slightly
7463 // differently than the real call_VM
7464
7465 __ enter(); // required for proper stackwalking of RuntimeStub frame
7466
7467 assert(is_even(framesize/2), "sp not 16-byte aligned");
7468
7469 // return address and rbp are already in place
7470 __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
7471
7472 int frame_complete = __ pc() - start;
7473
7474 // Set up last_Java_sp and last_Java_fp
7475 address the_pc = __ pc();
7476 __ set_last_Java_frame(rsp, rbp, the_pc);
7477 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack
7478
7479 // Call runtime
7480 if (arg1 != noreg) {
7481 assert(arg2 != c_rarg1, "clobbered");
7482 __ movptr(c_rarg1, arg1);
7483 }
7484 if (arg2 != noreg) {
7485 __ movptr(c_rarg2, arg2);
7486 }
7487 __ movptr(c_rarg0, r15_thread);
7488 BLOCK_COMMENT("call runtime_entry");
7489 __ call(RuntimeAddress(runtime_entry));
7490
7491 // Generate oop map
7492 OopMap* map = new OopMap(framesize, 0);
7493
7494 oop_maps->add_gc_map(the_pc - start, map);
7495
7496 __ reset_last_Java_frame(true);
7497
7498 __ leave(); // required for proper stackwalking of RuntimeStub frame
7499
7500 // check for pending exceptions
7501 #ifdef ASSERT
7502 Label L;
7503 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()),
7504 (int32_t) NULL_WORD);
7505 __ jcc(Assembler::notEqual, L);
7506 __ should_not_reach_here();
7507 __ bind(L);
7508 #endif // ASSERT
7509 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
7510
7511
7512 // codeBlob framesize is in words (not VMRegImpl::slot_size)
7513 RuntimeStub* stub =
7514 RuntimeStub::new_runtime_stub(name,
7515 &code,
7516 frame_complete,
7517 (framesize >> (LogBytesPerWord - LogBytesPerInt)),
7518 oop_maps, false);
7519 return stub->entry_point();
7520 }
7521
7522 void create_control_words() {
7523 // Round to nearest, 64-bit mode, exceptions masked
7524 StubRoutines::x86::_mxcsr_std = 0x1F80;
7525 }
7526
7527 // Initialization
7528 void generate_initial() {
7529 // Generates all stubs and initializes the entry points
7530
7531 // This platform-specific settings are needed by generate_call_stub()
7532 create_control_words();
7533
7534 // entry points that exist in all platforms Note: This is code
7535 // that could be shared among different platforms - however the
7536 // benefit seems to be smaller than the disadvantage of having a
7537 // much more complicated generator structure. See also comment in
7538 // stubRoutines.hpp.
7539
7540 StubRoutines::_forward_exception_entry = generate_forward_exception();
7541
7542 StubRoutines::_call_stub_entry =
7543 generate_call_stub(StubRoutines::_call_stub_return_address);
7544
7545 // is referenced by megamorphic call
7546 StubRoutines::_catch_exception_entry = generate_catch_exception();
7547
7548 // atomic calls
7549 StubRoutines::_fence_entry = generate_orderaccess_fence();
7550
7551 // platform dependent
7552 StubRoutines::x86::_get_previous_sp_entry = generate_get_previous_sp();
7553
7554 StubRoutines::x86::_verify_mxcsr_entry = generate_verify_mxcsr();
7555
7556 StubRoutines::x86::_f2i_fixup = generate_f2i_fixup();
7557 StubRoutines::x86::_f2l_fixup = generate_f2l_fixup();
7558 StubRoutines::x86::_d2i_fixup = generate_d2i_fixup();
7559 StubRoutines::x86::_d2l_fixup = generate_d2l_fixup();
7560
7561 StubRoutines::x86::_float_sign_mask = generate_fp_mask("float_sign_mask", 0x7FFFFFFF7FFFFFFF);
7562 StubRoutines::x86::_float_sign_flip = generate_fp_mask("float_sign_flip", 0x8000000080000000);
7563 StubRoutines::x86::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
7564 StubRoutines::x86::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
7565
7566 // Build this early so it's available for the interpreter.
7567 StubRoutines::_throw_StackOverflowError_entry =
7568 generate_throw_exception("StackOverflowError throw_exception",
7569 CAST_FROM_FN_PTR(address,
7570 SharedRuntime::
7571 throw_StackOverflowError));
7572 StubRoutines::_throw_delayed_StackOverflowError_entry =
7573 generate_throw_exception("delayed StackOverflowError throw_exception",
7574 CAST_FROM_FN_PTR(address,
7575 SharedRuntime::
7576 throw_delayed_StackOverflowError));
7577 if (UseCRC32Intrinsics) {
7578 // set table address before stub generation which use it
7579 StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
7580 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
7581 }
7582
7583 if (UseCRC32CIntrinsics) {
7584 bool supports_clmul = VM_Version::supports_clmul();
7585 StubRoutines::x86::generate_CRC32C_table(supports_clmul);
7586 StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
7587 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
7588 }
7589
7590 if (UseAdler32Intrinsics) {
7591 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
7592 }
7593
7594 if (UseLibmIntrinsic && InlineIntrinsics) {
7595 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
7596 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
7597 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
7598 StubRoutines::x86::_ONEHALF_adr = (address)StubRoutines::x86::_ONEHALF;
7599 StubRoutines::x86::_P_2_adr = (address)StubRoutines::x86::_P_2;
7600 StubRoutines::x86::_SC_4_adr = (address)StubRoutines::x86::_SC_4;
7601 StubRoutines::x86::_Ctable_adr = (address)StubRoutines::x86::_Ctable;
7602 StubRoutines::x86::_SC_2_adr = (address)StubRoutines::x86::_SC_2;
7603 StubRoutines::x86::_SC_3_adr = (address)StubRoutines::x86::_SC_3;
7604 StubRoutines::x86::_SC_1_adr = (address)StubRoutines::x86::_SC_1;
7605 StubRoutines::x86::_PI_INV_TABLE_adr = (address)StubRoutines::x86::_PI_INV_TABLE;
7606 StubRoutines::x86::_PI_4_adr = (address)StubRoutines::x86::_PI_4;
7607 StubRoutines::x86::_PI32INV_adr = (address)StubRoutines::x86::_PI32INV;
7608 StubRoutines::x86::_SIGN_MASK_adr = (address)StubRoutines::x86::_SIGN_MASK;
7609 StubRoutines::x86::_P_1_adr = (address)StubRoutines::x86::_P_1;
7610 StubRoutines::x86::_P_3_adr = (address)StubRoutines::x86::_P_3;
7611 StubRoutines::x86::_NEG_ZERO_adr = (address)StubRoutines::x86::_NEG_ZERO;
7612 }
7613 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) {
7614 StubRoutines::_dexp = generate_libmExp();
7615 }
7616 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
7617 StubRoutines::_dlog = generate_libmLog();
7618 }
7619 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) {
7620 StubRoutines::_dlog10 = generate_libmLog10();
7621 }
7622 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) {
7623 StubRoutines::_dpow = generate_libmPow();
7624 }
7625 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
7626 StubRoutines::_dsin = generate_libmSin();
7627 }
7628 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
7629 StubRoutines::_dcos = generate_libmCos();
7630 }
7631 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
7632 StubRoutines::_dtan = generate_libmTan();
7633 }
7634 }
7635 }
7636
7637 void generate_all() {
7638 // Generates all stubs and initializes the entry points
7639
7640 // These entry points require SharedInfo::stack0 to be set up in
7641 // non-core builds and need to be relocatable, so they each
7642 // fabricate a RuntimeStub internally.
7643 StubRoutines::_throw_AbstractMethodError_entry =
7644 generate_throw_exception("AbstractMethodError throw_exception",
7645 CAST_FROM_FN_PTR(address,
7646 SharedRuntime::
7647 throw_AbstractMethodError));
7648
7649 StubRoutines::_throw_IncompatibleClassChangeError_entry =
7650 generate_throw_exception("IncompatibleClassChangeError throw_exception",
7651 CAST_FROM_FN_PTR(address,
7652 SharedRuntime::
7653 throw_IncompatibleClassChangeError));
7654
7655 StubRoutines::_throw_NullPointerException_at_call_entry =
7656 generate_throw_exception("NullPointerException at call throw_exception",
7657 CAST_FROM_FN_PTR(address,
7658 SharedRuntime::
7659 throw_NullPointerException_at_call));
7660
7661 // entry points that are platform specific
7662 StubRoutines::x86::_vector_float_sign_mask = generate_vector_mask("vector_float_sign_mask", 0x7FFFFFFF7FFFFFFF);
7663 StubRoutines::x86::_vector_float_sign_flip = generate_vector_mask("vector_float_sign_flip", 0x8000000080000000);
7664 StubRoutines::x86::_vector_double_sign_mask = generate_vector_mask("vector_double_sign_mask", 0x7FFFFFFFFFFFFFFF);
7665 StubRoutines::x86::_vector_double_sign_flip = generate_vector_mask("vector_double_sign_flip", 0x8000000000000000);
7666 StubRoutines::x86::_vector_all_bits_set = generate_vector_mask("vector_all_bits_set", 0xFFFFFFFFFFFFFFFF);
7667 StubRoutines::x86::_vector_int_mask_cmp_bits = generate_vector_mask("vector_int_mask_cmp_bits", 0x0000000100000001);
7668 StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_mask("vector_short_to_byte_mask", 0x00ff00ff00ff00ff);
7669 StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask");
7670 StubRoutines::x86::_vector_int_to_byte_mask = generate_vector_mask("vector_int_to_byte_mask", 0x000000ff000000ff);
7671 StubRoutines::x86::_vector_int_to_short_mask = generate_vector_mask("vector_int_to_short_mask", 0x0000ffff0000ffff);
7672 StubRoutines::x86::_vector_32_bit_mask = generate_vector_custom_i32("vector_32_bit_mask", Assembler::AVX_512bit,
7673 0xFFFFFFFF, 0, 0, 0);
7674 StubRoutines::x86::_vector_64_bit_mask = generate_vector_custom_i32("vector_64_bit_mask", Assembler::AVX_512bit,
7675 0xFFFFFFFF, 0xFFFFFFFF, 0, 0);
7676 StubRoutines::x86::_vector_int_shuffle_mask = generate_vector_mask("vector_int_shuffle_mask", 0x0302010003020100);
7677 StubRoutines::x86::_vector_byte_shuffle_mask = generate_vector_byte_shuffle_mask("vector_byte_shuffle_mask");
7678 StubRoutines::x86::_vector_short_shuffle_mask = generate_vector_mask("vector_short_shuffle_mask", 0x0100010001000100);
7679 StubRoutines::x86::_vector_long_shuffle_mask = generate_vector_mask("vector_long_shuffle_mask", 0x0000000100000000);
7680 StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask("vector_long_sign_mask", 0x8000000000000000);
7681 StubRoutines::x86::_vector_iota_indices = generate_iota_indices("iota_indices");
7682
7683 if (UsePopCountInstruction && VM_Version::supports_avx2() && !VM_Version::supports_avx512_vpopcntdq()) {
7684 // lut implementation influenced by counting 1s algorithm from section 5-1 of Hackers' Delight.
7685 StubRoutines::x86::_vector_popcount_lut = generate_popcount_avx_lut("popcount_lut");
7686 }
7687
7688 // support for verify_oop (must happen after universe_init)
7689 if (VerifyOops) {
7690 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
7691 }
7692
7693 // data cache line writeback
7694 StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
7695 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
7696
7697 // arraycopy stubs used by compilers
7698 generate_arraycopy_stubs();
7699
7700 // don't bother generating these AES intrinsic stubs unless global flag is set
7701 if (UseAESIntrinsics) {
7702 StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // needed by the others
7703 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
7704 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
7705 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
7706 if (VM_Version::supports_avx512_vaes() && VM_Version::supports_avx512vl() && VM_Version::supports_avx512dq() ) {
7707 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptVectorAESCrypt();
7708 StubRoutines::_electronicCodeBook_encryptAESCrypt = generate_electronicCodeBook_encryptAESCrypt();
7709 StubRoutines::_electronicCodeBook_decryptAESCrypt = generate_electronicCodeBook_decryptAESCrypt();
7710 StubRoutines::x86::_counter_mask_addr = counter_mask_addr();
7711 StubRoutines::x86::_ghash_poly512_addr = ghash_polynomial512_addr();
7712 StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
7713 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
7714 } else {
7715 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
7716 }
7717 }
7718
7719 if (UseAESCTRIntrinsics) {
7720 if (VM_Version::supports_avx512_vaes() && VM_Version::supports_avx512bw() && VM_Version::supports_avx512vl()) {
7721 if (StubRoutines::x86::_counter_mask_addr == NULL) {
7722 StubRoutines::x86::_counter_mask_addr = counter_mask_addr();
7723 }
7724 StubRoutines::_counterMode_AESCrypt = generate_counterMode_VectorAESCrypt();
7725 } else {
7726 StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
7727 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
7728 }
7729 }
7730
7731 if (UseMD5Intrinsics) {
7732 StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress");
7733 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB");
7734 }
7735 if (UseSHA1Intrinsics) {
7736 StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
7737 StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
7738 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
7739 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
7740 }
7741 if (UseSHA256Intrinsics) {
7742 StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
7743 char* dst = (char*)StubRoutines::x86::_k256_W;
7744 char* src = (char*)StubRoutines::x86::_k256;
7745 for (int ii = 0; ii < 16; ++ii) {
7746 memcpy(dst + 32 * ii, src + 16 * ii, 16);
7747 memcpy(dst + 32 * ii + 16, src + 16 * ii, 16);
7748 }
7749 StubRoutines::x86::_k256_W_adr = (address)StubRoutines::x86::_k256_W;
7750 StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
7751 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
7752 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
7753 }
7754 if (UseSHA512Intrinsics) {
7755 StubRoutines::x86::_k512_W_addr = (address)StubRoutines::x86::_k512_W;
7756 StubRoutines::x86::_pshuffle_byte_flip_mask_addr_sha512 = generate_pshuffle_byte_flip_mask_sha512();
7757 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress");
7758 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
7759 }
7760
7761 // Generate GHASH intrinsics code
7762 if (UseGHASHIntrinsics) {
7763 if (StubRoutines::x86::_ghash_long_swap_mask_addr == NULL) {
7764 StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
7765 }
7766 StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
7767 if (VM_Version::supports_avx()) {
7768 StubRoutines::x86::_ghash_shuffmask_addr = ghash_shufflemask_addr();
7769 StubRoutines::x86::_ghash_poly_addr = ghash_polynomial_addr();
7770 StubRoutines::_ghash_processBlocks = generate_avx_ghash_processBlocks();
7771 } else {
7772 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
7773 }
7774 }
7775
7776
7777 if (UseBASE64Intrinsics) {
7778 if(VM_Version::supports_avx2() &&
7779 VM_Version::supports_avx512bw() &&
7780 VM_Version::supports_avx512vl()) {
7781 StubRoutines::x86::_avx2_shuffle_base64 = base64_avx2_shuffle_addr();
7782 StubRoutines::x86::_avx2_input_mask_base64 = base64_avx2_input_mask_addr();
7783 StubRoutines::x86::_avx2_lut_base64 = base64_avx2_lut_addr();
7784 }
7785 StubRoutines::x86::_encoding_table_base64 = base64_encoding_table_addr();
7786 if (VM_Version::supports_avx512_vbmi()) {
7787 StubRoutines::x86::_shuffle_base64 = base64_shuffle_addr();
7788 StubRoutines::x86::_lookup_lo_base64 = base64_vbmi_lookup_lo_addr();
7789 StubRoutines::x86::_lookup_hi_base64 = base64_vbmi_lookup_hi_addr();
7790 StubRoutines::x86::_lookup_lo_base64url = base64_vbmi_lookup_lo_url_addr();
7791 StubRoutines::x86::_lookup_hi_base64url = base64_vbmi_lookup_hi_url_addr();
7792 StubRoutines::x86::_pack_vec_base64 = base64_vbmi_pack_vec_addr();
7793 StubRoutines::x86::_join_0_1_base64 = base64_vbmi_join_0_1_addr();
7794 StubRoutines::x86::_join_1_2_base64 = base64_vbmi_join_1_2_addr();
7795 StubRoutines::x86::_join_2_3_base64 = base64_vbmi_join_2_3_addr();
7796 }
7797 StubRoutines::x86::_decoding_table_base64 = base64_decoding_table_addr();
7798 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
7799 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
7800 }
7801
7802 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
7803 if (bs_nm != NULL) {
7804 StubRoutines::x86::_method_entry_barrier = generate_method_entry_barrier();
7805 }
7806 #ifdef COMPILER2
7807 if (UseMultiplyToLenIntrinsic) {
7808 StubRoutines::_multiplyToLen = generate_multiplyToLen();
7809 }
7810 if (UseSquareToLenIntrinsic) {
7811 StubRoutines::_squareToLen = generate_squareToLen();
7812 }
7813 if (UseMulAddIntrinsic) {
7814 StubRoutines::_mulAdd = generate_mulAdd();
7815 }
7816 if (VM_Version::supports_avx512_vbmi2()) {
7817 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
7818 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
7819 }
7820 if (UseMontgomeryMultiplyIntrinsic) {
7821 StubRoutines::_montgomeryMultiply
7822 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
7823 }
7824 if (UseMontgomerySquareIntrinsic) {
7825 StubRoutines::_montgomerySquare
7826 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
7827 }
7828
7829 // Get svml stub routine addresses
7830 void *libjsvml = NULL;
7831 char ebuf[1024];
7832 char dll_name[JVM_MAXPATHLEN];
7833 if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "jsvml")) {
7834 libjsvml = os::dll_load(dll_name, ebuf, sizeof ebuf);
7835 }
7836 if (libjsvml != NULL) {
7837 // SVML method naming convention
7838 // All the methods are named as __jsvml_op<T><N>_ha_<VV>
7839 // Where:
7840 // ha stands for high accuracy
7841 // <T> is optional to indicate float/double
7842 // Set to f for vector float operation
7843 // Omitted for vector double operation
7844 // <N> is the number of elements in the vector
7845 // 1, 2, 4, 8, 16
7846 // e.g. 128 bit float vector has 4 float elements
7847 // <VV> indicates the avx/sse level:
7848 // z0 is AVX512, l9 is AVX2, e9 is AVX1 and ex is for SSE2
7849 // e.g. __jsvml_expf16_ha_z0 is the method for computing 16 element vector float exp using AVX 512 insns
7850 // __jsvml_exp8_ha_z0 is the method for computing 8 element vector double exp using AVX 512 insns
7851
7852 log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "jsvml" JNI_LIB_SUFFIX, p2i(libjsvml));
7853 if (UseAVX > 2) {
7854 for (int op = 0; op < VectorSupport::NUM_SVML_OP; op++) {
7855 int vop = VectorSupport::VECTOR_OP_SVML_START + op;
7856 if ((!VM_Version::supports_avx512dq()) &&
7857 (vop == VectorSupport::VECTOR_OP_LOG || vop == VectorSupport::VECTOR_OP_LOG10 || vop == VectorSupport::VECTOR_OP_POW)) {
7858 continue;
7859 }
7860 snprintf(ebuf, sizeof(ebuf), "__jsvml_%sf16_ha_z0", VectorSupport::svmlname[op]);
7861 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_512][op] = (address)os::dll_lookup(libjsvml, ebuf);
7862
7863 snprintf(ebuf, sizeof(ebuf), "__jsvml_%s8_ha_z0", VectorSupport::svmlname[op]);
7864 StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_512][op] = (address)os::dll_lookup(libjsvml, ebuf);
7865 }
7866 }
7867 const char* avx_sse_str = (UseAVX >= 2) ? "l9" : ((UseAVX == 1) ? "e9" : "ex");
7868 for (int op = 0; op < VectorSupport::NUM_SVML_OP; op++) {
7869 int vop = VectorSupport::VECTOR_OP_SVML_START + op;
7870 if (vop == VectorSupport::VECTOR_OP_POW) {
7871 continue;
7872 }
7873 snprintf(ebuf, sizeof(ebuf), "__jsvml_%sf4_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7874 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libjsvml, ebuf);
7875
7876 snprintf(ebuf, sizeof(ebuf), "__jsvml_%sf4_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7877 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libjsvml, ebuf);
7878
7879 snprintf(ebuf, sizeof(ebuf), "__jsvml_%sf8_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7880 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_256][op] = (address)os::dll_lookup(libjsvml, ebuf);
7881
7882 snprintf(ebuf, sizeof(ebuf), "__jsvml_%s1_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7883 StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libjsvml, ebuf);
7884
7885 snprintf(ebuf, sizeof(ebuf), "__jsvml_%s2_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7886 StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libjsvml, ebuf);
7887
7888 snprintf(ebuf, sizeof(ebuf), "__jsvml_%s4_ha_%s", VectorSupport::svmlname[op], avx_sse_str);
7889 StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_256][op] = (address)os::dll_lookup(libjsvml, ebuf);
7890 }
7891 }
7892 #endif // COMPILER2
7893
7894 if (UseVectorizedMismatchIntrinsic) {
7895 StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch();
7896 }
7897 }
7898
7899 public:
7900 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
7901 if (all) {
7902 generate_all();
7903 } else {
7904 generate_initial();
7905 }
7906 }
7907 }; // end class declaration
7908
7909 #define UCM_TABLE_MAX_ENTRIES 16
7910 void StubGenerator_generate(CodeBuffer* code, bool all) {
7911 if (UnsafeCopyMemory::_table == NULL) {
7912 UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
7913 }
7914 StubGenerator g(code, all);
7915 }
--- EOF ---