1 /*
2 * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "asm/assembler.hpp"
26 #include "asm/assembler.inline.hpp"
27 #include "gc/shared/barrierSet.hpp"
28 #include "gc/shared/barrierSetAssembler.hpp"
29 #include "oops/methodData.hpp"
30 #include "opto/c2_MacroAssembler.hpp"
31 #include "opto/intrinsicnode.hpp"
32 #include "opto/output.hpp"
33 #include "opto/opcodes.hpp"
34 #include "opto/subnode.hpp"
35 #include "runtime/globals.hpp"
36 #include "runtime/objectMonitor.hpp"
37 #include "runtime/stubRoutines.hpp"
38 #include "utilities/checkedCast.hpp"
39 #include "utilities/globalDefinitions.hpp"
40 #include "utilities/powerOfTwo.hpp"
41 #include "utilities/sizes.hpp"
42
43 #ifdef PRODUCT
44 #define BLOCK_COMMENT(str) /* nothing */
45 #define STOP(error) stop(error)
46 #else
47 #define BLOCK_COMMENT(str) block_comment(str)
48 #define STOP(error) block_comment(error); stop(error)
49 #endif
50
51 // C2 compiled method's prolog code.
52 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
53 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
54
55 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
56 // Remove word for return addr
57 framesize -= wordSize;
58 stack_bang_size -= wordSize;
59
60 // Calls to C2R adapters often do not accept exceptional returns.
61 // We require that their callers must bang for them. But be careful, because
62 // some VM calls (such as call site linkage) can use several kilobytes of
63 // stack. But the stack safety zone should account for that.
64 // See bugs 4446381, 4468289, 4497237.
65 if (stack_bang_size > 0) {
66 generate_stack_overflow_check(stack_bang_size);
67
68 // We always push rbp, so that on return to interpreter rbp, will be
69 // restored correctly and we can correct the stack.
70 push(rbp);
71 // Save caller's stack pointer into RBP if the frame pointer is preserved.
72 if (PreserveFramePointer) {
73 mov(rbp, rsp);
74 }
75 // Remove word for ebp
76 framesize -= wordSize;
77
78 // Create frame
79 if (framesize) {
80 subptr(rsp, framesize);
81 }
82 } else {
83 subptr(rsp, framesize);
84
85 // Save RBP register now.
86 framesize -= wordSize;
87 movptr(Address(rsp, framesize), rbp);
88 // Save caller's stack pointer into RBP if the frame pointer is preserved.
89 if (PreserveFramePointer) {
90 movptr(rbp, rsp);
91 if (framesize > 0) {
92 addptr(rbp, framesize);
93 }
94 }
95 }
96
97 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
98 framesize -= wordSize;
99 movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
100 }
101
102 #ifdef ASSERT
103 if (VerifyStackAtCalls) {
104 Label L;
105 push(rax);
106 mov(rax, rsp);
107 andptr(rax, StackAlignmentInBytes-1);
108 cmpptr(rax, StackAlignmentInBytes-wordSize);
109 pop(rax);
110 jcc(Assembler::equal, L);
111 STOP("Stack is not properly aligned!");
112 bind(L);
113 }
114 #endif
115
116 if (!is_stub) {
117 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
118 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
119 Label dummy_slow_path;
120 Label dummy_continuation;
121 Label* slow_path = &dummy_slow_path;
122 Label* continuation = &dummy_continuation;
123 if (!Compile::current()->output()->in_scratch_emit_size()) {
124 // Use real labels from actual stub when not emitting code for the purpose of measuring its size
125 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
126 Compile::current()->output()->add_stub(stub);
127 slow_path = &stub->entry();
128 continuation = &stub->continuation();
129 }
130 bs->nmethod_entry_barrier(this, slow_path, continuation);
131 }
132 }
133
134 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
135 switch (vlen_in_bytes) {
136 case 4: // fall-through
137 case 8: // fall-through
138 case 16: return Assembler::AVX_128bit;
139 case 32: return Assembler::AVX_256bit;
140 case 64: return Assembler::AVX_512bit;
141
142 default: {
143 ShouldNotReachHere();
144 return Assembler::AVX_NoVec;
145 }
146 }
147 }
148
149 // fast_lock and fast_unlock used by C2
150
151 // Because the transitions from emitted code to the runtime
152 // monitorenter/exit helper stubs are so slow it's critical that
153 // we inline both the stack-locking fast path and the inflated fast path.
154 //
155 // See also: cmpFastLock and cmpFastUnlock.
156 //
157 // What follows is a specialized inline transliteration of the code
158 // in enter() and exit(). If we're concerned about I$ bloat another
159 // option would be to emit TrySlowEnter and TrySlowExit methods
160 // at startup-time. These methods would accept arguments as
161 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
162 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply
163 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
164 // In practice, however, the # of lock sites is bounded and is usually small.
165 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
166 // if the processor uses simple bimodal branch predictors keyed by EIP
167 // Since the helper routines would be called from multiple synchronization
168 // sites.
169 //
170 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
171 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
172 // to those specialized methods. That'd give us a mostly platform-independent
173 // implementation that the JITs could optimize and inline at their pleasure.
174 // Done correctly, the only time we'd need to cross to native could would be
175 // to park() or unpark() threads. We'd also need a few more unsafe operators
176 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
177 // (b) explicit barriers or fence operations.
178 //
179 // TODO:
180 //
181 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
182 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
183 // Given TLAB allocation, Self is usually manifested in a register, so passing it into
184 // the lock operators would typically be faster than reifying Self.
185 //
186 // * Ideally I'd define the primitives as:
187 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
188 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
189 // Unfortunately ADLC bugs prevent us from expressing the ideal form.
190 // Instead, we're stuck with a rather awkward and brittle register assignments below.
191 // Furthermore the register assignments are overconstrained, possibly resulting in
192 // sub-optimal code near the synchronization site.
193 //
194 // * Eliminate the sp-proximity tests and just use "== Self" tests instead.
195 // Alternately, use a better sp-proximity test.
196 //
197 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
198 // Either one is sufficient to uniquely identify a thread.
199 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
200 //
201 // * Intrinsify notify() and notifyAll() for the common cases where the
202 // object is locked by the calling thread but the waitlist is empty.
203 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
204 //
205 // * use jccb and jmpb instead of jcc and jmp to improve code density.
206 // But beware of excessive branch density on AMD Opterons.
207 //
208 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success
209 // or failure of the fast path. If the fast path fails then we pass
210 // control to the slow path, typically in C. In fast_lock and
211 // fast_unlock we often branch to DONE_LABEL, just to find that C2
212 // will emit a conditional branch immediately after the node.
213 // So we have branches to branches and lots of ICC.ZF games.
214 // Instead, it might be better to have C2 pass a "FailureLabel"
215 // into fast_lock and fast_unlock. In the case of success, control
216 // will drop through the node. ICC.ZF is undefined at exit.
217 // In the case of failure, the node will branch directly to the
218 // FailureLabel
219
220
221 // obj: object to lock
222 // box: on-stack box address -- KILLED
223 // rax: tmp -- KILLED
224 // t : tmp -- KILLED
225 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register rax_reg,
226 Register t, Register thread) {
227 assert(rax_reg == rax, "Used for CAS");
228 assert_different_registers(obj, box, rax_reg, t, thread);
229
230 // Handle inflated monitor.
231 Label inflated;
232 // Finish fast lock successfully. ZF value is irrelevant.
233 Label locked;
234 // Finish fast lock unsuccessfully. MUST jump with ZF == 0
235 Label slow_path;
236
237 if (UseObjectMonitorTable) {
238 // Clear cache in case fast locking succeeds or we need to take the slow-path.
239 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
240 }
241
242 if (DiagnoseSyncOnValueBasedClasses != 0) {
243 load_klass(rax_reg, obj, t);
244 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
245 jcc(Assembler::notZero, slow_path);
246 }
247
248 const Register mark = t;
249
250 { // Fast Lock
251
252 Label push;
253
254 const Register top = UseObjectMonitorTable ? rax_reg : box;
255
256 // Load the mark.
257 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
258
259 // Prefetch top.
260 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
261
262 // Check for monitor (0b10).
263 testptr(mark, markWord::monitor_value);
264 jcc(Assembler::notZero, inflated);
265
266 // Check if lock-stack is full.
267 cmpl(top, LockStack::end_offset() - 1);
268 jcc(Assembler::greater, slow_path);
269
270 // Check if recursive.
271 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
272 jccb(Assembler::equal, push);
273
274 // Try to lock. Transition lock bits 0b01 => 0b00
275 movptr(rax_reg, mark);
276 orptr(rax_reg, markWord::unlocked_value);
277 andptr(mark, ~(int32_t)markWord::unlocked_value);
278 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
279 jcc(Assembler::notEqual, slow_path);
280
281 if (UseObjectMonitorTable) {
282 // Need to reload top, clobbered by CAS.
283 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
284 }
285 bind(push);
286 // After successful lock, push object on lock-stack.
287 movptr(Address(thread, top), obj);
288 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
289 jmpb(locked);
290 }
291
292 { // Handle inflated monitor.
293 bind(inflated);
294
295 const Register monitor = t;
296
297 if (!UseObjectMonitorTable) {
298 assert(mark == monitor, "should be the same here");
299 } else {
300 // Uses ObjectMonitorTable. Look for the monitor in the om_cache.
301 // Fetch ObjectMonitor* from the cache or take the slow-path.
302 Label monitor_found;
303
304 // Load cache address
305 lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
306
307 const int num_unrolled = 2;
308 for (int i = 0; i < num_unrolled; i++) {
309 cmpptr(obj, Address(t));
310 jccb(Assembler::equal, monitor_found);
311 increment(t, in_bytes(OMCache::oop_to_oop_difference()));
312 }
313
314 Label loop;
315
316 // Search for obj in cache.
317 bind(loop);
318
319 // Check for match.
320 cmpptr(obj, Address(t));
321 jccb(Assembler::equal, monitor_found);
322
323 // Search until null encountered, guaranteed _null_sentinel at end.
324 cmpptr(Address(t), 1);
325 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
326 increment(t, in_bytes(OMCache::oop_to_oop_difference()));
327 jmpb(loop);
328
329 // Cache hit.
330 bind(monitor_found);
331 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
332 }
333 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
334 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
335 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
336
337 Label monitor_locked;
338 // Lock the monitor.
339
340 if (UseObjectMonitorTable) {
341 // Cache the monitor for unlock before trashing box. On failure to acquire
342 // the lock, the slow path will reset the entry accordingly (see CacheSetter).
343 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
344 }
345
346 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
347 xorptr(rax_reg, rax_reg);
348 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
349 lock(); cmpxchgptr(box, owner_address);
350 jccb(Assembler::equal, monitor_locked);
351
352 // Check if recursive.
353 cmpptr(box, rax_reg);
354 jccb(Assembler::notEqual, slow_path);
355
356 // Recursive.
357 increment(recursions_address);
358
359 bind(monitor_locked);
360 }
361
362 bind(locked);
363 // Set ZF = 1
364 xorl(rax_reg, rax_reg);
365
366 #ifdef ASSERT
367 // Check that locked label is reached with ZF set.
368 Label zf_correct;
369 Label zf_bad_zero;
370 jcc(Assembler::zero, zf_correct);
371 jmp(zf_bad_zero);
372 #endif
373
374 bind(slow_path);
375 #ifdef ASSERT
376 // Check that slow_path label is reached with ZF not set.
377 jcc(Assembler::notZero, zf_correct);
378 stop("Fast Lock ZF != 0");
379 bind(zf_bad_zero);
380 stop("Fast Lock ZF != 1");
381 bind(zf_correct);
382 #endif
383 // C2 uses the value of ZF to determine the continuation.
384 }
385
386 // obj: object to lock
387 // rax: tmp -- KILLED
388 // t : tmp - cannot be obj nor rax -- KILLED
389 //
390 // Some commentary on balanced locking:
391 //
392 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
393 // Methods that don't have provably balanced locking are forced to run in the
394 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
395 // The interpreter provides two properties:
396 // I1: At return-time the interpreter automatically and quietly unlocks any
397 // objects acquired in the current activation (frame). Recall that the
398 // interpreter maintains an on-stack list of locks currently held by
399 // a frame.
400 // I2: If a method attempts to unlock an object that is not held by the
401 // frame the interpreter throws IMSX.
402 //
403 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
404 // B() doesn't have provably balanced locking so it runs in the interpreter.
405 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
406 // is still locked by A().
407 //
408 // The only other source of unbalanced locking would be JNI. The "Java Native Interface
409 // Specification" states that an object locked by JNI's MonitorEnter should not be
410 // unlocked by "normal" java-level locking and vice-versa. The specification doesn't
411 // specify what will occur if a program engages in such mixed-mode locking, however.
412 // Arguably given that the spec legislates the JNI case as undefined our implementation
413 // could reasonably *avoid* checking owner in fast_unlock().
414 // In the interest of performance we elide m->Owner==Self check in unlock.
415 // A perfectly viable alternative is to elide the owner check except when
416 // Xcheck:jni is enabled.
417
418 void C2_MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register t, Register thread) {
419 assert(reg_rax == rax, "Used for CAS");
420 assert_different_registers(obj, reg_rax, t);
421
422 // Handle inflated monitor.
423 Label inflated, inflated_check_lock_stack;
424 // Finish fast unlock successfully. MUST jump with ZF == 1
425 Label unlocked, slow_path;
426
427 const Register mark = t;
428 const Register monitor = t;
429 const Register top = UseObjectMonitorTable ? t : reg_rax;
430 const Register box = reg_rax;
431
432 Label dummy;
433 C2FastUnlockStub* stub = nullptr;
434
435 if (!Compile::current()->output()->in_scratch_emit_size()) {
436 stub = new (Compile::current()->comp_arena()) C2FastUnlockStub(obj, mark, reg_rax, thread);
437 Compile::current()->output()->add_stub(stub);
438 }
439
440 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
441
442 { // Fast Unlock
443
444 // Load top.
445 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
446
447 if (!UseObjectMonitorTable) {
448 // Prefetch mark.
449 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
450 }
451
452 // Check if obj is top of lock-stack.
453 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
454 // Top of lock stack was not obj. Must be monitor.
455 jcc(Assembler::notEqual, inflated_check_lock_stack);
456
457 // Pop lock-stack.
458 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
459 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
460
461 // Check if recursive.
462 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
463 jcc(Assembler::equal, unlocked);
464
465 // We elide the monitor check, let the CAS fail instead.
466
467 if (UseObjectMonitorTable) {
468 // Load mark.
469 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
470 }
471
472 // Try to unlock. Transition lock bits 0b00 => 0b01
473 movptr(reg_rax, mark);
474 andptr(reg_rax, ~(int32_t)markWord::lock_mask);
475 orptr(mark, markWord::unlocked_value);
476 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
477 jcc(Assembler::notEqual, push_and_slow_path);
478 jmp(unlocked);
479 }
480
481
482 { // Handle inflated monitor.
483 bind(inflated_check_lock_stack);
484 #ifdef ASSERT
485 Label check_done;
486 subl(top, oopSize);
487 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
488 jcc(Assembler::below, check_done);
489 cmpptr(obj, Address(thread, top));
490 jccb(Assembler::notEqual, inflated_check_lock_stack);
491 stop("Fast Unlock lock on stack");
492 bind(check_done);
493 if (UseObjectMonitorTable) {
494 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
495 }
496 testptr(mark, markWord::monitor_value);
497 jccb(Assembler::notZero, inflated);
498 stop("Fast Unlock not monitor");
499 #endif
500
501 bind(inflated);
502
503 if (!UseObjectMonitorTable) {
504 assert(mark == monitor, "should be the same here");
505 } else {
506 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack.
507 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
508 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
509 cmpptr(monitor, alignof(ObjectMonitor*));
510 jcc(Assembler::below, slow_path);
511 }
512 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
513 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
514 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
515 const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
516 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
517
518 Label recursive;
519
520 // Check if recursive.
521 cmpptr(recursions_address, 0);
522 jccb(Assembler::notZero, recursive);
523
524 // Set owner to null.
525 // Release to satisfy the JMM
526 movptr(owner_address, NULL_WORD);
527 // We need a full fence after clearing owner to avoid stranding.
528 // StoreLoad achieves this.
529 membar(StoreLoad);
530
531 // Check if the entry_list is empty.
532 cmpptr(entry_list_address, NULL_WORD);
533 jccb(Assembler::zero, unlocked); // If so we are done.
534
535 // Check if there is a successor.
536 cmpptr(succ_address, NULL_WORD);
537 jccb(Assembler::notZero, unlocked); // If so we are done.
538
539 // Save the monitor pointer in the current thread, so we can try to
540 // reacquire the lock in SharedRuntime::monitor_exit_helper().
541 if (!UseObjectMonitorTable) {
542 andptr(monitor, ~(int32_t)markWord::monitor_value);
543 }
544 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
545
546 orl(t, 1); // Fast Unlock ZF = 0
547 jmpb(slow_path);
548
549 // Recursive unlock.
550 bind(recursive);
551 decrement(recursions_address);
552 }
553
554 bind(unlocked);
555 xorl(t, t); // Fast Unlock ZF = 1
556
557 #ifdef ASSERT
558 // Check that unlocked label is reached with ZF set.
559 Label zf_correct;
560 Label zf_bad_zero;
561 jcc(Assembler::zero, zf_correct);
562 jmp(zf_bad_zero);
563 #endif
564
565 bind(slow_path);
566 if (stub != nullptr) {
567 bind(stub->slow_path_continuation());
568 }
569 #ifdef ASSERT
570 // Check that stub->continuation() label is reached with ZF not set.
571 jcc(Assembler::notZero, zf_correct);
572 stop("Fast Unlock ZF != 0");
573 bind(zf_bad_zero);
574 stop("Fast Unlock ZF != 1");
575 bind(zf_correct);
576 #endif
577 // C2 uses the value of ZF to determine the continuation.
578 }
579
580 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
581 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
582 }
583
584 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
585 const int framesize = Compile::current()->output()->frame_size_in_bytes();
586 masm->movptr(dst, rsp);
587 if (framesize > 2 * wordSize) {
588 masm->addptr(dst, framesize - 2 * wordSize);
589 }
590 }
591
592 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
593 if (PreserveFramePointer) {
594 // frame pointer is valid
595 #ifdef ASSERT
596 // Verify frame pointer value in rbp.
597 reconstruct_frame_pointer_helper(this, rtmp);
598 Label L_success;
599 cmpq(rbp, rtmp);
600 jccb(Assembler::equal, L_success);
601 STOP("frame pointer mismatch");
602 bind(L_success);
603 #endif // ASSERT
604 } else {
605 reconstruct_frame_pointer_helper(this, rbp);
606 }
607 }
608
609 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
610 jint lo = t->_lo;
611 jint hi = t->_hi;
612 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
613 if (t == TypeInt::INT) {
614 return;
615 }
616
617 BLOCK_COMMENT("CastII {");
618 Label fail;
619 Label succeed;
620
621 if (lo != min_jint) {
622 cmpl(val, lo);
623 jccb(Assembler::less, fail);
624 }
625 if (hi != max_jint) {
626 cmpl(val, hi);
627 jccb(Assembler::greater, fail);
628 }
629 jmpb(succeed);
630
631 bind(fail);
632 movl(c_rarg0, idx);
633 movl(c_rarg1, val);
634 movl(c_rarg2, lo);
635 movl(c_rarg3, hi);
636 reconstruct_frame_pointer(rscratch1);
637 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
638 hlt();
639 bind(succeed);
640 BLOCK_COMMENT("} // CastII");
641 }
642
643 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
644 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
645 }
646
647 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
648 jlong lo = t->_lo;
649 jlong hi = t->_hi;
650 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
651 if (t == TypeLong::LONG) {
652 return;
653 }
654
655 BLOCK_COMMENT("CastLL {");
656 Label fail;
657 Label succeed;
658
659 auto cmp_val = [&](jlong bound) {
660 if (is_simm32(bound)) {
661 cmpq(val, checked_cast<int>(bound));
662 } else {
663 mov64(tmp, bound);
664 cmpq(val, tmp);
665 }
666 };
667
668 if (lo != min_jlong) {
669 cmp_val(lo);
670 jccb(Assembler::less, fail);
671 }
672 if (hi != max_jlong) {
673 cmp_val(hi);
674 jccb(Assembler::greater, fail);
675 }
676 jmpb(succeed);
677
678 bind(fail);
679 movl(c_rarg0, idx);
680 movq(c_rarg1, val);
681 mov64(c_rarg2, lo);
682 mov64(c_rarg3, hi);
683 reconstruct_frame_pointer(rscratch1);
684 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
685 hlt();
686 bind(succeed);
687 BLOCK_COMMENT("} // CastLL");
688 }
689
690 //-------------------------------------------------------------------------------------------
691 // Generic instructions support for use in .ad files C2 code generation
692
693 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
694 if (dst != src) {
695 movdqu(dst, src);
696 }
697 if (opcode == Op_AbsVD) {
698 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
699 } else {
700 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
701 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
702 }
703 }
704
705 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
706 if (opcode == Op_AbsVD) {
707 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
708 } else {
709 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
710 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
711 }
712 }
713
714 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
715 if (dst != src) {
716 movdqu(dst, src);
717 }
718 if (opcode == Op_AbsVF) {
719 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
720 } else {
721 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
722 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
723 }
724 }
725
726 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
727 if (opcode == Op_AbsVF) {
728 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
729 } else {
730 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
731 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
732 }
733 }
734
735 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
736 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
737 assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
738
739 if (opcode == Op_MinV) {
740 if (elem_bt == T_BYTE) {
741 pminsb(dst, src);
742 } else if (elem_bt == T_SHORT) {
743 pminsw(dst, src);
744 } else if (elem_bt == T_INT) {
745 pminsd(dst, src);
746 } else {
747 assert(elem_bt == T_LONG, "required");
748 assert(tmp == xmm0, "required");
749 assert_different_registers(dst, src, tmp);
750 movdqu(xmm0, dst);
751 pcmpgtq(xmm0, src);
752 blendvpd(dst, src); // xmm0 as mask
753 }
754 } else { // opcode == Op_MaxV
755 if (elem_bt == T_BYTE) {
756 pmaxsb(dst, src);
757 } else if (elem_bt == T_SHORT) {
758 pmaxsw(dst, src);
759 } else if (elem_bt == T_INT) {
760 pmaxsd(dst, src);
761 } else {
762 assert(elem_bt == T_LONG, "required");
763 assert(tmp == xmm0, "required");
764 assert_different_registers(dst, src, tmp);
765 movdqu(xmm0, src);
766 pcmpgtq(xmm0, dst);
767 blendvpd(dst, src); // xmm0 as mask
768 }
769 }
770 }
771
772 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
773 XMMRegister src1, Address src2, int vlen_enc) {
774 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
775 if (opcode == Op_UMinV) {
776 switch(elem_bt) {
777 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break;
778 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
779 case T_INT: vpminud(dst, src1, src2, vlen_enc); break;
780 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
781 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
782 }
783 } else {
784 assert(opcode == Op_UMaxV, "required");
785 switch(elem_bt) {
786 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break;
787 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
788 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break;
789 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
790 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
791 }
792 }
793 }
794
795 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
796 // For optimality, leverage a full vector width of 512 bits
797 // for operations over smaller vector sizes on AVX512 targets.
798 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
799 if (opcode == Op_UMaxV) {
800 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
801 } else {
802 assert(opcode == Op_UMinV, "required");
803 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
804 }
805 } else {
806 // T1 = -1
807 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
808 // T1 = -1 << 63
809 vpsllq(xtmp1, xtmp1, 63, vlen_enc);
810 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
811 vpaddq(xtmp2, xtmp1, src2, vlen_enc);
812 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
813 vpaddq(xtmp1, xtmp1, src1, vlen_enc);
814 // Mask = T2 > T1
815 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
816 if (opcode == Op_UMaxV) {
817 // Res = Mask ? Src2 : Src1
818 vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
819 } else {
820 // Res = Mask ? Src1 : Src2
821 vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
822 }
823 }
824 }
825
826 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
827 XMMRegister src1, XMMRegister src2, int vlen_enc) {
828 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
829 if (opcode == Op_UMinV) {
830 switch(elem_bt) {
831 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break;
832 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
833 case T_INT: vpminud(dst, src1, src2, vlen_enc); break;
834 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
835 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
836 }
837 } else {
838 assert(opcode == Op_UMaxV, "required");
839 switch(elem_bt) {
840 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break;
841 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
842 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break;
843 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
844 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
845 }
846 }
847 }
848
849 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
850 XMMRegister dst, XMMRegister src1, XMMRegister src2,
851 int vlen_enc) {
852 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
853
854 if (opcode == Op_MinV) {
855 if (elem_bt == T_BYTE) {
856 vpminsb(dst, src1, src2, vlen_enc);
857 } else if (elem_bt == T_SHORT) {
858 vpminsw(dst, src1, src2, vlen_enc);
859 } else if (elem_bt == T_INT) {
860 vpminsd(dst, src1, src2, vlen_enc);
861 } else {
862 assert(elem_bt == T_LONG, "required");
863 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
864 vpminsq(dst, src1, src2, vlen_enc);
865 } else {
866 assert_different_registers(dst, src1, src2);
867 vpcmpgtq(dst, src1, src2, vlen_enc);
868 vblendvpd(dst, src1, src2, dst, vlen_enc);
869 }
870 }
871 } else { // opcode == Op_MaxV
872 if (elem_bt == T_BYTE) {
873 vpmaxsb(dst, src1, src2, vlen_enc);
874 } else if (elem_bt == T_SHORT) {
875 vpmaxsw(dst, src1, src2, vlen_enc);
876 } else if (elem_bt == T_INT) {
877 vpmaxsd(dst, src1, src2, vlen_enc);
878 } else {
879 assert(elem_bt == T_LONG, "required");
880 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
881 vpmaxsq(dst, src1, src2, vlen_enc);
882 } else {
883 assert_different_registers(dst, src1, src2);
884 vpcmpgtq(dst, src1, src2, vlen_enc);
885 vblendvpd(dst, src2, src1, dst, vlen_enc);
886 }
887 }
888 }
889 }
890
891 // Float/Double min max
892
893 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
894 XMMRegister dst, XMMRegister a, XMMRegister b,
895 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
896 int vlen_enc) {
897 assert(UseAVX > 0, "required");
898 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
899 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
900 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
901 assert_different_registers(a, tmp, atmp, btmp);
902 assert_different_registers(b, tmp, atmp, btmp);
903
904 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
905 bool is_double_word = is_double_word_type(elem_bt);
906
907 /* Note on 'non-obvious' assembly sequence:
908 *
909 * While there are vminps/vmaxps instructions, there are two important differences between hardware
910 * and Java on how they handle floats:
911 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
912 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
913 *
914 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
915 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
916 * (only useful when signs differ, noop otherwise)
917 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
918
919 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
920 * btmp = (b < +0.0) ? a : b
921 * atmp = (b < +0.0) ? b : a
922 * Tmp = Max_Float(atmp , btmp)
923 * Res = (atmp == NaN) ? atmp : Tmp
924 */
925
926 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
927 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
928 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
929 XMMRegister mask;
930
931 if (!is_double_word && is_min) {
932 mask = a;
933 vblend = &MacroAssembler::vblendvps;
934 vmaxmin = &MacroAssembler::vminps;
935 vcmp = &MacroAssembler::vcmpps;
936 } else if (!is_double_word && !is_min) {
937 mask = b;
938 vblend = &MacroAssembler::vblendvps;
939 vmaxmin = &MacroAssembler::vmaxps;
940 vcmp = &MacroAssembler::vcmpps;
941 } else if (is_double_word && is_min) {
942 mask = a;
943 vblend = &MacroAssembler::vblendvpd;
944 vmaxmin = &MacroAssembler::vminpd;
945 vcmp = &MacroAssembler::vcmppd;
946 } else {
947 assert(is_double_word && !is_min, "sanity");
948 mask = b;
949 vblend = &MacroAssembler::vblendvpd;
950 vmaxmin = &MacroAssembler::vmaxpd;
951 vcmp = &MacroAssembler::vcmppd;
952 }
953
954 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
955 XMMRegister maxmin, scratch;
956 if (dst == btmp) {
957 maxmin = btmp;
958 scratch = tmp;
959 } else {
960 maxmin = tmp;
961 scratch = btmp;
962 }
963
964 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
965 if (precompute_mask && !is_double_word) {
966 vpsrad(tmp, mask, 32, vlen_enc);
967 mask = tmp;
968 } else if (precompute_mask && is_double_word) {
969 vpxor(tmp, tmp, tmp, vlen_enc);
970 vpcmpgtq(tmp, tmp, mask, vlen_enc);
971 mask = tmp;
972 }
973
974 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
975 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
976 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
977 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
978 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
979 }
980
981 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
982 XMMRegister dst, XMMRegister a, XMMRegister b,
983 KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
984 int vlen_enc) {
985 assert(UseAVX > 2, "required");
986 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
987 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
988 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
989 assert_different_registers(dst, a, atmp, btmp);
990 assert_different_registers(dst, b, atmp, btmp);
991
992 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
993 bool is_double_word = is_double_word_type(elem_bt);
994 bool merge = true;
995
996 if (!is_double_word && is_min) {
997 evpmovd2m(ktmp, a, vlen_enc);
998 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
999 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1000 vminps(dst, atmp, btmp, vlen_enc);
1001 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1002 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1003 } else if (!is_double_word && !is_min) {
1004 evpmovd2m(ktmp, b, vlen_enc);
1005 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1006 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1007 vmaxps(dst, atmp, btmp, vlen_enc);
1008 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1009 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1010 } else if (is_double_word && is_min) {
1011 evpmovq2m(ktmp, a, vlen_enc);
1012 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1013 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1014 vminpd(dst, atmp, btmp, vlen_enc);
1015 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1016 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1017 } else {
1018 assert(is_double_word && !is_min, "sanity");
1019 evpmovq2m(ktmp, b, vlen_enc);
1020 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1021 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1022 vmaxpd(dst, atmp, btmp, vlen_enc);
1023 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1024 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1025 }
1026 }
1027
1028 void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1029 XMMRegister src1, XMMRegister src2, int vlen_enc) {
1030 assert(opc == Op_MinV || opc == Op_MinReductionV ||
1031 opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1032
1033 int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1034 : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1035 if (elem_bt == T_FLOAT) {
1036 evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1037 } else {
1038 assert(elem_bt == T_DOUBLE, "");
1039 evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1040 }
1041 }
1042
1043 // Float/Double signum
1044 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1045 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1046
1047 Label DONE_LABEL;
1048
1049 if (opcode == Op_SignumF) {
1050 ucomiss(dst, zero);
1051 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1052 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN
1053 movflt(dst, one);
1054 jcc(Assembler::above, DONE_LABEL);
1055 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1056 } else if (opcode == Op_SignumD) {
1057 ucomisd(dst, zero);
1058 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1059 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN
1060 movdbl(dst, one);
1061 jcc(Assembler::above, DONE_LABEL);
1062 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1063 }
1064
1065 bind(DONE_LABEL);
1066 }
1067
1068 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1069 if (sign) {
1070 pmovsxbw(dst, src);
1071 } else {
1072 pmovzxbw(dst, src);
1073 }
1074 }
1075
1076 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1077 if (sign) {
1078 vpmovsxbw(dst, src, vector_len);
1079 } else {
1080 vpmovzxbw(dst, src, vector_len);
1081 }
1082 }
1083
1084 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1085 if (sign) {
1086 vpmovsxbd(dst, src, vector_len);
1087 } else {
1088 vpmovzxbd(dst, src, vector_len);
1089 }
1090 }
1091
1092 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1093 if (sign) {
1094 vpmovsxwd(dst, src, vector_len);
1095 } else {
1096 vpmovzxwd(dst, src, vector_len);
1097 }
1098 }
1099
1100 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1101 int shift, int vector_len) {
1102 if (opcode == Op_RotateLeftV) {
1103 if (etype == T_INT) {
1104 evprold(dst, src, shift, vector_len);
1105 } else {
1106 assert(etype == T_LONG, "expected type T_LONG");
1107 evprolq(dst, src, shift, vector_len);
1108 }
1109 } else {
1110 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1111 if (etype == T_INT) {
1112 evprord(dst, src, shift, vector_len);
1113 } else {
1114 assert(etype == T_LONG, "expected type T_LONG");
1115 evprorq(dst, src, shift, vector_len);
1116 }
1117 }
1118 }
1119
1120 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1121 XMMRegister shift, int vector_len) {
1122 if (opcode == Op_RotateLeftV) {
1123 if (etype == T_INT) {
1124 evprolvd(dst, src, shift, vector_len);
1125 } else {
1126 assert(etype == T_LONG, "expected type T_LONG");
1127 evprolvq(dst, src, shift, vector_len);
1128 }
1129 } else {
1130 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1131 if (etype == T_INT) {
1132 evprorvd(dst, src, shift, vector_len);
1133 } else {
1134 assert(etype == T_LONG, "expected type T_LONG");
1135 evprorvq(dst, src, shift, vector_len);
1136 }
1137 }
1138 }
1139
1140 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1141 if (opcode == Op_RShiftVI) {
1142 psrad(dst, shift);
1143 } else if (opcode == Op_LShiftVI) {
1144 pslld(dst, shift);
1145 } else {
1146 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1147 psrld(dst, shift);
1148 }
1149 }
1150
1151 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1152 switch (opcode) {
1153 case Op_RShiftVI: psrad(dst, shift); break;
1154 case Op_LShiftVI: pslld(dst, shift); break;
1155 case Op_URShiftVI: psrld(dst, shift); break;
1156
1157 default: assert(false, "%s", NodeClassNames[opcode]);
1158 }
1159 }
1160
1161 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1162 if (opcode == Op_RShiftVI) {
1163 vpsrad(dst, nds, shift, vector_len);
1164 } else if (opcode == Op_LShiftVI) {
1165 vpslld(dst, nds, shift, vector_len);
1166 } else {
1167 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1168 vpsrld(dst, nds, shift, vector_len);
1169 }
1170 }
1171
1172 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1173 switch (opcode) {
1174 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break;
1175 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break;
1176 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1177
1178 default: assert(false, "%s", NodeClassNames[opcode]);
1179 }
1180 }
1181
1182 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1183 switch (opcode) {
1184 case Op_RShiftVB: // fall-through
1185 case Op_RShiftVS: psraw(dst, shift); break;
1186
1187 case Op_LShiftVB: // fall-through
1188 case Op_LShiftVS: psllw(dst, shift); break;
1189
1190 case Op_URShiftVS: // fall-through
1191 case Op_URShiftVB: psrlw(dst, shift); break;
1192
1193 default: assert(false, "%s", NodeClassNames[opcode]);
1194 }
1195 }
1196
1197 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1198 switch (opcode) {
1199 case Op_RShiftVB: // fall-through
1200 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break;
1201
1202 case Op_LShiftVB: // fall-through
1203 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break;
1204
1205 case Op_URShiftVS: // fall-through
1206 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1207
1208 default: assert(false, "%s", NodeClassNames[opcode]);
1209 }
1210 }
1211
1212 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1213 switch (opcode) {
1214 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1215 case Op_LShiftVL: psllq(dst, shift); break;
1216 case Op_URShiftVL: psrlq(dst, shift); break;
1217
1218 default: assert(false, "%s", NodeClassNames[opcode]);
1219 }
1220 }
1221
1222 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1223 if (opcode == Op_RShiftVL) {
1224 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems
1225 } else if (opcode == Op_LShiftVL) {
1226 psllq(dst, shift);
1227 } else {
1228 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1229 psrlq(dst, shift);
1230 }
1231 }
1232
1233 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1234 switch (opcode) {
1235 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1236 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break;
1237 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1238
1239 default: assert(false, "%s", NodeClassNames[opcode]);
1240 }
1241 }
1242
1243 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1244 if (opcode == Op_RShiftVL) {
1245 evpsraq(dst, nds, shift, vector_len);
1246 } else if (opcode == Op_LShiftVL) {
1247 vpsllq(dst, nds, shift, vector_len);
1248 } else {
1249 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1250 vpsrlq(dst, nds, shift, vector_len);
1251 }
1252 }
1253
1254 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1255 switch (opcode) {
1256 case Op_RShiftVB: // fall-through
1257 case Op_RShiftVS: // fall-through
1258 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break;
1259
1260 case Op_LShiftVB: // fall-through
1261 case Op_LShiftVS: // fall-through
1262 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break;
1263
1264 case Op_URShiftVB: // fall-through
1265 case Op_URShiftVS: // fall-through
1266 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1267
1268 default: assert(false, "%s", NodeClassNames[opcode]);
1269 }
1270 }
1271
1272 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1273 switch (opcode) {
1274 case Op_RShiftVB: // fall-through
1275 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break;
1276
1277 case Op_LShiftVB: // fall-through
1278 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break;
1279
1280 case Op_URShiftVB: // fall-through
1281 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1282
1283 default: assert(false, "%s", NodeClassNames[opcode]);
1284 }
1285 }
1286
1287 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1288 assert(UseAVX >= 2, "required");
1289 switch (opcode) {
1290 case Op_RShiftVL: {
1291 if (UseAVX > 2) {
1292 assert(tmp == xnoreg, "not used");
1293 if (!VM_Version::supports_avx512vl()) {
1294 vlen_enc = Assembler::AVX_512bit;
1295 }
1296 evpsravq(dst, src, shift, vlen_enc);
1297 } else {
1298 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1299 vpsrlvq(dst, src, shift, vlen_enc);
1300 vpsrlvq(tmp, tmp, shift, vlen_enc);
1301 vpxor(dst, dst, tmp, vlen_enc);
1302 vpsubq(dst, dst, tmp, vlen_enc);
1303 }
1304 break;
1305 }
1306 case Op_LShiftVL: {
1307 assert(tmp == xnoreg, "not used");
1308 vpsllvq(dst, src, shift, vlen_enc);
1309 break;
1310 }
1311 case Op_URShiftVL: {
1312 assert(tmp == xnoreg, "not used");
1313 vpsrlvq(dst, src, shift, vlen_enc);
1314 break;
1315 }
1316 default: assert(false, "%s", NodeClassNames[opcode]);
1317 }
1318 }
1319
1320 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1321 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1322 assert(opcode == Op_LShiftVB ||
1323 opcode == Op_RShiftVB ||
1324 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1325 bool sign = (opcode != Op_URShiftVB);
1326 assert(vector_len == 0, "required");
1327 vextendbd(sign, dst, src, 1);
1328 vpmovzxbd(vtmp, shift, 1);
1329 varshiftd(opcode, dst, dst, vtmp, 1);
1330 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1331 vextracti128_high(vtmp, dst);
1332 vpackusdw(dst, dst, vtmp, 0);
1333 }
1334
1335 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1336 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1337 assert(opcode == Op_LShiftVB ||
1338 opcode == Op_RShiftVB ||
1339 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1340 bool sign = (opcode != Op_URShiftVB);
1341 int ext_vector_len = vector_len + 1;
1342 vextendbw(sign, dst, src, ext_vector_len);
1343 vpmovzxbw(vtmp, shift, ext_vector_len);
1344 varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1345 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1346 if (vector_len == 0) {
1347 vextracti128_high(vtmp, dst);
1348 vpackuswb(dst, dst, vtmp, vector_len);
1349 } else {
1350 vextracti64x4_high(vtmp, dst);
1351 vpackuswb(dst, dst, vtmp, vector_len);
1352 vpermq(dst, dst, 0xD8, vector_len);
1353 }
1354 }
1355
1356 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1357 switch(typ) {
1358 case T_BYTE:
1359 pinsrb(dst, val, idx);
1360 break;
1361 case T_SHORT:
1362 pinsrw(dst, val, idx);
1363 break;
1364 case T_INT:
1365 pinsrd(dst, val, idx);
1366 break;
1367 case T_LONG:
1368 pinsrq(dst, val, idx);
1369 break;
1370 default:
1371 assert(false,"Should not reach here.");
1372 break;
1373 }
1374 }
1375
1376 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1377 switch(typ) {
1378 case T_BYTE:
1379 vpinsrb(dst, src, val, idx);
1380 break;
1381 case T_SHORT:
1382 vpinsrw(dst, src, val, idx);
1383 break;
1384 case T_INT:
1385 vpinsrd(dst, src, val, idx);
1386 break;
1387 case T_LONG:
1388 vpinsrq(dst, src, val, idx);
1389 break;
1390 default:
1391 assert(false,"Should not reach here.");
1392 break;
1393 }
1394 }
1395
1396 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst,
1397 Register base, Register idx_base,
1398 Register mask, Register mask_idx,
1399 Register rtmp, int vlen_enc) {
1400 vpxor(dst, dst, dst, vlen_enc);
1401 if (elem_bt == T_SHORT) {
1402 for (int i = 0; i < 4; i++) {
1403 // dst[i] = mask[i] ? src[idx_base[i]] : 0
1404 Label skip_load;
1405 btq(mask, mask_idx);
1406 jccb(Assembler::carryClear, skip_load);
1407 movl(rtmp, Address(idx_base, i * 4));
1408 pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1409 bind(skip_load);
1410 incq(mask_idx);
1411 }
1412 } else {
1413 assert(elem_bt == T_BYTE, "");
1414 for (int i = 0; i < 8; i++) {
1415 // dst[i] = mask[i] ? src[idx_base[i]] : 0
1416 Label skip_load;
1417 btq(mask, mask_idx);
1418 jccb(Assembler::carryClear, skip_load);
1419 movl(rtmp, Address(idx_base, i * 4));
1420 pinsrb(dst, Address(base, rtmp), i);
1421 bind(skip_load);
1422 incq(mask_idx);
1423 }
1424 }
1425 }
1426
1427 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
1428 Register base, Register idx_base,
1429 Register rtmp, int vlen_enc) {
1430 vpxor(dst, dst, dst, vlen_enc);
1431 if (elem_bt == T_SHORT) {
1432 for (int i = 0; i < 4; i++) {
1433 // dst[i] = src[idx_base[i]]
1434 movl(rtmp, Address(idx_base, i * 4));
1435 pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1436 }
1437 } else {
1438 assert(elem_bt == T_BYTE, "");
1439 for (int i = 0; i < 8; i++) {
1440 // dst[i] = src[idx_base[i]]
1441 movl(rtmp, Address(idx_base, i * 4));
1442 pinsrb(dst, Address(base, rtmp), i);
1443 }
1444 }
1445 }
1446
1447 /*
1448 * Gather using hybrid algorithm, first partially unroll scalar loop
1449 * to accumulate values from gather indices into a quad-word(64bit) slice.
1450 * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1451 * permutation to place the slice into appropriate vector lane
1452 * locations in destination vector. Following pseudo code describes the
1453 * algorithm in detail:
1454 *
1455 * DST_VEC = ZERO_VEC
1456 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1457 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1458 * FOREACH_ITER:
1459 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1460 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1461 * DST_VEC = DST_VEC OR TEMP_PERM_VEC
1462 * PERM_INDEX = PERM_INDEX - TWO_VEC
1463 *
1464 * With each iteration, doubleword permute indices (0,1) corresponding
1465 * to gathered quadword gets right shifted by two lane positions.
1466 *
1467 */
1468 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1469 Register base, Register idx_base,
1470 Register mask, XMMRegister xtmp1,
1471 XMMRegister xtmp2, XMMRegister temp_dst,
1472 Register rtmp, Register mask_idx,
1473 Register length, int vector_len, int vlen_enc) {
1474 Label GATHER8_LOOP;
1475 assert(is_subword_type(elem_ty), "");
1476 movl(length, vector_len);
1477 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1478 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1479 vallones(xtmp2, vlen_enc);
1480 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1481 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1482 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1483
1484 bind(GATHER8_LOOP);
1485 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1486 if (mask == noreg) {
1487 vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc);
1488 } else {
1489 vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc);
1490 }
1491 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1492 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1493 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1494 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1495 // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1496 vpor(dst, dst, temp_dst, vlen_enc);
1497 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1));
1498 subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1499 jcc(Assembler::notEqual, GATHER8_LOOP);
1500 }
1501
1502 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1503 switch(typ) {
1504 case T_INT:
1505 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1506 break;
1507 case T_FLOAT:
1508 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1509 break;
1510 case T_LONG:
1511 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1512 break;
1513 case T_DOUBLE:
1514 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1515 break;
1516 default:
1517 assert(false,"Should not reach here.");
1518 break;
1519 }
1520 }
1521
1522 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1523 switch(typ) {
1524 case T_INT:
1525 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1526 break;
1527 case T_FLOAT:
1528 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1529 break;
1530 case T_LONG:
1531 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1532 break;
1533 case T_DOUBLE:
1534 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1535 break;
1536 default:
1537 assert(false,"Should not reach here.");
1538 break;
1539 }
1540 }
1541
1542 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1543 switch(typ) {
1544 case T_INT:
1545 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1546 break;
1547 case T_FLOAT:
1548 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1549 break;
1550 case T_LONG:
1551 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1552 break;
1553 case T_DOUBLE:
1554 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1555 break;
1556 default:
1557 assert(false,"Should not reach here.");
1558 break;
1559 }
1560 }
1561
1562 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1563 if (vlen_in_bytes <= 16) {
1564 pxor (dst, dst);
1565 psubb(dst, src);
1566 switch (elem_bt) {
1567 case T_BYTE: /* nothing to do */ break;
1568 case T_SHORT: pmovsxbw(dst, dst); break;
1569 case T_INT: pmovsxbd(dst, dst); break;
1570 case T_FLOAT: pmovsxbd(dst, dst); break;
1571 case T_LONG: pmovsxbq(dst, dst); break;
1572 case T_DOUBLE: pmovsxbq(dst, dst); break;
1573
1574 default: assert(false, "%s", type2name(elem_bt));
1575 }
1576 } else {
1577 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1578 int vlen_enc = vector_length_encoding(vlen_in_bytes);
1579
1580 vpxor (dst, dst, dst, vlen_enc);
1581 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1582
1583 switch (elem_bt) {
1584 case T_BYTE: /* nothing to do */ break;
1585 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break;
1586 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break;
1587 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break;
1588 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break;
1589 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1590
1591 default: assert(false, "%s", type2name(elem_bt));
1592 }
1593 }
1594 }
1595
1596 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1597 if (novlbwdq) {
1598 vpmovsxbd(xtmp, src, vlen_enc);
1599 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1600 Assembler::eq, true, vlen_enc, noreg);
1601 } else {
1602 vpxor(xtmp, xtmp, xtmp, vlen_enc);
1603 vpsubb(xtmp, xtmp, src, vlen_enc);
1604 evpmovb2m(dst, xtmp, vlen_enc);
1605 }
1606 }
1607
1608 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1609 if (is_integral_type(bt)) {
1610 switch (vlen_in_bytes) {
1611 case 4: movdl(dst, src); break;
1612 case 8: movq(dst, src); break;
1613 case 16: movdqu(dst, src); break;
1614 case 32: vmovdqu(dst, src); break;
1615 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1616 default: ShouldNotReachHere();
1617 }
1618 } else {
1619 switch (vlen_in_bytes) {
1620 case 4: movflt(dst, src); break;
1621 case 8: movdbl(dst, src); break;
1622 case 16: movups(dst, src); break;
1623 case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1624 case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1625 default: ShouldNotReachHere();
1626 }
1627 }
1628 }
1629
1630 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1631 assert(rscratch != noreg || always_reachable(src), "missing");
1632
1633 if (reachable(src)) {
1634 load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1635 } else {
1636 lea(rscratch, src);
1637 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1638 }
1639 }
1640
1641 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1642 int vlen_enc = vector_length_encoding(vlen);
1643 if (VM_Version::supports_avx()) {
1644 if (bt == T_LONG) {
1645 if (VM_Version::supports_avx2()) {
1646 vpbroadcastq(dst, src, vlen_enc);
1647 } else {
1648 vmovddup(dst, src, vlen_enc);
1649 }
1650 } else if (bt == T_DOUBLE) {
1651 if (vlen_enc != Assembler::AVX_128bit) {
1652 vbroadcastsd(dst, src, vlen_enc, noreg);
1653 } else {
1654 vmovddup(dst, src, vlen_enc);
1655 }
1656 } else {
1657 if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1658 vpbroadcastd(dst, src, vlen_enc);
1659 } else {
1660 vbroadcastss(dst, src, vlen_enc);
1661 }
1662 }
1663 } else if (VM_Version::supports_sse3()) {
1664 movddup(dst, src);
1665 } else {
1666 load_vector(bt, dst, src, vlen);
1667 }
1668 }
1669
1670 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1671 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1672 int offset = exact_log2(type2aelembytes(bt)) << 6;
1673 if (is_floating_point_type(bt)) {
1674 offset += 128;
1675 }
1676 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1677 load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1678 }
1679
1680 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1681
1682 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1683 int vector_len = Assembler::AVX_128bit;
1684
1685 switch (opcode) {
1686 case Op_AndReductionV: pand(dst, src); break;
1687 case Op_OrReductionV: por (dst, src); break;
1688 case Op_XorReductionV: pxor(dst, src); break;
1689 case Op_MinReductionV:
1690 switch (typ) {
1691 case T_BYTE: pminsb(dst, src); break;
1692 case T_SHORT: pminsw(dst, src); break;
1693 case T_INT: pminsd(dst, src); break;
1694 case T_LONG: assert(UseAVX > 2, "required");
1695 vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1696 default: assert(false, "wrong type");
1697 }
1698 break;
1699 case Op_MaxReductionV:
1700 switch (typ) {
1701 case T_BYTE: pmaxsb(dst, src); break;
1702 case T_SHORT: pmaxsw(dst, src); break;
1703 case T_INT: pmaxsd(dst, src); break;
1704 case T_LONG: assert(UseAVX > 2, "required");
1705 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1706 default: assert(false, "wrong type");
1707 }
1708 break;
1709 case Op_AddReductionVF: addss(dst, src); break;
1710 case Op_AddReductionVD: addsd(dst, src); break;
1711 case Op_AddReductionVI:
1712 switch (typ) {
1713 case T_BYTE: paddb(dst, src); break;
1714 case T_SHORT: paddw(dst, src); break;
1715 case T_INT: paddd(dst, src); break;
1716 default: assert(false, "wrong type");
1717 }
1718 break;
1719 case Op_AddReductionVL: paddq(dst, src); break;
1720 case Op_MulReductionVF: mulss(dst, src); break;
1721 case Op_MulReductionVD: mulsd(dst, src); break;
1722 case Op_MulReductionVI:
1723 switch (typ) {
1724 case T_SHORT: pmullw(dst, src); break;
1725 case T_INT: pmulld(dst, src); break;
1726 default: assert(false, "wrong type");
1727 }
1728 break;
1729 case Op_MulReductionVL: assert(UseAVX > 2, "required");
1730 evpmullq(dst, dst, src, vector_len); break;
1731 default: assert(false, "wrong opcode");
1732 }
1733 }
1734
1735 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1736 switch (opcode) {
1737 case Op_AddReductionVF: addps(dst, src); break;
1738 case Op_AddReductionVD: addpd(dst, src); break;
1739 case Op_MulReductionVF: mulps(dst, src); break;
1740 case Op_MulReductionVD: mulpd(dst, src); break;
1741 default: assert(false, "%s", NodeClassNames[opcode]);
1742 }
1743 }
1744
1745 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1746 int vector_len = Assembler::AVX_256bit;
1747
1748 switch (opcode) {
1749 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break;
1750 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break;
1751 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break;
1752 case Op_MinReductionV:
1753 switch (typ) {
1754 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break;
1755 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break;
1756 case T_INT: vpminsd(dst, src1, src2, vector_len); break;
1757 case T_LONG: assert(UseAVX > 2, "required");
1758 vpminsq(dst, src1, src2, vector_len); break;
1759 default: assert(false, "wrong type");
1760 }
1761 break;
1762 case Op_MaxReductionV:
1763 switch (typ) {
1764 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break;
1765 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break;
1766 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break;
1767 case T_LONG: assert(UseAVX > 2, "required");
1768 vpmaxsq(dst, src1, src2, vector_len); break;
1769 default: assert(false, "wrong type");
1770 }
1771 break;
1772 case Op_AddReductionVI:
1773 switch (typ) {
1774 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break;
1775 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break;
1776 case T_INT: vpaddd(dst, src1, src2, vector_len); break;
1777 default: assert(false, "wrong type");
1778 }
1779 break;
1780 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1781 case Op_MulReductionVI:
1782 switch (typ) {
1783 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break;
1784 case T_INT: vpmulld(dst, src1, src2, vector_len); break;
1785 default: assert(false, "wrong type");
1786 }
1787 break;
1788 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1789 default: assert(false, "wrong opcode");
1790 }
1791 }
1792
1793 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1794 int vector_len = Assembler::AVX_256bit;
1795
1796 switch (opcode) {
1797 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1798 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1799 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1800 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1801 default: assert(false, "%s", NodeClassNames[opcode]);
1802 }
1803 }
1804
1805 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1806 XMMRegister dst, XMMRegister src,
1807 XMMRegister vtmp1, XMMRegister vtmp2) {
1808 switch (opcode) {
1809 case Op_AddReductionVF:
1810 case Op_MulReductionVF:
1811 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1812 break;
1813
1814 case Op_AddReductionVD:
1815 case Op_MulReductionVD:
1816 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1817 break;
1818
1819 default: assert(false, "wrong opcode");
1820 }
1821 }
1822
1823 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1824 XMMRegister dst, XMMRegister src,
1825 XMMRegister vtmp1, XMMRegister vtmp2) {
1826 switch (opcode) {
1827 case Op_AddReductionVF:
1828 case Op_MulReductionVF:
1829 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1830 break;
1831
1832 case Op_AddReductionVD:
1833 case Op_MulReductionVD:
1834 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1835 break;
1836
1837 default: assert(false, "%s", NodeClassNames[opcode]);
1838 }
1839 }
1840
1841 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1842 Register dst, Register src1, XMMRegister src2,
1843 XMMRegister vtmp1, XMMRegister vtmp2) {
1844 switch (vlen) {
1845 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1846 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1847 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1848 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1849
1850 default: assert(false, "wrong vector length");
1851 }
1852 }
1853
1854 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1855 Register dst, Register src1, XMMRegister src2,
1856 XMMRegister vtmp1, XMMRegister vtmp2) {
1857 switch (vlen) {
1858 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1859 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1860 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1861 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1862
1863 default: assert(false, "wrong vector length");
1864 }
1865 }
1866
1867 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1868 Register dst, Register src1, XMMRegister src2,
1869 XMMRegister vtmp1, XMMRegister vtmp2) {
1870 switch (vlen) {
1871 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1872 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1873 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1874 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1875
1876 default: assert(false, "wrong vector length");
1877 }
1878 }
1879
1880 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1881 Register dst, Register src1, XMMRegister src2,
1882 XMMRegister vtmp1, XMMRegister vtmp2) {
1883 switch (vlen) {
1884 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1885 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1886 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1887 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1888
1889 default: assert(false, "wrong vector length");
1890 }
1891 }
1892
1893 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1894 Register dst, Register src1, XMMRegister src2,
1895 XMMRegister vtmp1, XMMRegister vtmp2) {
1896 switch (vlen) {
1897 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1898 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1899 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1900
1901 default: assert(false, "wrong vector length");
1902 }
1903 }
1904
1905 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1906 switch (vlen) {
1907 case 2:
1908 assert(vtmp2 == xnoreg, "");
1909 reduce2F(opcode, dst, src, vtmp1);
1910 break;
1911 case 4:
1912 assert(vtmp2 == xnoreg, "");
1913 reduce4F(opcode, dst, src, vtmp1);
1914 break;
1915 case 8:
1916 reduce8F(opcode, dst, src, vtmp1, vtmp2);
1917 break;
1918 case 16:
1919 reduce16F(opcode, dst, src, vtmp1, vtmp2);
1920 break;
1921 default: assert(false, "wrong vector length");
1922 }
1923 }
1924
1925 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1926 switch (vlen) {
1927 case 2:
1928 assert(vtmp2 == xnoreg, "");
1929 reduce2D(opcode, dst, src, vtmp1);
1930 break;
1931 case 4:
1932 reduce4D(opcode, dst, src, vtmp1, vtmp2);
1933 break;
1934 case 8:
1935 reduce8D(opcode, dst, src, vtmp1, vtmp2);
1936 break;
1937 default: assert(false, "wrong vector length");
1938 }
1939 }
1940
1941 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1942 switch (vlen) {
1943 case 2:
1944 assert(vtmp1 == xnoreg, "");
1945 assert(vtmp2 == xnoreg, "");
1946 unorderedReduce2F(opcode, dst, src);
1947 break;
1948 case 4:
1949 assert(vtmp2 == xnoreg, "");
1950 unorderedReduce4F(opcode, dst, src, vtmp1);
1951 break;
1952 case 8:
1953 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
1954 break;
1955 case 16:
1956 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
1957 break;
1958 default: assert(false, "wrong vector length");
1959 }
1960 }
1961
1962 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1963 switch (vlen) {
1964 case 2:
1965 assert(vtmp1 == xnoreg, "");
1966 assert(vtmp2 == xnoreg, "");
1967 unorderedReduce2D(opcode, dst, src);
1968 break;
1969 case 4:
1970 assert(vtmp2 == xnoreg, "");
1971 unorderedReduce4D(opcode, dst, src, vtmp1);
1972 break;
1973 case 8:
1974 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
1975 break;
1976 default: assert(false, "wrong vector length");
1977 }
1978 }
1979
1980 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1981 if (opcode == Op_AddReductionVI) {
1982 if (vtmp1 != src2) {
1983 movdqu(vtmp1, src2);
1984 }
1985 phaddd(vtmp1, vtmp1);
1986 } else {
1987 pshufd(vtmp1, src2, 0x1);
1988 reduce_operation_128(T_INT, opcode, vtmp1, src2);
1989 }
1990 movdl(vtmp2, src1);
1991 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1992 movdl(dst, vtmp1);
1993 }
1994
1995 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1996 if (opcode == Op_AddReductionVI) {
1997 if (vtmp1 != src2) {
1998 movdqu(vtmp1, src2);
1999 }
2000 phaddd(vtmp1, src2);
2001 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2002 } else {
2003 pshufd(vtmp2, src2, 0xE);
2004 reduce_operation_128(T_INT, opcode, vtmp2, src2);
2005 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2006 }
2007 }
2008
2009 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2010 if (opcode == Op_AddReductionVI) {
2011 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2012 vextracti128_high(vtmp2, vtmp1);
2013 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2014 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2015 } else {
2016 vextracti128_high(vtmp1, src2);
2017 reduce_operation_128(T_INT, opcode, vtmp1, src2);
2018 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2019 }
2020 }
2021
2022 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2023 vextracti64x4_high(vtmp2, src2);
2024 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2025 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2026 }
2027
2028 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2029 pshufd(vtmp2, src2, 0x1);
2030 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2031 movdqu(vtmp1, vtmp2);
2032 psrldq(vtmp1, 2);
2033 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2034 movdqu(vtmp2, vtmp1);
2035 psrldq(vtmp2, 1);
2036 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2037 movdl(vtmp2, src1);
2038 pmovsxbd(vtmp1, vtmp1);
2039 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2040 pextrb(dst, vtmp1, 0x0);
2041 movsbl(dst, dst);
2042 }
2043
2044 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2045 pshufd(vtmp1, src2, 0xE);
2046 reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2047 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2048 }
2049
2050 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2051 vextracti128_high(vtmp2, src2);
2052 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2053 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2054 }
2055
2056 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2057 vextracti64x4_high(vtmp1, src2);
2058 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2059 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2060 }
2061
2062 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2063 pmovsxbw(vtmp2, src2);
2064 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2065 }
2066
2067 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2068 if (UseAVX > 1) {
2069 int vector_len = Assembler::AVX_256bit;
2070 vpmovsxbw(vtmp1, src2, vector_len);
2071 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2072 } else {
2073 pmovsxbw(vtmp2, src2);
2074 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2075 pshufd(vtmp2, src2, 0x1);
2076 pmovsxbw(vtmp2, src2);
2077 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2078 }
2079 }
2080
2081 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2082 if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2083 int vector_len = Assembler::AVX_512bit;
2084 vpmovsxbw(vtmp1, src2, vector_len);
2085 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2086 } else {
2087 assert(UseAVX >= 2,"Should not reach here.");
2088 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2089 vextracti128_high(vtmp2, src2);
2090 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2091 }
2092 }
2093
2094 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2095 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2096 vextracti64x4_high(vtmp2, src2);
2097 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2098 }
2099
2100 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2101 if (opcode == Op_AddReductionVI) {
2102 if (vtmp1 != src2) {
2103 movdqu(vtmp1, src2);
2104 }
2105 phaddw(vtmp1, vtmp1);
2106 phaddw(vtmp1, vtmp1);
2107 } else {
2108 pshufd(vtmp2, src2, 0x1);
2109 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2110 movdqu(vtmp1, vtmp2);
2111 psrldq(vtmp1, 2);
2112 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2113 }
2114 movdl(vtmp2, src1);
2115 pmovsxwd(vtmp1, vtmp1);
2116 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2117 pextrw(dst, vtmp1, 0x0);
2118 movswl(dst, dst);
2119 }
2120
2121 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2122 if (opcode == Op_AddReductionVI) {
2123 if (vtmp1 != src2) {
2124 movdqu(vtmp1, src2);
2125 }
2126 phaddw(vtmp1, src2);
2127 } else {
2128 pshufd(vtmp1, src2, 0xE);
2129 reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2130 }
2131 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2132 }
2133
2134 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2135 if (opcode == Op_AddReductionVI) {
2136 int vector_len = Assembler::AVX_256bit;
2137 vphaddw(vtmp2, src2, src2, vector_len);
2138 vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2139 } else {
2140 vextracti128_high(vtmp2, src2);
2141 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2142 }
2143 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2144 }
2145
2146 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2147 int vector_len = Assembler::AVX_256bit;
2148 vextracti64x4_high(vtmp1, src2);
2149 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2150 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2151 }
2152
2153 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2154 pshufd(vtmp2, src2, 0xE);
2155 reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2156 movdq(vtmp1, src1);
2157 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2158 movdq(dst, vtmp1);
2159 }
2160
2161 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2162 vextracti128_high(vtmp1, src2);
2163 reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2164 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2165 }
2166
2167 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2168 vextracti64x4_high(vtmp2, src2);
2169 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2170 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2171 }
2172
2173 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2174 mov64(temp, -1L);
2175 bzhiq(temp, temp, len);
2176 kmovql(dst, temp);
2177 }
2178
2179 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2180 reduce_operation_128(T_FLOAT, opcode, dst, src);
2181 pshufd(vtmp, src, 0x1);
2182 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2183 }
2184
2185 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2186 reduce2F(opcode, dst, src, vtmp);
2187 pshufd(vtmp, src, 0x2);
2188 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2189 pshufd(vtmp, src, 0x3);
2190 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2191 }
2192
2193 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2194 reduce4F(opcode, dst, src, vtmp2);
2195 vextractf128_high(vtmp2, src);
2196 reduce4F(opcode, dst, vtmp2, vtmp1);
2197 }
2198
2199 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2200 reduce8F(opcode, dst, src, vtmp1, vtmp2);
2201 vextracti64x4_high(vtmp1, src);
2202 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2203 }
2204
2205 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2206 pshufd(dst, src, 0x1);
2207 reduce_operation_128(T_FLOAT, opcode, dst, src);
2208 }
2209
2210 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2211 pshufd(vtmp, src, 0xE);
2212 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2213 unorderedReduce2F(opcode, dst, vtmp);
2214 }
2215
2216 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2217 vextractf128_high(vtmp1, src);
2218 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2219 unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2220 }
2221
2222 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2223 vextractf64x4_high(vtmp2, src);
2224 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2225 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2226 }
2227
2228 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2229 reduce_operation_128(T_DOUBLE, opcode, dst, src);
2230 pshufd(vtmp, src, 0xE);
2231 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2232 }
2233
2234 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2235 reduce2D(opcode, dst, src, vtmp2);
2236 vextractf128_high(vtmp2, src);
2237 reduce2D(opcode, dst, vtmp2, vtmp1);
2238 }
2239
2240 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2241 reduce4D(opcode, dst, src, vtmp1, vtmp2);
2242 vextracti64x4_high(vtmp1, src);
2243 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2244 }
2245
2246 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2247 pshufd(dst, src, 0xE);
2248 reduce_operation_128(T_DOUBLE, opcode, dst, src);
2249 }
2250
2251 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2252 vextractf128_high(vtmp, src);
2253 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2254 unorderedReduce2D(opcode, dst, vtmp);
2255 }
2256
2257 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2258 vextractf64x4_high(vtmp2, src);
2259 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2260 unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2261 }
2262
2263 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2264 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2265 }
2266
2267 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2268 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2269 }
2270
2271 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2272 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2273 }
2274
2275 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2276 int vec_enc) {
2277 switch(elem_bt) {
2278 case T_INT:
2279 case T_FLOAT:
2280 vmaskmovps(dst, src, mask, vec_enc);
2281 break;
2282 case T_LONG:
2283 case T_DOUBLE:
2284 vmaskmovpd(dst, src, mask, vec_enc);
2285 break;
2286 default:
2287 fatal("Unsupported type %s", type2name(elem_bt));
2288 break;
2289 }
2290 }
2291
2292 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2293 int vec_enc) {
2294 switch(elem_bt) {
2295 case T_INT:
2296 case T_FLOAT:
2297 vmaskmovps(dst, src, mask, vec_enc);
2298 break;
2299 case T_LONG:
2300 case T_DOUBLE:
2301 vmaskmovpd(dst, src, mask, vec_enc);
2302 break;
2303 default:
2304 fatal("Unsupported type %s", type2name(elem_bt));
2305 break;
2306 }
2307 }
2308
2309 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2310 XMMRegister dst, XMMRegister src,
2311 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2312 XMMRegister xmm_0, XMMRegister xmm_1) {
2313 const int permconst[] = {1, 14};
2314 XMMRegister wsrc = src;
2315 XMMRegister wdst = xmm_0;
2316 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2317
2318 int vlen_enc = Assembler::AVX_128bit;
2319 if (vlen == 16) {
2320 vlen_enc = Assembler::AVX_256bit;
2321 }
2322
2323 for (int i = log2(vlen) - 1; i >=0; i--) {
2324 if (i == 0 && !is_dst_valid) {
2325 wdst = dst;
2326 }
2327 if (i == 3) {
2328 vextracti64x4_high(wtmp, wsrc);
2329 } else if (i == 2) {
2330 vextracti128_high(wtmp, wsrc);
2331 } else { // i = [0,1]
2332 vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2333 }
2334
2335 if (VM_Version::supports_avx10_2()) {
2336 vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2337 } else {
2338 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2339 }
2340 wsrc = wdst;
2341 vlen_enc = Assembler::AVX_128bit;
2342 }
2343 if (is_dst_valid) {
2344 if (VM_Version::supports_avx10_2()) {
2345 vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2346 } else {
2347 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2348 }
2349 }
2350 }
2351
2352 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2353 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2354 XMMRegister xmm_0, XMMRegister xmm_1) {
2355 XMMRegister wsrc = src;
2356 XMMRegister wdst = xmm_0;
2357 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2358 int vlen_enc = Assembler::AVX_128bit;
2359 if (vlen == 8) {
2360 vlen_enc = Assembler::AVX_256bit;
2361 }
2362 for (int i = log2(vlen) - 1; i >=0; i--) {
2363 if (i == 0 && !is_dst_valid) {
2364 wdst = dst;
2365 }
2366 if (i == 1) {
2367 vextracti128_high(wtmp, wsrc);
2368 } else if (i == 2) {
2369 vextracti64x4_high(wtmp, wsrc);
2370 } else {
2371 assert(i == 0, "%d", i);
2372 vpermilpd(wtmp, wsrc, 1, vlen_enc);
2373 }
2374
2375 if (VM_Version::supports_avx10_2()) {
2376 vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2377 } else {
2378 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2379 }
2380
2381 wsrc = wdst;
2382 vlen_enc = Assembler::AVX_128bit;
2383 }
2384
2385 if (is_dst_valid) {
2386 if (VM_Version::supports_avx10_2()) {
2387 vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2388 } else {
2389 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2390 }
2391 }
2392 }
2393
2394 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2395 switch (bt) {
2396 case T_BYTE: pextrb(dst, src, idx); break;
2397 case T_SHORT: pextrw(dst, src, idx); break;
2398 case T_INT: pextrd(dst, src, idx); break;
2399 case T_LONG: pextrq(dst, src, idx); break;
2400
2401 default:
2402 assert(false,"Should not reach here.");
2403 break;
2404 }
2405 }
2406
2407 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2408 int esize = type2aelembytes(typ);
2409 int elem_per_lane = 16/esize;
2410 int lane = elemindex / elem_per_lane;
2411 int eindex = elemindex % elem_per_lane;
2412
2413 if (lane >= 2) {
2414 assert(UseAVX > 2, "required");
2415 vextractf32x4(dst, src, lane & 3);
2416 return dst;
2417 } else if (lane > 0) {
2418 assert(UseAVX > 0, "required");
2419 vextractf128(dst, src, lane);
2420 return dst;
2421 } else {
2422 return src;
2423 }
2424 }
2425
2426 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2427 if (typ == T_BYTE) {
2428 movsbl(dst, dst);
2429 } else if (typ == T_SHORT) {
2430 movswl(dst, dst);
2431 }
2432 }
2433
2434 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2435 int esize = type2aelembytes(typ);
2436 int elem_per_lane = 16/esize;
2437 int eindex = elemindex % elem_per_lane;
2438 assert(is_integral_type(typ),"required");
2439
2440 if (eindex == 0) {
2441 if (typ == T_LONG) {
2442 movq(dst, src);
2443 } else {
2444 movdl(dst, src);
2445 movsxl(typ, dst);
2446 }
2447 } else {
2448 extract(typ, dst, src, eindex);
2449 movsxl(typ, dst);
2450 }
2451 }
2452
2453 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2454 int esize = type2aelembytes(typ);
2455 int elem_per_lane = 16/esize;
2456 int eindex = elemindex % elem_per_lane;
2457 assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2458
2459 if (eindex == 0) {
2460 movq(dst, src);
2461 } else {
2462 if (typ == T_FLOAT) {
2463 if (UseAVX == 0) {
2464 movdqu(dst, src);
2465 shufps(dst, dst, eindex);
2466 } else {
2467 vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2468 }
2469 } else {
2470 if (UseAVX == 0) {
2471 movdqu(dst, src);
2472 psrldq(dst, eindex*esize);
2473 } else {
2474 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2475 }
2476 movq(dst, dst);
2477 }
2478 }
2479 // Zero upper bits
2480 if (typ == T_FLOAT) {
2481 if (UseAVX == 0) {
2482 assert(vtmp != xnoreg, "required.");
2483 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2484 pand(dst, vtmp);
2485 } else {
2486 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2487 }
2488 }
2489 }
2490
2491 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2492 switch(typ) {
2493 case T_BYTE:
2494 case T_BOOLEAN:
2495 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2496 break;
2497 case T_SHORT:
2498 case T_CHAR:
2499 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2500 break;
2501 case T_INT:
2502 case T_FLOAT:
2503 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2504 break;
2505 case T_LONG:
2506 case T_DOUBLE:
2507 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2508 break;
2509 default:
2510 assert(false,"Should not reach here.");
2511 break;
2512 }
2513 }
2514
2515 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2516 assert(rscratch != noreg || always_reachable(src2), "missing");
2517
2518 switch(typ) {
2519 case T_BOOLEAN:
2520 case T_BYTE:
2521 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2522 break;
2523 case T_CHAR:
2524 case T_SHORT:
2525 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2526 break;
2527 case T_INT:
2528 case T_FLOAT:
2529 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2530 break;
2531 case T_LONG:
2532 case T_DOUBLE:
2533 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2534 break;
2535 default:
2536 assert(false,"Should not reach here.");
2537 break;
2538 }
2539 }
2540
2541 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2542 switch(typ) {
2543 case T_BYTE:
2544 evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2545 break;
2546 case T_SHORT:
2547 evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2548 break;
2549 case T_INT:
2550 case T_FLOAT:
2551 evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2552 break;
2553 case T_LONG:
2554 case T_DOUBLE:
2555 evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2556 break;
2557 default:
2558 assert(false,"Should not reach here.");
2559 break;
2560 }
2561 }
2562
2563 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2564 assert(vlen_in_bytes <= 32, "");
2565 int esize = type2aelembytes(bt);
2566 if (vlen_in_bytes == 32) {
2567 assert(vtmp == xnoreg, "required.");
2568 if (esize >= 4) {
2569 vtestps(src1, src2, AVX_256bit);
2570 } else {
2571 vptest(src1, src2, AVX_256bit);
2572 }
2573 return;
2574 }
2575 if (vlen_in_bytes < 16) {
2576 // Duplicate the lower part to fill the whole register,
2577 // Don't need to do so for src2
2578 assert(vtmp != xnoreg, "required");
2579 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2580 pshufd(vtmp, src1, shuffle_imm);
2581 } else {
2582 assert(vtmp == xnoreg, "required");
2583 vtmp = src1;
2584 }
2585 if (esize >= 4 && VM_Version::supports_avx()) {
2586 vtestps(vtmp, src2, AVX_128bit);
2587 } else {
2588 ptest(vtmp, src2);
2589 }
2590 }
2591
2592 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2593 #ifdef ASSERT
2594 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2595 bool is_bw_supported = VM_Version::supports_avx512bw();
2596 if (is_bw && !is_bw_supported) {
2597 assert(vlen_enc != Assembler::AVX_512bit, "required");
2598 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2599 "XMM register should be 0-15");
2600 }
2601 #endif // ASSERT
2602 switch (elem_bt) {
2603 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2604 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2605 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2606 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2607 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2608 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2609 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2610 }
2611 }
2612
2613 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2614 assert(UseAVX >= 2, "required");
2615 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2616 bool is_vl = vlen_enc != Assembler::AVX_512bit;
2617 if ((UseAVX > 2) &&
2618 (!is_bw || VM_Version::supports_avx512bw()) &&
2619 (!is_vl || VM_Version::supports_avx512vl())) {
2620 switch (elem_bt) {
2621 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2622 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2623 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2624 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2625 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2626 }
2627 } else {
2628 assert(vlen_enc != Assembler::AVX_512bit, "required");
2629 assert((dst->encoding() < 16),"XMM register should be 0-15");
2630 switch (elem_bt) {
2631 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2632 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2633 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2634 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2635 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2636 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2637 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2638 }
2639 }
2640 }
2641
2642 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2643 switch (to_elem_bt) {
2644 case T_SHORT:
2645 vpmovsxbw(dst, src, vlen_enc);
2646 break;
2647 case T_INT:
2648 vpmovsxbd(dst, src, vlen_enc);
2649 break;
2650 case T_FLOAT:
2651 vpmovsxbd(dst, src, vlen_enc);
2652 vcvtdq2ps(dst, dst, vlen_enc);
2653 break;
2654 case T_LONG:
2655 vpmovsxbq(dst, src, vlen_enc);
2656 break;
2657 case T_DOUBLE: {
2658 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2659 vpmovsxbd(dst, src, mid_vlen_enc);
2660 vcvtdq2pd(dst, dst, vlen_enc);
2661 break;
2662 }
2663 default:
2664 fatal("Unsupported type %s", type2name(to_elem_bt));
2665 break;
2666 }
2667 }
2668
2669 //-------------------------------------------------------------------------------------------
2670
2671 // IndexOf for constant substrings with size >= 8 chars
2672 // which don't need to be loaded through stack.
2673 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2674 Register cnt1, Register cnt2,
2675 int int_cnt2, Register result,
2676 XMMRegister vec, Register tmp,
2677 int ae) {
2678 ShortBranchVerifier sbv(this);
2679 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2680 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2681
2682 // This method uses the pcmpestri instruction with bound registers
2683 // inputs:
2684 // xmm - substring
2685 // rax - substring length (elements count)
2686 // mem - scanned string
2687 // rdx - string length (elements count)
2688 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2689 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2690 // outputs:
2691 // rcx - matched index in string
2692 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2693 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2694 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2695 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2696 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2697
2698 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2699 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2700 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2701
2702 // Note, inline_string_indexOf() generates checks:
2703 // if (substr.count > string.count) return -1;
2704 // if (substr.count == 0) return 0;
2705 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2706
2707 // Load substring.
2708 if (ae == StrIntrinsicNode::UL) {
2709 pmovzxbw(vec, Address(str2, 0));
2710 } else {
2711 movdqu(vec, Address(str2, 0));
2712 }
2713 movl(cnt2, int_cnt2);
2714 movptr(result, str1); // string addr
2715
2716 if (int_cnt2 > stride) {
2717 jmpb(SCAN_TO_SUBSTR);
2718
2719 // Reload substr for rescan, this code
2720 // is executed only for large substrings (> 8 chars)
2721 bind(RELOAD_SUBSTR);
2722 if (ae == StrIntrinsicNode::UL) {
2723 pmovzxbw(vec, Address(str2, 0));
2724 } else {
2725 movdqu(vec, Address(str2, 0));
2726 }
2727 negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2728
2729 bind(RELOAD_STR);
2730 // We came here after the beginning of the substring was
2731 // matched but the rest of it was not so we need to search
2732 // again. Start from the next element after the previous match.
2733
2734 // cnt2 is number of substring reminding elements and
2735 // cnt1 is number of string reminding elements when cmp failed.
2736 // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2737 subl(cnt1, cnt2);
2738 addl(cnt1, int_cnt2);
2739 movl(cnt2, int_cnt2); // Now restore cnt2
2740
2741 decrementl(cnt1); // Shift to next element
2742 cmpl(cnt1, cnt2);
2743 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2744
2745 addptr(result, (1<<scale1));
2746
2747 } // (int_cnt2 > 8)
2748
2749 // Scan string for start of substr in 16-byte vectors
2750 bind(SCAN_TO_SUBSTR);
2751 pcmpestri(vec, Address(result, 0), mode);
2752 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
2753 subl(cnt1, stride);
2754 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2755 cmpl(cnt1, cnt2);
2756 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2757 addptr(result, 16);
2758 jmpb(SCAN_TO_SUBSTR);
2759
2760 // Found a potential substr
2761 bind(FOUND_CANDIDATE);
2762 // Matched whole vector if first element matched (tmp(rcx) == 0).
2763 if (int_cnt2 == stride) {
2764 jccb(Assembler::overflow, RET_FOUND); // OF == 1
2765 } else { // int_cnt2 > 8
2766 jccb(Assembler::overflow, FOUND_SUBSTR);
2767 }
2768 // After pcmpestri tmp(rcx) contains matched element index
2769 // Compute start addr of substr
2770 lea(result, Address(result, tmp, scale1));
2771
2772 // Make sure string is still long enough
2773 subl(cnt1, tmp);
2774 cmpl(cnt1, cnt2);
2775 if (int_cnt2 == stride) {
2776 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2777 } else { // int_cnt2 > 8
2778 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2779 }
2780 // Left less then substring.
2781
2782 bind(RET_NOT_FOUND);
2783 movl(result, -1);
2784 jmp(EXIT);
2785
2786 if (int_cnt2 > stride) {
2787 // This code is optimized for the case when whole substring
2788 // is matched if its head is matched.
2789 bind(MATCH_SUBSTR_HEAD);
2790 pcmpestri(vec, Address(result, 0), mode);
2791 // Reload only string if does not match
2792 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2793
2794 Label CONT_SCAN_SUBSTR;
2795 // Compare the rest of substring (> 8 chars).
2796 bind(FOUND_SUBSTR);
2797 // First 8 chars are already matched.
2798 negptr(cnt2);
2799 addptr(cnt2, stride);
2800
2801 bind(SCAN_SUBSTR);
2802 subl(cnt1, stride);
2803 cmpl(cnt2, -stride); // Do not read beyond substring
2804 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2805 // Back-up strings to avoid reading beyond substring:
2806 // cnt1 = cnt1 - cnt2 + 8
2807 addl(cnt1, cnt2); // cnt2 is negative
2808 addl(cnt1, stride);
2809 movl(cnt2, stride); negptr(cnt2);
2810 bind(CONT_SCAN_SUBSTR);
2811 if (int_cnt2 < (int)G) {
2812 int tail_off1 = int_cnt2<<scale1;
2813 int tail_off2 = int_cnt2<<scale2;
2814 if (ae == StrIntrinsicNode::UL) {
2815 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2816 } else {
2817 movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2818 }
2819 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2820 } else {
2821 // calculate index in register to avoid integer overflow (int_cnt2*2)
2822 movl(tmp, int_cnt2);
2823 addptr(tmp, cnt2);
2824 if (ae == StrIntrinsicNode::UL) {
2825 pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2826 } else {
2827 movdqu(vec, Address(str2, tmp, scale2, 0));
2828 }
2829 pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2830 }
2831 // Need to reload strings pointers if not matched whole vector
2832 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2833 addptr(cnt2, stride);
2834 jcc(Assembler::negative, SCAN_SUBSTR);
2835 // Fall through if found full substring
2836
2837 } // (int_cnt2 > 8)
2838
2839 bind(RET_FOUND);
2840 // Found result if we matched full small substring.
2841 // Compute substr offset
2842 subptr(result, str1);
2843 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2844 shrl(result, 1); // index
2845 }
2846 bind(EXIT);
2847
2848 } // string_indexofC8
2849
2850 // Small strings are loaded through stack if they cross page boundary.
2851 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2852 Register cnt1, Register cnt2,
2853 int int_cnt2, Register result,
2854 XMMRegister vec, Register tmp,
2855 int ae) {
2856 ShortBranchVerifier sbv(this);
2857 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2858 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2859
2860 //
2861 // int_cnt2 is length of small (< 8 chars) constant substring
2862 // or (-1) for non constant substring in which case its length
2863 // is in cnt2 register.
2864 //
2865 // Note, inline_string_indexOf() generates checks:
2866 // if (substr.count > string.count) return -1;
2867 // if (substr.count == 0) return 0;
2868 //
2869 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2870 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2871 // This method uses the pcmpestri instruction with bound registers
2872 // inputs:
2873 // xmm - substring
2874 // rax - substring length (elements count)
2875 // mem - scanned string
2876 // rdx - string length (elements count)
2877 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2878 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2879 // outputs:
2880 // rcx - matched index in string
2881 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2882 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2883 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2884 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2885
2886 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2887 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2888 FOUND_CANDIDATE;
2889
2890 { //========================================================
2891 // We don't know where these strings are located
2892 // and we can't read beyond them. Load them through stack.
2893 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2894
2895 movptr(tmp, rsp); // save old SP
2896
2897 if (int_cnt2 > 0) { // small (< 8 chars) constant substring
2898 if (int_cnt2 == (1>>scale2)) { // One byte
2899 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2900 load_unsigned_byte(result, Address(str2, 0));
2901 movdl(vec, result); // move 32 bits
2902 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes
2903 // Not enough header space in 32-bit VM: 12+3 = 15.
2904 movl(result, Address(str2, -1));
2905 shrl(result, 8);
2906 movdl(vec, result); // move 32 bits
2907 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char
2908 load_unsigned_short(result, Address(str2, 0));
2909 movdl(vec, result); // move 32 bits
2910 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2911 movdl(vec, Address(str2, 0)); // move 32 bits
2912 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2913 movq(vec, Address(str2, 0)); // move 64 bits
2914 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2915 // Array header size is 12 bytes in 32-bit VM
2916 // + 6 bytes for 3 chars == 18 bytes,
2917 // enough space to load vec and shift.
2918 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2919 if (ae == StrIntrinsicNode::UL) {
2920 int tail_off = int_cnt2-8;
2921 pmovzxbw(vec, Address(str2, tail_off));
2922 psrldq(vec, -2*tail_off);
2923 }
2924 else {
2925 int tail_off = int_cnt2*(1<<scale2);
2926 movdqu(vec, Address(str2, tail_off-16));
2927 psrldq(vec, 16-tail_off);
2928 }
2929 }
2930 } else { // not constant substring
2931 cmpl(cnt2, stride);
2932 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2933
2934 // We can read beyond string if srt+16 does not cross page boundary
2935 // since heaps are aligned and mapped by pages.
2936 assert(os::vm_page_size() < (int)G, "default page should be small");
2937 movl(result, str2); // We need only low 32 bits
2938 andl(result, ((int)os::vm_page_size()-1));
2939 cmpl(result, ((int)os::vm_page_size()-16));
2940 jccb(Assembler::belowEqual, CHECK_STR);
2941
2942 // Move small strings to stack to allow load 16 bytes into vec.
2943 subptr(rsp, 16);
2944 int stk_offset = wordSize-(1<<scale2);
2945 push(cnt2);
2946
2947 bind(COPY_SUBSTR);
2948 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2949 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2950 movb(Address(rsp, cnt2, scale2, stk_offset), result);
2951 } else if (ae == StrIntrinsicNode::UU) {
2952 load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2953 movw(Address(rsp, cnt2, scale2, stk_offset), result);
2954 }
2955 decrement(cnt2);
2956 jccb(Assembler::notZero, COPY_SUBSTR);
2957
2958 pop(cnt2);
2959 movptr(str2, rsp); // New substring address
2960 } // non constant
2961
2962 bind(CHECK_STR);
2963 cmpl(cnt1, stride);
2964 jccb(Assembler::aboveEqual, BIG_STRINGS);
2965
2966 // Check cross page boundary.
2967 movl(result, str1); // We need only low 32 bits
2968 andl(result, ((int)os::vm_page_size()-1));
2969 cmpl(result, ((int)os::vm_page_size()-16));
2970 jccb(Assembler::belowEqual, BIG_STRINGS);
2971
2972 subptr(rsp, 16);
2973 int stk_offset = -(1<<scale1);
2974 if (int_cnt2 < 0) { // not constant
2975 push(cnt2);
2976 stk_offset += wordSize;
2977 }
2978 movl(cnt2, cnt1);
2979
2980 bind(COPY_STR);
2981 if (ae == StrIntrinsicNode::LL) {
2982 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2983 movb(Address(rsp, cnt2, scale1, stk_offset), result);
2984 } else {
2985 load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2986 movw(Address(rsp, cnt2, scale1, stk_offset), result);
2987 }
2988 decrement(cnt2);
2989 jccb(Assembler::notZero, COPY_STR);
2990
2991 if (int_cnt2 < 0) { // not constant
2992 pop(cnt2);
2993 }
2994 movptr(str1, rsp); // New string address
2995
2996 bind(BIG_STRINGS);
2997 // Load substring.
2998 if (int_cnt2 < 0) { // -1
2999 if (ae == StrIntrinsicNode::UL) {
3000 pmovzxbw(vec, Address(str2, 0));
3001 } else {
3002 movdqu(vec, Address(str2, 0));
3003 }
3004 push(cnt2); // substr count
3005 push(str2); // substr addr
3006 push(str1); // string addr
3007 } else {
3008 // Small (< 8 chars) constant substrings are loaded already.
3009 movl(cnt2, int_cnt2);
3010 }
3011 push(tmp); // original SP
3012
3013 } // Finished loading
3014
3015 //========================================================
3016 // Start search
3017 //
3018
3019 movptr(result, str1); // string addr
3020
3021 if (int_cnt2 < 0) { // Only for non constant substring
3022 jmpb(SCAN_TO_SUBSTR);
3023
3024 // SP saved at sp+0
3025 // String saved at sp+1*wordSize
3026 // Substr saved at sp+2*wordSize
3027 // Substr count saved at sp+3*wordSize
3028
3029 // Reload substr for rescan, this code
3030 // is executed only for large substrings (> 8 chars)
3031 bind(RELOAD_SUBSTR);
3032 movptr(str2, Address(rsp, 2*wordSize));
3033 movl(cnt2, Address(rsp, 3*wordSize));
3034 if (ae == StrIntrinsicNode::UL) {
3035 pmovzxbw(vec, Address(str2, 0));
3036 } else {
3037 movdqu(vec, Address(str2, 0));
3038 }
3039 // We came here after the beginning of the substring was
3040 // matched but the rest of it was not so we need to search
3041 // again. Start from the next element after the previous match.
3042 subptr(str1, result); // Restore counter
3043 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3044 shrl(str1, 1);
3045 }
3046 addl(cnt1, str1);
3047 decrementl(cnt1); // Shift to next element
3048 cmpl(cnt1, cnt2);
3049 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
3050
3051 addptr(result, (1<<scale1));
3052 } // non constant
3053
3054 // Scan string for start of substr in 16-byte vectors
3055 bind(SCAN_TO_SUBSTR);
3056 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3057 pcmpestri(vec, Address(result, 0), mode);
3058 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
3059 subl(cnt1, stride);
3060 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3061 cmpl(cnt1, cnt2);
3062 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
3063 addptr(result, 16);
3064
3065 bind(ADJUST_STR);
3066 cmpl(cnt1, stride); // Do not read beyond string
3067 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3068 // Back-up string to avoid reading beyond string.
3069 lea(result, Address(result, cnt1, scale1, -16));
3070 movl(cnt1, stride);
3071 jmpb(SCAN_TO_SUBSTR);
3072
3073 // Found a potential substr
3074 bind(FOUND_CANDIDATE);
3075 // After pcmpestri tmp(rcx) contains matched element index
3076
3077 // Make sure string is still long enough
3078 subl(cnt1, tmp);
3079 cmpl(cnt1, cnt2);
3080 jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3081 // Left less then substring.
3082
3083 bind(RET_NOT_FOUND);
3084 movl(result, -1);
3085 jmp(CLEANUP);
3086
3087 bind(FOUND_SUBSTR);
3088 // Compute start addr of substr
3089 lea(result, Address(result, tmp, scale1));
3090 if (int_cnt2 > 0) { // Constant substring
3091 // Repeat search for small substring (< 8 chars)
3092 // from new point without reloading substring.
3093 // Have to check that we don't read beyond string.
3094 cmpl(tmp, stride-int_cnt2);
3095 jccb(Assembler::greater, ADJUST_STR);
3096 // Fall through if matched whole substring.
3097 } else { // non constant
3098 assert(int_cnt2 == -1, "should be != 0");
3099
3100 addl(tmp, cnt2);
3101 // Found result if we matched whole substring.
3102 cmpl(tmp, stride);
3103 jcc(Assembler::lessEqual, RET_FOUND);
3104
3105 // Repeat search for small substring (<= 8 chars)
3106 // from new point 'str1' without reloading substring.
3107 cmpl(cnt2, stride);
3108 // Have to check that we don't read beyond string.
3109 jccb(Assembler::lessEqual, ADJUST_STR);
3110
3111 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3112 // Compare the rest of substring (> 8 chars).
3113 movptr(str1, result);
3114
3115 cmpl(tmp, cnt2);
3116 // First 8 chars are already matched.
3117 jccb(Assembler::equal, CHECK_NEXT);
3118
3119 bind(SCAN_SUBSTR);
3120 pcmpestri(vec, Address(str1, 0), mode);
3121 // Need to reload strings pointers if not matched whole vector
3122 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3123
3124 bind(CHECK_NEXT);
3125 subl(cnt2, stride);
3126 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3127 addptr(str1, 16);
3128 if (ae == StrIntrinsicNode::UL) {
3129 addptr(str2, 8);
3130 } else {
3131 addptr(str2, 16);
3132 }
3133 subl(cnt1, stride);
3134 cmpl(cnt2, stride); // Do not read beyond substring
3135 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3136 // Back-up strings to avoid reading beyond substring.
3137
3138 if (ae == StrIntrinsicNode::UL) {
3139 lea(str2, Address(str2, cnt2, scale2, -8));
3140 lea(str1, Address(str1, cnt2, scale1, -16));
3141 } else {
3142 lea(str2, Address(str2, cnt2, scale2, -16));
3143 lea(str1, Address(str1, cnt2, scale1, -16));
3144 }
3145 subl(cnt1, cnt2);
3146 movl(cnt2, stride);
3147 addl(cnt1, stride);
3148 bind(CONT_SCAN_SUBSTR);
3149 if (ae == StrIntrinsicNode::UL) {
3150 pmovzxbw(vec, Address(str2, 0));
3151 } else {
3152 movdqu(vec, Address(str2, 0));
3153 }
3154 jmp(SCAN_SUBSTR);
3155
3156 bind(RET_FOUND_LONG);
3157 movptr(str1, Address(rsp, wordSize));
3158 } // non constant
3159
3160 bind(RET_FOUND);
3161 // Compute substr offset
3162 subptr(result, str1);
3163 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3164 shrl(result, 1); // index
3165 }
3166 bind(CLEANUP);
3167 pop(rsp); // restore SP
3168
3169 } // string_indexof
3170
3171 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3172 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3173 ShortBranchVerifier sbv(this);
3174 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3175
3176 int stride = 8;
3177
3178 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3179 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3180 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3181 FOUND_SEQ_CHAR, DONE_LABEL;
3182
3183 movptr(result, str1);
3184 if (UseAVX >= 2) {
3185 cmpl(cnt1, stride);
3186 jcc(Assembler::less, SCAN_TO_CHAR);
3187 cmpl(cnt1, 2*stride);
3188 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3189 movdl(vec1, ch);
3190 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3191 vpxor(vec2, vec2);
3192 movl(tmp, cnt1);
3193 andl(tmp, 0xFFFFFFF0); //vector count (in chars)
3194 andl(cnt1,0x0000000F); //tail count (in chars)
3195
3196 bind(SCAN_TO_16_CHAR_LOOP);
3197 vmovdqu(vec3, Address(result, 0));
3198 vpcmpeqw(vec3, vec3, vec1, 1);
3199 vptest(vec2, vec3);
3200 jcc(Assembler::carryClear, FOUND_CHAR);
3201 addptr(result, 32);
3202 subl(tmp, 2*stride);
3203 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3204 jmp(SCAN_TO_8_CHAR);
3205 bind(SCAN_TO_8_CHAR_INIT);
3206 movdl(vec1, ch);
3207 pshuflw(vec1, vec1, 0x00);
3208 pshufd(vec1, vec1, 0);
3209 pxor(vec2, vec2);
3210 }
3211 bind(SCAN_TO_8_CHAR);
3212 cmpl(cnt1, stride);
3213 jcc(Assembler::less, SCAN_TO_CHAR);
3214 if (UseAVX < 2) {
3215 movdl(vec1, ch);
3216 pshuflw(vec1, vec1, 0x00);
3217 pshufd(vec1, vec1, 0);
3218 pxor(vec2, vec2);
3219 }
3220 movl(tmp, cnt1);
3221 andl(tmp, 0xFFFFFFF8); //vector count (in chars)
3222 andl(cnt1,0x00000007); //tail count (in chars)
3223
3224 bind(SCAN_TO_8_CHAR_LOOP);
3225 movdqu(vec3, Address(result, 0));
3226 pcmpeqw(vec3, vec1);
3227 ptest(vec2, vec3);
3228 jcc(Assembler::carryClear, FOUND_CHAR);
3229 addptr(result, 16);
3230 subl(tmp, stride);
3231 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3232 bind(SCAN_TO_CHAR);
3233 testl(cnt1, cnt1);
3234 jcc(Assembler::zero, RET_NOT_FOUND);
3235 bind(SCAN_TO_CHAR_LOOP);
3236 load_unsigned_short(tmp, Address(result, 0));
3237 cmpl(ch, tmp);
3238 jccb(Assembler::equal, FOUND_SEQ_CHAR);
3239 addptr(result, 2);
3240 subl(cnt1, 1);
3241 jccb(Assembler::zero, RET_NOT_FOUND);
3242 jmp(SCAN_TO_CHAR_LOOP);
3243
3244 bind(RET_NOT_FOUND);
3245 movl(result, -1);
3246 jmpb(DONE_LABEL);
3247
3248 bind(FOUND_CHAR);
3249 if (UseAVX >= 2) {
3250 vpmovmskb(tmp, vec3);
3251 } else {
3252 pmovmskb(tmp, vec3);
3253 }
3254 bsfl(ch, tmp);
3255 addptr(result, ch);
3256
3257 bind(FOUND_SEQ_CHAR);
3258 subptr(result, str1);
3259 shrl(result, 1);
3260
3261 bind(DONE_LABEL);
3262 } // string_indexof_char
3263
3264 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3265 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3266 ShortBranchVerifier sbv(this);
3267 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3268
3269 int stride = 16;
3270
3271 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3272 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3273 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3274 FOUND_SEQ_CHAR, DONE_LABEL;
3275
3276 movptr(result, str1);
3277 if (UseAVX >= 2) {
3278 cmpl(cnt1, stride);
3279 jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3280 cmpl(cnt1, stride*2);
3281 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3282 movdl(vec1, ch);
3283 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3284 vpxor(vec2, vec2);
3285 movl(tmp, cnt1);
3286 andl(tmp, 0xFFFFFFE0); //vector count (in chars)
3287 andl(cnt1,0x0000001F); //tail count (in chars)
3288
3289 bind(SCAN_TO_32_CHAR_LOOP);
3290 vmovdqu(vec3, Address(result, 0));
3291 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3292 vptest(vec2, vec3);
3293 jcc(Assembler::carryClear, FOUND_CHAR);
3294 addptr(result, 32);
3295 subl(tmp, stride*2);
3296 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3297 jmp(SCAN_TO_16_CHAR);
3298
3299 bind(SCAN_TO_16_CHAR_INIT);
3300 movdl(vec1, ch);
3301 pxor(vec2, vec2);
3302 pshufb(vec1, vec2);
3303 }
3304
3305 bind(SCAN_TO_16_CHAR);
3306 cmpl(cnt1, stride);
3307 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3308 if (UseAVX < 2) {
3309 movdl(vec1, ch);
3310 pxor(vec2, vec2);
3311 pshufb(vec1, vec2);
3312 }
3313 movl(tmp, cnt1);
3314 andl(tmp, 0xFFFFFFF0); //vector count (in bytes)
3315 andl(cnt1,0x0000000F); //tail count (in bytes)
3316
3317 bind(SCAN_TO_16_CHAR_LOOP);
3318 movdqu(vec3, Address(result, 0));
3319 pcmpeqb(vec3, vec1);
3320 ptest(vec2, vec3);
3321 jcc(Assembler::carryClear, FOUND_CHAR);
3322 addptr(result, 16);
3323 subl(tmp, stride);
3324 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3325
3326 bind(SCAN_TO_CHAR_INIT);
3327 testl(cnt1, cnt1);
3328 jcc(Assembler::zero, RET_NOT_FOUND);
3329 bind(SCAN_TO_CHAR_LOOP);
3330 load_unsigned_byte(tmp, Address(result, 0));
3331 cmpl(ch, tmp);
3332 jccb(Assembler::equal, FOUND_SEQ_CHAR);
3333 addptr(result, 1);
3334 subl(cnt1, 1);
3335 jccb(Assembler::zero, RET_NOT_FOUND);
3336 jmp(SCAN_TO_CHAR_LOOP);
3337
3338 bind(RET_NOT_FOUND);
3339 movl(result, -1);
3340 jmpb(DONE_LABEL);
3341
3342 bind(FOUND_CHAR);
3343 if (UseAVX >= 2) {
3344 vpmovmskb(tmp, vec3);
3345 } else {
3346 pmovmskb(tmp, vec3);
3347 }
3348 bsfl(ch, tmp);
3349 addptr(result, ch);
3350
3351 bind(FOUND_SEQ_CHAR);
3352 subptr(result, str1);
3353
3354 bind(DONE_LABEL);
3355 } // stringL_indexof_char
3356
3357 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3358 switch (eltype) {
3359 case T_BOOLEAN: return sizeof(jboolean);
3360 case T_BYTE: return sizeof(jbyte);
3361 case T_SHORT: return sizeof(jshort);
3362 case T_CHAR: return sizeof(jchar);
3363 case T_INT: return sizeof(jint);
3364 default:
3365 ShouldNotReachHere();
3366 return -1;
3367 }
3368 }
3369
3370 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3371 switch (eltype) {
3372 // T_BOOLEAN used as surrogate for unsigned byte
3373 case T_BOOLEAN: movzbl(dst, src); break;
3374 case T_BYTE: movsbl(dst, src); break;
3375 case T_SHORT: movswl(dst, src); break;
3376 case T_CHAR: movzwl(dst, src); break;
3377 case T_INT: movl(dst, src); break;
3378 default:
3379 ShouldNotReachHere();
3380 }
3381 }
3382
3383 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3384 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3385 }
3386
3387 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3388 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3389 }
3390
3391 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3392 const int vlen = Assembler::AVX_256bit;
3393 switch (eltype) {
3394 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break;
3395 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break;
3396 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3397 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3398 case T_INT:
3399 // do nothing
3400 break;
3401 default:
3402 ShouldNotReachHere();
3403 }
3404 }
3405
3406 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3407 Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3408 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3409 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3410 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3411 BasicType eltype) {
3412 ShortBranchVerifier sbv(this);
3413 assert(UseAVX >= 2, "AVX2 intrinsics are required");
3414 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3415 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3416
3417 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3418 SHORT_UNROLLED_LOOP_EXIT,
3419 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3420 UNROLLED_VECTOR_LOOP_BEGIN,
3421 END;
3422 switch (eltype) {
3423 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3424 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break;
3425 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break;
3426 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break;
3427 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break;
3428 default: BLOCK_COMMENT("arrays_hashcode {"); break;
3429 }
3430
3431 // For "renaming" for readibility of the code
3432 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3433 vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3434 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3435
3436 const int elsize = arrays_hashcode_elsize(eltype);
3437
3438 /*
3439 if (cnt1 >= 2) {
3440 if (cnt1 >= 32) {
3441 UNROLLED VECTOR LOOP
3442 }
3443 UNROLLED SCALAR LOOP
3444 }
3445 SINGLE SCALAR
3446 */
3447
3448 cmpl(cnt1, 32);
3449 jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3450
3451 // cnt1 >= 32 && generate_vectorized_loop
3452 xorl(index, index);
3453
3454 // vresult = IntVector.zero(I256);
3455 for (int idx = 0; idx < 4; idx++) {
3456 vpxor(vresult[idx], vresult[idx]);
3457 }
3458 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3459 Register bound = tmp2;
3460 Register next = tmp3;
3461 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3462 movl(next, Address(tmp2, 0));
3463 movdl(vnext, next);
3464 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3465
3466 // index = 0;
3467 // bound = cnt1 & ~(32 - 1);
3468 movl(bound, cnt1);
3469 andl(bound, ~(32 - 1));
3470 // for (; index < bound; index += 32) {
3471 bind(UNROLLED_VECTOR_LOOP_BEGIN);
3472 // result *= next;
3473 imull(result, next);
3474 // loop fission to upfront the cost of fetching from memory, OOO execution
3475 // can then hopefully do a better job of prefetching
3476 for (int idx = 0; idx < 4; idx++) {
3477 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3478 }
3479 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3480 for (int idx = 0; idx < 4; idx++) {
3481 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3482 arrays_hashcode_elvcast(vtmp[idx], eltype);
3483 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3484 }
3485 // index += 32;
3486 addl(index, 32);
3487 // index < bound;
3488 cmpl(index, bound);
3489 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3490 // }
3491
3492 lea(ary1, Address(ary1, bound, Address::times(elsize)));
3493 subl(cnt1, bound);
3494 // release bound
3495
3496 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3497 for (int idx = 0; idx < 4; idx++) {
3498 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3499 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3500 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3501 }
3502 // result += vresult.reduceLanes(ADD);
3503 for (int idx = 0; idx < 4; idx++) {
3504 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3505 }
3506
3507 // } else if (cnt1 < 32) {
3508
3509 bind(SHORT_UNROLLED_BEGIN);
3510 // int i = 1;
3511 movl(index, 1);
3512 cmpl(index, cnt1);
3513 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3514
3515 // for (; i < cnt1 ; i += 2) {
3516 bind(SHORT_UNROLLED_LOOP_BEGIN);
3517 movl(tmp3, 961);
3518 imull(result, tmp3);
3519 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3520 movl(tmp3, tmp2);
3521 shll(tmp3, 5);
3522 subl(tmp3, tmp2);
3523 addl(result, tmp3);
3524 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3525 addl(result, tmp3);
3526 addl(index, 2);
3527 cmpl(index, cnt1);
3528 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3529
3530 // }
3531 // if (i >= cnt1) {
3532 bind(SHORT_UNROLLED_LOOP_EXIT);
3533 jccb(Assembler::greater, END);
3534 movl(tmp2, result);
3535 shll(result, 5);
3536 subl(result, tmp2);
3537 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3538 addl(result, tmp3);
3539 // }
3540 bind(END);
3541
3542 BLOCK_COMMENT("} // arrays_hashcode");
3543
3544 } // arrays_hashcode
3545
3546 // helper function for string_compare
3547 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3548 Address::ScaleFactor scale, Address::ScaleFactor scale1,
3549 Address::ScaleFactor scale2, Register index, int ae) {
3550 if (ae == StrIntrinsicNode::LL) {
3551 load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3552 load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3553 } else if (ae == StrIntrinsicNode::UU) {
3554 load_unsigned_short(elem1, Address(str1, index, scale, 0));
3555 load_unsigned_short(elem2, Address(str2, index, scale, 0));
3556 } else {
3557 load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3558 load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3559 }
3560 }
3561
3562 // Compare strings, used for char[] and byte[].
3563 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3564 Register cnt1, Register cnt2, Register result,
3565 XMMRegister vec1, int ae, KRegister mask) {
3566 ShortBranchVerifier sbv(this);
3567 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3568 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only AVX3
3569 int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3570 int stride2x2 = 0x40;
3571 Address::ScaleFactor scale = Address::no_scale;
3572 Address::ScaleFactor scale1 = Address::no_scale;
3573 Address::ScaleFactor scale2 = Address::no_scale;
3574
3575 if (ae != StrIntrinsicNode::LL) {
3576 stride2x2 = 0x20;
3577 }
3578
3579 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3580 shrl(cnt2, 1);
3581 }
3582 // Compute the minimum of the string lengths and the
3583 // difference of the string lengths (stack).
3584 // Do the conditional move stuff
3585 movl(result, cnt1);
3586 subl(cnt1, cnt2);
3587 push(cnt1);
3588 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2)
3589
3590 // Is the minimum length zero?
3591 testl(cnt2, cnt2);
3592 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3593 if (ae == StrIntrinsicNode::LL) {
3594 // Load first bytes
3595 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0]
3596 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0]
3597 } else if (ae == StrIntrinsicNode::UU) {
3598 // Load first characters
3599 load_unsigned_short(result, Address(str1, 0));
3600 load_unsigned_short(cnt1, Address(str2, 0));
3601 } else {
3602 load_unsigned_byte(result, Address(str1, 0));
3603 load_unsigned_short(cnt1, Address(str2, 0));
3604 }
3605 subl(result, cnt1);
3606 jcc(Assembler::notZero, POP_LABEL);
3607
3608 if (ae == StrIntrinsicNode::UU) {
3609 // Divide length by 2 to get number of chars
3610 shrl(cnt2, 1);
3611 }
3612 cmpl(cnt2, 1);
3613 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3614
3615 // Check if the strings start at the same location and setup scale and stride
3616 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3617 cmpptr(str1, str2);
3618 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3619 if (ae == StrIntrinsicNode::LL) {
3620 scale = Address::times_1;
3621 stride = 16;
3622 } else {
3623 scale = Address::times_2;
3624 stride = 8;
3625 }
3626 } else {
3627 scale1 = Address::times_1;
3628 scale2 = Address::times_2;
3629 // scale not used
3630 stride = 8;
3631 }
3632
3633 if (UseAVX >= 2 && UseSSE42Intrinsics) {
3634 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3635 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3636 Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3637 Label COMPARE_TAIL_LONG;
3638 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only AVX3
3639
3640 int pcmpmask = 0x19;
3641 if (ae == StrIntrinsicNode::LL) {
3642 pcmpmask &= ~0x01;
3643 }
3644
3645 // Setup to compare 16-chars (32-bytes) vectors,
3646 // start from first character again because it has aligned address.
3647 if (ae == StrIntrinsicNode::LL) {
3648 stride2 = 32;
3649 } else {
3650 stride2 = 16;
3651 }
3652 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3653 adr_stride = stride << scale;
3654 } else {
3655 adr_stride1 = 8; //stride << scale1;
3656 adr_stride2 = 16; //stride << scale2;
3657 }
3658
3659 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3660 // rax and rdx are used by pcmpestri as elements counters
3661 movl(result, cnt2);
3662 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count
3663 jcc(Assembler::zero, COMPARE_TAIL_LONG);
3664
3665 // fast path : compare first 2 8-char vectors.
3666 bind(COMPARE_16_CHARS);
3667 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3668 movdqu(vec1, Address(str1, 0));
3669 } else {
3670 pmovzxbw(vec1, Address(str1, 0));
3671 }
3672 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3673 jccb(Assembler::below, COMPARE_INDEX_CHAR);
3674
3675 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3676 movdqu(vec1, Address(str1, adr_stride));
3677 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3678 } else {
3679 pmovzxbw(vec1, Address(str1, adr_stride1));
3680 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3681 }
3682 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3683 addl(cnt1, stride);
3684
3685 // Compare the characters at index in cnt1
3686 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3687 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3688 subl(result, cnt2);
3689 jmp(POP_LABEL);
3690
3691 // Setup the registers to start vector comparison loop
3692 bind(COMPARE_WIDE_VECTORS);
3693 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3694 lea(str1, Address(str1, result, scale));
3695 lea(str2, Address(str2, result, scale));
3696 } else {
3697 lea(str1, Address(str1, result, scale1));
3698 lea(str2, Address(str2, result, scale2));
3699 }
3700 subl(result, stride2);
3701 subl(cnt2, stride2);
3702 jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3703 negptr(result);
3704
3705 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3706 bind(COMPARE_WIDE_VECTORS_LOOP);
3707
3708 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3709 cmpl(cnt2, stride2x2);
3710 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3711 testl(cnt2, stride2x2-1); // cnt2 holds the vector count
3712 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40
3713
3714 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3715 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3716 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3717 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3718 } else {
3719 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3720 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3721 }
3722 kortestql(mask, mask);
3723 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare
3724 addptr(result, stride2x2); // update since we already compared at this addr
3725 subl(cnt2, stride2x2); // and sub the size too
3726 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3727
3728 vpxor(vec1, vec1);
3729 jmpb(COMPARE_WIDE_TAIL);
3730 }//if (VM_Version::supports_avx512vlbw())
3731
3732 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3733 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3734 vmovdqu(vec1, Address(str1, result, scale));
3735 vpxor(vec1, Address(str2, result, scale));
3736 } else {
3737 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3738 vpxor(vec1, Address(str2, result, scale2));
3739 }
3740 vptest(vec1, vec1);
3741 jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3742 addptr(result, stride2);
3743 subl(cnt2, stride2);
3744 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3745 // clean upper bits of YMM registers
3746 vpxor(vec1, vec1);
3747
3748 // compare wide vectors tail
3749 bind(COMPARE_WIDE_TAIL);
3750 testptr(result, result);
3751 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3752
3753 movl(result, stride2);
3754 movl(cnt2, result);
3755 negptr(result);
3756 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3757
3758 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3759 bind(VECTOR_NOT_EQUAL);
3760 // clean upper bits of YMM registers
3761 vpxor(vec1, vec1);
3762 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3763 lea(str1, Address(str1, result, scale));
3764 lea(str2, Address(str2, result, scale));
3765 } else {
3766 lea(str1, Address(str1, result, scale1));
3767 lea(str2, Address(str2, result, scale2));
3768 }
3769 jmp(COMPARE_16_CHARS);
3770
3771 // Compare tail chars, length between 1 to 15 chars
3772 bind(COMPARE_TAIL_LONG);
3773 movl(cnt2, result);
3774 cmpl(cnt2, stride);
3775 jcc(Assembler::less, COMPARE_SMALL_STR);
3776
3777 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3778 movdqu(vec1, Address(str1, 0));
3779 } else {
3780 pmovzxbw(vec1, Address(str1, 0));
3781 }
3782 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3783 jcc(Assembler::below, COMPARE_INDEX_CHAR);
3784 subptr(cnt2, stride);
3785 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3786 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3787 lea(str1, Address(str1, result, scale));
3788 lea(str2, Address(str2, result, scale));
3789 } else {
3790 lea(str1, Address(str1, result, scale1));
3791 lea(str2, Address(str2, result, scale2));
3792 }
3793 negptr(cnt2);
3794 jmpb(WHILE_HEAD_LABEL);
3795
3796 bind(COMPARE_SMALL_STR);
3797 } else if (UseSSE42Intrinsics) {
3798 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3799 int pcmpmask = 0x19;
3800 // Setup to compare 8-char (16-byte) vectors,
3801 // start from first character again because it has aligned address.
3802 movl(result, cnt2);
3803 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count
3804 if (ae == StrIntrinsicNode::LL) {
3805 pcmpmask &= ~0x01;
3806 }
3807 jcc(Assembler::zero, COMPARE_TAIL);
3808 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3809 lea(str1, Address(str1, result, scale));
3810 lea(str2, Address(str2, result, scale));
3811 } else {
3812 lea(str1, Address(str1, result, scale1));
3813 lea(str2, Address(str2, result, scale2));
3814 }
3815 negptr(result);
3816
3817 // pcmpestri
3818 // inputs:
3819 // vec1- substring
3820 // rax - negative string length (elements count)
3821 // mem - scanned string
3822 // rdx - string length (elements count)
3823 // pcmpmask - cmp mode: 11000 (string compare with negated result)
3824 // + 00 (unsigned bytes) or + 01 (unsigned shorts)
3825 // outputs:
3826 // rcx - first mismatched element index
3827 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3828
3829 bind(COMPARE_WIDE_VECTORS);
3830 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3831 movdqu(vec1, Address(str1, result, scale));
3832 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3833 } else {
3834 pmovzxbw(vec1, Address(str1, result, scale1));
3835 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3836 }
3837 // After pcmpestri cnt1(rcx) contains mismatched element index
3838
3839 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1
3840 addptr(result, stride);
3841 subptr(cnt2, stride);
3842 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3843
3844 // compare wide vectors tail
3845 testptr(result, result);
3846 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3847
3848 movl(cnt2, stride);
3849 movl(result, stride);
3850 negptr(result);
3851 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3852 movdqu(vec1, Address(str1, result, scale));
3853 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3854 } else {
3855 pmovzxbw(vec1, Address(str1, result, scale1));
3856 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3857 }
3858 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3859
3860 // Mismatched characters in the vectors
3861 bind(VECTOR_NOT_EQUAL);
3862 addptr(cnt1, result);
3863 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3864 subl(result, cnt2);
3865 jmpb(POP_LABEL);
3866
3867 bind(COMPARE_TAIL); // limit is zero
3868 movl(cnt2, result);
3869 // Fallthru to tail compare
3870 }
3871 // Shift str2 and str1 to the end of the arrays, negate min
3872 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3873 lea(str1, Address(str1, cnt2, scale));
3874 lea(str2, Address(str2, cnt2, scale));
3875 } else {
3876 lea(str1, Address(str1, cnt2, scale1));
3877 lea(str2, Address(str2, cnt2, scale2));
3878 }
3879 decrementl(cnt2); // first character was compared already
3880 negptr(cnt2);
3881
3882 // Compare the rest of the elements
3883 bind(WHILE_HEAD_LABEL);
3884 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3885 subl(result, cnt1);
3886 jccb(Assembler::notZero, POP_LABEL);
3887 increment(cnt2);
3888 jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3889
3890 // Strings are equal up to min length. Return the length difference.
3891 bind(LENGTH_DIFF_LABEL);
3892 pop(result);
3893 if (ae == StrIntrinsicNode::UU) {
3894 // Divide diff by 2 to get number of chars
3895 sarl(result, 1);
3896 }
3897 jmpb(DONE_LABEL);
3898
3899 if (VM_Version::supports_avx512vlbw()) {
3900
3901 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3902
3903 kmovql(cnt1, mask);
3904 notq(cnt1);
3905 bsfq(cnt2, cnt1);
3906 if (ae != StrIntrinsicNode::LL) {
3907 // Divide diff by 2 to get number of chars
3908 sarl(cnt2, 1);
3909 }
3910 addq(result, cnt2);
3911 if (ae == StrIntrinsicNode::LL) {
3912 load_unsigned_byte(cnt1, Address(str2, result));
3913 load_unsigned_byte(result, Address(str1, result));
3914 } else if (ae == StrIntrinsicNode::UU) {
3915 load_unsigned_short(cnt1, Address(str2, result, scale));
3916 load_unsigned_short(result, Address(str1, result, scale));
3917 } else {
3918 load_unsigned_short(cnt1, Address(str2, result, scale2));
3919 load_unsigned_byte(result, Address(str1, result, scale1));
3920 }
3921 subl(result, cnt1);
3922 jmpb(POP_LABEL);
3923 }//if (VM_Version::supports_avx512vlbw())
3924
3925 // Discard the stored length difference
3926 bind(POP_LABEL);
3927 pop(cnt1);
3928
3929 // That's it
3930 bind(DONE_LABEL);
3931 if(ae == StrIntrinsicNode::UL) {
3932 negl(result);
3933 }
3934
3935 }
3936
3937 // Search for Non-ASCII character (Negative byte value) in a byte array,
3938 // return the index of the first such character, otherwise the length
3939 // of the array segment searched.
3940 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3941 // @IntrinsicCandidate
3942 // public static int countPositives(byte[] ba, int off, int len) {
3943 // for (int i = off; i < off + len; i++) {
3944 // if (ba[i] < 0) {
3945 // return i - off;
3946 // }
3947 // }
3948 // return len;
3949 // }
3950 void C2_MacroAssembler::count_positives(Register ary1, Register len,
3951 Register result, Register tmp1,
3952 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3953 // rsi: byte array
3954 // rcx: len
3955 // rax: result
3956 ShortBranchVerifier sbv(this);
3957 assert_different_registers(ary1, len, result, tmp1);
3958 assert_different_registers(vec1, vec2);
3959 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3960
3961 movl(result, len); // copy
3962 // len == 0
3963 testl(len, len);
3964 jcc(Assembler::zero, DONE);
3965
3966 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3967 VM_Version::supports_avx512vlbw() &&
3968 VM_Version::supports_bmi2()) {
3969
3970 Label test_64_loop, test_tail, BREAK_LOOP;
3971 movl(tmp1, len);
3972 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3973
3974 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
3975 andl(len, 0xffffffc0); // vector count (in chars)
3976 jccb(Assembler::zero, test_tail);
3977
3978 lea(ary1, Address(ary1, len, Address::times_1));
3979 negptr(len);
3980
3981 bind(test_64_loop);
3982 // Check whether our 64 elements of size byte contain negatives
3983 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3984 kortestql(mask1, mask1);
3985 jcc(Assembler::notZero, BREAK_LOOP);
3986
3987 addptr(len, 64);
3988 jccb(Assembler::notZero, test_64_loop);
3989
3990 bind(test_tail);
3991 // bail out when there is nothing to be done
3992 testl(tmp1, -1);
3993 jcc(Assembler::zero, DONE);
3994
3995
3996 // check the tail for absense of negatives
3997 // ~(~0 << len) applied up to two times (for 32-bit scenario)
3998 {
3999 Register tmp3_aliased = len;
4000 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4001 shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4002 notq(tmp3_aliased);
4003 kmovql(mask2, tmp3_aliased);
4004 }
4005
4006 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4007 ktestq(mask1, mask2);
4008 jcc(Assembler::zero, DONE);
4009
4010 // do a full check for negative registers in the tail
4011 movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4012 // ary1 already pointing to the right place
4013 jmpb(TAIL_START);
4014
4015 bind(BREAK_LOOP);
4016 // At least one byte in the last 64 byte block was negative.
4017 // Set up to look at the last 64 bytes as if they were a tail
4018 lea(ary1, Address(ary1, len, Address::times_1));
4019 addptr(result, len);
4020 // Ignore the very last byte: if all others are positive,
4021 // it must be negative, so we can skip right to the 2+1 byte
4022 // end comparison at this point
4023 orl(result, 63);
4024 movl(len, 63);
4025 // Fallthru to tail compare
4026 } else {
4027
4028 if (UseAVX >= 2) {
4029 // With AVX2, use 32-byte vector compare
4030 Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4031
4032 // Compare 32-byte vectors
4033 testl(len, 0xffffffe0); // vector count (in bytes)
4034 jccb(Assembler::zero, TAIL_START);
4035
4036 andl(len, 0xffffffe0);
4037 lea(ary1, Address(ary1, len, Address::times_1));
4038 negptr(len);
4039
4040 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
4041 movdl(vec2, tmp1);
4042 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4043
4044 bind(COMPARE_WIDE_VECTORS);
4045 vmovdqu(vec1, Address(ary1, len, Address::times_1));
4046 vptest(vec1, vec2);
4047 jccb(Assembler::notZero, BREAK_LOOP);
4048 addptr(len, 32);
4049 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4050
4051 testl(result, 0x0000001f); // any bytes remaining?
4052 jcc(Assembler::zero, DONE);
4053
4054 // Quick test using the already prepared vector mask
4055 movl(len, result);
4056 andl(len, 0x0000001f);
4057 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4058 vptest(vec1, vec2);
4059 jcc(Assembler::zero, DONE);
4060 // There are zeros, jump to the tail to determine exactly where
4061 jmpb(TAIL_START);
4062
4063 bind(BREAK_LOOP);
4064 // At least one byte in the last 32-byte vector is negative.
4065 // Set up to look at the last 32 bytes as if they were a tail
4066 lea(ary1, Address(ary1, len, Address::times_1));
4067 addptr(result, len);
4068 // Ignore the very last byte: if all others are positive,
4069 // it must be negative, so we can skip right to the 2+1 byte
4070 // end comparison at this point
4071 orl(result, 31);
4072 movl(len, 31);
4073 // Fallthru to tail compare
4074 } else if (UseSSE42Intrinsics) {
4075 // With SSE4.2, use double quad vector compare
4076 Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4077
4078 // Compare 16-byte vectors
4079 testl(len, 0xfffffff0); // vector count (in bytes)
4080 jcc(Assembler::zero, TAIL_START);
4081
4082 andl(len, 0xfffffff0);
4083 lea(ary1, Address(ary1, len, Address::times_1));
4084 negptr(len);
4085
4086 movl(tmp1, 0x80808080);
4087 movdl(vec2, tmp1);
4088 pshufd(vec2, vec2, 0);
4089
4090 bind(COMPARE_WIDE_VECTORS);
4091 movdqu(vec1, Address(ary1, len, Address::times_1));
4092 ptest(vec1, vec2);
4093 jccb(Assembler::notZero, BREAK_LOOP);
4094 addptr(len, 16);
4095 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4096
4097 testl(result, 0x0000000f); // len is zero, any bytes remaining?
4098 jcc(Assembler::zero, DONE);
4099
4100 // Quick test using the already prepared vector mask
4101 movl(len, result);
4102 andl(len, 0x0000000f); // tail count (in bytes)
4103 movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4104 ptest(vec1, vec2);
4105 jcc(Assembler::zero, DONE);
4106 jmpb(TAIL_START);
4107
4108 bind(BREAK_LOOP);
4109 // At least one byte in the last 16-byte vector is negative.
4110 // Set up and look at the last 16 bytes as if they were a tail
4111 lea(ary1, Address(ary1, len, Address::times_1));
4112 addptr(result, len);
4113 // Ignore the very last byte: if all others are positive,
4114 // it must be negative, so we can skip right to the 2+1 byte
4115 // end comparison at this point
4116 orl(result, 15);
4117 movl(len, 15);
4118 // Fallthru to tail compare
4119 }
4120 }
4121
4122 bind(TAIL_START);
4123 // Compare 4-byte vectors
4124 andl(len, 0xfffffffc); // vector count (in bytes)
4125 jccb(Assembler::zero, COMPARE_CHAR);
4126
4127 lea(ary1, Address(ary1, len, Address::times_1));
4128 negptr(len);
4129
4130 bind(COMPARE_VECTORS);
4131 movl(tmp1, Address(ary1, len, Address::times_1));
4132 andl(tmp1, 0x80808080);
4133 jccb(Assembler::notZero, TAIL_ADJUST);
4134 addptr(len, 4);
4135 jccb(Assembler::notZero, COMPARE_VECTORS);
4136
4137 // Compare trailing char (final 2-3 bytes), if any
4138 bind(COMPARE_CHAR);
4139
4140 testl(result, 0x2); // tail char
4141 jccb(Assembler::zero, COMPARE_BYTE);
4142 load_unsigned_short(tmp1, Address(ary1, 0));
4143 andl(tmp1, 0x00008080);
4144 jccb(Assembler::notZero, CHAR_ADJUST);
4145 lea(ary1, Address(ary1, 2));
4146
4147 bind(COMPARE_BYTE);
4148 testl(result, 0x1); // tail byte
4149 jccb(Assembler::zero, DONE);
4150 load_unsigned_byte(tmp1, Address(ary1, 0));
4151 testl(tmp1, 0x00000080);
4152 jccb(Assembler::zero, DONE);
4153 subptr(result, 1);
4154 jmpb(DONE);
4155
4156 bind(TAIL_ADJUST);
4157 // there are negative bits in the last 4 byte block.
4158 // Adjust result and check the next three bytes
4159 addptr(result, len);
4160 orl(result, 3);
4161 lea(ary1, Address(ary1, len, Address::times_1));
4162 jmpb(COMPARE_CHAR);
4163
4164 bind(CHAR_ADJUST);
4165 // We are looking at a char + optional byte tail, and found that one
4166 // of the bytes in the char is negative. Adjust the result, check the
4167 // first byte and readjust if needed.
4168 andl(result, 0xfffffffc);
4169 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4170 jccb(Assembler::notZero, DONE);
4171 addptr(result, 1);
4172
4173 // That's it
4174 bind(DONE);
4175 if (UseAVX >= 2) {
4176 // clean upper bits of YMM registers
4177 vpxor(vec1, vec1);
4178 vpxor(vec2, vec2);
4179 }
4180 }
4181
4182 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4183 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4184 Register limit, Register result, Register chr,
4185 XMMRegister vec1, XMMRegister vec2, bool is_char,
4186 KRegister mask, bool expand_ary2) {
4187 // for expand_ary2, limit is the (smaller) size of the second array.
4188 ShortBranchVerifier sbv(this);
4189 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4190
4191 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4192 "Expansion only implemented for AVX2");
4193
4194 int length_offset = arrayOopDesc::length_offset_in_bytes();
4195 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4196
4197 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4198 int scaleIncr = expand_ary2 ? 8 : 16;
4199
4200 if (is_array_equ) {
4201 // Check the input args
4202 cmpoop(ary1, ary2);
4203 jcc(Assembler::equal, TRUE_LABEL);
4204
4205 // Need additional checks for arrays_equals.
4206 testptr(ary1, ary1);
4207 jcc(Assembler::zero, FALSE_LABEL);
4208 testptr(ary2, ary2);
4209 jcc(Assembler::zero, FALSE_LABEL);
4210
4211 // Check the lengths
4212 movl(limit, Address(ary1, length_offset));
4213 cmpl(limit, Address(ary2, length_offset));
4214 jcc(Assembler::notEqual, FALSE_LABEL);
4215 }
4216
4217 // count == 0
4218 testl(limit, limit);
4219 jcc(Assembler::zero, TRUE_LABEL);
4220
4221 if (is_array_equ) {
4222 // Load array address
4223 lea(ary1, Address(ary1, base_offset));
4224 lea(ary2, Address(ary2, base_offset));
4225 }
4226
4227 if (is_array_equ && is_char) {
4228 // arrays_equals when used for char[].
4229 shll(limit, 1); // byte count != 0
4230 }
4231 movl(result, limit); // copy
4232
4233 if (UseAVX >= 2) {
4234 // With AVX2, use 32-byte vector compare
4235 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4236
4237 // Compare 32-byte vectors
4238 if (expand_ary2) {
4239 andl(result, 0x0000000f); // tail count (in bytes)
4240 andl(limit, 0xfffffff0); // vector count (in bytes)
4241 jcc(Assembler::zero, COMPARE_TAIL);
4242 } else {
4243 andl(result, 0x0000001f); // tail count (in bytes)
4244 andl(limit, 0xffffffe0); // vector count (in bytes)
4245 jcc(Assembler::zero, COMPARE_TAIL_16);
4246 }
4247
4248 lea(ary1, Address(ary1, limit, scaleFactor));
4249 lea(ary2, Address(ary2, limit, Address::times_1));
4250 negptr(limit);
4251
4252 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4253 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4254
4255 cmpl(limit, -64);
4256 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4257
4258 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4259
4260 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4261 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4262 kortestql(mask, mask);
4263 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
4264 addptr(limit, 64); // update since we already compared at this addr
4265 cmpl(limit, -64);
4266 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4267
4268 // At this point we may still need to compare -limit+result bytes.
4269 // We could execute the next two instruction and just continue via non-wide path:
4270 // cmpl(limit, 0);
4271 // jcc(Assembler::equal, COMPARE_TAIL); // true
4272 // But since we stopped at the points ary{1,2}+limit which are
4273 // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4274 // (|limit| <= 32 and result < 32),
4275 // we may just compare the last 64 bytes.
4276 //
4277 addptr(result, -64); // it is safe, bc we just came from this area
4278 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4279 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4280 kortestql(mask, mask);
4281 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
4282
4283 jmp(TRUE_LABEL);
4284
4285 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4286
4287 }//if (VM_Version::supports_avx512vlbw())
4288
4289 bind(COMPARE_WIDE_VECTORS);
4290 vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4291 if (expand_ary2) {
4292 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4293 } else {
4294 vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4295 }
4296 vpxor(vec1, vec2);
4297
4298 vptest(vec1, vec1);
4299 jcc(Assembler::notZero, FALSE_LABEL);
4300 addptr(limit, scaleIncr * 2);
4301 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4302
4303 testl(result, result);
4304 jcc(Assembler::zero, TRUE_LABEL);
4305
4306 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4307 if (expand_ary2) {
4308 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4309 } else {
4310 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4311 }
4312 vpxor(vec1, vec2);
4313
4314 vptest(vec1, vec1);
4315 jcc(Assembler::notZero, FALSE_LABEL);
4316 jmp(TRUE_LABEL);
4317
4318 bind(COMPARE_TAIL_16); // limit is zero
4319 movl(limit, result);
4320
4321 // Compare 16-byte chunks
4322 andl(result, 0x0000000f); // tail count (in bytes)
4323 andl(limit, 0xfffffff0); // vector count (in bytes)
4324 jcc(Assembler::zero, COMPARE_TAIL);
4325
4326 lea(ary1, Address(ary1, limit, scaleFactor));
4327 lea(ary2, Address(ary2, limit, Address::times_1));
4328 negptr(limit);
4329
4330 bind(COMPARE_WIDE_VECTORS_16);
4331 movdqu(vec1, Address(ary1, limit, scaleFactor));
4332 if (expand_ary2) {
4333 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4334 } else {
4335 movdqu(vec2, Address(ary2, limit, Address::times_1));
4336 }
4337 pxor(vec1, vec2);
4338
4339 ptest(vec1, vec1);
4340 jcc(Assembler::notZero, FALSE_LABEL);
4341 addptr(limit, scaleIncr);
4342 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4343
4344 bind(COMPARE_TAIL); // limit is zero
4345 movl(limit, result);
4346 // Fallthru to tail compare
4347 } else if (UseSSE42Intrinsics) {
4348 // With SSE4.2, use double quad vector compare
4349 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4350
4351 // Compare 16-byte vectors
4352 andl(result, 0x0000000f); // tail count (in bytes)
4353 andl(limit, 0xfffffff0); // vector count (in bytes)
4354 jcc(Assembler::zero, COMPARE_TAIL);
4355
4356 lea(ary1, Address(ary1, limit, Address::times_1));
4357 lea(ary2, Address(ary2, limit, Address::times_1));
4358 negptr(limit);
4359
4360 bind(COMPARE_WIDE_VECTORS);
4361 movdqu(vec1, Address(ary1, limit, Address::times_1));
4362 movdqu(vec2, Address(ary2, limit, Address::times_1));
4363 pxor(vec1, vec2);
4364
4365 ptest(vec1, vec1);
4366 jcc(Assembler::notZero, FALSE_LABEL);
4367 addptr(limit, 16);
4368 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4369
4370 testl(result, result);
4371 jcc(Assembler::zero, TRUE_LABEL);
4372
4373 movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4374 movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4375 pxor(vec1, vec2);
4376
4377 ptest(vec1, vec1);
4378 jccb(Assembler::notZero, FALSE_LABEL);
4379 jmpb(TRUE_LABEL);
4380
4381 bind(COMPARE_TAIL); // limit is zero
4382 movl(limit, result);
4383 // Fallthru to tail compare
4384 }
4385
4386 // Compare 4-byte vectors
4387 if (expand_ary2) {
4388 testl(result, result);
4389 jccb(Assembler::zero, TRUE_LABEL);
4390 } else {
4391 andl(limit, 0xfffffffc); // vector count (in bytes)
4392 jccb(Assembler::zero, COMPARE_CHAR);
4393 }
4394
4395 lea(ary1, Address(ary1, limit, scaleFactor));
4396 lea(ary2, Address(ary2, limit, Address::times_1));
4397 negptr(limit);
4398
4399 bind(COMPARE_VECTORS);
4400 if (expand_ary2) {
4401 // There are no "vector" operations for bytes to shorts
4402 movzbl(chr, Address(ary2, limit, Address::times_1));
4403 cmpw(Address(ary1, limit, Address::times_2), chr);
4404 jccb(Assembler::notEqual, FALSE_LABEL);
4405 addptr(limit, 1);
4406 jcc(Assembler::notZero, COMPARE_VECTORS);
4407 jmp(TRUE_LABEL);
4408 } else {
4409 movl(chr, Address(ary1, limit, Address::times_1));
4410 cmpl(chr, Address(ary2, limit, Address::times_1));
4411 jccb(Assembler::notEqual, FALSE_LABEL);
4412 addptr(limit, 4);
4413 jcc(Assembler::notZero, COMPARE_VECTORS);
4414 }
4415
4416 // Compare trailing char (final 2 bytes), if any
4417 bind(COMPARE_CHAR);
4418 testl(result, 0x2); // tail char
4419 jccb(Assembler::zero, COMPARE_BYTE);
4420 load_unsigned_short(chr, Address(ary1, 0));
4421 load_unsigned_short(limit, Address(ary2, 0));
4422 cmpl(chr, limit);
4423 jccb(Assembler::notEqual, FALSE_LABEL);
4424
4425 if (is_array_equ && is_char) {
4426 bind(COMPARE_BYTE);
4427 } else {
4428 lea(ary1, Address(ary1, 2));
4429 lea(ary2, Address(ary2, 2));
4430
4431 bind(COMPARE_BYTE);
4432 testl(result, 0x1); // tail byte
4433 jccb(Assembler::zero, TRUE_LABEL);
4434 load_unsigned_byte(chr, Address(ary1, 0));
4435 load_unsigned_byte(limit, Address(ary2, 0));
4436 cmpl(chr, limit);
4437 jccb(Assembler::notEqual, FALSE_LABEL);
4438 }
4439 bind(TRUE_LABEL);
4440 movl(result, 1); // return true
4441 jmpb(DONE);
4442
4443 bind(FALSE_LABEL);
4444 xorl(result, result); // return false
4445
4446 // That's it
4447 bind(DONE);
4448 if (UseAVX >= 2) {
4449 // clean upper bits of YMM registers
4450 vpxor(vec1, vec1);
4451 vpxor(vec2, vec2);
4452 }
4453 }
4454
4455 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4456 #define __ masm.
4457 Register dst = stub.data<0>();
4458 XMMRegister src = stub.data<1>();
4459 address target = stub.data<2>();
4460 __ bind(stub.entry());
4461 __ subptr(rsp, 8);
4462 __ movdbl(Address(rsp), src);
4463 __ call(RuntimeAddress(target));
4464 // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4465 __ pop(dst);
4466 __ jmp(stub.continuation());
4467 #undef __
4468 }
4469
4470 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4471 assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4472 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4473
4474 address slowpath_target;
4475 if (dst_bt == T_INT) {
4476 if (src_bt == T_FLOAT) {
4477 cvttss2sil(dst, src);
4478 cmpl(dst, 0x80000000);
4479 slowpath_target = StubRoutines::x86::f2i_fixup();
4480 } else {
4481 cvttsd2sil(dst, src);
4482 cmpl(dst, 0x80000000);
4483 slowpath_target = StubRoutines::x86::d2i_fixup();
4484 }
4485 } else {
4486 if (src_bt == T_FLOAT) {
4487 cvttss2siq(dst, src);
4488 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4489 slowpath_target = StubRoutines::x86::f2l_fixup();
4490 } else {
4491 cvttsd2siq(dst, src);
4492 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4493 slowpath_target = StubRoutines::x86::d2l_fixup();
4494 }
4495 }
4496
4497 // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4498 int max_size = 23 + (UseAPX ? 1 : 0);
4499 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4500 jcc(Assembler::equal, stub->entry());
4501 bind(stub->continuation());
4502 }
4503
4504 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4505 XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4506 switch(ideal_opc) {
4507 case Op_LShiftVS:
4508 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4509 case Op_LShiftVI:
4510 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4511 case Op_LShiftVL:
4512 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4513 case Op_RShiftVS:
4514 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4515 case Op_RShiftVI:
4516 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4517 case Op_RShiftVL:
4518 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4519 case Op_URShiftVS:
4520 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4521 case Op_URShiftVI:
4522 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4523 case Op_URShiftVL:
4524 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4525 case Op_RotateRightV:
4526 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4527 case Op_RotateLeftV:
4528 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4529 default:
4530 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4531 break;
4532 }
4533 }
4534
4535 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4536 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4537 if (is_unsigned) {
4538 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4539 } else {
4540 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4541 }
4542 }
4543
4544 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4545 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4546 switch (elem_bt) {
4547 case T_BYTE:
4548 if (ideal_opc == Op_SaturatingAddV) {
4549 evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4550 } else {
4551 assert(ideal_opc == Op_SaturatingSubV, "");
4552 evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4553 }
4554 break;
4555 case T_SHORT:
4556 if (ideal_opc == Op_SaturatingAddV) {
4557 evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4558 } else {
4559 assert(ideal_opc == Op_SaturatingSubV, "");
4560 evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4561 }
4562 break;
4563 default:
4564 fatal("Unsupported type %s", type2name(elem_bt));
4565 break;
4566 }
4567 }
4568
4569 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4570 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4571 switch (elem_bt) {
4572 case T_BYTE:
4573 if (ideal_opc == Op_SaturatingAddV) {
4574 evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4575 } else {
4576 assert(ideal_opc == Op_SaturatingSubV, "");
4577 evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4578 }
4579 break;
4580 case T_SHORT:
4581 if (ideal_opc == Op_SaturatingAddV) {
4582 evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4583 } else {
4584 assert(ideal_opc == Op_SaturatingSubV, "");
4585 evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4586 }
4587 break;
4588 default:
4589 fatal("Unsupported type %s", type2name(elem_bt));
4590 break;
4591 }
4592 }
4593
4594 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4595 Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4596 if (is_unsigned) {
4597 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4598 } else {
4599 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4600 }
4601 }
4602
4603 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4604 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4605 switch (elem_bt) {
4606 case T_BYTE:
4607 if (ideal_opc == Op_SaturatingAddV) {
4608 evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4609 } else {
4610 assert(ideal_opc == Op_SaturatingSubV, "");
4611 evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4612 }
4613 break;
4614 case T_SHORT:
4615 if (ideal_opc == Op_SaturatingAddV) {
4616 evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4617 } else {
4618 assert(ideal_opc == Op_SaturatingSubV, "");
4619 evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4620 }
4621 break;
4622 default:
4623 fatal("Unsupported type %s", type2name(elem_bt));
4624 break;
4625 }
4626 }
4627
4628 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4629 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4630 switch (elem_bt) {
4631 case T_BYTE:
4632 if (ideal_opc == Op_SaturatingAddV) {
4633 evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4634 } else {
4635 assert(ideal_opc == Op_SaturatingSubV, "");
4636 evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4637 }
4638 break;
4639 case T_SHORT:
4640 if (ideal_opc == Op_SaturatingAddV) {
4641 evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4642 } else {
4643 assert(ideal_opc == Op_SaturatingSubV, "");
4644 evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4645 }
4646 break;
4647 default:
4648 fatal("Unsupported type %s", type2name(elem_bt));
4649 break;
4650 }
4651 }
4652
4653 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4654 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4655 bool is_varshift) {
4656 switch (ideal_opc) {
4657 case Op_AddVB:
4658 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4659 case Op_AddVS:
4660 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4661 case Op_AddVI:
4662 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4663 case Op_AddVL:
4664 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4665 case Op_AddVF:
4666 evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4667 case Op_AddVD:
4668 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4669 case Op_SubVB:
4670 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4671 case Op_SubVS:
4672 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4673 case Op_SubVI:
4674 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4675 case Op_SubVL:
4676 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4677 case Op_SubVF:
4678 evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4679 case Op_SubVD:
4680 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4681 case Op_MulVS:
4682 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4683 case Op_MulVI:
4684 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4685 case Op_MulVL:
4686 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4687 case Op_MulVF:
4688 evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4689 case Op_MulVD:
4690 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4691 case Op_DivVF:
4692 evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4693 case Op_DivVD:
4694 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4695 case Op_SqrtVF:
4696 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4697 case Op_SqrtVD:
4698 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4699 case Op_AbsVB:
4700 evpabsb(dst, mask, src2, merge, vlen_enc); break;
4701 case Op_AbsVS:
4702 evpabsw(dst, mask, src2, merge, vlen_enc); break;
4703 case Op_AbsVI:
4704 evpabsd(dst, mask, src2, merge, vlen_enc); break;
4705 case Op_AbsVL:
4706 evpabsq(dst, mask, src2, merge, vlen_enc); break;
4707 case Op_FmaVF:
4708 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4709 case Op_FmaVD:
4710 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4711 case Op_VectorRearrange:
4712 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4713 case Op_LShiftVS:
4714 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4715 case Op_LShiftVI:
4716 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4717 case Op_LShiftVL:
4718 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4719 case Op_RShiftVS:
4720 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4721 case Op_RShiftVI:
4722 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4723 case Op_RShiftVL:
4724 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4725 case Op_URShiftVS:
4726 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4727 case Op_URShiftVI:
4728 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4729 case Op_URShiftVL:
4730 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4731 case Op_RotateLeftV:
4732 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4733 case Op_RotateRightV:
4734 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4735 case Op_MaxV:
4736 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4737 case Op_MinV:
4738 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4739 case Op_UMinV:
4740 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4741 case Op_UMaxV:
4742 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4743 case Op_XorV:
4744 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4745 case Op_OrV:
4746 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4747 case Op_AndV:
4748 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4749 default:
4750 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4751 break;
4752 }
4753 }
4754
4755 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4756 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4757 switch (ideal_opc) {
4758 case Op_AddVB:
4759 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4760 case Op_AddVS:
4761 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4762 case Op_AddVI:
4763 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4764 case Op_AddVL:
4765 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4766 case Op_AddVF:
4767 evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4768 case Op_AddVD:
4769 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4770 case Op_SubVB:
4771 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4772 case Op_SubVS:
4773 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4774 case Op_SubVI:
4775 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4776 case Op_SubVL:
4777 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4778 case Op_SubVF:
4779 evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4780 case Op_SubVD:
4781 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4782 case Op_MulVS:
4783 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4784 case Op_MulVI:
4785 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4786 case Op_MulVL:
4787 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4788 case Op_MulVF:
4789 evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4790 case Op_MulVD:
4791 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4792 case Op_DivVF:
4793 evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4794 case Op_DivVD:
4795 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4796 case Op_FmaVF:
4797 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4798 case Op_FmaVD:
4799 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4800 case Op_MaxV:
4801 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4802 case Op_MinV:
4803 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4804 case Op_UMaxV:
4805 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4806 case Op_UMinV:
4807 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4808 case Op_XorV:
4809 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4810 case Op_OrV:
4811 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4812 case Op_AndV:
4813 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4814 default:
4815 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4816 break;
4817 }
4818 }
4819
4820 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4821 KRegister src1, KRegister src2) {
4822 BasicType etype = T_ILLEGAL;
4823 switch(mask_len) {
4824 case 2:
4825 case 4:
4826 case 8: etype = T_BYTE; break;
4827 case 16: etype = T_SHORT; break;
4828 case 32: etype = T_INT; break;
4829 case 64: etype = T_LONG; break;
4830 default: fatal("Unsupported type"); break;
4831 }
4832 assert(etype != T_ILLEGAL, "");
4833 switch(ideal_opc) {
4834 case Op_AndVMask:
4835 kand(etype, dst, src1, src2); break;
4836 case Op_OrVMask:
4837 kor(etype, dst, src1, src2); break;
4838 case Op_XorVMask:
4839 kxor(etype, dst, src1, src2); break;
4840 default:
4841 fatal("Unsupported masked operation"); break;
4842 }
4843 }
4844
4845 /*
4846 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4847 * If src is NaN, the result is 0.
4848 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4849 * the result is equal to the value of Integer.MIN_VALUE.
4850 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4851 * the result is equal to the value of Integer.MAX_VALUE.
4852 */
4853 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4854 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4855 Register rscratch, AddressLiteral float_sign_flip,
4856 int vec_enc) {
4857 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4858 Label done;
4859 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4860 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4861 vptest(xtmp2, xtmp2, vec_enc);
4862 jccb(Assembler::equal, done);
4863
4864 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4865 vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4866
4867 vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4868 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4869 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4870
4871 // Recompute the mask for remaining special value.
4872 vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4873 // Extract SRC values corresponding to TRUE mask lanes.
4874 vpand(xtmp4, xtmp2, src, vec_enc);
4875 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4876 // values are set.
4877 vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4878
4879 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4880 bind(done);
4881 }
4882
4883 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4884 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4885 Register rscratch, AddressLiteral float_sign_flip,
4886 int vec_enc) {
4887 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4888 Label done;
4889 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4890 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4891 kortestwl(ktmp1, ktmp1);
4892 jccb(Assembler::equal, done);
4893
4894 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4895 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4896 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4897
4898 kxorwl(ktmp1, ktmp1, ktmp2);
4899 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4900 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4901 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4902 bind(done);
4903 }
4904
4905 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4906 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4907 Register rscratch, AddressLiteral double_sign_flip,
4908 int vec_enc) {
4909 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4910
4911 Label done;
4912 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4913 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4914 kortestwl(ktmp1, ktmp1);
4915 jccb(Assembler::equal, done);
4916
4917 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4918 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4919 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4920
4921 kxorwl(ktmp1, ktmp1, ktmp2);
4922 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4923 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4924 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4925 bind(done);
4926 }
4927
4928 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4929 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4930 Register rscratch, AddressLiteral float_sign_flip,
4931 int vec_enc) {
4932 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4933 Label done;
4934 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4935 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4936 kortestwl(ktmp1, ktmp1);
4937 jccb(Assembler::equal, done);
4938
4939 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4940 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4941 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4942
4943 kxorwl(ktmp1, ktmp1, ktmp2);
4944 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4945 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4946 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4947 bind(done);
4948 }
4949
4950 /*
4951 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4952 * If src is NaN, the result is 0.
4953 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4954 * the result is equal to the value of Long.MIN_VALUE.
4955 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4956 * the result is equal to the value of Long.MAX_VALUE.
4957 */
4958 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4959 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4960 Register rscratch, AddressLiteral double_sign_flip,
4961 int vec_enc) {
4962 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4963
4964 Label done;
4965 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4966 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4967 kortestwl(ktmp1, ktmp1);
4968 jccb(Assembler::equal, done);
4969
4970 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4971 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4972 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4973
4974 kxorwl(ktmp1, ktmp1, ktmp2);
4975 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4976 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4977 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4978 bind(done);
4979 }
4980
4981 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
4982 XMMRegister xtmp, int index, int vec_enc) {
4983 assert(vec_enc < Assembler::AVX_512bit, "");
4984 if (vec_enc == Assembler::AVX_256bit) {
4985 vextractf128_high(xtmp, src);
4986 vshufps(dst, src, xtmp, index, vec_enc);
4987 } else {
4988 vshufps(dst, src, zero, index, vec_enc);
4989 }
4990 }
4991
4992 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4993 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
4994 AddressLiteral float_sign_flip, int src_vec_enc) {
4995 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4996
4997 Label done;
4998 // Compare the destination lanes with float_sign_flip
4999 // value to get mask for all special values.
5000 movdqu(xtmp1, float_sign_flip, rscratch);
5001 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5002 ptest(xtmp2, xtmp2);
5003 jccb(Assembler::equal, done);
5004
5005 // Flip float_sign_flip to get max integer value.
5006 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5007 pxor(xtmp1, xtmp4);
5008
5009 // Set detination lanes corresponding to unordered source lanes as zero.
5010 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5011 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5012
5013 // Shuffle mask vector and pack lower doubles word from each quadword lane.
5014 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5015 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5016
5017 // Recompute the mask for remaining special value.
5018 pxor(xtmp2, xtmp3);
5019 // Extract mask corresponding to non-negative source lanes.
5020 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5021
5022 // Shuffle mask vector and pack lower doubles word from each quadword lane.
5023 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5024 pand(xtmp3, xtmp2);
5025
5026 // Replace destination lanes holding special value(0x80000000) with max int
5027 // if corresponding source lane holds a +ve value.
5028 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5029 bind(done);
5030 }
5031
5032
5033 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5034 XMMRegister xtmp, Register rscratch, int vec_enc) {
5035 switch(to_elem_bt) {
5036 case T_SHORT:
5037 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5038 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5039 vpackusdw(dst, dst, zero, vec_enc);
5040 if (vec_enc == Assembler::AVX_256bit) {
5041 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5042 }
5043 break;
5044 case T_BYTE:
5045 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5046 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5047 vpackusdw(dst, dst, zero, vec_enc);
5048 if (vec_enc == Assembler::AVX_256bit) {
5049 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5050 }
5051 vpackuswb(dst, dst, zero, vec_enc);
5052 break;
5053 default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt));
5054 }
5055 }
5056
5057 /*
5058 * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):-
5059 * a) Perform vector D2L/F2I cast.
5060 * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5061 * It signifies that source value could be any of the special floating point
5062 * values(NaN,-Inf,Inf,Max,-Min).
5063 * c) Set destination to zero if source is NaN value.
5064 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5065 */
5066
5067 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5068 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5069 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5070 int to_elem_sz = type2aelembytes(to_elem_bt);
5071 assert(to_elem_sz <= 4, "");
5072 vcvttps2dq(dst, src, vec_enc);
5073 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5074 if (to_elem_sz < 4) {
5075 vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5076 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5077 }
5078 }
5079
5080 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5081 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5082 Register rscratch, int vec_enc) {
5083 int to_elem_sz = type2aelembytes(to_elem_bt);
5084 assert(to_elem_sz <= 4, "");
5085 vcvttps2dq(dst, src, vec_enc);
5086 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5087 switch(to_elem_bt) {
5088 case T_INT:
5089 break;
5090 case T_SHORT:
5091 evpmovdw(dst, dst, vec_enc);
5092 break;
5093 case T_BYTE:
5094 evpmovdb(dst, dst, vec_enc);
5095 break;
5096 default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt));
5097 }
5098 }
5099
5100 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5101 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5102 Register rscratch, int vec_enc) {
5103 evcvttps2qq(dst, src, vec_enc);
5104 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5105 }
5106
5107 // Handling for downcasting from double to integer or sub-word types on AVX2.
5108 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5109 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5110 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5111 int to_elem_sz = type2aelembytes(to_elem_bt);
5112 assert(to_elem_sz < 8, "");
5113 vcvttpd2dq(dst, src, vec_enc);
5114 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5115 float_sign_flip, vec_enc);
5116 if (to_elem_sz < 4) {
5117 // xtmp4 holds all zero lanes.
5118 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5119 }
5120 }
5121
5122 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5123 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5124 KRegister ktmp2, AddressLiteral sign_flip,
5125 Register rscratch, int vec_enc) {
5126 if (VM_Version::supports_avx512dq()) {
5127 evcvttpd2qq(dst, src, vec_enc);
5128 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5129 switch(to_elem_bt) {
5130 case T_LONG:
5131 break;
5132 case T_INT:
5133 evpmovsqd(dst, dst, vec_enc);
5134 break;
5135 case T_SHORT:
5136 evpmovsqd(dst, dst, vec_enc);
5137 evpmovdw(dst, dst, vec_enc);
5138 break;
5139 case T_BYTE:
5140 evpmovsqd(dst, dst, vec_enc);
5141 evpmovdb(dst, dst, vec_enc);
5142 break;
5143 default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt));
5144 }
5145 } else {
5146 assert(type2aelembytes(to_elem_bt) <= 4, "");
5147 vcvttpd2dq(dst, src, vec_enc);
5148 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5149 switch(to_elem_bt) {
5150 case T_INT:
5151 break;
5152 case T_SHORT:
5153 evpmovdw(dst, dst, vec_enc);
5154 break;
5155 case T_BYTE:
5156 evpmovdb(dst, dst, vec_enc);
5157 break;
5158 default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt));
5159 }
5160 }
5161 }
5162
5163 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5164 switch(to_elem_bt) {
5165 case T_LONG:
5166 evcvttps2qqs(dst, src, vec_enc);
5167 break;
5168 case T_INT:
5169 evcvttps2dqs(dst, src, vec_enc);
5170 break;
5171 case T_SHORT:
5172 evcvttps2dqs(dst, src, vec_enc);
5173 evpmovdw(dst, dst, vec_enc);
5174 break;
5175 case T_BYTE:
5176 evcvttps2dqs(dst, src, vec_enc);
5177 evpmovdb(dst, dst, vec_enc);
5178 break;
5179 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt));
5180 }
5181 }
5182
5183 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5184 switch(to_elem_bt) {
5185 case T_LONG:
5186 evcvttps2qqs(dst, src, vec_enc);
5187 break;
5188 case T_INT:
5189 evcvttps2dqs(dst, src, vec_enc);
5190 break;
5191 case T_SHORT:
5192 evcvttps2dqs(dst, src, vec_enc);
5193 evpmovdw(dst, dst, vec_enc);
5194 break;
5195 case T_BYTE:
5196 evcvttps2dqs(dst, src, vec_enc);
5197 evpmovdb(dst, dst, vec_enc);
5198 break;
5199 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt));
5200 }
5201 }
5202
5203 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5204 switch(to_elem_bt) {
5205 case T_LONG:
5206 evcvttpd2qqs(dst, src, vec_enc);
5207 break;
5208 case T_INT:
5209 evcvttpd2dqs(dst, src, vec_enc);
5210 break;
5211 case T_SHORT:
5212 evcvttpd2dqs(dst, src, vec_enc);
5213 evpmovdw(dst, dst, vec_enc);
5214 break;
5215 case T_BYTE:
5216 evcvttpd2dqs(dst, src, vec_enc);
5217 evpmovdb(dst, dst, vec_enc);
5218 break;
5219 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt));
5220 }
5221 }
5222
5223 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5224 switch(to_elem_bt) {
5225 case T_LONG:
5226 evcvttpd2qqs(dst, src, vec_enc);
5227 break;
5228 case T_INT:
5229 evcvttpd2dqs(dst, src, vec_enc);
5230 break;
5231 case T_SHORT:
5232 evcvttpd2dqs(dst, src, vec_enc);
5233 evpmovdw(dst, dst, vec_enc);
5234 break;
5235 case T_BYTE:
5236 evcvttpd2dqs(dst, src, vec_enc);
5237 evpmovdb(dst, dst, vec_enc);
5238 break;
5239 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt));
5240 }
5241 }
5242
5243 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5244 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5245 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5246 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5247 // and re-instantiate original MXCSR.RC mode after that.
5248 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5249
5250 mov64(tmp, julong_cast(0.5L));
5251 evpbroadcastq(xtmp1, tmp, vec_enc);
5252 vaddpd(xtmp1, src , xtmp1, vec_enc);
5253 evcvtpd2qq(dst, xtmp1, vec_enc);
5254 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5255 double_sign_flip, vec_enc);;
5256
5257 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5258 }
5259
5260 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5261 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5262 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5263 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5264 // and re-instantiate original MXCSR.RC mode after that.
5265 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5266
5267 movl(tmp, jint_cast(0.5));
5268 movq(xtmp1, tmp);
5269 vbroadcastss(xtmp1, xtmp1, vec_enc);
5270 vaddps(xtmp1, src , xtmp1, vec_enc);
5271 vcvtps2dq(dst, xtmp1, vec_enc);
5272 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5273 float_sign_flip, vec_enc);
5274
5275 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5276 }
5277
5278 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5279 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5280 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5281 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5282 // and re-instantiate original MXCSR.RC mode after that.
5283 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5284
5285 movl(tmp, jint_cast(0.5));
5286 movq(xtmp1, tmp);
5287 vbroadcastss(xtmp1, xtmp1, vec_enc);
5288 vaddps(xtmp1, src , xtmp1, vec_enc);
5289 vcvtps2dq(dst, xtmp1, vec_enc);
5290 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5291
5292 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5293 }
5294
5295 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5296 BasicType from_elem_bt, BasicType to_elem_bt) {
5297 switch (from_elem_bt) {
5298 case T_BYTE:
5299 switch (to_elem_bt) {
5300 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5301 case T_INT: vpmovzxbd(dst, src, vlen_enc); break;
5302 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break;
5303 default: ShouldNotReachHere();
5304 }
5305 break;
5306 case T_SHORT:
5307 switch (to_elem_bt) {
5308 case T_INT: vpmovzxwd(dst, src, vlen_enc); break;
5309 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5310 default: ShouldNotReachHere();
5311 }
5312 break;
5313 case T_INT:
5314 assert(to_elem_bt == T_LONG, "");
5315 vpmovzxdq(dst, src, vlen_enc);
5316 break;
5317 default:
5318 ShouldNotReachHere();
5319 }
5320 }
5321
5322 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5323 BasicType from_elem_bt, BasicType to_elem_bt) {
5324 switch (from_elem_bt) {
5325 case T_BYTE:
5326 switch (to_elem_bt) {
5327 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5328 case T_INT: vpmovsxbd(dst, src, vlen_enc); break;
5329 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break;
5330 default: ShouldNotReachHere();
5331 }
5332 break;
5333 case T_SHORT:
5334 switch (to_elem_bt) {
5335 case T_INT: vpmovsxwd(dst, src, vlen_enc); break;
5336 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5337 default: ShouldNotReachHere();
5338 }
5339 break;
5340 case T_INT:
5341 assert(to_elem_bt == T_LONG, "");
5342 vpmovsxdq(dst, src, vlen_enc);
5343 break;
5344 default:
5345 ShouldNotReachHere();
5346 }
5347 }
5348
5349 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5350 BasicType dst_bt, BasicType src_bt, int vlen) {
5351 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5352 assert(vlen_enc != AVX_512bit, "");
5353
5354 int dst_bt_size = type2aelembytes(dst_bt);
5355 int src_bt_size = type2aelembytes(src_bt);
5356 if (dst_bt_size > src_bt_size) {
5357 switch (dst_bt_size / src_bt_size) {
5358 case 2: vpmovsxbw(dst, src, vlen_enc); break;
5359 case 4: vpmovsxbd(dst, src, vlen_enc); break;
5360 case 8: vpmovsxbq(dst, src, vlen_enc); break;
5361 default: ShouldNotReachHere();
5362 }
5363 } else {
5364 assert(dst_bt_size < src_bt_size, "");
5365 switch (src_bt_size / dst_bt_size) {
5366 case 2: {
5367 if (vlen_enc == AVX_128bit) {
5368 vpacksswb(dst, src, src, vlen_enc);
5369 } else {
5370 vpacksswb(dst, src, src, vlen_enc);
5371 vpermq(dst, dst, 0x08, vlen_enc);
5372 }
5373 break;
5374 }
5375 case 4: {
5376 if (vlen_enc == AVX_128bit) {
5377 vpackssdw(dst, src, src, vlen_enc);
5378 vpacksswb(dst, dst, dst, vlen_enc);
5379 } else {
5380 vpackssdw(dst, src, src, vlen_enc);
5381 vpermq(dst, dst, 0x08, vlen_enc);
5382 vpacksswb(dst, dst, dst, AVX_128bit);
5383 }
5384 break;
5385 }
5386 case 8: {
5387 if (vlen_enc == AVX_128bit) {
5388 vpshufd(dst, src, 0x08, vlen_enc);
5389 vpackssdw(dst, dst, dst, vlen_enc);
5390 vpacksswb(dst, dst, dst, vlen_enc);
5391 } else {
5392 vpshufd(dst, src, 0x08, vlen_enc);
5393 vpermq(dst, dst, 0x08, vlen_enc);
5394 vpackssdw(dst, dst, dst, AVX_128bit);
5395 vpacksswb(dst, dst, dst, AVX_128bit);
5396 }
5397 break;
5398 }
5399 default: ShouldNotReachHere();
5400 }
5401 }
5402 }
5403
5404 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5405 bool merge, BasicType bt, int vlen_enc) {
5406 if (bt == T_INT) {
5407 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5408 } else {
5409 assert(bt == T_LONG, "");
5410 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5411 }
5412 }
5413
5414 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5415 bool merge, BasicType bt, int vlen_enc) {
5416 if (bt == T_INT) {
5417 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5418 } else {
5419 assert(bt == T_LONG, "");
5420 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5421 }
5422 }
5423
5424 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5425 Register rtmp2, XMMRegister xtmp, int mask_len,
5426 int vec_enc) {
5427 int index = 0;
5428 int vindex = 0;
5429 mov64(rtmp1, 0x0101010101010101L);
5430 pdepq(rtmp1, src, rtmp1);
5431 if (mask_len > 8) {
5432 movq(rtmp2, src);
5433 vpxor(xtmp, xtmp, xtmp, vec_enc);
5434 movq(xtmp, rtmp1);
5435 }
5436 movq(dst, rtmp1);
5437
5438 mask_len -= 8;
5439 while (mask_len > 0) {
5440 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5441 index++;
5442 if ((index % 2) == 0) {
5443 pxor(xtmp, xtmp);
5444 }
5445 mov64(rtmp1, 0x0101010101010101L);
5446 shrq(rtmp2, 8);
5447 pdepq(rtmp1, rtmp2, rtmp1);
5448 pinsrq(xtmp, rtmp1, index % 2);
5449 vindex = index / 2;
5450 if (vindex) {
5451 // Write entire 16 byte vector when both 64 bit
5452 // lanes are update to save redundant instructions.
5453 if (index % 2) {
5454 vinsertf128(dst, dst, xtmp, vindex);
5455 }
5456 } else {
5457 vmovdqu(dst, xtmp);
5458 }
5459 mask_len -= 8;
5460 }
5461 }
5462
5463 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5464 switch(opc) {
5465 case Op_VectorMaskTrueCount:
5466 popcntq(dst, tmp);
5467 break;
5468 case Op_VectorMaskLastTrue:
5469 if (VM_Version::supports_lzcnt()) {
5470 lzcntq(tmp, tmp);
5471 movl(dst, 63);
5472 subl(dst, tmp);
5473 } else {
5474 movl(dst, -1);
5475 bsrq(tmp, tmp);
5476 cmov32(Assembler::notZero, dst, tmp);
5477 }
5478 break;
5479 case Op_VectorMaskFirstTrue:
5480 if (VM_Version::supports_bmi1()) {
5481 if (masklen < 32) {
5482 orl(tmp, 1 << masklen);
5483 tzcntl(dst, tmp);
5484 } else if (masklen == 32) {
5485 tzcntl(dst, tmp);
5486 } else {
5487 assert(masklen == 64, "");
5488 tzcntq(dst, tmp);
5489 }
5490 } else {
5491 if (masklen < 32) {
5492 orl(tmp, 1 << masklen);
5493 bsfl(dst, tmp);
5494 } else {
5495 assert(masklen == 32 || masklen == 64, "");
5496 movl(dst, masklen);
5497 if (masklen == 32) {
5498 bsfl(tmp, tmp);
5499 } else {
5500 bsfq(tmp, tmp);
5501 }
5502 cmov32(Assembler::notZero, dst, tmp);
5503 }
5504 }
5505 break;
5506 case Op_VectorMaskToLong:
5507 assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5508 break;
5509 default: assert(false, "Unhandled mask operation");
5510 }
5511 }
5512
5513 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5514 int masklen, int masksize, int vec_enc) {
5515 assert(VM_Version::supports_popcnt(), "");
5516
5517 if(VM_Version::supports_avx512bw()) {
5518 kmovql(tmp, mask);
5519 } else {
5520 assert(masklen <= 16, "");
5521 kmovwl(tmp, mask);
5522 }
5523
5524 // Mask generated out of partial vector comparisons/replicate/mask manipulation
5525 // operations needs to be clipped.
5526 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5527 andq(tmp, (1 << masklen) - 1);
5528 }
5529
5530 vector_mask_operation_helper(opc, dst, tmp, masklen);
5531 }
5532
5533 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5534 Register tmp, int masklen, BasicType bt, int vec_enc) {
5535 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5536 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5537 assert(VM_Version::supports_popcnt(), "");
5538
5539 bool need_clip = false;
5540 switch(bt) {
5541 case T_BOOLEAN:
5542 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5543 vpxor(xtmp, xtmp, xtmp, vec_enc);
5544 vpsubb(xtmp, xtmp, mask, vec_enc);
5545 vpmovmskb(tmp, xtmp, vec_enc);
5546 need_clip = masklen < 16;
5547 break;
5548 case T_BYTE:
5549 vpmovmskb(tmp, mask, vec_enc);
5550 need_clip = masklen < 16;
5551 break;
5552 case T_SHORT:
5553 vpacksswb(xtmp, mask, mask, vec_enc);
5554 if (masklen >= 16) {
5555 vpermpd(xtmp, xtmp, 8, vec_enc);
5556 }
5557 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5558 need_clip = masklen < 16;
5559 break;
5560 case T_INT:
5561 case T_FLOAT:
5562 vmovmskps(tmp, mask, vec_enc);
5563 need_clip = masklen < 4;
5564 break;
5565 case T_LONG:
5566 case T_DOUBLE:
5567 vmovmskpd(tmp, mask, vec_enc);
5568 need_clip = masklen < 2;
5569 break;
5570 default: assert(false, "Unhandled type, %s", type2name(bt));
5571 }
5572
5573 // Mask generated out of partial vector comparisons/replicate/mask manipulation
5574 // operations needs to be clipped.
5575 if (need_clip && opc != Op_VectorMaskFirstTrue) {
5576 // need_clip implies masklen < 32
5577 andq(tmp, (1 << masklen) - 1);
5578 }
5579
5580 vector_mask_operation_helper(opc, dst, tmp, masklen);
5581 }
5582
5583 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5584 Register rtmp2, int mask_len) {
5585 kmov(rtmp1, src);
5586 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5587 mov64(rtmp2, -1L);
5588 pextq(rtmp2, rtmp2, rtmp1);
5589 kmov(dst, rtmp2);
5590 }
5591
5592 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5593 XMMRegister mask, Register rtmp, Register rscratch,
5594 XMMRegister permv, XMMRegister xtmp, BasicType bt,
5595 int vec_enc) {
5596 assert(type2aelembytes(bt) >= 4, "");
5597 assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5598 address compress_perm_table = nullptr;
5599 address expand_perm_table = nullptr;
5600 if (type2aelembytes(bt) == 8) {
5601 compress_perm_table = StubRoutines::x86::compress_perm_table64();
5602 expand_perm_table = StubRoutines::x86::expand_perm_table64();
5603 vmovmskpd(rtmp, mask, vec_enc);
5604 } else {
5605 compress_perm_table = StubRoutines::x86::compress_perm_table32();
5606 expand_perm_table = StubRoutines::x86::expand_perm_table32();
5607 vmovmskps(rtmp, mask, vec_enc);
5608 }
5609 shlq(rtmp, 5); // for 32 byte permute row.
5610 if (opcode == Op_CompressV) {
5611 lea(rscratch, ExternalAddress(compress_perm_table));
5612 } else {
5613 lea(rscratch, ExternalAddress(expand_perm_table));
5614 }
5615 addptr(rtmp, rscratch);
5616 vmovdqu(permv, Address(rtmp));
5617 vpermps(dst, permv, src, Assembler::AVX_256bit);
5618 vpxor(xtmp, xtmp, xtmp, vec_enc);
5619 // Blend the result with zero vector using permute mask, each column entry
5620 // in a permute table row contains either a valid permute index or a -1 (default)
5621 // value, this can potentially be used as a blending mask after
5622 // compressing/expanding the source vector lanes.
5623 vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5624 }
5625
5626 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5627 bool merge, BasicType bt, int vec_enc) {
5628 if (opcode == Op_CompressV) {
5629 switch(bt) {
5630 case T_BYTE:
5631 evpcompressb(dst, mask, src, merge, vec_enc);
5632 break;
5633 case T_CHAR:
5634 case T_SHORT:
5635 evpcompressw(dst, mask, src, merge, vec_enc);
5636 break;
5637 case T_INT:
5638 evpcompressd(dst, mask, src, merge, vec_enc);
5639 break;
5640 case T_FLOAT:
5641 evcompressps(dst, mask, src, merge, vec_enc);
5642 break;
5643 case T_LONG:
5644 evpcompressq(dst, mask, src, merge, vec_enc);
5645 break;
5646 case T_DOUBLE:
5647 evcompresspd(dst, mask, src, merge, vec_enc);
5648 break;
5649 default:
5650 fatal("Unsupported type %s", type2name(bt));
5651 break;
5652 }
5653 } else {
5654 assert(opcode == Op_ExpandV, "");
5655 switch(bt) {
5656 case T_BYTE:
5657 evpexpandb(dst, mask, src, merge, vec_enc);
5658 break;
5659 case T_CHAR:
5660 case T_SHORT:
5661 evpexpandw(dst, mask, src, merge, vec_enc);
5662 break;
5663 case T_INT:
5664 evpexpandd(dst, mask, src, merge, vec_enc);
5665 break;
5666 case T_FLOAT:
5667 evexpandps(dst, mask, src, merge, vec_enc);
5668 break;
5669 case T_LONG:
5670 evpexpandq(dst, mask, src, merge, vec_enc);
5671 break;
5672 case T_DOUBLE:
5673 evexpandpd(dst, mask, src, merge, vec_enc);
5674 break;
5675 default:
5676 fatal("Unsupported type %s", type2name(bt));
5677 break;
5678 }
5679 }
5680 }
5681
5682 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5683 KRegister ktmp1, int vec_enc) {
5684 if (opcode == Op_SignumVD) {
5685 vsubpd(dst, zero, one, vec_enc);
5686 // if src < 0 ? -1 : 1
5687 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5688 evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5689 // if src == NaN, -0.0 or 0.0 return src.
5690 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5691 evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5692 } else {
5693 assert(opcode == Op_SignumVF, "");
5694 vsubps(dst, zero, one, vec_enc);
5695 // if src < 0 ? -1 : 1
5696 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5697 evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5698 // if src == NaN, -0.0 or 0.0 return src.
5699 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5700 evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5701 }
5702 }
5703
5704 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5705 XMMRegister xtmp1, int vec_enc) {
5706 if (opcode == Op_SignumVD) {
5707 vsubpd(dst, zero, one, vec_enc);
5708 // if src < 0 ? -1 : 1
5709 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5710 // if src == NaN, -0.0 or 0.0 return src.
5711 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5712 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5713 } else {
5714 assert(opcode == Op_SignumVF, "");
5715 vsubps(dst, zero, one, vec_enc);
5716 // if src < 0 ? -1 : 1
5717 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5718 // if src == NaN, -0.0 or 0.0 return src.
5719 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5720 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5721 }
5722 }
5723
5724 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5725 if (VM_Version::supports_avx512bw()) {
5726 if (mask_len > 32) {
5727 kmovql(dst, src);
5728 } else {
5729 kmovdl(dst, src);
5730 if (mask_len != 32) {
5731 kshiftrdl(dst, dst, 32 - mask_len);
5732 }
5733 }
5734 } else {
5735 assert(mask_len <= 16, "");
5736 kmovwl(dst, src);
5737 if (mask_len != 16) {
5738 kshiftrwl(dst, dst, 16 - mask_len);
5739 }
5740 }
5741 }
5742
5743 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5744 int lane_size = type2aelembytes(bt);
5745 if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5746 (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5747 movptr(rtmp, imm32);
5748 switch(lane_size) {
5749 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5750 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5751 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5752 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5753 fatal("Unsupported lane size %d", lane_size);
5754 break;
5755 }
5756 } else {
5757 movptr(rtmp, imm32);
5758 movq(dst, rtmp);
5759 switch(lane_size) {
5760 case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5761 case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5762 case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5763 case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5764 fatal("Unsupported lane size %d", lane_size);
5765 break;
5766 }
5767 }
5768 }
5769
5770 //
5771 // Following is lookup table based popcount computation algorithm:-
5772 // Index Bit set count
5773 // [ 0000 -> 0,
5774 // 0001 -> 1,
5775 // 0010 -> 1,
5776 // 0011 -> 2,
5777 // 0100 -> 1,
5778 // 0101 -> 2,
5779 // 0110 -> 2,
5780 // 0111 -> 3,
5781 // 1000 -> 1,
5782 // 1001 -> 2,
5783 // 1010 -> 3,
5784 // 1011 -> 3,
5785 // 1100 -> 2,
5786 // 1101 -> 3,
5787 // 1111 -> 4 ]
5788 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5789 // shuffle indices for lookup table access.
5790 // b. Right shift each byte of vector lane by 4 positions.
5791 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5792 // shuffle indices for lookup table access.
5793 // d. Add the bitset count of upper and lower 4 bits of each byte.
5794 // e. Unpack double words to quad words and compute sum of absolute difference of bitset
5795 // count of all the bytes of a quadword.
5796 // f. Perform step e. for upper 128bit vector lane.
5797 // g. Pack the bitset count of quadwords back to double word.
5798 // h. Unpacking and packing operations are not needed for 64bit vector lane.
5799
5800 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5801 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5802 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5803 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5804 vpsrlw(dst, src, 4, vec_enc);
5805 vpand(dst, dst, xtmp1, vec_enc);
5806 vpand(xtmp1, src, xtmp1, vec_enc);
5807 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5808 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5809 vpshufb(dst, xtmp2, dst, vec_enc);
5810 vpaddb(dst, dst, xtmp1, vec_enc);
5811 }
5812
5813 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5814 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5815 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5816 // Following code is as per steps e,f,g and h of above algorithm.
5817 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5818 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5819 vpsadbw(dst, dst, xtmp2, vec_enc);
5820 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5821 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5822 vpackuswb(dst, xtmp1, dst, vec_enc);
5823 }
5824
5825 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5826 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5827 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5828 // Add the popcount of upper and lower bytes of word.
5829 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5830 vpsrlw(dst, xtmp1, 8, vec_enc);
5831 vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5832 vpaddw(dst, dst, xtmp1, vec_enc);
5833 }
5834
5835 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5836 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5837 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5838 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5839 vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5840 }
5841
5842 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5843 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5844 switch(bt) {
5845 case T_LONG:
5846 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5847 break;
5848 case T_INT:
5849 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5850 break;
5851 case T_CHAR:
5852 case T_SHORT:
5853 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5854 break;
5855 case T_BYTE:
5856 case T_BOOLEAN:
5857 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5858 break;
5859 default:
5860 fatal("Unsupported type %s", type2name(bt));
5861 break;
5862 }
5863 }
5864
5865 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5866 KRegister mask, bool merge, int vec_enc) {
5867 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5868 switch(bt) {
5869 case T_LONG:
5870 assert(VM_Version::supports_avx512_vpopcntdq(), "");
5871 evpopcntq(dst, mask, src, merge, vec_enc);
5872 break;
5873 case T_INT:
5874 assert(VM_Version::supports_avx512_vpopcntdq(), "");
5875 evpopcntd(dst, mask, src, merge, vec_enc);
5876 break;
5877 case T_CHAR:
5878 case T_SHORT:
5879 assert(VM_Version::supports_avx512_bitalg(), "");
5880 evpopcntw(dst, mask, src, merge, vec_enc);
5881 break;
5882 case T_BYTE:
5883 case T_BOOLEAN:
5884 assert(VM_Version::supports_avx512_bitalg(), "");
5885 evpopcntb(dst, mask, src, merge, vec_enc);
5886 break;
5887 default:
5888 fatal("Unsupported type %s", type2name(bt));
5889 break;
5890 }
5891 }
5892
5893 // Bit reversal algorithm first reverses the bits of each byte followed by
5894 // a byte level reversal for multi-byte primitive types (short/int/long).
5895 // Algorithm performs a lookup table access to get reverse bit sequence
5896 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5897 // is obtained by swapping the reverse bit sequences of upper and lower
5898 // nibble of a byte.
5899 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5900 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5901 if (VM_Version::supports_avx512vlbw()) {
5902
5903 // Get the reverse bit sequence of lower nibble of each byte.
5904 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5905 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5906 evpandq(dst, xtmp2, src, vec_enc);
5907 vpshufb(dst, xtmp1, dst, vec_enc);
5908 vpsllq(dst, dst, 4, vec_enc);
5909
5910 // Get the reverse bit sequence of upper nibble of each byte.
5911 vpandn(xtmp2, xtmp2, src, vec_enc);
5912 vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5913 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5914
5915 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5916 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5917 evporq(xtmp2, dst, xtmp2, vec_enc);
5918 vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5919
5920 } else if(vec_enc == Assembler::AVX_512bit) {
5921 // Shift based bit reversal.
5922 assert(bt == T_LONG || bt == T_INT, "");
5923
5924 // Swap lower and upper nibble of each byte.
5925 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5926
5927 // Swap two least and most significant bits of each nibble.
5928 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5929
5930 // Swap adjacent pair of bits.
5931 evmovdqul(xtmp1, k0, dst, true, vec_enc);
5932 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5933
5934 evmovdqul(xtmp1, k0, dst, true, vec_enc);
5935 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5936 } else {
5937 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5938 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5939
5940 // Get the reverse bit sequence of lower nibble of each byte.
5941 vpand(dst, xtmp2, src, vec_enc);
5942 vpshufb(dst, xtmp1, dst, vec_enc);
5943 vpsllq(dst, dst, 4, vec_enc);
5944
5945 // Get the reverse bit sequence of upper nibble of each byte.
5946 vpandn(xtmp2, xtmp2, src, vec_enc);
5947 vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5948 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5949
5950 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5951 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5952 vpor(xtmp2, dst, xtmp2, vec_enc);
5953 vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5954 }
5955 }
5956
5957 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5958 XMMRegister xtmp, Register rscratch) {
5959 assert(VM_Version::supports_gfni(), "");
5960 assert(rscratch != noreg || always_reachable(mask), "missing");
5961
5962 // Galois field instruction based bit reversal based on following algorithm.
5963 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5964 vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5965 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5966 vector_reverse_byte(bt, dst, xtmp, vec_enc);
5967 }
5968
5969 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5970 XMMRegister xtmp1, Register rtmp, int vec_enc) {
5971 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5972 evpandq(dst, xtmp1, src, vec_enc);
5973 vpsllq(dst, dst, nbits, vec_enc);
5974 vpandn(xtmp1, xtmp1, src, vec_enc);
5975 vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5976 evporq(dst, dst, xtmp1, vec_enc);
5977 }
5978
5979 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5980 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5981 // Shift based bit reversal.
5982 assert(VM_Version::supports_evex(), "");
5983 switch(bt) {
5984 case T_LONG:
5985 // Swap upper and lower double word of each quad word.
5986 evprorq(xtmp1, k0, src, 32, true, vec_enc);
5987 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5988 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5989 break;
5990 case T_INT:
5991 // Swap upper and lower word of each double word.
5992 evprord(xtmp1, k0, src, 16, true, vec_enc);
5993 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5994 break;
5995 case T_CHAR:
5996 case T_SHORT:
5997 // Swap upper and lower byte of each word.
5998 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
5999 break;
6000 case T_BYTE:
6001 evmovdquq(dst, k0, src, true, vec_enc);
6002 break;
6003 default:
6004 fatal("Unsupported type %s", type2name(bt));
6005 break;
6006 }
6007 }
6008
6009 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6010 if (bt == T_BYTE) {
6011 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6012 evmovdquq(dst, k0, src, true, vec_enc);
6013 } else {
6014 vmovdqu(dst, src);
6015 }
6016 return;
6017 }
6018 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6019 // pre-computed shuffle indices.
6020 switch(bt) {
6021 case T_LONG:
6022 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6023 break;
6024 case T_INT:
6025 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6026 break;
6027 case T_CHAR:
6028 case T_SHORT:
6029 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6030 break;
6031 default:
6032 fatal("Unsupported type %s", type2name(bt));
6033 break;
6034 }
6035 vpshufb(dst, src, dst, vec_enc);
6036 }
6037
6038 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6039 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6040 KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6041 assert(is_integral_type(bt), "");
6042 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6043 assert(VM_Version::supports_avx512cd(), "");
6044 switch(bt) {
6045 case T_LONG:
6046 evplzcntq(dst, ktmp, src, merge, vec_enc);
6047 break;
6048 case T_INT:
6049 evplzcntd(dst, ktmp, src, merge, vec_enc);
6050 break;
6051 case T_SHORT:
6052 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6053 vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6054 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6055 vpunpckhwd(dst, xtmp1, src, vec_enc);
6056 evplzcntd(dst, ktmp, dst, merge, vec_enc);
6057 vpackusdw(dst, xtmp2, dst, vec_enc);
6058 break;
6059 case T_BYTE:
6060 // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6061 // accessing the lookup table.
6062 // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6063 // accessing the lookup table.
6064 // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6065 assert(VM_Version::supports_avx512bw(), "");
6066 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6067 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6068 vpand(xtmp2, dst, src, vec_enc);
6069 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6070 vpsrlw(xtmp3, src, 4, vec_enc);
6071 vpand(xtmp3, dst, xtmp3, vec_enc);
6072 vpshufb(dst, xtmp1, xtmp3, vec_enc);
6073 vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6074 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6075 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6076 break;
6077 default:
6078 fatal("Unsupported type %s", type2name(bt));
6079 break;
6080 }
6081 }
6082
6083 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6084 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6085 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6086 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6087 // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6088 // accessing the lookup table.
6089 vpand(dst, xtmp2, src, vec_enc);
6090 vpshufb(dst, xtmp1, dst, vec_enc);
6091 // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6092 // accessing the lookup table.
6093 vpsrlw(xtmp3, src, 4, vec_enc);
6094 vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6095 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6096 // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6097 vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6098 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6099 vpaddb(dst, dst, xtmp2, vec_enc);
6100 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6101 }
6102
6103 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6104 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6105 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6106 // Add zero counts of lower byte and upper byte of a word if
6107 // upper byte holds a zero value.
6108 vpsrlw(xtmp3, src, 8, vec_enc);
6109 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6110 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6111 vpsllw(xtmp2, dst, 8, vec_enc);
6112 vpaddw(xtmp2, xtmp2, dst, vec_enc);
6113 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6114 vpsrlw(dst, dst, 8, vec_enc);
6115 }
6116
6117 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6118 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6119 // By converting the integer to a float, we can obtain the number of leading zeros based on the exponent of the float.
6120 // As the float exponent contains a bias of 127 for nonzero values, the bias must be removed before interpreting the
6121 // exponent as the leading zero count.
6122
6123 // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6124 // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6125 // contributes to the leading number of zeros.
6126 vpsrld(dst, src, 1, vec_enc);
6127 vpandn(dst, dst, src, vec_enc);
6128
6129 vcvtdq2ps(dst, dst, vec_enc);
6130
6131 // By comparing the register to itself, all the bits in the destination are set.
6132 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6133
6134 // Move the biased exponent to the low end of the lane and mask with 0xFF to discard the sign bit.
6135 vpsrld(xtmp2, xtmp1, 24, vec_enc);
6136 vpsrld(dst, dst, 23, vec_enc);
6137 vpand(dst, xtmp2, dst, vec_enc);
6138
6139 // Subtract 127 from the exponent, which removes the bias from the exponent.
6140 vpsrld(xtmp2, xtmp1, 25, vec_enc);
6141 vpsubd(dst, dst, xtmp2, vec_enc);
6142
6143 vpsrld(xtmp2, xtmp1, 27, vec_enc);
6144
6145 // If the original value is 0 the exponent would not have bias, so the subtraction creates a negative number. If this
6146 // is found in any of the lanes, replace the lane with -1 from xtmp1.
6147 vblendvps(dst, dst, xtmp1, dst, vec_enc, true, xtmp3);
6148
6149 // If the original value is negative, replace the lane with 31.
6150 vblendvps(dst, dst, xtmp2, src, vec_enc, true, xtmp3);
6151
6152 // Subtract the exponent from 31, giving the final result. For 0, the result is 32 as the exponent was replaced with -1,
6153 // and for negative numbers the result is 0 as the exponent was replaced with 31.
6154 vpsubd(dst, xtmp2, dst, vec_enc);
6155 }
6156
6157 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6158 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6159 // Find the leading zeros of the top and bottom halves of the long individually.
6160 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6161
6162 // Move the top half result to the bottom half of xtmp1, setting the top half to 0.
6163 vpsrlq(xtmp1, dst, 32, vec_enc);
6164 // By moving the top half result to the right by 6 bits, if the top half was empty (i.e. 32 is returned) the result bit will
6165 // be in the most significant position of the bottom half.
6166 vpsrlq(xtmp2, dst, 6, vec_enc);
6167
6168 // In the bottom half, add the top half and bottom half results.
6169 vpaddq(dst, xtmp1, dst, vec_enc);
6170
6171 // For the bottom half, choose between the values using the most significant bit of xtmp2.
6172 // If the MSB is set, then bottom+top in dst is the resulting value. If the top half is less than 32 xtmp1 is chosen,
6173 // which contains only the top half result.
6174 // In the top half the MSB is always zero, so the value in xtmp1 is always chosen. This value is always 0, which clears
6175 // the lane as required.
6176 vblendvps(dst, xtmp1, dst, xtmp2, vec_enc, true, xtmp3);
6177 }
6178
6179 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6180 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6181 Register rtmp, int vec_enc) {
6182 assert(is_integral_type(bt), "unexpected type");
6183 assert(vec_enc < Assembler::AVX_512bit, "");
6184 switch(bt) {
6185 case T_LONG:
6186 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6187 break;
6188 case T_INT:
6189 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6190 break;
6191 case T_SHORT:
6192 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6193 break;
6194 case T_BYTE:
6195 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6196 break;
6197 default:
6198 fatal("Unsupported type %s", type2name(bt));
6199 break;
6200 }
6201 }
6202
6203 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6204 switch(bt) {
6205 case T_BYTE:
6206 vpsubb(dst, src1, src2, vec_enc);
6207 break;
6208 case T_SHORT:
6209 vpsubw(dst, src1, src2, vec_enc);
6210 break;
6211 case T_INT:
6212 vpsubd(dst, src1, src2, vec_enc);
6213 break;
6214 case T_LONG:
6215 vpsubq(dst, src1, src2, vec_enc);
6216 break;
6217 default:
6218 fatal("Unsupported type %s", type2name(bt));
6219 break;
6220 }
6221 }
6222
6223 // Trailing zero count computation is based on leading zero count operation as per
6224 // following equation. All AVX3 targets support AVX512CD feature which offers
6225 // direct vector instruction to compute leading zero count.
6226 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6227 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6228 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6229 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6230 assert(is_integral_type(bt), "");
6231 // xtmp = -1
6232 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6233 // xtmp = xtmp + src
6234 vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6235 // xtmp = xtmp & ~src
6236 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6237 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6238 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6239 vpsub(bt, dst, xtmp4, dst, vec_enc);
6240 }
6241
6242 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6243 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6244 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6245 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6246 assert(is_integral_type(bt), "");
6247 // xtmp = 0
6248 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6249 // xtmp = 0 - src
6250 vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6251 // xtmp = xtmp | src
6252 vpor(xtmp3, xtmp3, src, vec_enc);
6253 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6254 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6255 vpsub(bt, dst, xtmp1, dst, vec_enc);
6256 }
6257
6258 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6259 Label done;
6260 Label neg_divisor_fastpath;
6261 cmpl(divisor, 0);
6262 jccb(Assembler::less, neg_divisor_fastpath);
6263 xorl(rdx, rdx);
6264 divl(divisor);
6265 jmpb(done);
6266 bind(neg_divisor_fastpath);
6267 // Fastpath for divisor < 0:
6268 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6269 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6270 movl(rdx, rax);
6271 subl(rdx, divisor);
6272 if (VM_Version::supports_bmi1()) {
6273 andnl(rax, rdx, rax);
6274 } else {
6275 notl(rdx);
6276 andl(rax, rdx);
6277 }
6278 shrl(rax, 31);
6279 bind(done);
6280 }
6281
6282 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6283 Label done;
6284 Label neg_divisor_fastpath;
6285 cmpl(divisor, 0);
6286 jccb(Assembler::less, neg_divisor_fastpath);
6287 xorl(rdx, rdx);
6288 divl(divisor);
6289 jmpb(done);
6290 bind(neg_divisor_fastpath);
6291 // Fastpath when divisor < 0:
6292 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6293 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6294 movl(rdx, rax);
6295 subl(rax, divisor);
6296 if (VM_Version::supports_bmi1()) {
6297 andnl(rax, rax, rdx);
6298 } else {
6299 notl(rax);
6300 andl(rax, rdx);
6301 }
6302 sarl(rax, 31);
6303 andl(rax, divisor);
6304 subl(rdx, rax);
6305 bind(done);
6306 }
6307
6308 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6309 Label done;
6310 Label neg_divisor_fastpath;
6311
6312 cmpl(divisor, 0);
6313 jccb(Assembler::less, neg_divisor_fastpath);
6314 xorl(rdx, rdx);
6315 divl(divisor);
6316 jmpb(done);
6317 bind(neg_divisor_fastpath);
6318 // Fastpath for divisor < 0:
6319 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6320 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6321 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6322 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6323 movl(rdx, rax);
6324 subl(rax, divisor);
6325 if (VM_Version::supports_bmi1()) {
6326 andnl(rax, rax, rdx);
6327 } else {
6328 notl(rax);
6329 andl(rax, rdx);
6330 }
6331 movl(tmp, rax);
6332 shrl(rax, 31); // quotient
6333 sarl(tmp, 31);
6334 andl(tmp, divisor);
6335 subl(rdx, tmp); // remainder
6336 bind(done);
6337 }
6338
6339 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6340 XMMRegister xtmp2, Register rtmp) {
6341 if(VM_Version::supports_gfni()) {
6342 // Galois field instruction based bit reversal based on following algorithm.
6343 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6344 mov64(rtmp, 0x8040201008040201L);
6345 movq(xtmp1, src);
6346 movq(xtmp2, rtmp);
6347 gf2p8affineqb(xtmp1, xtmp2, 0);
6348 movq(dst, xtmp1);
6349 } else {
6350 // Swap even and odd numbered bits.
6351 movl(rtmp, src);
6352 andl(rtmp, 0x55555555);
6353 shll(rtmp, 1);
6354 movl(dst, src);
6355 andl(dst, 0xAAAAAAAA);
6356 shrl(dst, 1);
6357 orl(dst, rtmp);
6358
6359 // Swap LSB and MSB 2 bits of each nibble.
6360 movl(rtmp, dst);
6361 andl(rtmp, 0x33333333);
6362 shll(rtmp, 2);
6363 andl(dst, 0xCCCCCCCC);
6364 shrl(dst, 2);
6365 orl(dst, rtmp);
6366
6367 // Swap LSB and MSB 4 bits of each byte.
6368 movl(rtmp, dst);
6369 andl(rtmp, 0x0F0F0F0F);
6370 shll(rtmp, 4);
6371 andl(dst, 0xF0F0F0F0);
6372 shrl(dst, 4);
6373 orl(dst, rtmp);
6374 }
6375 bswapl(dst);
6376 }
6377
6378 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6379 XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6380 if(VM_Version::supports_gfni()) {
6381 // Galois field instruction based bit reversal based on following algorithm.
6382 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6383 mov64(rtmp1, 0x8040201008040201L);
6384 movq(xtmp1, src);
6385 movq(xtmp2, rtmp1);
6386 gf2p8affineqb(xtmp1, xtmp2, 0);
6387 movq(dst, xtmp1);
6388 } else {
6389 // Swap even and odd numbered bits.
6390 movq(rtmp1, src);
6391 mov64(rtmp2, 0x5555555555555555L);
6392 andq(rtmp1, rtmp2);
6393 shlq(rtmp1, 1);
6394 movq(dst, src);
6395 notq(rtmp2);
6396 andq(dst, rtmp2);
6397 shrq(dst, 1);
6398 orq(dst, rtmp1);
6399
6400 // Swap LSB and MSB 2 bits of each nibble.
6401 movq(rtmp1, dst);
6402 mov64(rtmp2, 0x3333333333333333L);
6403 andq(rtmp1, rtmp2);
6404 shlq(rtmp1, 2);
6405 notq(rtmp2);
6406 andq(dst, rtmp2);
6407 shrq(dst, 2);
6408 orq(dst, rtmp1);
6409
6410 // Swap LSB and MSB 4 bits of each byte.
6411 movq(rtmp1, dst);
6412 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6413 andq(rtmp1, rtmp2);
6414 shlq(rtmp1, 4);
6415 notq(rtmp2);
6416 andq(dst, rtmp2);
6417 shrq(dst, 4);
6418 orq(dst, rtmp1);
6419 }
6420 bswapq(dst);
6421 }
6422
6423 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6424 Label done;
6425 Label neg_divisor_fastpath;
6426 cmpq(divisor, 0);
6427 jccb(Assembler::less, neg_divisor_fastpath);
6428 xorl(rdx, rdx);
6429 divq(divisor);
6430 jmpb(done);
6431 bind(neg_divisor_fastpath);
6432 // Fastpath for divisor < 0:
6433 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6434 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6435 movq(rdx, rax);
6436 subq(rdx, divisor);
6437 if (VM_Version::supports_bmi1()) {
6438 andnq(rax, rdx, rax);
6439 } else {
6440 notq(rdx);
6441 andq(rax, rdx);
6442 }
6443 shrq(rax, 63);
6444 bind(done);
6445 }
6446
6447 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6448 Label done;
6449 Label neg_divisor_fastpath;
6450 cmpq(divisor, 0);
6451 jccb(Assembler::less, neg_divisor_fastpath);
6452 xorq(rdx, rdx);
6453 divq(divisor);
6454 jmp(done);
6455 bind(neg_divisor_fastpath);
6456 // Fastpath when divisor < 0:
6457 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6458 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6459 movq(rdx, rax);
6460 subq(rax, divisor);
6461 if (VM_Version::supports_bmi1()) {
6462 andnq(rax, rax, rdx);
6463 } else {
6464 notq(rax);
6465 andq(rax, rdx);
6466 }
6467 sarq(rax, 63);
6468 andq(rax, divisor);
6469 subq(rdx, rax);
6470 bind(done);
6471 }
6472
6473 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6474 Label done;
6475 Label neg_divisor_fastpath;
6476 cmpq(divisor, 0);
6477 jccb(Assembler::less, neg_divisor_fastpath);
6478 xorq(rdx, rdx);
6479 divq(divisor);
6480 jmp(done);
6481 bind(neg_divisor_fastpath);
6482 // Fastpath for divisor < 0:
6483 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6484 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6485 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6486 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6487 movq(rdx, rax);
6488 subq(rax, divisor);
6489 if (VM_Version::supports_bmi1()) {
6490 andnq(rax, rax, rdx);
6491 } else {
6492 notq(rax);
6493 andq(rax, rdx);
6494 }
6495 movq(tmp, rax);
6496 shrq(rax, 63); // quotient
6497 sarq(tmp, 63);
6498 andq(tmp, divisor);
6499 subq(rdx, tmp); // remainder
6500 bind(done);
6501 }
6502
6503 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6504 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6505 int vlen_enc) {
6506 assert(VM_Version::supports_avx512bw(), "");
6507 // Byte shuffles are inlane operations and indices are determined using
6508 // lower 4 bit of each shuffle lane, thus all shuffle indices are
6509 // normalized to index range 0-15. This makes sure that all the multiples
6510 // of an index value are placed at same relative position in 128 bit
6511 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6512 // will be 16th element in their respective 128 bit lanes.
6513 movl(rtmp, 16);
6514 evpbroadcastb(xtmp1, rtmp, vlen_enc);
6515
6516 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6517 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6518 // original shuffle indices and move the shuffled lanes corresponding to true
6519 // mask to destination vector.
6520 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6521 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6522 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6523
6524 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6525 // and broadcasting second 128 bit lane.
6526 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
6527 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6528 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6529 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6530 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6531
6532 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6533 // and broadcasting third 128 bit lane.
6534 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc);
6535 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6536 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6537 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6538 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6539
6540 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6541 // and broadcasting third 128 bit lane.
6542 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
6543 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6544 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6545 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6546 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6547 }
6548
6549 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6550 XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6551 if (vlen_enc == AVX_128bit) {
6552 vpermilps(dst, src, shuffle, vlen_enc);
6553 } else if (bt == T_INT) {
6554 vpermd(dst, shuffle, src, vlen_enc);
6555 } else {
6556 assert(bt == T_FLOAT, "");
6557 vpermps(dst, shuffle, src, vlen_enc);
6558 }
6559 }
6560
6561 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6562 switch(opcode) {
6563 case Op_AddHF: vaddsh(dst, src1, src2); break;
6564 case Op_SubHF: vsubsh(dst, src1, src2); break;
6565 case Op_MulHF: vmulsh(dst, src1, src2); break;
6566 case Op_DivHF: vdivsh(dst, src1, src2); break;
6567 default: assert(false, "%s", NodeClassNames[opcode]); break;
6568 }
6569 }
6570
6571 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6572 switch(elem_bt) {
6573 case T_BYTE:
6574 if (ideal_opc == Op_SaturatingAddV) {
6575 vpaddsb(dst, src1, src2, vlen_enc);
6576 } else {
6577 assert(ideal_opc == Op_SaturatingSubV, "");
6578 vpsubsb(dst, src1, src2, vlen_enc);
6579 }
6580 break;
6581 case T_SHORT:
6582 if (ideal_opc == Op_SaturatingAddV) {
6583 vpaddsw(dst, src1, src2, vlen_enc);
6584 } else {
6585 assert(ideal_opc == Op_SaturatingSubV, "");
6586 vpsubsw(dst, src1, src2, vlen_enc);
6587 }
6588 break;
6589 default:
6590 fatal("Unsupported type %s", type2name(elem_bt));
6591 break;
6592 }
6593 }
6594
6595 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6596 switch(elem_bt) {
6597 case T_BYTE:
6598 if (ideal_opc == Op_SaturatingAddV) {
6599 vpaddusb(dst, src1, src2, vlen_enc);
6600 } else {
6601 assert(ideal_opc == Op_SaturatingSubV, "");
6602 vpsubusb(dst, src1, src2, vlen_enc);
6603 }
6604 break;
6605 case T_SHORT:
6606 if (ideal_opc == Op_SaturatingAddV) {
6607 vpaddusw(dst, src1, src2, vlen_enc);
6608 } else {
6609 assert(ideal_opc == Op_SaturatingSubV, "");
6610 vpsubusw(dst, src1, src2, vlen_enc);
6611 }
6612 break;
6613 default:
6614 fatal("Unsupported type %s", type2name(elem_bt));
6615 break;
6616 }
6617 }
6618
6619 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6620 XMMRegister src2, KRegister ktmp, int vlen_enc) {
6621 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6622 // overflow_mask = Inp1 <u Inp2
6623 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc);
6624 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6625 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6626 }
6627
6628 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6629 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6630 // Emulate unsigned comparison using signed comparison
6631 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6632 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6633 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6634 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6635
6636 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6637
6638 // Res = INP1 - INP2 (non-commutative and non-associative)
6639 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6640 // Res = Mask ? Zero : Res
6641 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6642 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6643 }
6644
6645 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6646 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6647 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6648 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6649 // Res = Signed Add INP1, INP2
6650 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6651 // T1 = SRC1 | SRC2
6652 vpor(xtmp1, src1, src2, vlen_enc);
6653 // Max_Unsigned = -1
6654 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6655 // Unsigned compare: Mask = Res <u T1
6656 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6657 // res = Mask ? Max_Unsigned : Res
6658 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc);
6659 }
6660
6661 //
6662 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6663 // unsigned addition operation.
6664 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6665 //
6666 // We empirically determined its semantic equivalence to following reduced expression
6667 // overflow_mask = (a + b) <u (a | b)
6668 //
6669 // and also verified it though Alive2 solver.
6670 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6671 //
6672
6673 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6674 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6675 // Res = Signed Add INP1, INP2
6676 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6677 // Compute T1 = INP1 | INP2
6678 vpor(xtmp3, src1, src2, vlen_enc);
6679 // T1 = Minimum signed value.
6680 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6681 // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6682 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6683 // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6684 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6685 // Compute overflow detection mask = Res<1> <s T1
6686 if (elem_bt == T_INT) {
6687 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6688 } else {
6689 assert(elem_bt == T_LONG, "");
6690 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6691 }
6692 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6693 }
6694
6695 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6696 int vlen_enc, bool xtmp2_hold_M1) {
6697 if (VM_Version::supports_avx512dq()) {
6698 evpmovq2m(ktmp, src, vlen_enc);
6699 } else {
6700 assert(VM_Version::supports_evex(), "");
6701 if (!xtmp2_hold_M1) {
6702 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6703 }
6704 evpsraq(xtmp1, src, 63, vlen_enc);
6705 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6706 }
6707 }
6708
6709 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6710 int vlen_enc, bool xtmp2_hold_M1) {
6711 if (VM_Version::supports_avx512dq()) {
6712 evpmovd2m(ktmp, src, vlen_enc);
6713 } else {
6714 assert(VM_Version::supports_evex(), "");
6715 if (!xtmp2_hold_M1) {
6716 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6717 }
6718 vpsrad(xtmp1, src, 31, vlen_enc);
6719 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6720 }
6721 }
6722
6723
6724 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6725 if (elem_bt == T_LONG) {
6726 if (VM_Version::supports_evex()) {
6727 evpsraq(dst, src, 63, vlen_enc);
6728 } else {
6729 vpsrad(dst, src, 31, vlen_enc);
6730 vpshufd(dst, dst, 0xF5, vlen_enc);
6731 }
6732 } else {
6733 assert(elem_bt == T_INT, "");
6734 vpsrad(dst, src, 31, vlen_enc);
6735 }
6736 }
6737
6738 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6739 if (compute_allones) {
6740 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6741 vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6742 } else {
6743 vpcmpeqq(allones, allones, allones, vlen_enc);
6744 }
6745 }
6746 if (elem_bt == T_LONG) {
6747 vpsrlq(dst, allones, 1, vlen_enc);
6748 } else {
6749 assert(elem_bt == T_INT, "");
6750 vpsrld(dst, allones, 1, vlen_enc);
6751 }
6752 }
6753
6754 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6755 if (compute_allones) {
6756 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6757 vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6758 } else {
6759 vpcmpeqq(allones, allones, allones, vlen_enc);
6760 }
6761 }
6762 if (elem_bt == T_LONG) {
6763 vpsllq(dst, allones, 63, vlen_enc);
6764 } else {
6765 assert(elem_bt == T_INT, "");
6766 vpslld(dst, allones, 31, vlen_enc);
6767 }
6768 }
6769
6770 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2,
6771 Assembler::ComparisonPredicate cond, int vlen_enc) {
6772 switch(elem_bt) {
6773 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6774 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6775 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6776 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6777 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6778 }
6779 }
6780
6781 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6782 switch(elem_bt) {
6783 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break;
6784 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break;
6785 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6786 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break;
6787 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6788 }
6789 }
6790
6791 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6792 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6793 if (elem_bt == T_LONG) {
6794 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6795 } else {
6796 assert(elem_bt == T_INT, "");
6797 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6798 }
6799 }
6800
6801 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6802 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6803 KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6804 assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6805 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6806 // Overflow detection based on Hacker's delight section 2-13.
6807 if (ideal_opc == Op_SaturatingAddV) {
6808 // res = src1 + src2
6809 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6810 // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6811 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6812 vpxor(xtmp1, dst, src1, vlen_enc);
6813 vpxor(xtmp2, dst, src2, vlen_enc);
6814 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6815 } else {
6816 assert(ideal_opc == Op_SaturatingSubV, "");
6817 // res = src1 - src2
6818 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6819 // Overflow occurs when both inputs have opposite polarity and
6820 // result polarity does not comply with first input polarity.
6821 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6822 vpxor(xtmp1, src1, src2, vlen_enc);
6823 vpxor(xtmp2, dst, src1, vlen_enc);
6824 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6825 }
6826
6827 // Compute overflow detection mask.
6828 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6829 // Note: xtmp1 hold -1 in all its lanes after above call.
6830
6831 // Compute mask based on first input polarity.
6832 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6833
6834 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6835 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6836
6837 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6838 // set bits in first input polarity mask holds a min value.
6839 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6840 // Blend destination lanes with saturated values using overflow detection mask.
6841 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6842 }
6843
6844
6845 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6846 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6847 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6848 assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6849 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6850 // Overflow detection based on Hacker's delight section 2-13.
6851 if (ideal_opc == Op_SaturatingAddV) {
6852 // res = src1 + src2
6853 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6854 // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6855 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6856 vpxor(xtmp1, dst, src1, vlen_enc);
6857 vpxor(xtmp2, dst, src2, vlen_enc);
6858 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6859 } else {
6860 assert(ideal_opc == Op_SaturatingSubV, "");
6861 // res = src1 - src2
6862 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6863 // Overflow occurs when both inputs have opposite polarity and
6864 // result polarity does not comply with first input polarity.
6865 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6866 vpxor(xtmp1, src1, src2, vlen_enc);
6867 vpxor(xtmp2, dst, src1, vlen_enc);
6868 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6869 }
6870
6871 // Sign-extend to compute overflow detection mask.
6872 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6873
6874 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6875 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6876 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6877
6878 // Compose saturating min/max vector using first input polarity mask.
6879 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6880 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6881
6882 // Blend result with saturating vector using overflow detection mask.
6883 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6884 }
6885
6886 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6887 switch(elem_bt) {
6888 case T_BYTE:
6889 if (ideal_opc == Op_SaturatingAddV) {
6890 vpaddsb(dst, src1, src2, vlen_enc);
6891 } else {
6892 assert(ideal_opc == Op_SaturatingSubV, "");
6893 vpsubsb(dst, src1, src2, vlen_enc);
6894 }
6895 break;
6896 case T_SHORT:
6897 if (ideal_opc == Op_SaturatingAddV) {
6898 vpaddsw(dst, src1, src2, vlen_enc);
6899 } else {
6900 assert(ideal_opc == Op_SaturatingSubV, "");
6901 vpsubsw(dst, src1, src2, vlen_enc);
6902 }
6903 break;
6904 default:
6905 fatal("Unsupported type %s", type2name(elem_bt));
6906 break;
6907 }
6908 }
6909
6910 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6911 switch(elem_bt) {
6912 case T_BYTE:
6913 if (ideal_opc == Op_SaturatingAddV) {
6914 vpaddusb(dst, src1, src2, vlen_enc);
6915 } else {
6916 assert(ideal_opc == Op_SaturatingSubV, "");
6917 vpsubusb(dst, src1, src2, vlen_enc);
6918 }
6919 break;
6920 case T_SHORT:
6921 if (ideal_opc == Op_SaturatingAddV) {
6922 vpaddusw(dst, src1, src2, vlen_enc);
6923 } else {
6924 assert(ideal_opc == Op_SaturatingSubV, "");
6925 vpsubusw(dst, src1, src2, vlen_enc);
6926 }
6927 break;
6928 default:
6929 fatal("Unsupported type %s", type2name(elem_bt));
6930 break;
6931 }
6932 }
6933
6934 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6935 XMMRegister src2, int vlen_enc) {
6936 switch(elem_bt) {
6937 case T_BYTE:
6938 evpermi2b(dst, src1, src2, vlen_enc);
6939 break;
6940 case T_SHORT:
6941 evpermi2w(dst, src1, src2, vlen_enc);
6942 break;
6943 case T_INT:
6944 evpermi2d(dst, src1, src2, vlen_enc);
6945 break;
6946 case T_LONG:
6947 evpermi2q(dst, src1, src2, vlen_enc);
6948 break;
6949 case T_FLOAT:
6950 evpermi2ps(dst, src1, src2, vlen_enc);
6951 break;
6952 case T_DOUBLE:
6953 evpermi2pd(dst, src1, src2, vlen_enc);
6954 break;
6955 default:
6956 fatal("Unsupported type %s", type2name(elem_bt));
6957 break;
6958 }
6959 }
6960
6961 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
6962 if (is_unsigned) {
6963 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6964 } else {
6965 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6966 }
6967 }
6968
6969 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
6970 if (is_unsigned) {
6971 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6972 } else {
6973 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6974 }
6975 }
6976
6977 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6978 switch(opcode) {
6979 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
6980 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
6981 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
6982 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
6983 default: assert(false, "%s", NodeClassNames[opcode]); break;
6984 }
6985 }
6986
6987 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6988 switch(opcode) {
6989 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
6990 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
6991 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
6992 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
6993 default: assert(false, "%s", NodeClassNames[opcode]); break;
6994 }
6995 }
6996
6997 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6998 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
6999 vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7000 }
7001
7002 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7003 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7004 if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7005 // Move sign bits of src2 to mask register.
7006 evpmovw2m(ktmp, src2, vlen_enc);
7007 // xtmp1 = src2 < 0 ? src2 : src1
7008 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7009 // xtmp2 = src2 < 0 ? ? src1 : src2
7010 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7011 // Idea behind above swapping is to make seconds source operand a +ve value.
7012 // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7013 // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7014 // the second source operand, either a NaN or a valid floating-point value, is returned
7015 // dst = max(xtmp1, xtmp2)
7016 evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7017 // isNaN = is_unordered_quiet(xtmp1)
7018 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7019 // Final result is same as first source if its a NaN value,
7020 // in case second operand holds a NaN value then as per above semantics
7021 // result is same as second operand.
7022 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7023 } else {
7024 assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7025 // Move sign bits of src1 to mask register.
7026 evpmovw2m(ktmp, src1, vlen_enc);
7027 // xtmp1 = src1 < 0 ? src2 : src1
7028 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7029 // xtmp2 = src1 < 0 ? src1 : src2
7030 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7031 // Idea behind above swapping is to make seconds source operand a -ve value.
7032 // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7033 // the second source operand is returned.
7034 // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7035 // or a valid floating-point value, is written to the result.
7036 // dst = min(xtmp1, xtmp2)
7037 evminph(dst, xtmp1, xtmp2, vlen_enc);
7038 // isNaN = is_unordered_quiet(xtmp1)
7039 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7040 // Final result is same as first source if its a NaN value,
7041 // in case second operand holds a NaN value then as per above semantics
7042 // result is same as second operand.
7043 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7044 }
7045 }