1 /*
2 * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "asm/assembler.hpp"
26 #include "asm/assembler.inline.hpp"
27 #include "gc/shared/barrierSet.hpp"
28 #include "gc/shared/barrierSetAssembler.hpp"
29 #include "oops/methodData.hpp"
30 #include "opto/c2_MacroAssembler.hpp"
31 #include "opto/intrinsicnode.hpp"
32 #include "opto/output.hpp"
33 #include "opto/opcodes.hpp"
34 #include "opto/subnode.hpp"
35 #include "runtime/globals.hpp"
36 #include "runtime/objectMonitor.hpp"
37 #include "runtime/stubRoutines.hpp"
38 #include "utilities/checkedCast.hpp"
39 #include "utilities/globalDefinitions.hpp"
40 #include "utilities/powerOfTwo.hpp"
41 #include "utilities/sizes.hpp"
42
43 #ifdef PRODUCT
44 #define BLOCK_COMMENT(str) /* nothing */
45 #define STOP(error) stop(error)
46 #else
47 #define BLOCK_COMMENT(str) block_comment(str)
48 #define STOP(error) block_comment(error); stop(error)
49 #endif
50
51 // C2 compiled method's prolog code.
52 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
53 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
54
55 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
56 // Remove word for return addr
57 framesize -= wordSize;
58 stack_bang_size -= wordSize;
59
60 // Calls to C2R adapters often do not accept exceptional returns.
61 // We require that their callers must bang for them. But be careful, because
62 // some VM calls (such as call site linkage) can use several kilobytes of
63 // stack. But the stack safety zone should account for that.
64 // See bugs 4446381, 4468289, 4497237.
65 if (stack_bang_size > 0) {
66 generate_stack_overflow_check(stack_bang_size);
67
68 // We always push rbp, so that on return to interpreter rbp, will be
69 // restored correctly and we can correct the stack.
70 push(rbp);
71 // Save caller's stack pointer into RBP if the frame pointer is preserved.
72 if (PreserveFramePointer) {
73 mov(rbp, rsp);
74 }
75 // Remove word for ebp
76 framesize -= wordSize;
77
78 // Create frame
79 if (framesize) {
80 subptr(rsp, framesize);
81 }
82 } else {
83 subptr(rsp, framesize);
84
85 // Save RBP register now.
86 framesize -= wordSize;
87 movptr(Address(rsp, framesize), rbp);
88 // Save caller's stack pointer into RBP if the frame pointer is preserved.
89 if (PreserveFramePointer) {
90 movptr(rbp, rsp);
91 if (framesize > 0) {
92 addptr(rbp, framesize);
93 }
94 }
95 }
96
97 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
98 framesize -= wordSize;
99 movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
100 }
101
102 #ifdef ASSERT
103 if (VerifyStackAtCalls) {
104 Label L;
105 push(rax);
106 mov(rax, rsp);
107 andptr(rax, StackAlignmentInBytes-1);
108 cmpptr(rax, StackAlignmentInBytes-wordSize);
109 pop(rax);
110 jcc(Assembler::equal, L);
111 STOP("Stack is not properly aligned!");
112 bind(L);
113 }
114 #endif
115
116 if (!is_stub) {
117 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
118 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
119 Label dummy_slow_path;
120 Label dummy_continuation;
121 Label* slow_path = &dummy_slow_path;
122 Label* continuation = &dummy_continuation;
123 if (!Compile::current()->output()->in_scratch_emit_size()) {
124 // Use real labels from actual stub when not emitting code for the purpose of measuring its size
125 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
126 Compile::current()->output()->add_stub(stub);
127 slow_path = &stub->entry();
128 continuation = &stub->continuation();
129 }
130 bs->nmethod_entry_barrier(this, slow_path, continuation);
131 }
132 }
133
134 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
135 switch (vlen_in_bytes) {
136 case 4: // fall-through
137 case 8: // fall-through
138 case 16: return Assembler::AVX_128bit;
139 case 32: return Assembler::AVX_256bit;
140 case 64: return Assembler::AVX_512bit;
141
142 default: {
143 ShouldNotReachHere();
144 return Assembler::AVX_NoVec;
145 }
146 }
147 }
148
149 // fast_lock and fast_unlock used by C2
150
151 // Because the transitions from emitted code to the runtime
152 // monitorenter/exit helper stubs are so slow it's critical that
153 // we inline both the stack-locking fast path and the inflated fast path.
154 //
155 // See also: cmpFastLock and cmpFastUnlock.
156 //
157 // What follows is a specialized inline transliteration of the code
158 // in enter() and exit(). If we're concerned about I$ bloat another
159 // option would be to emit TrySlowEnter and TrySlowExit methods
160 // at startup-time. These methods would accept arguments as
161 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
162 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply
163 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
164 // In practice, however, the # of lock sites is bounded and is usually small.
165 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
166 // if the processor uses simple bimodal branch predictors keyed by EIP
167 // Since the helper routines would be called from multiple synchronization
168 // sites.
169 //
170 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
171 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
172 // to those specialized methods. That'd give us a mostly platform-independent
173 // implementation that the JITs could optimize and inline at their pleasure.
174 // Done correctly, the only time we'd need to cross to native could would be
175 // to park() or unpark() threads. We'd also need a few more unsafe operators
176 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
177 // (b) explicit barriers or fence operations.
178 //
179 // TODO:
180 //
181 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
182 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
183 // Given TLAB allocation, Self is usually manifested in a register, so passing it into
184 // the lock operators would typically be faster than reifying Self.
185 //
186 // * Ideally I'd define the primitives as:
187 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
188 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
189 // Unfortunately ADLC bugs prevent us from expressing the ideal form.
190 // Instead, we're stuck with a rather awkward and brittle register assignments below.
191 // Furthermore the register assignments are overconstrained, possibly resulting in
192 // sub-optimal code near the synchronization site.
193 //
194 // * Eliminate the sp-proximity tests and just use "== Self" tests instead.
195 // Alternately, use a better sp-proximity test.
196 //
197 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
198 // Either one is sufficient to uniquely identify a thread.
199 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
200 //
201 // * Intrinsify notify() and notifyAll() for the common cases where the
202 // object is locked by the calling thread but the waitlist is empty.
203 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
204 //
205 // * use jccb and jmpb instead of jcc and jmp to improve code density.
206 // But beware of excessive branch density on AMD Opterons.
207 //
208 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success
209 // or failure of the fast path. If the fast path fails then we pass
210 // control to the slow path, typically in C. In fast_lock and
211 // fast_unlock we often branch to DONE_LABEL, just to find that C2
212 // will emit a conditional branch immediately after the node.
213 // So we have branches to branches and lots of ICC.ZF games.
214 // Instead, it might be better to have C2 pass a "FailureLabel"
215 // into fast_lock and fast_unlock. In the case of success, control
216 // will drop through the node. ICC.ZF is undefined at exit.
217 // In the case of failure, the node will branch directly to the
218 // FailureLabel
219
220
221 // obj: object to lock
222 // box: on-stack box address -- KILLED
223 // rax: tmp -- KILLED
224 // t : tmp -- KILLED
225 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register rax_reg,
226 Register t, Register thread) {
227 assert(rax_reg == rax, "Used for CAS");
228 assert_different_registers(obj, box, rax_reg, t, thread);
229
230 // Handle inflated monitor.
231 Label inflated;
232 // Finish fast lock successfully. ZF value is irrelevant.
233 Label locked;
234 // Finish fast lock unsuccessfully. MUST jump with ZF == 0
235 Label slow_path;
236
237 if (UseObjectMonitorTable) {
238 // Clear cache in case fast locking succeeds or we need to take the slow-path.
239 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
240 }
241
242 if (DiagnoseSyncOnValueBasedClasses != 0) {
243 load_klass(rax_reg, obj, t);
244 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
245 jcc(Assembler::notZero, slow_path);
246 }
247
248 const Register mark = t;
249
250 { // Fast Lock
251
252 Label push;
253
254 const Register top = UseObjectMonitorTable ? rax_reg : box;
255
256 // Load the mark.
257 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
258
259 // Prefetch top.
260 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
261
262 // Check for monitor (0b10).
263 testptr(mark, markWord::monitor_value);
264 jcc(Assembler::notZero, inflated);
265
266 // Check if lock-stack is full.
267 cmpl(top, LockStack::end_offset() - 1);
268 jcc(Assembler::greater, slow_path);
269
270 // Check if recursive.
271 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
272 jccb(Assembler::equal, push);
273
274 // Try to lock. Transition lock bits 0b01 => 0b00
275 movptr(rax_reg, mark);
276 orptr(rax_reg, markWord::unlocked_value);
277 andptr(mark, ~(int32_t)markWord::unlocked_value);
278 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
279 jcc(Assembler::notEqual, slow_path);
280
281 if (UseObjectMonitorTable) {
282 // Need to reload top, clobbered by CAS.
283 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
284 }
285 bind(push);
286 // After successful lock, push object on lock-stack.
287 movptr(Address(thread, top), obj);
288 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
289 jmpb(locked);
290 }
291
292 { // Handle inflated monitor.
293 bind(inflated);
294
295 const Register monitor = t;
296
297 if (!UseObjectMonitorTable) {
298 assert(mark == monitor, "should be the same here");
299 } else {
300 // Uses ObjectMonitorTable. Look for the monitor in the om_cache.
301 // Fetch ObjectMonitor* from the cache or take the slow-path.
302 Label monitor_found;
303
304 // Load cache address
305 lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
306
307 const int num_unrolled = 2;
308 for (int i = 0; i < num_unrolled; i++) {
309 cmpptr(obj, Address(t));
310 jccb(Assembler::equal, monitor_found);
311 increment(t, in_bytes(OMCache::oop_to_oop_difference()));
312 }
313
314 Label loop;
315
316 // Search for obj in cache.
317 bind(loop);
318
319 // Check for match.
320 cmpptr(obj, Address(t));
321 jccb(Assembler::equal, monitor_found);
322
323 // Search until null encountered, guaranteed _null_sentinel at end.
324 cmpptr(Address(t), 1);
325 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
326 increment(t, in_bytes(OMCache::oop_to_oop_difference()));
327 jmpb(loop);
328
329 // Cache hit.
330 bind(monitor_found);
331 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
332 }
333 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
334 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
335 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
336
337 Label monitor_locked;
338 // Lock the monitor.
339
340 if (UseObjectMonitorTable) {
341 // Cache the monitor for unlock before trashing box. On failure to acquire
342 // the lock, the slow path will reset the entry accordingly (see CacheSetter).
343 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
344 }
345
346 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
347 xorptr(rax_reg, rax_reg);
348 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
349 lock(); cmpxchgptr(box, owner_address);
350 jccb(Assembler::equal, monitor_locked);
351
352 // Check if recursive.
353 cmpptr(box, rax_reg);
354 jccb(Assembler::notEqual, slow_path);
355
356 // Recursive.
357 increment(recursions_address);
358
359 bind(monitor_locked);
360 }
361
362 bind(locked);
363 // Set ZF = 1
364 xorl(rax_reg, rax_reg);
365
366 #ifdef ASSERT
367 // Check that locked label is reached with ZF set.
368 Label zf_correct;
369 Label zf_bad_zero;
370 jcc(Assembler::zero, zf_correct);
371 jmp(zf_bad_zero);
372 #endif
373
374 bind(slow_path);
375 #ifdef ASSERT
376 // Check that slow_path label is reached with ZF not set.
377 jcc(Assembler::notZero, zf_correct);
378 stop("Fast Lock ZF != 0");
379 bind(zf_bad_zero);
380 stop("Fast Lock ZF != 1");
381 bind(zf_correct);
382 #endif
383 // C2 uses the value of ZF to determine the continuation.
384 }
385
386 // obj: object to lock
387 // rax: tmp -- KILLED
388 // t : tmp - cannot be obj nor rax -- KILLED
389 //
390 // Some commentary on balanced locking:
391 //
392 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
393 // Methods that don't have provably balanced locking are forced to run in the
394 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
395 // The interpreter provides two properties:
396 // I1: At return-time the interpreter automatically and quietly unlocks any
397 // objects acquired in the current activation (frame). Recall that the
398 // interpreter maintains an on-stack list of locks currently held by
399 // a frame.
400 // I2: If a method attempts to unlock an object that is not held by the
401 // frame the interpreter throws IMSX.
402 //
403 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
404 // B() doesn't have provably balanced locking so it runs in the interpreter.
405 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
406 // is still locked by A().
407 //
408 // The only other source of unbalanced locking would be JNI. The "Java Native Interface
409 // Specification" states that an object locked by JNI's MonitorEnter should not be
410 // unlocked by "normal" java-level locking and vice-versa. The specification doesn't
411 // specify what will occur if a program engages in such mixed-mode locking, however.
412 // Arguably given that the spec legislates the JNI case as undefined our implementation
413 // could reasonably *avoid* checking owner in fast_unlock().
414 // In the interest of performance we elide m->Owner==Self check in unlock.
415 // A perfectly viable alternative is to elide the owner check except when
416 // Xcheck:jni is enabled.
417
418 void C2_MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register t, Register thread) {
419 assert(reg_rax == rax, "Used for CAS");
420 assert_different_registers(obj, reg_rax, t);
421
422 // Handle inflated monitor.
423 Label inflated, inflated_check_lock_stack;
424 // Finish fast unlock successfully. MUST jump with ZF == 1
425 Label unlocked, slow_path;
426
427 const Register mark = t;
428 const Register monitor = t;
429 const Register top = UseObjectMonitorTable ? t : reg_rax;
430 const Register box = reg_rax;
431
432 Label dummy;
433 C2FastUnlockStub* stub = nullptr;
434
435 if (!Compile::current()->output()->in_scratch_emit_size()) {
436 stub = new (Compile::current()->comp_arena()) C2FastUnlockStub(obj, mark, reg_rax, thread);
437 Compile::current()->output()->add_stub(stub);
438 }
439
440 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
441
442 { // Fast Unlock
443
444 // Load top.
445 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
446
447 if (!UseObjectMonitorTable) {
448 // Prefetch mark.
449 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
450 }
451
452 // Check if obj is top of lock-stack.
453 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
454 // Top of lock stack was not obj. Must be monitor.
455 jcc(Assembler::notEqual, inflated_check_lock_stack);
456
457 // Pop lock-stack.
458 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
459 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
460
461 // Check if recursive.
462 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
463 jcc(Assembler::equal, unlocked);
464
465 // We elide the monitor check, let the CAS fail instead.
466
467 if (UseObjectMonitorTable) {
468 // Load mark.
469 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
470 }
471
472 // Try to unlock. Transition lock bits 0b00 => 0b01
473 movptr(reg_rax, mark);
474 andptr(reg_rax, ~(int32_t)markWord::lock_mask);
475 orptr(mark, markWord::unlocked_value);
476 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
477 jcc(Assembler::notEqual, push_and_slow_path);
478 jmp(unlocked);
479 }
480
481
482 { // Handle inflated monitor.
483 bind(inflated_check_lock_stack);
484 #ifdef ASSERT
485 Label check_done;
486 subl(top, oopSize);
487 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
488 jcc(Assembler::below, check_done);
489 cmpptr(obj, Address(thread, top));
490 jccb(Assembler::notEqual, inflated_check_lock_stack);
491 stop("Fast Unlock lock on stack");
492 bind(check_done);
493 if (UseObjectMonitorTable) {
494 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
495 }
496 testptr(mark, markWord::monitor_value);
497 jccb(Assembler::notZero, inflated);
498 stop("Fast Unlock not monitor");
499 #endif
500
501 bind(inflated);
502
503 if (!UseObjectMonitorTable) {
504 assert(mark == monitor, "should be the same here");
505 } else {
506 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack.
507 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
508 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
509 cmpptr(monitor, alignof(ObjectMonitor*));
510 jcc(Assembler::below, slow_path);
511 }
512 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
513 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
514 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
515 const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
516 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
517
518 Label recursive;
519
520 // Check if recursive.
521 cmpptr(recursions_address, 0);
522 jccb(Assembler::notZero, recursive);
523
524 // Set owner to null.
525 // Release to satisfy the JMM
526 movptr(owner_address, NULL_WORD);
527 // We need a full fence after clearing owner to avoid stranding.
528 // StoreLoad achieves this.
529 membar(StoreLoad);
530
531 // Check if the entry_list is empty.
532 cmpptr(entry_list_address, NULL_WORD);
533 jccb(Assembler::zero, unlocked); // If so we are done.
534
535 // Check if there is a successor.
536 cmpptr(succ_address, NULL_WORD);
537 jccb(Assembler::notZero, unlocked); // If so we are done.
538
539 // Save the monitor pointer in the current thread, so we can try to
540 // reacquire the lock in SharedRuntime::monitor_exit_helper().
541 if (!UseObjectMonitorTable) {
542 andptr(monitor, ~(int32_t)markWord::monitor_value);
543 }
544 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
545
546 orl(t, 1); // Fast Unlock ZF = 0
547 jmpb(slow_path);
548
549 // Recursive unlock.
550 bind(recursive);
551 decrement(recursions_address);
552 }
553
554 bind(unlocked);
555 xorl(t, t); // Fast Unlock ZF = 1
556
557 #ifdef ASSERT
558 // Check that unlocked label is reached with ZF set.
559 Label zf_correct;
560 Label zf_bad_zero;
561 jcc(Assembler::zero, zf_correct);
562 jmp(zf_bad_zero);
563 #endif
564
565 bind(slow_path);
566 if (stub != nullptr) {
567 bind(stub->slow_path_continuation());
568 }
569 #ifdef ASSERT
570 // Check that stub->continuation() label is reached with ZF not set.
571 jcc(Assembler::notZero, zf_correct);
572 stop("Fast Unlock ZF != 0");
573 bind(zf_bad_zero);
574 stop("Fast Unlock ZF != 1");
575 bind(zf_correct);
576 #endif
577 // C2 uses the value of ZF to determine the continuation.
578 }
579
580 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
581 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
582 }
583
584 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
585 const int framesize = Compile::current()->output()->frame_size_in_bytes();
586 masm->movptr(dst, rsp);
587 if (framesize > 2 * wordSize) {
588 masm->addptr(dst, framesize - 2 * wordSize);
589 }
590 }
591
592 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
593 if (PreserveFramePointer) {
594 // frame pointer is valid
595 #ifdef ASSERT
596 // Verify frame pointer value in rbp.
597 reconstruct_frame_pointer_helper(this, rtmp);
598 Label L_success;
599 cmpq(rbp, rtmp);
600 jccb(Assembler::equal, L_success);
601 STOP("frame pointer mismatch");
602 bind(L_success);
603 #endif // ASSERT
604 } else {
605 reconstruct_frame_pointer_helper(this, rbp);
606 }
607 }
608
609 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
610 jint lo = t->_lo;
611 jint hi = t->_hi;
612 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
613 if (t == TypeInt::INT) {
614 return;
615 }
616
617 BLOCK_COMMENT("CastII {");
618 Label fail;
619 Label succeed;
620
621 if (lo != min_jint) {
622 cmpl(val, lo);
623 jccb(Assembler::less, fail);
624 }
625 if (hi != max_jint) {
626 cmpl(val, hi);
627 jccb(Assembler::greater, fail);
628 }
629 jmpb(succeed);
630
631 bind(fail);
632 movl(c_rarg0, idx);
633 movl(c_rarg1, val);
634 movl(c_rarg2, lo);
635 movl(c_rarg3, hi);
636 reconstruct_frame_pointer(rscratch1);
637 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
638 hlt();
639 bind(succeed);
640 BLOCK_COMMENT("} // CastII");
641 }
642
643 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
644 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
645 }
646
647 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
648 jlong lo = t->_lo;
649 jlong hi = t->_hi;
650 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
651 if (t == TypeLong::LONG) {
652 return;
653 }
654
655 BLOCK_COMMENT("CastLL {");
656 Label fail;
657 Label succeed;
658
659 auto cmp_val = [&](jlong bound) {
660 if (is_simm32(bound)) {
661 cmpq(val, checked_cast<int>(bound));
662 } else {
663 mov64(tmp, bound);
664 cmpq(val, tmp);
665 }
666 };
667
668 if (lo != min_jlong) {
669 cmp_val(lo);
670 jccb(Assembler::less, fail);
671 }
672 if (hi != max_jlong) {
673 cmp_val(hi);
674 jccb(Assembler::greater, fail);
675 }
676 jmpb(succeed);
677
678 bind(fail);
679 movl(c_rarg0, idx);
680 movq(c_rarg1, val);
681 mov64(c_rarg2, lo);
682 mov64(c_rarg3, hi);
683 reconstruct_frame_pointer(rscratch1);
684 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
685 hlt();
686 bind(succeed);
687 BLOCK_COMMENT("} // CastLL");
688 }
689
690 //-------------------------------------------------------------------------------------------
691 // Generic instructions support for use in .ad files C2 code generation
692
693 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
694 if (dst != src) {
695 movdqu(dst, src);
696 }
697 if (opcode == Op_AbsVD) {
698 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
699 } else {
700 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
701 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
702 }
703 }
704
705 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
706 if (opcode == Op_AbsVD) {
707 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
708 } else {
709 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
710 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
711 }
712 }
713
714 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
715 if (dst != src) {
716 movdqu(dst, src);
717 }
718 if (opcode == Op_AbsVF) {
719 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
720 } else {
721 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
722 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
723 }
724 }
725
726 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
727 if (opcode == Op_AbsVF) {
728 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
729 } else {
730 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
731 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
732 }
733 }
734
735 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
736 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
737 assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
738
739 if (opcode == Op_MinV) {
740 if (elem_bt == T_BYTE) {
741 pminsb(dst, src);
742 } else if (elem_bt == T_SHORT) {
743 pminsw(dst, src);
744 } else if (elem_bt == T_INT) {
745 pminsd(dst, src);
746 } else {
747 assert(elem_bt == T_LONG, "required");
748 assert(tmp == xmm0, "required");
749 assert_different_registers(dst, src, tmp);
750 movdqu(xmm0, dst);
751 pcmpgtq(xmm0, src);
752 blendvpd(dst, src); // xmm0 as mask
753 }
754 } else { // opcode == Op_MaxV
755 if (elem_bt == T_BYTE) {
756 pmaxsb(dst, src);
757 } else if (elem_bt == T_SHORT) {
758 pmaxsw(dst, src);
759 } else if (elem_bt == T_INT) {
760 pmaxsd(dst, src);
761 } else {
762 assert(elem_bt == T_LONG, "required");
763 assert(tmp == xmm0, "required");
764 assert_different_registers(dst, src, tmp);
765 movdqu(xmm0, src);
766 pcmpgtq(xmm0, dst);
767 blendvpd(dst, src); // xmm0 as mask
768 }
769 }
770 }
771
772 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
773 XMMRegister src1, Address src2, int vlen_enc) {
774 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
775 if (opcode == Op_UMinV) {
776 switch(elem_bt) {
777 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break;
778 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
779 case T_INT: vpminud(dst, src1, src2, vlen_enc); break;
780 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
781 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
782 }
783 } else {
784 assert(opcode == Op_UMaxV, "required");
785 switch(elem_bt) {
786 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break;
787 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
788 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break;
789 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
790 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
791 }
792 }
793 }
794
795 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
796 // For optimality, leverage a full vector width of 512 bits
797 // for operations over smaller vector sizes on AVX512 targets.
798 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
799 if (opcode == Op_UMaxV) {
800 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
801 } else {
802 assert(opcode == Op_UMinV, "required");
803 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
804 }
805 } else {
806 // T1 = -1
807 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
808 // T1 = -1 << 63
809 vpsllq(xtmp1, xtmp1, 63, vlen_enc);
810 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
811 vpaddq(xtmp2, xtmp1, src2, vlen_enc);
812 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
813 vpaddq(xtmp1, xtmp1, src1, vlen_enc);
814 // Mask = T2 > T1
815 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
816 if (opcode == Op_UMaxV) {
817 // Res = Mask ? Src2 : Src1
818 vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
819 } else {
820 // Res = Mask ? Src1 : Src2
821 vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
822 }
823 }
824 }
825
826 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
827 XMMRegister src1, XMMRegister src2, int vlen_enc) {
828 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
829 if (opcode == Op_UMinV) {
830 switch(elem_bt) {
831 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break;
832 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
833 case T_INT: vpminud(dst, src1, src2, vlen_enc); break;
834 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
835 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
836 }
837 } else {
838 assert(opcode == Op_UMaxV, "required");
839 switch(elem_bt) {
840 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break;
841 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
842 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break;
843 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
844 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
845 }
846 }
847 }
848
849 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
850 XMMRegister dst, XMMRegister src1, XMMRegister src2,
851 int vlen_enc) {
852 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
853
854 if (opcode == Op_MinV) {
855 if (elem_bt == T_BYTE) {
856 vpminsb(dst, src1, src2, vlen_enc);
857 } else if (elem_bt == T_SHORT) {
858 vpminsw(dst, src1, src2, vlen_enc);
859 } else if (elem_bt == T_INT) {
860 vpminsd(dst, src1, src2, vlen_enc);
861 } else {
862 assert(elem_bt == T_LONG, "required");
863 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
864 vpminsq(dst, src1, src2, vlen_enc);
865 } else {
866 assert_different_registers(dst, src1, src2);
867 vpcmpgtq(dst, src1, src2, vlen_enc);
868 vblendvpd(dst, src1, src2, dst, vlen_enc);
869 }
870 }
871 } else { // opcode == Op_MaxV
872 if (elem_bt == T_BYTE) {
873 vpmaxsb(dst, src1, src2, vlen_enc);
874 } else if (elem_bt == T_SHORT) {
875 vpmaxsw(dst, src1, src2, vlen_enc);
876 } else if (elem_bt == T_INT) {
877 vpmaxsd(dst, src1, src2, vlen_enc);
878 } else {
879 assert(elem_bt == T_LONG, "required");
880 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
881 vpmaxsq(dst, src1, src2, vlen_enc);
882 } else {
883 assert_different_registers(dst, src1, src2);
884 vpcmpgtq(dst, src1, src2, vlen_enc);
885 vblendvpd(dst, src2, src1, dst, vlen_enc);
886 }
887 }
888 }
889 }
890
891 // Float/Double min max
892
893 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
894 XMMRegister dst, XMMRegister a, XMMRegister b,
895 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
896 int vlen_enc) {
897 assert(UseAVX > 0, "required");
898 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
899 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
900 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
901 assert_different_registers(a, tmp, atmp, btmp);
902 assert_different_registers(b, tmp, atmp, btmp);
903
904 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
905 bool is_double_word = is_double_word_type(elem_bt);
906
907 /* Note on 'non-obvious' assembly sequence:
908 *
909 * While there are vminps/vmaxps instructions, there are two important differences between hardware
910 * and Java on how they handle floats:
911 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
912 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
913 *
914 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
915 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
916 * (only useful when signs differ, noop otherwise)
917 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
918
919 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
920 * btmp = (b < +0.0) ? a : b
921 * atmp = (b < +0.0) ? b : a
922 * Tmp = Max_Float(atmp , btmp)
923 * Res = (atmp == NaN) ? atmp : Tmp
924 */
925
926 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
927 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
928 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
929 XMMRegister mask;
930
931 if (!is_double_word && is_min) {
932 mask = a;
933 vblend = &MacroAssembler::vblendvps;
934 vmaxmin = &MacroAssembler::vminps;
935 vcmp = &MacroAssembler::vcmpps;
936 } else if (!is_double_word && !is_min) {
937 mask = b;
938 vblend = &MacroAssembler::vblendvps;
939 vmaxmin = &MacroAssembler::vmaxps;
940 vcmp = &MacroAssembler::vcmpps;
941 } else if (is_double_word && is_min) {
942 mask = a;
943 vblend = &MacroAssembler::vblendvpd;
944 vmaxmin = &MacroAssembler::vminpd;
945 vcmp = &MacroAssembler::vcmppd;
946 } else {
947 assert(is_double_word && !is_min, "sanity");
948 mask = b;
949 vblend = &MacroAssembler::vblendvpd;
950 vmaxmin = &MacroAssembler::vmaxpd;
951 vcmp = &MacroAssembler::vcmppd;
952 }
953
954 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
955 XMMRegister maxmin, scratch;
956 if (dst == btmp) {
957 maxmin = btmp;
958 scratch = tmp;
959 } else {
960 maxmin = tmp;
961 scratch = btmp;
962 }
963
964 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
965 if (precompute_mask && !is_double_word) {
966 vpsrad(tmp, mask, 32, vlen_enc);
967 mask = tmp;
968 } else if (precompute_mask && is_double_word) {
969 vpxor(tmp, tmp, tmp, vlen_enc);
970 vpcmpgtq(tmp, tmp, mask, vlen_enc);
971 mask = tmp;
972 }
973
974 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
975 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
976 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
977 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
978 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
979 }
980
981 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
982 XMMRegister dst, XMMRegister a, XMMRegister b,
983 KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
984 int vlen_enc) {
985 assert(UseAVX > 2, "required");
986 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
987 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
988 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
989 assert_different_registers(dst, a, atmp, btmp);
990 assert_different_registers(dst, b, atmp, btmp);
991
992 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
993 bool is_double_word = is_double_word_type(elem_bt);
994 bool merge = true;
995
996 if (!is_double_word && is_min) {
997 evpmovd2m(ktmp, a, vlen_enc);
998 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
999 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1000 vminps(dst, atmp, btmp, vlen_enc);
1001 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1002 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1003 } else if (!is_double_word && !is_min) {
1004 evpmovd2m(ktmp, b, vlen_enc);
1005 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1006 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1007 vmaxps(dst, atmp, btmp, vlen_enc);
1008 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1009 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1010 } else if (is_double_word && is_min) {
1011 evpmovq2m(ktmp, a, vlen_enc);
1012 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1013 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1014 vminpd(dst, atmp, btmp, vlen_enc);
1015 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1016 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1017 } else {
1018 assert(is_double_word && !is_min, "sanity");
1019 evpmovq2m(ktmp, b, vlen_enc);
1020 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1021 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1022 vmaxpd(dst, atmp, btmp, vlen_enc);
1023 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1024 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1025 }
1026 }
1027
1028 void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1029 XMMRegister src1, XMMRegister src2, int vlen_enc) {
1030 assert(opc == Op_MinV || opc == Op_MinReductionV ||
1031 opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1032
1033 int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1034 : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1035 if (elem_bt == T_FLOAT) {
1036 evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1037 } else {
1038 assert(elem_bt == T_DOUBLE, "");
1039 evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1040 }
1041 }
1042
1043 // Float/Double signum
1044 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1045 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1046
1047 Label DONE_LABEL;
1048
1049 // Handle special cases +0.0/-0.0 and NaN, if argument is +0.0/-0.0 or NaN, return argument
1050 // If AVX10.2 (or newer) floating point comparison instructions used, SF=1 for equal and unordered cases
1051 // If other floating point comparison instructions used, ZF=1 for equal and unordered cases
1052 if (opcode == Op_SignumF) {
1053 if (VM_Version::supports_avx10_2()) {
1054 vucomxss(dst, zero);
1055 jcc(Assembler::negative, DONE_LABEL);
1056 } else {
1057 ucomiss(dst, zero);
1058 jcc(Assembler::equal, DONE_LABEL);
1059 }
1060 movflt(dst, one);
1061 jcc(Assembler::above, DONE_LABEL);
1062 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1063 } else if (opcode == Op_SignumD) {
1064 if (VM_Version::supports_avx10_2()) {
1065 vucomxsd(dst, zero);
1066 jcc(Assembler::negative, DONE_LABEL);
1067 } else {
1068 ucomisd(dst, zero);
1069 jcc(Assembler::equal, DONE_LABEL);
1070 }
1071 movdbl(dst, one);
1072 jcc(Assembler::above, DONE_LABEL);
1073 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1074 }
1075
1076 bind(DONE_LABEL);
1077 }
1078
1079 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1080 if (sign) {
1081 pmovsxbw(dst, src);
1082 } else {
1083 pmovzxbw(dst, src);
1084 }
1085 }
1086
1087 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1088 if (sign) {
1089 vpmovsxbw(dst, src, vector_len);
1090 } else {
1091 vpmovzxbw(dst, src, vector_len);
1092 }
1093 }
1094
1095 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1096 if (sign) {
1097 vpmovsxbd(dst, src, vector_len);
1098 } else {
1099 vpmovzxbd(dst, src, vector_len);
1100 }
1101 }
1102
1103 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1104 if (sign) {
1105 vpmovsxwd(dst, src, vector_len);
1106 } else {
1107 vpmovzxwd(dst, src, vector_len);
1108 }
1109 }
1110
1111 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1112 int shift, int vector_len) {
1113 if (opcode == Op_RotateLeftV) {
1114 if (etype == T_INT) {
1115 evprold(dst, src, shift, vector_len);
1116 } else {
1117 assert(etype == T_LONG, "expected type T_LONG");
1118 evprolq(dst, src, shift, vector_len);
1119 }
1120 } else {
1121 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1122 if (etype == T_INT) {
1123 evprord(dst, src, shift, vector_len);
1124 } else {
1125 assert(etype == T_LONG, "expected type T_LONG");
1126 evprorq(dst, src, shift, vector_len);
1127 }
1128 }
1129 }
1130
1131 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1132 XMMRegister shift, int vector_len) {
1133 if (opcode == Op_RotateLeftV) {
1134 if (etype == T_INT) {
1135 evprolvd(dst, src, shift, vector_len);
1136 } else {
1137 assert(etype == T_LONG, "expected type T_LONG");
1138 evprolvq(dst, src, shift, vector_len);
1139 }
1140 } else {
1141 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1142 if (etype == T_INT) {
1143 evprorvd(dst, src, shift, vector_len);
1144 } else {
1145 assert(etype == T_LONG, "expected type T_LONG");
1146 evprorvq(dst, src, shift, vector_len);
1147 }
1148 }
1149 }
1150
1151 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1152 if (opcode == Op_RShiftVI) {
1153 psrad(dst, shift);
1154 } else if (opcode == Op_LShiftVI) {
1155 pslld(dst, shift);
1156 } else {
1157 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1158 psrld(dst, shift);
1159 }
1160 }
1161
1162 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1163 switch (opcode) {
1164 case Op_RShiftVI: psrad(dst, shift); break;
1165 case Op_LShiftVI: pslld(dst, shift); break;
1166 case Op_URShiftVI: psrld(dst, shift); break;
1167
1168 default: assert(false, "%s", NodeClassNames[opcode]);
1169 }
1170 }
1171
1172 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1173 if (opcode == Op_RShiftVI) {
1174 vpsrad(dst, nds, shift, vector_len);
1175 } else if (opcode == Op_LShiftVI) {
1176 vpslld(dst, nds, shift, vector_len);
1177 } else {
1178 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1179 vpsrld(dst, nds, shift, vector_len);
1180 }
1181 }
1182
1183 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1184 switch (opcode) {
1185 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break;
1186 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break;
1187 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1188
1189 default: assert(false, "%s", NodeClassNames[opcode]);
1190 }
1191 }
1192
1193 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1194 switch (opcode) {
1195 case Op_RShiftVB: // fall-through
1196 case Op_RShiftVS: psraw(dst, shift); break;
1197
1198 case Op_LShiftVB: // fall-through
1199 case Op_LShiftVS: psllw(dst, shift); break;
1200
1201 case Op_URShiftVS: // fall-through
1202 case Op_URShiftVB: psrlw(dst, shift); break;
1203
1204 default: assert(false, "%s", NodeClassNames[opcode]);
1205 }
1206 }
1207
1208 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1209 switch (opcode) {
1210 case Op_RShiftVB: // fall-through
1211 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break;
1212
1213 case Op_LShiftVB: // fall-through
1214 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break;
1215
1216 case Op_URShiftVS: // fall-through
1217 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1218
1219 default: assert(false, "%s", NodeClassNames[opcode]);
1220 }
1221 }
1222
1223 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1224 switch (opcode) {
1225 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1226 case Op_LShiftVL: psllq(dst, shift); break;
1227 case Op_URShiftVL: psrlq(dst, shift); break;
1228
1229 default: assert(false, "%s", NodeClassNames[opcode]);
1230 }
1231 }
1232
1233 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1234 if (opcode == Op_RShiftVL) {
1235 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems
1236 } else if (opcode == Op_LShiftVL) {
1237 psllq(dst, shift);
1238 } else {
1239 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1240 psrlq(dst, shift);
1241 }
1242 }
1243
1244 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1245 switch (opcode) {
1246 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1247 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break;
1248 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1249
1250 default: assert(false, "%s", NodeClassNames[opcode]);
1251 }
1252 }
1253
1254 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1255 if (opcode == Op_RShiftVL) {
1256 evpsraq(dst, nds, shift, vector_len);
1257 } else if (opcode == Op_LShiftVL) {
1258 vpsllq(dst, nds, shift, vector_len);
1259 } else {
1260 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1261 vpsrlq(dst, nds, shift, vector_len);
1262 }
1263 }
1264
1265 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1266 switch (opcode) {
1267 case Op_RShiftVB: // fall-through
1268 case Op_RShiftVS: // fall-through
1269 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break;
1270
1271 case Op_LShiftVB: // fall-through
1272 case Op_LShiftVS: // fall-through
1273 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break;
1274
1275 case Op_URShiftVB: // fall-through
1276 case Op_URShiftVS: // fall-through
1277 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1278
1279 default: assert(false, "%s", NodeClassNames[opcode]);
1280 }
1281 }
1282
1283 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1284 switch (opcode) {
1285 case Op_RShiftVB: // fall-through
1286 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break;
1287
1288 case Op_LShiftVB: // fall-through
1289 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break;
1290
1291 case Op_URShiftVB: // fall-through
1292 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1293
1294 default: assert(false, "%s", NodeClassNames[opcode]);
1295 }
1296 }
1297
1298 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1299 assert(UseAVX >= 2, "required");
1300 switch (opcode) {
1301 case Op_RShiftVL: {
1302 if (UseAVX > 2) {
1303 assert(tmp == xnoreg, "not used");
1304 if (!VM_Version::supports_avx512vl()) {
1305 vlen_enc = Assembler::AVX_512bit;
1306 }
1307 evpsravq(dst, src, shift, vlen_enc);
1308 } else {
1309 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1310 vpsrlvq(dst, src, shift, vlen_enc);
1311 vpsrlvq(tmp, tmp, shift, vlen_enc);
1312 vpxor(dst, dst, tmp, vlen_enc);
1313 vpsubq(dst, dst, tmp, vlen_enc);
1314 }
1315 break;
1316 }
1317 case Op_LShiftVL: {
1318 assert(tmp == xnoreg, "not used");
1319 vpsllvq(dst, src, shift, vlen_enc);
1320 break;
1321 }
1322 case Op_URShiftVL: {
1323 assert(tmp == xnoreg, "not used");
1324 vpsrlvq(dst, src, shift, vlen_enc);
1325 break;
1326 }
1327 default: assert(false, "%s", NodeClassNames[opcode]);
1328 }
1329 }
1330
1331 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1332 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1333 assert(opcode == Op_LShiftVB ||
1334 opcode == Op_RShiftVB ||
1335 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1336 bool sign = (opcode != Op_URShiftVB);
1337 assert(vector_len == 0, "required");
1338 vextendbd(sign, dst, src, 1);
1339 vpmovzxbd(vtmp, shift, 1);
1340 varshiftd(opcode, dst, dst, vtmp, 1);
1341 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1342 vextracti128_high(vtmp, dst);
1343 vpackusdw(dst, dst, vtmp, 0);
1344 }
1345
1346 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1347 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1348 assert(opcode == Op_LShiftVB ||
1349 opcode == Op_RShiftVB ||
1350 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1351 bool sign = (opcode != Op_URShiftVB);
1352 int ext_vector_len = vector_len + 1;
1353 vextendbw(sign, dst, src, ext_vector_len);
1354 vpmovzxbw(vtmp, shift, ext_vector_len);
1355 varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1356 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1357 if (vector_len == 0) {
1358 vextracti128_high(vtmp, dst);
1359 vpackuswb(dst, dst, vtmp, vector_len);
1360 } else {
1361 vextracti64x4_high(vtmp, dst);
1362 vpackuswb(dst, dst, vtmp, vector_len);
1363 vpermq(dst, dst, 0xD8, vector_len);
1364 }
1365 }
1366
1367 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1368 switch(typ) {
1369 case T_BYTE:
1370 pinsrb(dst, val, idx);
1371 break;
1372 case T_SHORT:
1373 pinsrw(dst, val, idx);
1374 break;
1375 case T_INT:
1376 pinsrd(dst, val, idx);
1377 break;
1378 case T_LONG:
1379 pinsrq(dst, val, idx);
1380 break;
1381 default:
1382 assert(false,"Should not reach here.");
1383 break;
1384 }
1385 }
1386
1387 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1388 switch(typ) {
1389 case T_BYTE:
1390 vpinsrb(dst, src, val, idx);
1391 break;
1392 case T_SHORT:
1393 vpinsrw(dst, src, val, idx);
1394 break;
1395 case T_INT:
1396 vpinsrd(dst, src, val, idx);
1397 break;
1398 case T_LONG:
1399 vpinsrq(dst, src, val, idx);
1400 break;
1401 default:
1402 assert(false,"Should not reach here.");
1403 break;
1404 }
1405 }
1406
1407 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst,
1408 Register base, Register idx_base,
1409 Register mask, Register mask_idx,
1410 Register rtmp, int vlen_enc) {
1411 vpxor(dst, dst, dst, vlen_enc);
1412 if (elem_bt == T_SHORT) {
1413 for (int i = 0; i < 4; i++) {
1414 // dst[i] = mask[i] ? src[idx_base[i]] : 0
1415 Label skip_load;
1416 btq(mask, mask_idx);
1417 jccb(Assembler::carryClear, skip_load);
1418 movl(rtmp, Address(idx_base, i * 4));
1419 pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1420 bind(skip_load);
1421 incq(mask_idx);
1422 }
1423 } else {
1424 assert(elem_bt == T_BYTE, "");
1425 for (int i = 0; i < 8; i++) {
1426 // dst[i] = mask[i] ? src[idx_base[i]] : 0
1427 Label skip_load;
1428 btq(mask, mask_idx);
1429 jccb(Assembler::carryClear, skip_load);
1430 movl(rtmp, Address(idx_base, i * 4));
1431 pinsrb(dst, Address(base, rtmp), i);
1432 bind(skip_load);
1433 incq(mask_idx);
1434 }
1435 }
1436 }
1437
1438 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
1439 Register base, Register idx_base,
1440 Register rtmp, int vlen_enc) {
1441 vpxor(dst, dst, dst, vlen_enc);
1442 if (elem_bt == T_SHORT) {
1443 for (int i = 0; i < 4; i++) {
1444 // dst[i] = src[idx_base[i]]
1445 movl(rtmp, Address(idx_base, i * 4));
1446 pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1447 }
1448 } else {
1449 assert(elem_bt == T_BYTE, "");
1450 for (int i = 0; i < 8; i++) {
1451 // dst[i] = src[idx_base[i]]
1452 movl(rtmp, Address(idx_base, i * 4));
1453 pinsrb(dst, Address(base, rtmp), i);
1454 }
1455 }
1456 }
1457
1458 /*
1459 * Gather using hybrid algorithm, first partially unroll scalar loop
1460 * to accumulate values from gather indices into a quad-word(64bit) slice.
1461 * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1462 * permutation to place the slice into appropriate vector lane
1463 * locations in destination vector. Following pseudo code describes the
1464 * algorithm in detail:
1465 *
1466 * DST_VEC = ZERO_VEC
1467 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1468 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1469 * FOREACH_ITER:
1470 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1471 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1472 * DST_VEC = DST_VEC OR TEMP_PERM_VEC
1473 * PERM_INDEX = PERM_INDEX - TWO_VEC
1474 *
1475 * With each iteration, doubleword permute indices (0,1) corresponding
1476 * to gathered quadword gets right shifted by two lane positions.
1477 *
1478 */
1479 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1480 Register base, Register idx_base,
1481 Register mask, XMMRegister xtmp1,
1482 XMMRegister xtmp2, XMMRegister temp_dst,
1483 Register rtmp, Register mask_idx,
1484 Register length, int vector_len, int vlen_enc) {
1485 Label GATHER8_LOOP;
1486 assert(is_subword_type(elem_ty), "");
1487 movl(length, vector_len);
1488 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1489 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1490 vallones(xtmp2, vlen_enc);
1491 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1492 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1493 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1494
1495 bind(GATHER8_LOOP);
1496 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1497 if (mask == noreg) {
1498 vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc);
1499 } else {
1500 vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc);
1501 }
1502 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1503 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1504 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1505 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1506 // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1507 vpor(dst, dst, temp_dst, vlen_enc);
1508 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1));
1509 subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1510 jcc(Assembler::notEqual, GATHER8_LOOP);
1511 }
1512
1513 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1514 switch(typ) {
1515 case T_INT:
1516 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1517 break;
1518 case T_FLOAT:
1519 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1520 break;
1521 case T_LONG:
1522 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1523 break;
1524 case T_DOUBLE:
1525 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1526 break;
1527 default:
1528 assert(false,"Should not reach here.");
1529 break;
1530 }
1531 }
1532
1533 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1534 switch(typ) {
1535 case T_INT:
1536 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1537 break;
1538 case T_FLOAT:
1539 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1540 break;
1541 case T_LONG:
1542 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1543 break;
1544 case T_DOUBLE:
1545 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1546 break;
1547 default:
1548 assert(false,"Should not reach here.");
1549 break;
1550 }
1551 }
1552
1553 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1554 switch(typ) {
1555 case T_INT:
1556 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1557 break;
1558 case T_FLOAT:
1559 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1560 break;
1561 case T_LONG:
1562 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1563 break;
1564 case T_DOUBLE:
1565 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1566 break;
1567 default:
1568 assert(false,"Should not reach here.");
1569 break;
1570 }
1571 }
1572
1573 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1574 if (vlen_in_bytes <= 16) {
1575 pxor (dst, dst);
1576 psubb(dst, src);
1577 switch (elem_bt) {
1578 case T_BYTE: /* nothing to do */ break;
1579 case T_SHORT: pmovsxbw(dst, dst); break;
1580 case T_INT: pmovsxbd(dst, dst); break;
1581 case T_FLOAT: pmovsxbd(dst, dst); break;
1582 case T_LONG: pmovsxbq(dst, dst); break;
1583 case T_DOUBLE: pmovsxbq(dst, dst); break;
1584
1585 default: assert(false, "%s", type2name(elem_bt));
1586 }
1587 } else {
1588 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1589 int vlen_enc = vector_length_encoding(vlen_in_bytes);
1590
1591 vpxor (dst, dst, dst, vlen_enc);
1592 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1593
1594 switch (elem_bt) {
1595 case T_BYTE: /* nothing to do */ break;
1596 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break;
1597 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break;
1598 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break;
1599 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break;
1600 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1601
1602 default: assert(false, "%s", type2name(elem_bt));
1603 }
1604 }
1605 }
1606
1607 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1608 if (novlbwdq) {
1609 vpmovsxbd(xtmp, src, vlen_enc);
1610 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1611 Assembler::eq, true, vlen_enc, noreg);
1612 } else {
1613 vpxor(xtmp, xtmp, xtmp, vlen_enc);
1614 vpsubb(xtmp, xtmp, src, vlen_enc);
1615 evpmovb2m(dst, xtmp, vlen_enc);
1616 }
1617 }
1618
1619 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1620 if (is_integral_type(bt)) {
1621 switch (vlen_in_bytes) {
1622 case 4: movdl(dst, src); break;
1623 case 8: movq(dst, src); break;
1624 case 16: movdqu(dst, src); break;
1625 case 32: vmovdqu(dst, src); break;
1626 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1627 default: ShouldNotReachHere();
1628 }
1629 } else {
1630 switch (vlen_in_bytes) {
1631 case 4: movflt(dst, src); break;
1632 case 8: movdbl(dst, src); break;
1633 case 16: movups(dst, src); break;
1634 case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1635 case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1636 default: ShouldNotReachHere();
1637 }
1638 }
1639 }
1640
1641 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1642 assert(rscratch != noreg || always_reachable(src), "missing");
1643
1644 if (reachable(src)) {
1645 load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1646 } else {
1647 lea(rscratch, src);
1648 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1649 }
1650 }
1651
1652 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1653 int vlen_enc = vector_length_encoding(vlen);
1654 if (VM_Version::supports_avx()) {
1655 if (bt == T_LONG) {
1656 if (VM_Version::supports_avx2()) {
1657 vpbroadcastq(dst, src, vlen_enc);
1658 } else {
1659 vmovddup(dst, src, vlen_enc);
1660 }
1661 } else if (bt == T_DOUBLE) {
1662 if (vlen_enc != Assembler::AVX_128bit) {
1663 vbroadcastsd(dst, src, vlen_enc, noreg);
1664 } else {
1665 vmovddup(dst, src, vlen_enc);
1666 }
1667 } else {
1668 if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1669 vpbroadcastd(dst, src, vlen_enc);
1670 } else {
1671 vbroadcastss(dst, src, vlen_enc);
1672 }
1673 }
1674 } else if (VM_Version::supports_sse3()) {
1675 movddup(dst, src);
1676 } else {
1677 load_vector(bt, dst, src, vlen);
1678 }
1679 }
1680
1681 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1682 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1683 int offset = exact_log2(type2aelembytes(bt)) << 6;
1684 if (is_floating_point_type(bt)) {
1685 offset += 128;
1686 }
1687 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1688 load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1689 }
1690
1691 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1692
1693 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1694 int vector_len = Assembler::AVX_128bit;
1695
1696 switch (opcode) {
1697 case Op_AndReductionV: pand(dst, src); break;
1698 case Op_OrReductionV: por (dst, src); break;
1699 case Op_XorReductionV: pxor(dst, src); break;
1700 case Op_MinReductionV:
1701 switch (typ) {
1702 case T_BYTE: pminsb(dst, src); break;
1703 case T_SHORT: pminsw(dst, src); break;
1704 case T_INT: pminsd(dst, src); break;
1705 case T_LONG: assert(UseAVX > 2, "required");
1706 vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1707 default: assert(false, "wrong type");
1708 }
1709 break;
1710 case Op_MaxReductionV:
1711 switch (typ) {
1712 case T_BYTE: pmaxsb(dst, src); break;
1713 case T_SHORT: pmaxsw(dst, src); break;
1714 case T_INT: pmaxsd(dst, src); break;
1715 case T_LONG: assert(UseAVX > 2, "required");
1716 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1717 default: assert(false, "wrong type");
1718 }
1719 break;
1720 case Op_AddReductionVF: addss(dst, src); break;
1721 case Op_AddReductionVD: addsd(dst, src); break;
1722 case Op_AddReductionVI:
1723 switch (typ) {
1724 case T_BYTE: paddb(dst, src); break;
1725 case T_SHORT: paddw(dst, src); break;
1726 case T_INT: paddd(dst, src); break;
1727 default: assert(false, "wrong type");
1728 }
1729 break;
1730 case Op_AddReductionVL: paddq(dst, src); break;
1731 case Op_MulReductionVF: mulss(dst, src); break;
1732 case Op_MulReductionVD: mulsd(dst, src); break;
1733 case Op_MulReductionVI:
1734 switch (typ) {
1735 case T_SHORT: pmullw(dst, src); break;
1736 case T_INT: pmulld(dst, src); break;
1737 default: assert(false, "wrong type");
1738 }
1739 break;
1740 case Op_MulReductionVL: assert(UseAVX > 2, "required");
1741 evpmullq(dst, dst, src, vector_len); break;
1742 default: assert(false, "wrong opcode");
1743 }
1744 }
1745
1746 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1747 switch (opcode) {
1748 case Op_AddReductionVF: addps(dst, src); break;
1749 case Op_AddReductionVD: addpd(dst, src); break;
1750 case Op_MulReductionVF: mulps(dst, src); break;
1751 case Op_MulReductionVD: mulpd(dst, src); break;
1752 default: assert(false, "%s", NodeClassNames[opcode]);
1753 }
1754 }
1755
1756 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1757 int vector_len = Assembler::AVX_256bit;
1758
1759 switch (opcode) {
1760 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break;
1761 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break;
1762 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break;
1763 case Op_MinReductionV:
1764 switch (typ) {
1765 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break;
1766 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break;
1767 case T_INT: vpminsd(dst, src1, src2, vector_len); break;
1768 case T_LONG: assert(UseAVX > 2, "required");
1769 vpminsq(dst, src1, src2, vector_len); break;
1770 default: assert(false, "wrong type");
1771 }
1772 break;
1773 case Op_MaxReductionV:
1774 switch (typ) {
1775 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break;
1776 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break;
1777 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break;
1778 case T_LONG: assert(UseAVX > 2, "required");
1779 vpmaxsq(dst, src1, src2, vector_len); break;
1780 default: assert(false, "wrong type");
1781 }
1782 break;
1783 case Op_AddReductionVI:
1784 switch (typ) {
1785 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break;
1786 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break;
1787 case T_INT: vpaddd(dst, src1, src2, vector_len); break;
1788 default: assert(false, "wrong type");
1789 }
1790 break;
1791 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1792 case Op_MulReductionVI:
1793 switch (typ) {
1794 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break;
1795 case T_INT: vpmulld(dst, src1, src2, vector_len); break;
1796 default: assert(false, "wrong type");
1797 }
1798 break;
1799 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1800 default: assert(false, "wrong opcode");
1801 }
1802 }
1803
1804 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1805 int vector_len = Assembler::AVX_256bit;
1806
1807 switch (opcode) {
1808 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1809 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1810 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1811 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1812 default: assert(false, "%s", NodeClassNames[opcode]);
1813 }
1814 }
1815
1816 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1817 XMMRegister dst, XMMRegister src,
1818 XMMRegister vtmp1, XMMRegister vtmp2) {
1819 switch (opcode) {
1820 case Op_AddReductionVF:
1821 case Op_MulReductionVF:
1822 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1823 break;
1824
1825 case Op_AddReductionVD:
1826 case Op_MulReductionVD:
1827 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1828 break;
1829
1830 default: assert(false, "wrong opcode");
1831 }
1832 }
1833
1834 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1835 XMMRegister dst, XMMRegister src,
1836 XMMRegister vtmp1, XMMRegister vtmp2) {
1837 switch (opcode) {
1838 case Op_AddReductionVF:
1839 case Op_MulReductionVF:
1840 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1841 break;
1842
1843 case Op_AddReductionVD:
1844 case Op_MulReductionVD:
1845 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1846 break;
1847
1848 default: assert(false, "%s", NodeClassNames[opcode]);
1849 }
1850 }
1851
1852 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1853 Register dst, Register src1, XMMRegister src2,
1854 XMMRegister vtmp1, XMMRegister vtmp2) {
1855 switch (vlen) {
1856 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1857 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1858 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1859 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1860
1861 default: assert(false, "wrong vector length");
1862 }
1863 }
1864
1865 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1866 Register dst, Register src1, XMMRegister src2,
1867 XMMRegister vtmp1, XMMRegister vtmp2) {
1868 switch (vlen) {
1869 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1870 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1871 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1872 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1873
1874 default: assert(false, "wrong vector length");
1875 }
1876 }
1877
1878 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1879 Register dst, Register src1, XMMRegister src2,
1880 XMMRegister vtmp1, XMMRegister vtmp2) {
1881 switch (vlen) {
1882 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1883 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1884 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1885 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1886
1887 default: assert(false, "wrong vector length");
1888 }
1889 }
1890
1891 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1892 Register dst, Register src1, XMMRegister src2,
1893 XMMRegister vtmp1, XMMRegister vtmp2) {
1894 switch (vlen) {
1895 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1896 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1897 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1898 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1899
1900 default: assert(false, "wrong vector length");
1901 }
1902 }
1903
1904 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1905 Register dst, Register src1, XMMRegister src2,
1906 XMMRegister vtmp1, XMMRegister vtmp2) {
1907 switch (vlen) {
1908 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1909 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1910 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1911
1912 default: assert(false, "wrong vector length");
1913 }
1914 }
1915
1916 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1917 switch (vlen) {
1918 case 2:
1919 assert(vtmp2 == xnoreg, "");
1920 reduce2F(opcode, dst, src, vtmp1);
1921 break;
1922 case 4:
1923 assert(vtmp2 == xnoreg, "");
1924 reduce4F(opcode, dst, src, vtmp1);
1925 break;
1926 case 8:
1927 reduce8F(opcode, dst, src, vtmp1, vtmp2);
1928 break;
1929 case 16:
1930 reduce16F(opcode, dst, src, vtmp1, vtmp2);
1931 break;
1932 default: assert(false, "wrong vector length");
1933 }
1934 }
1935
1936 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1937 switch (vlen) {
1938 case 2:
1939 assert(vtmp2 == xnoreg, "");
1940 reduce2D(opcode, dst, src, vtmp1);
1941 break;
1942 case 4:
1943 reduce4D(opcode, dst, src, vtmp1, vtmp2);
1944 break;
1945 case 8:
1946 reduce8D(opcode, dst, src, vtmp1, vtmp2);
1947 break;
1948 default: assert(false, "wrong vector length");
1949 }
1950 }
1951
1952 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1953 switch (vlen) {
1954 case 2:
1955 assert(vtmp1 == xnoreg, "");
1956 assert(vtmp2 == xnoreg, "");
1957 unorderedReduce2F(opcode, dst, src);
1958 break;
1959 case 4:
1960 assert(vtmp2 == xnoreg, "");
1961 unorderedReduce4F(opcode, dst, src, vtmp1);
1962 break;
1963 case 8:
1964 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
1965 break;
1966 case 16:
1967 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
1968 break;
1969 default: assert(false, "wrong vector length");
1970 }
1971 }
1972
1973 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1974 switch (vlen) {
1975 case 2:
1976 assert(vtmp1 == xnoreg, "");
1977 assert(vtmp2 == xnoreg, "");
1978 unorderedReduce2D(opcode, dst, src);
1979 break;
1980 case 4:
1981 assert(vtmp2 == xnoreg, "");
1982 unorderedReduce4D(opcode, dst, src, vtmp1);
1983 break;
1984 case 8:
1985 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
1986 break;
1987 default: assert(false, "wrong vector length");
1988 }
1989 }
1990
1991 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1992 if (opcode == Op_AddReductionVI) {
1993 if (vtmp1 != src2) {
1994 movdqu(vtmp1, src2);
1995 }
1996 phaddd(vtmp1, vtmp1);
1997 } else {
1998 pshufd(vtmp1, src2, 0x1);
1999 reduce_operation_128(T_INT, opcode, vtmp1, src2);
2000 }
2001 movdl(vtmp2, src1);
2002 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2003 movdl(dst, vtmp1);
2004 }
2005
2006 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2007 if (opcode == Op_AddReductionVI) {
2008 if (vtmp1 != src2) {
2009 movdqu(vtmp1, src2);
2010 }
2011 phaddd(vtmp1, src2);
2012 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2013 } else {
2014 pshufd(vtmp2, src2, 0xE);
2015 reduce_operation_128(T_INT, opcode, vtmp2, src2);
2016 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2017 }
2018 }
2019
2020 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2021 if (opcode == Op_AddReductionVI) {
2022 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2023 vextracti128_high(vtmp2, vtmp1);
2024 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2025 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2026 } else {
2027 vextracti128_high(vtmp1, src2);
2028 reduce_operation_128(T_INT, opcode, vtmp1, src2);
2029 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2030 }
2031 }
2032
2033 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2034 vextracti64x4_high(vtmp2, src2);
2035 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2036 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2037 }
2038
2039 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2040 pshufd(vtmp2, src2, 0x1);
2041 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2042 movdqu(vtmp1, vtmp2);
2043 psrldq(vtmp1, 2);
2044 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2045 movdqu(vtmp2, vtmp1);
2046 psrldq(vtmp2, 1);
2047 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2048 movdl(vtmp2, src1);
2049 pmovsxbd(vtmp1, vtmp1);
2050 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2051 pextrb(dst, vtmp1, 0x0);
2052 movsbl(dst, dst);
2053 }
2054
2055 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2056 pshufd(vtmp1, src2, 0xE);
2057 reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2058 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2059 }
2060
2061 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2062 vextracti128_high(vtmp2, src2);
2063 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2064 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2065 }
2066
2067 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2068 vextracti64x4_high(vtmp1, src2);
2069 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2070 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2071 }
2072
2073 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2074 pmovsxbw(vtmp2, src2);
2075 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2076 }
2077
2078 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2079 if (UseAVX > 1) {
2080 int vector_len = Assembler::AVX_256bit;
2081 vpmovsxbw(vtmp1, src2, vector_len);
2082 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2083 } else {
2084 pmovsxbw(vtmp2, src2);
2085 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2086 pshufd(vtmp2, src2, 0x1);
2087 pmovsxbw(vtmp2, src2);
2088 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2089 }
2090 }
2091
2092 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2093 if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2094 int vector_len = Assembler::AVX_512bit;
2095 vpmovsxbw(vtmp1, src2, vector_len);
2096 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2097 } else {
2098 assert(UseAVX >= 2,"Should not reach here.");
2099 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2100 vextracti128_high(vtmp2, src2);
2101 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2102 }
2103 }
2104
2105 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2106 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2107 vextracti64x4_high(vtmp2, src2);
2108 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2109 }
2110
2111 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2112 if (opcode == Op_AddReductionVI) {
2113 if (vtmp1 != src2) {
2114 movdqu(vtmp1, src2);
2115 }
2116 phaddw(vtmp1, vtmp1);
2117 phaddw(vtmp1, vtmp1);
2118 } else {
2119 pshufd(vtmp2, src2, 0x1);
2120 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2121 movdqu(vtmp1, vtmp2);
2122 psrldq(vtmp1, 2);
2123 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2124 }
2125 movdl(vtmp2, src1);
2126 pmovsxwd(vtmp1, vtmp1);
2127 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2128 pextrw(dst, vtmp1, 0x0);
2129 movswl(dst, dst);
2130 }
2131
2132 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2133 if (opcode == Op_AddReductionVI) {
2134 if (vtmp1 != src2) {
2135 movdqu(vtmp1, src2);
2136 }
2137 phaddw(vtmp1, src2);
2138 } else {
2139 pshufd(vtmp1, src2, 0xE);
2140 reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2141 }
2142 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2143 }
2144
2145 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2146 if (opcode == Op_AddReductionVI) {
2147 int vector_len = Assembler::AVX_256bit;
2148 vphaddw(vtmp2, src2, src2, vector_len);
2149 vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2150 } else {
2151 vextracti128_high(vtmp2, src2);
2152 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2153 }
2154 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2155 }
2156
2157 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2158 int vector_len = Assembler::AVX_256bit;
2159 vextracti64x4_high(vtmp1, src2);
2160 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2161 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2162 }
2163
2164 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2165 pshufd(vtmp2, src2, 0xE);
2166 reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2167 movdq(vtmp1, src1);
2168 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2169 movdq(dst, vtmp1);
2170 }
2171
2172 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2173 vextracti128_high(vtmp1, src2);
2174 reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2175 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2176 }
2177
2178 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2179 vextracti64x4_high(vtmp2, src2);
2180 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2181 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2182 }
2183
2184 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2185 mov64(temp, -1L);
2186 bzhiq(temp, temp, len);
2187 kmovql(dst, temp);
2188 }
2189
2190 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2191 reduce_operation_128(T_FLOAT, opcode, dst, src);
2192 pshufd(vtmp, src, 0x1);
2193 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2194 }
2195
2196 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2197 reduce2F(opcode, dst, src, vtmp);
2198 pshufd(vtmp, src, 0x2);
2199 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2200 pshufd(vtmp, src, 0x3);
2201 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2202 }
2203
2204 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2205 reduce4F(opcode, dst, src, vtmp2);
2206 vextractf128_high(vtmp2, src);
2207 reduce4F(opcode, dst, vtmp2, vtmp1);
2208 }
2209
2210 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2211 reduce8F(opcode, dst, src, vtmp1, vtmp2);
2212 vextracti64x4_high(vtmp1, src);
2213 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2214 }
2215
2216 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2217 pshufd(dst, src, 0x1);
2218 reduce_operation_128(T_FLOAT, opcode, dst, src);
2219 }
2220
2221 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2222 pshufd(vtmp, src, 0xE);
2223 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2224 unorderedReduce2F(opcode, dst, vtmp);
2225 }
2226
2227 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2228 vextractf128_high(vtmp1, src);
2229 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2230 unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2231 }
2232
2233 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2234 vextractf64x4_high(vtmp2, src);
2235 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2236 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2237 }
2238
2239 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2240 reduce_operation_128(T_DOUBLE, opcode, dst, src);
2241 pshufd(vtmp, src, 0xE);
2242 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2243 }
2244
2245 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2246 reduce2D(opcode, dst, src, vtmp2);
2247 vextractf128_high(vtmp2, src);
2248 reduce2D(opcode, dst, vtmp2, vtmp1);
2249 }
2250
2251 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2252 reduce4D(opcode, dst, src, vtmp1, vtmp2);
2253 vextracti64x4_high(vtmp1, src);
2254 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2255 }
2256
2257 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2258 pshufd(dst, src, 0xE);
2259 reduce_operation_128(T_DOUBLE, opcode, dst, src);
2260 }
2261
2262 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2263 vextractf128_high(vtmp, src);
2264 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2265 unorderedReduce2D(opcode, dst, vtmp);
2266 }
2267
2268 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2269 vextractf64x4_high(vtmp2, src);
2270 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2271 unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2272 }
2273
2274 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2275 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2276 }
2277
2278 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2279 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2280 }
2281
2282 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2283 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2284 }
2285
2286 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2287 int vec_enc) {
2288 switch(elem_bt) {
2289 case T_INT:
2290 case T_FLOAT:
2291 vmaskmovps(dst, src, mask, vec_enc);
2292 break;
2293 case T_LONG:
2294 case T_DOUBLE:
2295 vmaskmovpd(dst, src, mask, vec_enc);
2296 break;
2297 default:
2298 fatal("Unsupported type %s", type2name(elem_bt));
2299 break;
2300 }
2301 }
2302
2303 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2304 int vec_enc) {
2305 switch(elem_bt) {
2306 case T_INT:
2307 case T_FLOAT:
2308 vmaskmovps(dst, src, mask, vec_enc);
2309 break;
2310 case T_LONG:
2311 case T_DOUBLE:
2312 vmaskmovpd(dst, src, mask, vec_enc);
2313 break;
2314 default:
2315 fatal("Unsupported type %s", type2name(elem_bt));
2316 break;
2317 }
2318 }
2319
2320 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2321 XMMRegister dst, XMMRegister src,
2322 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2323 XMMRegister xmm_0, XMMRegister xmm_1) {
2324 const int permconst[] = {1, 14};
2325 XMMRegister wsrc = src;
2326 XMMRegister wdst = xmm_0;
2327 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2328
2329 int vlen_enc = Assembler::AVX_128bit;
2330 if (vlen == 16) {
2331 vlen_enc = Assembler::AVX_256bit;
2332 }
2333
2334 for (int i = log2(vlen) - 1; i >=0; i--) {
2335 if (i == 0 && !is_dst_valid) {
2336 wdst = dst;
2337 }
2338 if (i == 3) {
2339 vextracti64x4_high(wtmp, wsrc);
2340 } else if (i == 2) {
2341 vextracti128_high(wtmp, wsrc);
2342 } else { // i = [0,1]
2343 vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2344 }
2345
2346 if (VM_Version::supports_avx10_2()) {
2347 vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2348 } else {
2349 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2350 }
2351 wsrc = wdst;
2352 vlen_enc = Assembler::AVX_128bit;
2353 }
2354 if (is_dst_valid) {
2355 if (VM_Version::supports_avx10_2()) {
2356 vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2357 } else {
2358 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2359 }
2360 }
2361 }
2362
2363 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2364 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2365 XMMRegister xmm_0, XMMRegister xmm_1) {
2366 XMMRegister wsrc = src;
2367 XMMRegister wdst = xmm_0;
2368 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2369 int vlen_enc = Assembler::AVX_128bit;
2370 if (vlen == 8) {
2371 vlen_enc = Assembler::AVX_256bit;
2372 }
2373 for (int i = log2(vlen) - 1; i >=0; i--) {
2374 if (i == 0 && !is_dst_valid) {
2375 wdst = dst;
2376 }
2377 if (i == 1) {
2378 vextracti128_high(wtmp, wsrc);
2379 } else if (i == 2) {
2380 vextracti64x4_high(wtmp, wsrc);
2381 } else {
2382 assert(i == 0, "%d", i);
2383 vpermilpd(wtmp, wsrc, 1, vlen_enc);
2384 }
2385
2386 if (VM_Version::supports_avx10_2()) {
2387 vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2388 } else {
2389 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2390 }
2391
2392 wsrc = wdst;
2393 vlen_enc = Assembler::AVX_128bit;
2394 }
2395
2396 if (is_dst_valid) {
2397 if (VM_Version::supports_avx10_2()) {
2398 vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2399 } else {
2400 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2401 }
2402 }
2403 }
2404
2405 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2406 switch (bt) {
2407 case T_BYTE: pextrb(dst, src, idx); break;
2408 case T_SHORT: pextrw(dst, src, idx); break;
2409 case T_INT: pextrd(dst, src, idx); break;
2410 case T_LONG: pextrq(dst, src, idx); break;
2411
2412 default:
2413 assert(false,"Should not reach here.");
2414 break;
2415 }
2416 }
2417
2418 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2419 int esize = type2aelembytes(typ);
2420 int elem_per_lane = 16/esize;
2421 int lane = elemindex / elem_per_lane;
2422 int eindex = elemindex % elem_per_lane;
2423
2424 if (lane >= 2) {
2425 assert(UseAVX > 2, "required");
2426 vextractf32x4(dst, src, lane & 3);
2427 return dst;
2428 } else if (lane > 0) {
2429 assert(UseAVX > 0, "required");
2430 vextractf128(dst, src, lane);
2431 return dst;
2432 } else {
2433 return src;
2434 }
2435 }
2436
2437 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2438 if (typ == T_BYTE) {
2439 movsbl(dst, dst);
2440 } else if (typ == T_SHORT) {
2441 movswl(dst, dst);
2442 }
2443 }
2444
2445 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2446 int esize = type2aelembytes(typ);
2447 int elem_per_lane = 16/esize;
2448 int eindex = elemindex % elem_per_lane;
2449 assert(is_integral_type(typ),"required");
2450
2451 if (eindex == 0) {
2452 if (typ == T_LONG) {
2453 movq(dst, src);
2454 } else {
2455 movdl(dst, src);
2456 movsxl(typ, dst);
2457 }
2458 } else {
2459 extract(typ, dst, src, eindex);
2460 movsxl(typ, dst);
2461 }
2462 }
2463
2464 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2465 int esize = type2aelembytes(typ);
2466 int elem_per_lane = 16/esize;
2467 int eindex = elemindex % elem_per_lane;
2468 assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2469
2470 if (eindex == 0) {
2471 movq(dst, src);
2472 } else {
2473 if (typ == T_FLOAT) {
2474 if (UseAVX == 0) {
2475 movdqu(dst, src);
2476 shufps(dst, dst, eindex);
2477 } else {
2478 vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2479 }
2480 } else {
2481 if (UseAVX == 0) {
2482 movdqu(dst, src);
2483 psrldq(dst, eindex*esize);
2484 } else {
2485 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2486 }
2487 movq(dst, dst);
2488 }
2489 }
2490 // Zero upper bits
2491 if (typ == T_FLOAT) {
2492 if (UseAVX == 0) {
2493 assert(vtmp != xnoreg, "required.");
2494 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2495 pand(dst, vtmp);
2496 } else {
2497 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2498 }
2499 }
2500 }
2501
2502 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2503 switch(typ) {
2504 case T_BYTE:
2505 case T_BOOLEAN:
2506 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2507 break;
2508 case T_SHORT:
2509 case T_CHAR:
2510 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2511 break;
2512 case T_INT:
2513 case T_FLOAT:
2514 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2515 break;
2516 case T_LONG:
2517 case T_DOUBLE:
2518 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2519 break;
2520 default:
2521 assert(false,"Should not reach here.");
2522 break;
2523 }
2524 }
2525
2526 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2527 assert(rscratch != noreg || always_reachable(src2), "missing");
2528
2529 switch(typ) {
2530 case T_BOOLEAN:
2531 case T_BYTE:
2532 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2533 break;
2534 case T_CHAR:
2535 case T_SHORT:
2536 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2537 break;
2538 case T_INT:
2539 case T_FLOAT:
2540 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2541 break;
2542 case T_LONG:
2543 case T_DOUBLE:
2544 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2545 break;
2546 default:
2547 assert(false,"Should not reach here.");
2548 break;
2549 }
2550 }
2551
2552 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2553 switch(typ) {
2554 case T_BYTE:
2555 evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2556 break;
2557 case T_SHORT:
2558 evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2559 break;
2560 case T_INT:
2561 case T_FLOAT:
2562 evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2563 break;
2564 case T_LONG:
2565 case T_DOUBLE:
2566 evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2567 break;
2568 default:
2569 assert(false,"Should not reach here.");
2570 break;
2571 }
2572 }
2573
2574 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2575 assert(vlen_in_bytes <= 32, "");
2576 int esize = type2aelembytes(bt);
2577 if (vlen_in_bytes == 32) {
2578 assert(vtmp == xnoreg, "required.");
2579 if (esize >= 4) {
2580 vtestps(src1, src2, AVX_256bit);
2581 } else {
2582 vptest(src1, src2, AVX_256bit);
2583 }
2584 return;
2585 }
2586 if (vlen_in_bytes < 16) {
2587 // Duplicate the lower part to fill the whole register,
2588 // Don't need to do so for src2
2589 assert(vtmp != xnoreg, "required");
2590 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2591 pshufd(vtmp, src1, shuffle_imm);
2592 } else {
2593 assert(vtmp == xnoreg, "required");
2594 vtmp = src1;
2595 }
2596 if (esize >= 4 && VM_Version::supports_avx()) {
2597 vtestps(vtmp, src2, AVX_128bit);
2598 } else {
2599 ptest(vtmp, src2);
2600 }
2601 }
2602
2603 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2604 #ifdef ASSERT
2605 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2606 bool is_bw_supported = VM_Version::supports_avx512bw();
2607 if (is_bw && !is_bw_supported) {
2608 assert(vlen_enc != Assembler::AVX_512bit, "required");
2609 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2610 "XMM register should be 0-15");
2611 }
2612 #endif // ASSERT
2613 switch (elem_bt) {
2614 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2615 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2616 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2617 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2618 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2619 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2620 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2621 }
2622 }
2623
2624 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2625 assert(UseAVX >= 2, "required");
2626 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2627 bool is_vl = vlen_enc != Assembler::AVX_512bit;
2628 if ((UseAVX > 2) &&
2629 (!is_bw || VM_Version::supports_avx512bw()) &&
2630 (!is_vl || VM_Version::supports_avx512vl())) {
2631 switch (elem_bt) {
2632 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2633 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2634 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2635 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2636 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2637 }
2638 } else {
2639 assert(vlen_enc != Assembler::AVX_512bit, "required");
2640 assert((dst->encoding() < 16),"XMM register should be 0-15");
2641 switch (elem_bt) {
2642 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2643 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2644 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2645 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2646 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2647 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2648 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2649 }
2650 }
2651 }
2652
2653 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2654 switch (to_elem_bt) {
2655 case T_SHORT:
2656 vpmovsxbw(dst, src, vlen_enc);
2657 break;
2658 case T_INT:
2659 vpmovsxbd(dst, src, vlen_enc);
2660 break;
2661 case T_FLOAT:
2662 vpmovsxbd(dst, src, vlen_enc);
2663 vcvtdq2ps(dst, dst, vlen_enc);
2664 break;
2665 case T_LONG:
2666 vpmovsxbq(dst, src, vlen_enc);
2667 break;
2668 case T_DOUBLE: {
2669 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2670 vpmovsxbd(dst, src, mid_vlen_enc);
2671 vcvtdq2pd(dst, dst, vlen_enc);
2672 break;
2673 }
2674 default:
2675 fatal("Unsupported type %s", type2name(to_elem_bt));
2676 break;
2677 }
2678 }
2679
2680 //-------------------------------------------------------------------------------------------
2681
2682 // IndexOf for constant substrings with size >= 8 chars
2683 // which don't need to be loaded through stack.
2684 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2685 Register cnt1, Register cnt2,
2686 int int_cnt2, Register result,
2687 XMMRegister vec, Register tmp,
2688 int ae) {
2689 ShortBranchVerifier sbv(this);
2690 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2691 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2692
2693 // This method uses the pcmpestri instruction with bound registers
2694 // inputs:
2695 // xmm - substring
2696 // rax - substring length (elements count)
2697 // mem - scanned string
2698 // rdx - string length (elements count)
2699 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2700 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2701 // outputs:
2702 // rcx - matched index in string
2703 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2704 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2705 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2706 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2707 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2708
2709 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2710 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2711 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2712
2713 // Note, inline_string_indexOf() generates checks:
2714 // if (substr.count > string.count) return -1;
2715 // if (substr.count == 0) return 0;
2716 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2717
2718 // Load substring.
2719 if (ae == StrIntrinsicNode::UL) {
2720 pmovzxbw(vec, Address(str2, 0));
2721 } else {
2722 movdqu(vec, Address(str2, 0));
2723 }
2724 movl(cnt2, int_cnt2);
2725 movptr(result, str1); // string addr
2726
2727 if (int_cnt2 > stride) {
2728 jmpb(SCAN_TO_SUBSTR);
2729
2730 // Reload substr for rescan, this code
2731 // is executed only for large substrings (> 8 chars)
2732 bind(RELOAD_SUBSTR);
2733 if (ae == StrIntrinsicNode::UL) {
2734 pmovzxbw(vec, Address(str2, 0));
2735 } else {
2736 movdqu(vec, Address(str2, 0));
2737 }
2738 negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2739
2740 bind(RELOAD_STR);
2741 // We came here after the beginning of the substring was
2742 // matched but the rest of it was not so we need to search
2743 // again. Start from the next element after the previous match.
2744
2745 // cnt2 is number of substring reminding elements and
2746 // cnt1 is number of string reminding elements when cmp failed.
2747 // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2748 subl(cnt1, cnt2);
2749 addl(cnt1, int_cnt2);
2750 movl(cnt2, int_cnt2); // Now restore cnt2
2751
2752 decrementl(cnt1); // Shift to next element
2753 cmpl(cnt1, cnt2);
2754 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2755
2756 addptr(result, (1<<scale1));
2757
2758 } // (int_cnt2 > 8)
2759
2760 // Scan string for start of substr in 16-byte vectors
2761 bind(SCAN_TO_SUBSTR);
2762 pcmpestri(vec, Address(result, 0), mode);
2763 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
2764 subl(cnt1, stride);
2765 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2766 cmpl(cnt1, cnt2);
2767 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2768 addptr(result, 16);
2769 jmpb(SCAN_TO_SUBSTR);
2770
2771 // Found a potential substr
2772 bind(FOUND_CANDIDATE);
2773 // Matched whole vector if first element matched (tmp(rcx) == 0).
2774 if (int_cnt2 == stride) {
2775 jccb(Assembler::overflow, RET_FOUND); // OF == 1
2776 } else { // int_cnt2 > 8
2777 jccb(Assembler::overflow, FOUND_SUBSTR);
2778 }
2779 // After pcmpestri tmp(rcx) contains matched element index
2780 // Compute start addr of substr
2781 lea(result, Address(result, tmp, scale1));
2782
2783 // Make sure string is still long enough
2784 subl(cnt1, tmp);
2785 cmpl(cnt1, cnt2);
2786 if (int_cnt2 == stride) {
2787 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2788 } else { // int_cnt2 > 8
2789 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2790 }
2791 // Left less then substring.
2792
2793 bind(RET_NOT_FOUND);
2794 movl(result, -1);
2795 jmp(EXIT);
2796
2797 if (int_cnt2 > stride) {
2798 // This code is optimized for the case when whole substring
2799 // is matched if its head is matched.
2800 bind(MATCH_SUBSTR_HEAD);
2801 pcmpestri(vec, Address(result, 0), mode);
2802 // Reload only string if does not match
2803 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2804
2805 Label CONT_SCAN_SUBSTR;
2806 // Compare the rest of substring (> 8 chars).
2807 bind(FOUND_SUBSTR);
2808 // First 8 chars are already matched.
2809 negptr(cnt2);
2810 addptr(cnt2, stride);
2811
2812 bind(SCAN_SUBSTR);
2813 subl(cnt1, stride);
2814 cmpl(cnt2, -stride); // Do not read beyond substring
2815 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2816 // Back-up strings to avoid reading beyond substring:
2817 // cnt1 = cnt1 - cnt2 + 8
2818 addl(cnt1, cnt2); // cnt2 is negative
2819 addl(cnt1, stride);
2820 movl(cnt2, stride); negptr(cnt2);
2821 bind(CONT_SCAN_SUBSTR);
2822 if (int_cnt2 < (int)G) {
2823 int tail_off1 = int_cnt2<<scale1;
2824 int tail_off2 = int_cnt2<<scale2;
2825 if (ae == StrIntrinsicNode::UL) {
2826 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2827 } else {
2828 movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2829 }
2830 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2831 } else {
2832 // calculate index in register to avoid integer overflow (int_cnt2*2)
2833 movl(tmp, int_cnt2);
2834 addptr(tmp, cnt2);
2835 if (ae == StrIntrinsicNode::UL) {
2836 pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2837 } else {
2838 movdqu(vec, Address(str2, tmp, scale2, 0));
2839 }
2840 pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2841 }
2842 // Need to reload strings pointers if not matched whole vector
2843 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2844 addptr(cnt2, stride);
2845 jcc(Assembler::negative, SCAN_SUBSTR);
2846 // Fall through if found full substring
2847
2848 } // (int_cnt2 > 8)
2849
2850 bind(RET_FOUND);
2851 // Found result if we matched full small substring.
2852 // Compute substr offset
2853 subptr(result, str1);
2854 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2855 shrl(result, 1); // index
2856 }
2857 bind(EXIT);
2858
2859 } // string_indexofC8
2860
2861 // Small strings are loaded through stack if they cross page boundary.
2862 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2863 Register cnt1, Register cnt2,
2864 int int_cnt2, Register result,
2865 XMMRegister vec, Register tmp,
2866 int ae) {
2867 ShortBranchVerifier sbv(this);
2868 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2869 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2870
2871 //
2872 // int_cnt2 is length of small (< 8 chars) constant substring
2873 // or (-1) for non constant substring in which case its length
2874 // is in cnt2 register.
2875 //
2876 // Note, inline_string_indexOf() generates checks:
2877 // if (substr.count > string.count) return -1;
2878 // if (substr.count == 0) return 0;
2879 //
2880 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2881 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2882 // This method uses the pcmpestri instruction with bound registers
2883 // inputs:
2884 // xmm - substring
2885 // rax - substring length (elements count)
2886 // mem - scanned string
2887 // rdx - string length (elements count)
2888 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2889 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2890 // outputs:
2891 // rcx - matched index in string
2892 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2893 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2894 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2895 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2896
2897 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2898 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2899 FOUND_CANDIDATE;
2900
2901 { //========================================================
2902 // We don't know where these strings are located
2903 // and we can't read beyond them. Load them through stack.
2904 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2905
2906 movptr(tmp, rsp); // save old SP
2907
2908 if (int_cnt2 > 0) { // small (< 8 chars) constant substring
2909 if (int_cnt2 == (1>>scale2)) { // One byte
2910 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2911 load_unsigned_byte(result, Address(str2, 0));
2912 movdl(vec, result); // move 32 bits
2913 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes
2914 // Not enough header space in 32-bit VM: 12+3 = 15.
2915 movl(result, Address(str2, -1));
2916 shrl(result, 8);
2917 movdl(vec, result); // move 32 bits
2918 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char
2919 load_unsigned_short(result, Address(str2, 0));
2920 movdl(vec, result); // move 32 bits
2921 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2922 movdl(vec, Address(str2, 0)); // move 32 bits
2923 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2924 movq(vec, Address(str2, 0)); // move 64 bits
2925 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2926 // Array header size is 12 bytes in 32-bit VM
2927 // + 6 bytes for 3 chars == 18 bytes,
2928 // enough space to load vec and shift.
2929 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2930 if (ae == StrIntrinsicNode::UL) {
2931 int tail_off = int_cnt2-8;
2932 pmovzxbw(vec, Address(str2, tail_off));
2933 psrldq(vec, -2*tail_off);
2934 }
2935 else {
2936 int tail_off = int_cnt2*(1<<scale2);
2937 movdqu(vec, Address(str2, tail_off-16));
2938 psrldq(vec, 16-tail_off);
2939 }
2940 }
2941 } else { // not constant substring
2942 cmpl(cnt2, stride);
2943 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2944
2945 // We can read beyond string if srt+16 does not cross page boundary
2946 // since heaps are aligned and mapped by pages.
2947 assert(os::vm_page_size() < (int)G, "default page should be small");
2948 movl(result, str2); // We need only low 32 bits
2949 andl(result, ((int)os::vm_page_size()-1));
2950 cmpl(result, ((int)os::vm_page_size()-16));
2951 jccb(Assembler::belowEqual, CHECK_STR);
2952
2953 // Move small strings to stack to allow load 16 bytes into vec.
2954 subptr(rsp, 16);
2955 int stk_offset = wordSize-(1<<scale2);
2956 push(cnt2);
2957
2958 bind(COPY_SUBSTR);
2959 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2960 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2961 movb(Address(rsp, cnt2, scale2, stk_offset), result);
2962 } else if (ae == StrIntrinsicNode::UU) {
2963 load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2964 movw(Address(rsp, cnt2, scale2, stk_offset), result);
2965 }
2966 decrement(cnt2);
2967 jccb(Assembler::notZero, COPY_SUBSTR);
2968
2969 pop(cnt2);
2970 movptr(str2, rsp); // New substring address
2971 } // non constant
2972
2973 bind(CHECK_STR);
2974 cmpl(cnt1, stride);
2975 jccb(Assembler::aboveEqual, BIG_STRINGS);
2976
2977 // Check cross page boundary.
2978 movl(result, str1); // We need only low 32 bits
2979 andl(result, ((int)os::vm_page_size()-1));
2980 cmpl(result, ((int)os::vm_page_size()-16));
2981 jccb(Assembler::belowEqual, BIG_STRINGS);
2982
2983 subptr(rsp, 16);
2984 int stk_offset = -(1<<scale1);
2985 if (int_cnt2 < 0) { // not constant
2986 push(cnt2);
2987 stk_offset += wordSize;
2988 }
2989 movl(cnt2, cnt1);
2990
2991 bind(COPY_STR);
2992 if (ae == StrIntrinsicNode::LL) {
2993 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2994 movb(Address(rsp, cnt2, scale1, stk_offset), result);
2995 } else {
2996 load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2997 movw(Address(rsp, cnt2, scale1, stk_offset), result);
2998 }
2999 decrement(cnt2);
3000 jccb(Assembler::notZero, COPY_STR);
3001
3002 if (int_cnt2 < 0) { // not constant
3003 pop(cnt2);
3004 }
3005 movptr(str1, rsp); // New string address
3006
3007 bind(BIG_STRINGS);
3008 // Load substring.
3009 if (int_cnt2 < 0) { // -1
3010 if (ae == StrIntrinsicNode::UL) {
3011 pmovzxbw(vec, Address(str2, 0));
3012 } else {
3013 movdqu(vec, Address(str2, 0));
3014 }
3015 push(cnt2); // substr count
3016 push(str2); // substr addr
3017 push(str1); // string addr
3018 } else {
3019 // Small (< 8 chars) constant substrings are loaded already.
3020 movl(cnt2, int_cnt2);
3021 }
3022 push(tmp); // original SP
3023
3024 } // Finished loading
3025
3026 //========================================================
3027 // Start search
3028 //
3029
3030 movptr(result, str1); // string addr
3031
3032 if (int_cnt2 < 0) { // Only for non constant substring
3033 jmpb(SCAN_TO_SUBSTR);
3034
3035 // SP saved at sp+0
3036 // String saved at sp+1*wordSize
3037 // Substr saved at sp+2*wordSize
3038 // Substr count saved at sp+3*wordSize
3039
3040 // Reload substr for rescan, this code
3041 // is executed only for large substrings (> 8 chars)
3042 bind(RELOAD_SUBSTR);
3043 movptr(str2, Address(rsp, 2*wordSize));
3044 movl(cnt2, Address(rsp, 3*wordSize));
3045 if (ae == StrIntrinsicNode::UL) {
3046 pmovzxbw(vec, Address(str2, 0));
3047 } else {
3048 movdqu(vec, Address(str2, 0));
3049 }
3050 // We came here after the beginning of the substring was
3051 // matched but the rest of it was not so we need to search
3052 // again. Start from the next element after the previous match.
3053 subptr(str1, result); // Restore counter
3054 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3055 shrl(str1, 1);
3056 }
3057 addl(cnt1, str1);
3058 decrementl(cnt1); // Shift to next element
3059 cmpl(cnt1, cnt2);
3060 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
3061
3062 addptr(result, (1<<scale1));
3063 } // non constant
3064
3065 // Scan string for start of substr in 16-byte vectors
3066 bind(SCAN_TO_SUBSTR);
3067 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3068 pcmpestri(vec, Address(result, 0), mode);
3069 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
3070 subl(cnt1, stride);
3071 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3072 cmpl(cnt1, cnt2);
3073 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
3074 addptr(result, 16);
3075
3076 bind(ADJUST_STR);
3077 cmpl(cnt1, stride); // Do not read beyond string
3078 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3079 // Back-up string to avoid reading beyond string.
3080 lea(result, Address(result, cnt1, scale1, -16));
3081 movl(cnt1, stride);
3082 jmpb(SCAN_TO_SUBSTR);
3083
3084 // Found a potential substr
3085 bind(FOUND_CANDIDATE);
3086 // After pcmpestri tmp(rcx) contains matched element index
3087
3088 // Make sure string is still long enough
3089 subl(cnt1, tmp);
3090 cmpl(cnt1, cnt2);
3091 jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3092 // Left less then substring.
3093
3094 bind(RET_NOT_FOUND);
3095 movl(result, -1);
3096 jmp(CLEANUP);
3097
3098 bind(FOUND_SUBSTR);
3099 // Compute start addr of substr
3100 lea(result, Address(result, tmp, scale1));
3101 if (int_cnt2 > 0) { // Constant substring
3102 // Repeat search for small substring (< 8 chars)
3103 // from new point without reloading substring.
3104 // Have to check that we don't read beyond string.
3105 cmpl(tmp, stride-int_cnt2);
3106 jccb(Assembler::greater, ADJUST_STR);
3107 // Fall through if matched whole substring.
3108 } else { // non constant
3109 assert(int_cnt2 == -1, "should be != 0");
3110
3111 addl(tmp, cnt2);
3112 // Found result if we matched whole substring.
3113 cmpl(tmp, stride);
3114 jcc(Assembler::lessEqual, RET_FOUND);
3115
3116 // Repeat search for small substring (<= 8 chars)
3117 // from new point 'str1' without reloading substring.
3118 cmpl(cnt2, stride);
3119 // Have to check that we don't read beyond string.
3120 jccb(Assembler::lessEqual, ADJUST_STR);
3121
3122 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3123 // Compare the rest of substring (> 8 chars).
3124 movptr(str1, result);
3125
3126 cmpl(tmp, cnt2);
3127 // First 8 chars are already matched.
3128 jccb(Assembler::equal, CHECK_NEXT);
3129
3130 bind(SCAN_SUBSTR);
3131 pcmpestri(vec, Address(str1, 0), mode);
3132 // Need to reload strings pointers if not matched whole vector
3133 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3134
3135 bind(CHECK_NEXT);
3136 subl(cnt2, stride);
3137 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3138 addptr(str1, 16);
3139 if (ae == StrIntrinsicNode::UL) {
3140 addptr(str2, 8);
3141 } else {
3142 addptr(str2, 16);
3143 }
3144 subl(cnt1, stride);
3145 cmpl(cnt2, stride); // Do not read beyond substring
3146 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3147 // Back-up strings to avoid reading beyond substring.
3148
3149 if (ae == StrIntrinsicNode::UL) {
3150 lea(str2, Address(str2, cnt2, scale2, -8));
3151 lea(str1, Address(str1, cnt2, scale1, -16));
3152 } else {
3153 lea(str2, Address(str2, cnt2, scale2, -16));
3154 lea(str1, Address(str1, cnt2, scale1, -16));
3155 }
3156 subl(cnt1, cnt2);
3157 movl(cnt2, stride);
3158 addl(cnt1, stride);
3159 bind(CONT_SCAN_SUBSTR);
3160 if (ae == StrIntrinsicNode::UL) {
3161 pmovzxbw(vec, Address(str2, 0));
3162 } else {
3163 movdqu(vec, Address(str2, 0));
3164 }
3165 jmp(SCAN_SUBSTR);
3166
3167 bind(RET_FOUND_LONG);
3168 movptr(str1, Address(rsp, wordSize));
3169 } // non constant
3170
3171 bind(RET_FOUND);
3172 // Compute substr offset
3173 subptr(result, str1);
3174 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3175 shrl(result, 1); // index
3176 }
3177 bind(CLEANUP);
3178 pop(rsp); // restore SP
3179
3180 } // string_indexof
3181
3182 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3183 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3184 ShortBranchVerifier sbv(this);
3185 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3186
3187 int stride = 8;
3188
3189 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3190 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3191 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3192 FOUND_SEQ_CHAR, DONE_LABEL;
3193
3194 movptr(result, str1);
3195 if (UseAVX >= 2) {
3196 cmpl(cnt1, stride);
3197 jcc(Assembler::less, SCAN_TO_CHAR);
3198 cmpl(cnt1, 2*stride);
3199 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3200 movdl(vec1, ch);
3201 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3202 vpxor(vec2, vec2);
3203 movl(tmp, cnt1);
3204 andl(tmp, 0xFFFFFFF0); //vector count (in chars)
3205 andl(cnt1,0x0000000F); //tail count (in chars)
3206
3207 bind(SCAN_TO_16_CHAR_LOOP);
3208 vmovdqu(vec3, Address(result, 0));
3209 vpcmpeqw(vec3, vec3, vec1, 1);
3210 vptest(vec2, vec3);
3211 jcc(Assembler::carryClear, FOUND_CHAR);
3212 addptr(result, 32);
3213 subl(tmp, 2*stride);
3214 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3215 jmp(SCAN_TO_8_CHAR);
3216 bind(SCAN_TO_8_CHAR_INIT);
3217 movdl(vec1, ch);
3218 pshuflw(vec1, vec1, 0x00);
3219 pshufd(vec1, vec1, 0);
3220 pxor(vec2, vec2);
3221 }
3222 bind(SCAN_TO_8_CHAR);
3223 cmpl(cnt1, stride);
3224 jcc(Assembler::less, SCAN_TO_CHAR);
3225 if (UseAVX < 2) {
3226 movdl(vec1, ch);
3227 pshuflw(vec1, vec1, 0x00);
3228 pshufd(vec1, vec1, 0);
3229 pxor(vec2, vec2);
3230 }
3231 movl(tmp, cnt1);
3232 andl(tmp, 0xFFFFFFF8); //vector count (in chars)
3233 andl(cnt1,0x00000007); //tail count (in chars)
3234
3235 bind(SCAN_TO_8_CHAR_LOOP);
3236 movdqu(vec3, Address(result, 0));
3237 pcmpeqw(vec3, vec1);
3238 ptest(vec2, vec3);
3239 jcc(Assembler::carryClear, FOUND_CHAR);
3240 addptr(result, 16);
3241 subl(tmp, stride);
3242 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3243 bind(SCAN_TO_CHAR);
3244 testl(cnt1, cnt1);
3245 jcc(Assembler::zero, RET_NOT_FOUND);
3246 bind(SCAN_TO_CHAR_LOOP);
3247 load_unsigned_short(tmp, Address(result, 0));
3248 cmpl(ch, tmp);
3249 jccb(Assembler::equal, FOUND_SEQ_CHAR);
3250 addptr(result, 2);
3251 subl(cnt1, 1);
3252 jccb(Assembler::zero, RET_NOT_FOUND);
3253 jmp(SCAN_TO_CHAR_LOOP);
3254
3255 bind(RET_NOT_FOUND);
3256 movl(result, -1);
3257 jmpb(DONE_LABEL);
3258
3259 bind(FOUND_CHAR);
3260 if (UseAVX >= 2) {
3261 vpmovmskb(tmp, vec3);
3262 } else {
3263 pmovmskb(tmp, vec3);
3264 }
3265 bsfl(ch, tmp);
3266 addptr(result, ch);
3267
3268 bind(FOUND_SEQ_CHAR);
3269 subptr(result, str1);
3270 shrl(result, 1);
3271
3272 bind(DONE_LABEL);
3273 } // string_indexof_char
3274
3275 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3276 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3277 ShortBranchVerifier sbv(this);
3278 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3279
3280 int stride = 16;
3281
3282 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3283 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3284 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3285 FOUND_SEQ_CHAR, DONE_LABEL;
3286
3287 movptr(result, str1);
3288 if (UseAVX >= 2) {
3289 cmpl(cnt1, stride);
3290 jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3291 cmpl(cnt1, stride*2);
3292 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3293 movdl(vec1, ch);
3294 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3295 vpxor(vec2, vec2);
3296 movl(tmp, cnt1);
3297 andl(tmp, 0xFFFFFFE0); //vector count (in chars)
3298 andl(cnt1,0x0000001F); //tail count (in chars)
3299
3300 bind(SCAN_TO_32_CHAR_LOOP);
3301 vmovdqu(vec3, Address(result, 0));
3302 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3303 vptest(vec2, vec3);
3304 jcc(Assembler::carryClear, FOUND_CHAR);
3305 addptr(result, 32);
3306 subl(tmp, stride*2);
3307 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3308 jmp(SCAN_TO_16_CHAR);
3309
3310 bind(SCAN_TO_16_CHAR_INIT);
3311 movdl(vec1, ch);
3312 pxor(vec2, vec2);
3313 pshufb(vec1, vec2);
3314 }
3315
3316 bind(SCAN_TO_16_CHAR);
3317 cmpl(cnt1, stride);
3318 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3319 if (UseAVX < 2) {
3320 movdl(vec1, ch);
3321 pxor(vec2, vec2);
3322 pshufb(vec1, vec2);
3323 }
3324 movl(tmp, cnt1);
3325 andl(tmp, 0xFFFFFFF0); //vector count (in bytes)
3326 andl(cnt1,0x0000000F); //tail count (in bytes)
3327
3328 bind(SCAN_TO_16_CHAR_LOOP);
3329 movdqu(vec3, Address(result, 0));
3330 pcmpeqb(vec3, vec1);
3331 ptest(vec2, vec3);
3332 jcc(Assembler::carryClear, FOUND_CHAR);
3333 addptr(result, 16);
3334 subl(tmp, stride);
3335 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3336
3337 bind(SCAN_TO_CHAR_INIT);
3338 testl(cnt1, cnt1);
3339 jcc(Assembler::zero, RET_NOT_FOUND);
3340 bind(SCAN_TO_CHAR_LOOP);
3341 load_unsigned_byte(tmp, Address(result, 0));
3342 cmpl(ch, tmp);
3343 jccb(Assembler::equal, FOUND_SEQ_CHAR);
3344 addptr(result, 1);
3345 subl(cnt1, 1);
3346 jccb(Assembler::zero, RET_NOT_FOUND);
3347 jmp(SCAN_TO_CHAR_LOOP);
3348
3349 bind(RET_NOT_FOUND);
3350 movl(result, -1);
3351 jmpb(DONE_LABEL);
3352
3353 bind(FOUND_CHAR);
3354 if (UseAVX >= 2) {
3355 vpmovmskb(tmp, vec3);
3356 } else {
3357 pmovmskb(tmp, vec3);
3358 }
3359 bsfl(ch, tmp);
3360 addptr(result, ch);
3361
3362 bind(FOUND_SEQ_CHAR);
3363 subptr(result, str1);
3364
3365 bind(DONE_LABEL);
3366 } // stringL_indexof_char
3367
3368 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3369 switch (eltype) {
3370 case T_BOOLEAN: return sizeof(jboolean);
3371 case T_BYTE: return sizeof(jbyte);
3372 case T_SHORT: return sizeof(jshort);
3373 case T_CHAR: return sizeof(jchar);
3374 case T_INT: return sizeof(jint);
3375 default:
3376 ShouldNotReachHere();
3377 return -1;
3378 }
3379 }
3380
3381 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3382 switch (eltype) {
3383 // T_BOOLEAN used as surrogate for unsigned byte
3384 case T_BOOLEAN: movzbl(dst, src); break;
3385 case T_BYTE: movsbl(dst, src); break;
3386 case T_SHORT: movswl(dst, src); break;
3387 case T_CHAR: movzwl(dst, src); break;
3388 case T_INT: movl(dst, src); break;
3389 default:
3390 ShouldNotReachHere();
3391 }
3392 }
3393
3394 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3395 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3396 }
3397
3398 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3399 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3400 }
3401
3402 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3403 const int vlen = Assembler::AVX_256bit;
3404 switch (eltype) {
3405 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break;
3406 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break;
3407 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3408 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3409 case T_INT:
3410 // do nothing
3411 break;
3412 default:
3413 ShouldNotReachHere();
3414 }
3415 }
3416
3417 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3418 Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3419 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3420 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3421 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3422 BasicType eltype) {
3423 ShortBranchVerifier sbv(this);
3424 assert(UseAVX >= 2, "AVX2 intrinsics are required");
3425 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3426 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3427
3428 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3429 SHORT_UNROLLED_LOOP_EXIT,
3430 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3431 UNROLLED_VECTOR_LOOP_BEGIN,
3432 END;
3433 switch (eltype) {
3434 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3435 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break;
3436 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break;
3437 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break;
3438 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break;
3439 default: BLOCK_COMMENT("arrays_hashcode {"); break;
3440 }
3441
3442 // For "renaming" for readibility of the code
3443 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3444 vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3445 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3446
3447 const int elsize = arrays_hashcode_elsize(eltype);
3448
3449 /*
3450 if (cnt1 >= 2) {
3451 if (cnt1 >= 32) {
3452 UNROLLED VECTOR LOOP
3453 }
3454 UNROLLED SCALAR LOOP
3455 }
3456 SINGLE SCALAR
3457 */
3458
3459 cmpl(cnt1, 32);
3460 jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3461
3462 // cnt1 >= 32 && generate_vectorized_loop
3463 xorl(index, index);
3464
3465 // vresult = IntVector.zero(I256);
3466 for (int idx = 0; idx < 4; idx++) {
3467 vpxor(vresult[idx], vresult[idx]);
3468 }
3469 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3470 Register bound = tmp2;
3471 Register next = tmp3;
3472 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3473 movl(next, Address(tmp2, 0));
3474 movdl(vnext, next);
3475 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3476
3477 // index = 0;
3478 // bound = cnt1 & ~(32 - 1);
3479 movl(bound, cnt1);
3480 andl(bound, ~(32 - 1));
3481 // for (; index < bound; index += 32) {
3482 bind(UNROLLED_VECTOR_LOOP_BEGIN);
3483 // result *= next;
3484 imull(result, next);
3485 // loop fission to upfront the cost of fetching from memory, OOO execution
3486 // can then hopefully do a better job of prefetching
3487 for (int idx = 0; idx < 4; idx++) {
3488 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3489 }
3490 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3491 for (int idx = 0; idx < 4; idx++) {
3492 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3493 arrays_hashcode_elvcast(vtmp[idx], eltype);
3494 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3495 }
3496 // index += 32;
3497 addl(index, 32);
3498 // index < bound;
3499 cmpl(index, bound);
3500 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3501 // }
3502
3503 lea(ary1, Address(ary1, bound, Address::times(elsize)));
3504 subl(cnt1, bound);
3505 // release bound
3506
3507 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3508 for (int idx = 0; idx < 4; idx++) {
3509 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3510 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3511 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3512 }
3513 // result += vresult.reduceLanes(ADD);
3514 for (int idx = 0; idx < 4; idx++) {
3515 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3516 }
3517
3518 // } else if (cnt1 < 32) {
3519
3520 bind(SHORT_UNROLLED_BEGIN);
3521 // int i = 1;
3522 movl(index, 1);
3523 cmpl(index, cnt1);
3524 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3525
3526 // for (; i < cnt1 ; i += 2) {
3527 bind(SHORT_UNROLLED_LOOP_BEGIN);
3528 movl(tmp3, 961);
3529 imull(result, tmp3);
3530 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3531 movl(tmp3, tmp2);
3532 shll(tmp3, 5);
3533 subl(tmp3, tmp2);
3534 addl(result, tmp3);
3535 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3536 addl(result, tmp3);
3537 addl(index, 2);
3538 cmpl(index, cnt1);
3539 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3540
3541 // }
3542 // if (i >= cnt1) {
3543 bind(SHORT_UNROLLED_LOOP_EXIT);
3544 jccb(Assembler::greater, END);
3545 movl(tmp2, result);
3546 shll(result, 5);
3547 subl(result, tmp2);
3548 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3549 addl(result, tmp3);
3550 // }
3551 bind(END);
3552
3553 BLOCK_COMMENT("} // arrays_hashcode");
3554
3555 } // arrays_hashcode
3556
3557 // helper function for string_compare
3558 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3559 Address::ScaleFactor scale, Address::ScaleFactor scale1,
3560 Address::ScaleFactor scale2, Register index, int ae) {
3561 if (ae == StrIntrinsicNode::LL) {
3562 load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3563 load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3564 } else if (ae == StrIntrinsicNode::UU) {
3565 load_unsigned_short(elem1, Address(str1, index, scale, 0));
3566 load_unsigned_short(elem2, Address(str2, index, scale, 0));
3567 } else {
3568 load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3569 load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3570 }
3571 }
3572
3573 // Compare strings, used for char[] and byte[].
3574 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3575 Register cnt1, Register cnt2, Register result,
3576 XMMRegister vec1, int ae, KRegister mask) {
3577 ShortBranchVerifier sbv(this);
3578 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3579 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only AVX3
3580 int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3581 int stride2x2 = 0x40;
3582 Address::ScaleFactor scale = Address::no_scale;
3583 Address::ScaleFactor scale1 = Address::no_scale;
3584 Address::ScaleFactor scale2 = Address::no_scale;
3585
3586 if (ae != StrIntrinsicNode::LL) {
3587 stride2x2 = 0x20;
3588 }
3589
3590 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3591 shrl(cnt2, 1);
3592 }
3593 // Compute the minimum of the string lengths and the
3594 // difference of the string lengths (stack).
3595 // Do the conditional move stuff
3596 movl(result, cnt1);
3597 subl(cnt1, cnt2);
3598 push(cnt1);
3599 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2)
3600
3601 // Is the minimum length zero?
3602 testl(cnt2, cnt2);
3603 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3604 if (ae == StrIntrinsicNode::LL) {
3605 // Load first bytes
3606 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0]
3607 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0]
3608 } else if (ae == StrIntrinsicNode::UU) {
3609 // Load first characters
3610 load_unsigned_short(result, Address(str1, 0));
3611 load_unsigned_short(cnt1, Address(str2, 0));
3612 } else {
3613 load_unsigned_byte(result, Address(str1, 0));
3614 load_unsigned_short(cnt1, Address(str2, 0));
3615 }
3616 subl(result, cnt1);
3617 jcc(Assembler::notZero, POP_LABEL);
3618
3619 if (ae == StrIntrinsicNode::UU) {
3620 // Divide length by 2 to get number of chars
3621 shrl(cnt2, 1);
3622 }
3623 cmpl(cnt2, 1);
3624 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3625
3626 // Check if the strings start at the same location and setup scale and stride
3627 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3628 cmpptr(str1, str2);
3629 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3630 if (ae == StrIntrinsicNode::LL) {
3631 scale = Address::times_1;
3632 stride = 16;
3633 } else {
3634 scale = Address::times_2;
3635 stride = 8;
3636 }
3637 } else {
3638 scale1 = Address::times_1;
3639 scale2 = Address::times_2;
3640 // scale not used
3641 stride = 8;
3642 }
3643
3644 if (UseAVX >= 2 && UseSSE42Intrinsics) {
3645 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3646 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3647 Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3648 Label COMPARE_TAIL_LONG;
3649 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only AVX3
3650
3651 int pcmpmask = 0x19;
3652 if (ae == StrIntrinsicNode::LL) {
3653 pcmpmask &= ~0x01;
3654 }
3655
3656 // Setup to compare 16-chars (32-bytes) vectors,
3657 // start from first character again because it has aligned address.
3658 if (ae == StrIntrinsicNode::LL) {
3659 stride2 = 32;
3660 } else {
3661 stride2 = 16;
3662 }
3663 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3664 adr_stride = stride << scale;
3665 } else {
3666 adr_stride1 = 8; //stride << scale1;
3667 adr_stride2 = 16; //stride << scale2;
3668 }
3669
3670 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3671 // rax and rdx are used by pcmpestri as elements counters
3672 movl(result, cnt2);
3673 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count
3674 jcc(Assembler::zero, COMPARE_TAIL_LONG);
3675
3676 // fast path : compare first 2 8-char vectors.
3677 bind(COMPARE_16_CHARS);
3678 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3679 movdqu(vec1, Address(str1, 0));
3680 } else {
3681 pmovzxbw(vec1, Address(str1, 0));
3682 }
3683 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3684 jccb(Assembler::below, COMPARE_INDEX_CHAR);
3685
3686 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3687 movdqu(vec1, Address(str1, adr_stride));
3688 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3689 } else {
3690 pmovzxbw(vec1, Address(str1, adr_stride1));
3691 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3692 }
3693 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3694 addl(cnt1, stride);
3695
3696 // Compare the characters at index in cnt1
3697 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3698 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3699 subl(result, cnt2);
3700 jmp(POP_LABEL);
3701
3702 // Setup the registers to start vector comparison loop
3703 bind(COMPARE_WIDE_VECTORS);
3704 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3705 lea(str1, Address(str1, result, scale));
3706 lea(str2, Address(str2, result, scale));
3707 } else {
3708 lea(str1, Address(str1, result, scale1));
3709 lea(str2, Address(str2, result, scale2));
3710 }
3711 subl(result, stride2);
3712 subl(cnt2, stride2);
3713 jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3714 negptr(result);
3715
3716 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3717 bind(COMPARE_WIDE_VECTORS_LOOP);
3718
3719 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3720 cmpl(cnt2, stride2x2);
3721 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3722 testl(cnt2, stride2x2-1); // cnt2 holds the vector count
3723 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40
3724
3725 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3726 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3727 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3728 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3729 } else {
3730 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3731 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3732 }
3733 kortestql(mask, mask);
3734 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare
3735 addptr(result, stride2x2); // update since we already compared at this addr
3736 subl(cnt2, stride2x2); // and sub the size too
3737 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3738
3739 vpxor(vec1, vec1);
3740 jmpb(COMPARE_WIDE_TAIL);
3741 }//if (VM_Version::supports_avx512vlbw())
3742
3743 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3744 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3745 vmovdqu(vec1, Address(str1, result, scale));
3746 vpxor(vec1, Address(str2, result, scale));
3747 } else {
3748 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3749 vpxor(vec1, Address(str2, result, scale2));
3750 }
3751 vptest(vec1, vec1);
3752 jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3753 addptr(result, stride2);
3754 subl(cnt2, stride2);
3755 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3756 // clean upper bits of YMM registers
3757 vpxor(vec1, vec1);
3758
3759 // compare wide vectors tail
3760 bind(COMPARE_WIDE_TAIL);
3761 testptr(result, result);
3762 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3763
3764 movl(result, stride2);
3765 movl(cnt2, result);
3766 negptr(result);
3767 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3768
3769 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3770 bind(VECTOR_NOT_EQUAL);
3771 // clean upper bits of YMM registers
3772 vpxor(vec1, vec1);
3773 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3774 lea(str1, Address(str1, result, scale));
3775 lea(str2, Address(str2, result, scale));
3776 } else {
3777 lea(str1, Address(str1, result, scale1));
3778 lea(str2, Address(str2, result, scale2));
3779 }
3780 jmp(COMPARE_16_CHARS);
3781
3782 // Compare tail chars, length between 1 to 15 chars
3783 bind(COMPARE_TAIL_LONG);
3784 movl(cnt2, result);
3785 cmpl(cnt2, stride);
3786 jcc(Assembler::less, COMPARE_SMALL_STR);
3787
3788 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3789 movdqu(vec1, Address(str1, 0));
3790 } else {
3791 pmovzxbw(vec1, Address(str1, 0));
3792 }
3793 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3794 jcc(Assembler::below, COMPARE_INDEX_CHAR);
3795 subptr(cnt2, stride);
3796 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3797 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3798 lea(str1, Address(str1, result, scale));
3799 lea(str2, Address(str2, result, scale));
3800 } else {
3801 lea(str1, Address(str1, result, scale1));
3802 lea(str2, Address(str2, result, scale2));
3803 }
3804 negptr(cnt2);
3805 jmpb(WHILE_HEAD_LABEL);
3806
3807 bind(COMPARE_SMALL_STR);
3808 } else if (UseSSE42Intrinsics) {
3809 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3810 int pcmpmask = 0x19;
3811 // Setup to compare 8-char (16-byte) vectors,
3812 // start from first character again because it has aligned address.
3813 movl(result, cnt2);
3814 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count
3815 if (ae == StrIntrinsicNode::LL) {
3816 pcmpmask &= ~0x01;
3817 }
3818 jcc(Assembler::zero, COMPARE_TAIL);
3819 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3820 lea(str1, Address(str1, result, scale));
3821 lea(str2, Address(str2, result, scale));
3822 } else {
3823 lea(str1, Address(str1, result, scale1));
3824 lea(str2, Address(str2, result, scale2));
3825 }
3826 negptr(result);
3827
3828 // pcmpestri
3829 // inputs:
3830 // vec1- substring
3831 // rax - negative string length (elements count)
3832 // mem - scanned string
3833 // rdx - string length (elements count)
3834 // pcmpmask - cmp mode: 11000 (string compare with negated result)
3835 // + 00 (unsigned bytes) or + 01 (unsigned shorts)
3836 // outputs:
3837 // rcx - first mismatched element index
3838 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3839
3840 bind(COMPARE_WIDE_VECTORS);
3841 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3842 movdqu(vec1, Address(str1, result, scale));
3843 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3844 } else {
3845 pmovzxbw(vec1, Address(str1, result, scale1));
3846 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3847 }
3848 // After pcmpestri cnt1(rcx) contains mismatched element index
3849
3850 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1
3851 addptr(result, stride);
3852 subptr(cnt2, stride);
3853 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3854
3855 // compare wide vectors tail
3856 testptr(result, result);
3857 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3858
3859 movl(cnt2, stride);
3860 movl(result, stride);
3861 negptr(result);
3862 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3863 movdqu(vec1, Address(str1, result, scale));
3864 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3865 } else {
3866 pmovzxbw(vec1, Address(str1, result, scale1));
3867 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3868 }
3869 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3870
3871 // Mismatched characters in the vectors
3872 bind(VECTOR_NOT_EQUAL);
3873 addptr(cnt1, result);
3874 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3875 subl(result, cnt2);
3876 jmpb(POP_LABEL);
3877
3878 bind(COMPARE_TAIL); // limit is zero
3879 movl(cnt2, result);
3880 // Fallthru to tail compare
3881 }
3882 // Shift str2 and str1 to the end of the arrays, negate min
3883 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3884 lea(str1, Address(str1, cnt2, scale));
3885 lea(str2, Address(str2, cnt2, scale));
3886 } else {
3887 lea(str1, Address(str1, cnt2, scale1));
3888 lea(str2, Address(str2, cnt2, scale2));
3889 }
3890 decrementl(cnt2); // first character was compared already
3891 negptr(cnt2);
3892
3893 // Compare the rest of the elements
3894 bind(WHILE_HEAD_LABEL);
3895 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3896 subl(result, cnt1);
3897 jccb(Assembler::notZero, POP_LABEL);
3898 increment(cnt2);
3899 jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3900
3901 // Strings are equal up to min length. Return the length difference.
3902 bind(LENGTH_DIFF_LABEL);
3903 pop(result);
3904 if (ae == StrIntrinsicNode::UU) {
3905 // Divide diff by 2 to get number of chars
3906 sarl(result, 1);
3907 }
3908 jmpb(DONE_LABEL);
3909
3910 if (VM_Version::supports_avx512vlbw()) {
3911
3912 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3913
3914 kmovql(cnt1, mask);
3915 notq(cnt1);
3916 bsfq(cnt2, cnt1);
3917 if (ae != StrIntrinsicNode::LL) {
3918 // Divide diff by 2 to get number of chars
3919 sarl(cnt2, 1);
3920 }
3921 addq(result, cnt2);
3922 if (ae == StrIntrinsicNode::LL) {
3923 load_unsigned_byte(cnt1, Address(str2, result));
3924 load_unsigned_byte(result, Address(str1, result));
3925 } else if (ae == StrIntrinsicNode::UU) {
3926 load_unsigned_short(cnt1, Address(str2, result, scale));
3927 load_unsigned_short(result, Address(str1, result, scale));
3928 } else {
3929 load_unsigned_short(cnt1, Address(str2, result, scale2));
3930 load_unsigned_byte(result, Address(str1, result, scale1));
3931 }
3932 subl(result, cnt1);
3933 jmpb(POP_LABEL);
3934 }//if (VM_Version::supports_avx512vlbw())
3935
3936 // Discard the stored length difference
3937 bind(POP_LABEL);
3938 pop(cnt1);
3939
3940 // That's it
3941 bind(DONE_LABEL);
3942 if(ae == StrIntrinsicNode::UL) {
3943 negl(result);
3944 }
3945
3946 }
3947
3948 // Search for Non-ASCII character (Negative byte value) in a byte array,
3949 // return the index of the first such character, otherwise the length
3950 // of the array segment searched.
3951 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3952 // @IntrinsicCandidate
3953 // public static int countPositives(byte[] ba, int off, int len) {
3954 // for (int i = off; i < off + len; i++) {
3955 // if (ba[i] < 0) {
3956 // return i - off;
3957 // }
3958 // }
3959 // return len;
3960 // }
3961 void C2_MacroAssembler::count_positives(Register ary1, Register len,
3962 Register result, Register tmp1,
3963 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3964 // rsi: byte array
3965 // rcx: len
3966 // rax: result
3967 ShortBranchVerifier sbv(this);
3968 assert_different_registers(ary1, len, result, tmp1);
3969 assert_different_registers(vec1, vec2);
3970 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3971
3972 movl(result, len); // copy
3973 // len == 0
3974 testl(len, len);
3975 jcc(Assembler::zero, DONE);
3976
3977 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3978 VM_Version::supports_avx512vlbw() &&
3979 VM_Version::supports_bmi2()) {
3980
3981 Label test_64_loop, test_tail, BREAK_LOOP;
3982 movl(tmp1, len);
3983 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3984
3985 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
3986 andl(len, 0xffffffc0); // vector count (in chars)
3987 jccb(Assembler::zero, test_tail);
3988
3989 lea(ary1, Address(ary1, len, Address::times_1));
3990 negptr(len);
3991
3992 bind(test_64_loop);
3993 // Check whether our 64 elements of size byte contain negatives
3994 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3995 kortestql(mask1, mask1);
3996 jcc(Assembler::notZero, BREAK_LOOP);
3997
3998 addptr(len, 64);
3999 jccb(Assembler::notZero, test_64_loop);
4000
4001 bind(test_tail);
4002 // bail out when there is nothing to be done
4003 testl(tmp1, -1);
4004 jcc(Assembler::zero, DONE);
4005
4006
4007 // check the tail for absense of negatives
4008 // ~(~0 << len) applied up to two times (for 32-bit scenario)
4009 {
4010 Register tmp3_aliased = len;
4011 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4012 shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4013 notq(tmp3_aliased);
4014 kmovql(mask2, tmp3_aliased);
4015 }
4016
4017 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4018 ktestq(mask1, mask2);
4019 jcc(Assembler::zero, DONE);
4020
4021 // do a full check for negative registers in the tail
4022 movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4023 // ary1 already pointing to the right place
4024 jmpb(TAIL_START);
4025
4026 bind(BREAK_LOOP);
4027 // At least one byte in the last 64 byte block was negative.
4028 // Set up to look at the last 64 bytes as if they were a tail
4029 lea(ary1, Address(ary1, len, Address::times_1));
4030 addptr(result, len);
4031 // Ignore the very last byte: if all others are positive,
4032 // it must be negative, so we can skip right to the 2+1 byte
4033 // end comparison at this point
4034 orl(result, 63);
4035 movl(len, 63);
4036 // Fallthru to tail compare
4037 } else {
4038
4039 if (UseAVX >= 2) {
4040 // With AVX2, use 32-byte vector compare
4041 Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4042
4043 // Compare 32-byte vectors
4044 testl(len, 0xffffffe0); // vector count (in bytes)
4045 jccb(Assembler::zero, TAIL_START);
4046
4047 andl(len, 0xffffffe0);
4048 lea(ary1, Address(ary1, len, Address::times_1));
4049 negptr(len);
4050
4051 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
4052 movdl(vec2, tmp1);
4053 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4054
4055 bind(COMPARE_WIDE_VECTORS);
4056 vmovdqu(vec1, Address(ary1, len, Address::times_1));
4057 vptest(vec1, vec2);
4058 jccb(Assembler::notZero, BREAK_LOOP);
4059 addptr(len, 32);
4060 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4061
4062 testl(result, 0x0000001f); // any bytes remaining?
4063 jcc(Assembler::zero, DONE);
4064
4065 // Quick test using the already prepared vector mask
4066 movl(len, result);
4067 andl(len, 0x0000001f);
4068 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4069 vptest(vec1, vec2);
4070 jcc(Assembler::zero, DONE);
4071 // There are zeros, jump to the tail to determine exactly where
4072 jmpb(TAIL_START);
4073
4074 bind(BREAK_LOOP);
4075 // At least one byte in the last 32-byte vector is negative.
4076 // Set up to look at the last 32 bytes as if they were a tail
4077 lea(ary1, Address(ary1, len, Address::times_1));
4078 addptr(result, len);
4079 // Ignore the very last byte: if all others are positive,
4080 // it must be negative, so we can skip right to the 2+1 byte
4081 // end comparison at this point
4082 orl(result, 31);
4083 movl(len, 31);
4084 // Fallthru to tail compare
4085 } else if (UseSSE42Intrinsics) {
4086 // With SSE4.2, use double quad vector compare
4087 Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4088
4089 // Compare 16-byte vectors
4090 testl(len, 0xfffffff0); // vector count (in bytes)
4091 jcc(Assembler::zero, TAIL_START);
4092
4093 andl(len, 0xfffffff0);
4094 lea(ary1, Address(ary1, len, Address::times_1));
4095 negptr(len);
4096
4097 movl(tmp1, 0x80808080);
4098 movdl(vec2, tmp1);
4099 pshufd(vec2, vec2, 0);
4100
4101 bind(COMPARE_WIDE_VECTORS);
4102 movdqu(vec1, Address(ary1, len, Address::times_1));
4103 ptest(vec1, vec2);
4104 jccb(Assembler::notZero, BREAK_LOOP);
4105 addptr(len, 16);
4106 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4107
4108 testl(result, 0x0000000f); // len is zero, any bytes remaining?
4109 jcc(Assembler::zero, DONE);
4110
4111 // Quick test using the already prepared vector mask
4112 movl(len, result);
4113 andl(len, 0x0000000f); // tail count (in bytes)
4114 movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4115 ptest(vec1, vec2);
4116 jcc(Assembler::zero, DONE);
4117 jmpb(TAIL_START);
4118
4119 bind(BREAK_LOOP);
4120 // At least one byte in the last 16-byte vector is negative.
4121 // Set up and look at the last 16 bytes as if they were a tail
4122 lea(ary1, Address(ary1, len, Address::times_1));
4123 addptr(result, len);
4124 // Ignore the very last byte: if all others are positive,
4125 // it must be negative, so we can skip right to the 2+1 byte
4126 // end comparison at this point
4127 orl(result, 15);
4128 movl(len, 15);
4129 // Fallthru to tail compare
4130 }
4131 }
4132
4133 bind(TAIL_START);
4134 // Compare 4-byte vectors
4135 andl(len, 0xfffffffc); // vector count (in bytes)
4136 jccb(Assembler::zero, COMPARE_CHAR);
4137
4138 lea(ary1, Address(ary1, len, Address::times_1));
4139 negptr(len);
4140
4141 bind(COMPARE_VECTORS);
4142 movl(tmp1, Address(ary1, len, Address::times_1));
4143 andl(tmp1, 0x80808080);
4144 jccb(Assembler::notZero, TAIL_ADJUST);
4145 addptr(len, 4);
4146 jccb(Assembler::notZero, COMPARE_VECTORS);
4147
4148 // Compare trailing char (final 2-3 bytes), if any
4149 bind(COMPARE_CHAR);
4150
4151 testl(result, 0x2); // tail char
4152 jccb(Assembler::zero, COMPARE_BYTE);
4153 load_unsigned_short(tmp1, Address(ary1, 0));
4154 andl(tmp1, 0x00008080);
4155 jccb(Assembler::notZero, CHAR_ADJUST);
4156 lea(ary1, Address(ary1, 2));
4157
4158 bind(COMPARE_BYTE);
4159 testl(result, 0x1); // tail byte
4160 jccb(Assembler::zero, DONE);
4161 load_unsigned_byte(tmp1, Address(ary1, 0));
4162 testl(tmp1, 0x00000080);
4163 jccb(Assembler::zero, DONE);
4164 subptr(result, 1);
4165 jmpb(DONE);
4166
4167 bind(TAIL_ADJUST);
4168 // there are negative bits in the last 4 byte block.
4169 // Adjust result and check the next three bytes
4170 addptr(result, len);
4171 orl(result, 3);
4172 lea(ary1, Address(ary1, len, Address::times_1));
4173 jmpb(COMPARE_CHAR);
4174
4175 bind(CHAR_ADJUST);
4176 // We are looking at a char + optional byte tail, and found that one
4177 // of the bytes in the char is negative. Adjust the result, check the
4178 // first byte and readjust if needed.
4179 andl(result, 0xfffffffc);
4180 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4181 jccb(Assembler::notZero, DONE);
4182 addptr(result, 1);
4183
4184 // That's it
4185 bind(DONE);
4186 if (UseAVX >= 2) {
4187 // clean upper bits of YMM registers
4188 vpxor(vec1, vec1);
4189 vpxor(vec2, vec2);
4190 }
4191 }
4192
4193 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4194 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4195 Register limit, Register result, Register chr,
4196 XMMRegister vec1, XMMRegister vec2, bool is_char,
4197 KRegister mask, bool expand_ary2) {
4198 // for expand_ary2, limit is the (smaller) size of the second array.
4199 ShortBranchVerifier sbv(this);
4200 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4201
4202 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4203 "Expansion only implemented for AVX2");
4204
4205 int length_offset = arrayOopDesc::length_offset_in_bytes();
4206 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4207
4208 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4209 int scaleIncr = expand_ary2 ? 8 : 16;
4210
4211 if (is_array_equ) {
4212 // Check the input args
4213 cmpoop(ary1, ary2);
4214 jcc(Assembler::equal, TRUE_LABEL);
4215
4216 // Need additional checks for arrays_equals.
4217 testptr(ary1, ary1);
4218 jcc(Assembler::zero, FALSE_LABEL);
4219 testptr(ary2, ary2);
4220 jcc(Assembler::zero, FALSE_LABEL);
4221
4222 // Check the lengths
4223 movl(limit, Address(ary1, length_offset));
4224 cmpl(limit, Address(ary2, length_offset));
4225 jcc(Assembler::notEqual, FALSE_LABEL);
4226 }
4227
4228 // count == 0
4229 testl(limit, limit);
4230 jcc(Assembler::zero, TRUE_LABEL);
4231
4232 if (is_array_equ) {
4233 // Load array address
4234 lea(ary1, Address(ary1, base_offset));
4235 lea(ary2, Address(ary2, base_offset));
4236 }
4237
4238 if (is_array_equ && is_char) {
4239 // arrays_equals when used for char[].
4240 shll(limit, 1); // byte count != 0
4241 }
4242 movl(result, limit); // copy
4243
4244 if (UseAVX >= 2) {
4245 // With AVX2, use 32-byte vector compare
4246 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4247
4248 // Compare 32-byte vectors
4249 if (expand_ary2) {
4250 andl(result, 0x0000000f); // tail count (in bytes)
4251 andl(limit, 0xfffffff0); // vector count (in bytes)
4252 jcc(Assembler::zero, COMPARE_TAIL);
4253 } else {
4254 andl(result, 0x0000001f); // tail count (in bytes)
4255 andl(limit, 0xffffffe0); // vector count (in bytes)
4256 jcc(Assembler::zero, COMPARE_TAIL_16);
4257 }
4258
4259 lea(ary1, Address(ary1, limit, scaleFactor));
4260 lea(ary2, Address(ary2, limit, Address::times_1));
4261 negptr(limit);
4262
4263 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4264 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4265
4266 cmpl(limit, -64);
4267 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4268
4269 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4270
4271 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4272 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4273 kortestql(mask, mask);
4274 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
4275 addptr(limit, 64); // update since we already compared at this addr
4276 cmpl(limit, -64);
4277 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4278
4279 // At this point we may still need to compare -limit+result bytes.
4280 // We could execute the next two instruction and just continue via non-wide path:
4281 // cmpl(limit, 0);
4282 // jcc(Assembler::equal, COMPARE_TAIL); // true
4283 // But since we stopped at the points ary{1,2}+limit which are
4284 // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4285 // (|limit| <= 32 and result < 32),
4286 // we may just compare the last 64 bytes.
4287 //
4288 addptr(result, -64); // it is safe, bc we just came from this area
4289 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4290 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4291 kortestql(mask, mask);
4292 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
4293
4294 jmp(TRUE_LABEL);
4295
4296 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4297
4298 }//if (VM_Version::supports_avx512vlbw())
4299
4300 bind(COMPARE_WIDE_VECTORS);
4301 vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4302 if (expand_ary2) {
4303 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4304 } else {
4305 vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4306 }
4307 vpxor(vec1, vec2);
4308
4309 vptest(vec1, vec1);
4310 jcc(Assembler::notZero, FALSE_LABEL);
4311 addptr(limit, scaleIncr * 2);
4312 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4313
4314 testl(result, result);
4315 jcc(Assembler::zero, TRUE_LABEL);
4316
4317 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4318 if (expand_ary2) {
4319 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4320 } else {
4321 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4322 }
4323 vpxor(vec1, vec2);
4324
4325 vptest(vec1, vec1);
4326 jcc(Assembler::notZero, FALSE_LABEL);
4327 jmp(TRUE_LABEL);
4328
4329 bind(COMPARE_TAIL_16); // limit is zero
4330 movl(limit, result);
4331
4332 // Compare 16-byte chunks
4333 andl(result, 0x0000000f); // tail count (in bytes)
4334 andl(limit, 0xfffffff0); // vector count (in bytes)
4335 jcc(Assembler::zero, COMPARE_TAIL);
4336
4337 lea(ary1, Address(ary1, limit, scaleFactor));
4338 lea(ary2, Address(ary2, limit, Address::times_1));
4339 negptr(limit);
4340
4341 bind(COMPARE_WIDE_VECTORS_16);
4342 movdqu(vec1, Address(ary1, limit, scaleFactor));
4343 if (expand_ary2) {
4344 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4345 } else {
4346 movdqu(vec2, Address(ary2, limit, Address::times_1));
4347 }
4348 pxor(vec1, vec2);
4349
4350 ptest(vec1, vec1);
4351 jcc(Assembler::notZero, FALSE_LABEL);
4352 addptr(limit, scaleIncr);
4353 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4354
4355 bind(COMPARE_TAIL); // limit is zero
4356 movl(limit, result);
4357 // Fallthru to tail compare
4358 } else if (UseSSE42Intrinsics) {
4359 // With SSE4.2, use double quad vector compare
4360 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4361
4362 // Compare 16-byte vectors
4363 andl(result, 0x0000000f); // tail count (in bytes)
4364 andl(limit, 0xfffffff0); // vector count (in bytes)
4365 jcc(Assembler::zero, COMPARE_TAIL);
4366
4367 lea(ary1, Address(ary1, limit, Address::times_1));
4368 lea(ary2, Address(ary2, limit, Address::times_1));
4369 negptr(limit);
4370
4371 bind(COMPARE_WIDE_VECTORS);
4372 movdqu(vec1, Address(ary1, limit, Address::times_1));
4373 movdqu(vec2, Address(ary2, limit, Address::times_1));
4374 pxor(vec1, vec2);
4375
4376 ptest(vec1, vec1);
4377 jcc(Assembler::notZero, FALSE_LABEL);
4378 addptr(limit, 16);
4379 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4380
4381 testl(result, result);
4382 jcc(Assembler::zero, TRUE_LABEL);
4383
4384 movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4385 movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4386 pxor(vec1, vec2);
4387
4388 ptest(vec1, vec1);
4389 jccb(Assembler::notZero, FALSE_LABEL);
4390 jmpb(TRUE_LABEL);
4391
4392 bind(COMPARE_TAIL); // limit is zero
4393 movl(limit, result);
4394 // Fallthru to tail compare
4395 }
4396
4397 // Compare 4-byte vectors
4398 if (expand_ary2) {
4399 testl(result, result);
4400 jccb(Assembler::zero, TRUE_LABEL);
4401 } else {
4402 andl(limit, 0xfffffffc); // vector count (in bytes)
4403 jccb(Assembler::zero, COMPARE_CHAR);
4404 }
4405
4406 lea(ary1, Address(ary1, limit, scaleFactor));
4407 lea(ary2, Address(ary2, limit, Address::times_1));
4408 negptr(limit);
4409
4410 bind(COMPARE_VECTORS);
4411 if (expand_ary2) {
4412 // There are no "vector" operations for bytes to shorts
4413 movzbl(chr, Address(ary2, limit, Address::times_1));
4414 cmpw(Address(ary1, limit, Address::times_2), chr);
4415 jccb(Assembler::notEqual, FALSE_LABEL);
4416 addptr(limit, 1);
4417 jcc(Assembler::notZero, COMPARE_VECTORS);
4418 jmp(TRUE_LABEL);
4419 } else {
4420 movl(chr, Address(ary1, limit, Address::times_1));
4421 cmpl(chr, Address(ary2, limit, Address::times_1));
4422 jccb(Assembler::notEqual, FALSE_LABEL);
4423 addptr(limit, 4);
4424 jcc(Assembler::notZero, COMPARE_VECTORS);
4425 }
4426
4427 // Compare trailing char (final 2 bytes), if any
4428 bind(COMPARE_CHAR);
4429 testl(result, 0x2); // tail char
4430 jccb(Assembler::zero, COMPARE_BYTE);
4431 load_unsigned_short(chr, Address(ary1, 0));
4432 load_unsigned_short(limit, Address(ary2, 0));
4433 cmpl(chr, limit);
4434 jccb(Assembler::notEqual, FALSE_LABEL);
4435
4436 if (is_array_equ && is_char) {
4437 bind(COMPARE_BYTE);
4438 } else {
4439 lea(ary1, Address(ary1, 2));
4440 lea(ary2, Address(ary2, 2));
4441
4442 bind(COMPARE_BYTE);
4443 testl(result, 0x1); // tail byte
4444 jccb(Assembler::zero, TRUE_LABEL);
4445 load_unsigned_byte(chr, Address(ary1, 0));
4446 load_unsigned_byte(limit, Address(ary2, 0));
4447 cmpl(chr, limit);
4448 jccb(Assembler::notEqual, FALSE_LABEL);
4449 }
4450 bind(TRUE_LABEL);
4451 movl(result, 1); // return true
4452 jmpb(DONE);
4453
4454 bind(FALSE_LABEL);
4455 xorl(result, result); // return false
4456
4457 // That's it
4458 bind(DONE);
4459 if (UseAVX >= 2) {
4460 // clean upper bits of YMM registers
4461 vpxor(vec1, vec1);
4462 vpxor(vec2, vec2);
4463 }
4464 }
4465
4466 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4467 #define __ masm.
4468 Register dst = stub.data<0>();
4469 XMMRegister src = stub.data<1>();
4470 address target = stub.data<2>();
4471 __ bind(stub.entry());
4472 __ subptr(rsp, 8);
4473 __ movdbl(Address(rsp), src);
4474 __ call(RuntimeAddress(target));
4475 // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4476 __ pop(dst);
4477 __ jmp(stub.continuation());
4478 #undef __
4479 }
4480
4481 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4482 assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4483 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4484
4485 address slowpath_target;
4486 if (dst_bt == T_INT) {
4487 if (src_bt == T_FLOAT) {
4488 cvttss2sil(dst, src);
4489 cmpl(dst, 0x80000000);
4490 slowpath_target = StubRoutines::x86::f2i_fixup();
4491 } else {
4492 cvttsd2sil(dst, src);
4493 cmpl(dst, 0x80000000);
4494 slowpath_target = StubRoutines::x86::d2i_fixup();
4495 }
4496 } else {
4497 if (src_bt == T_FLOAT) {
4498 cvttss2siq(dst, src);
4499 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4500 slowpath_target = StubRoutines::x86::f2l_fixup();
4501 } else {
4502 cvttsd2siq(dst, src);
4503 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4504 slowpath_target = StubRoutines::x86::d2l_fixup();
4505 }
4506 }
4507
4508 // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4509 int max_size = 23 + (UseAPX ? 1 : 0);
4510 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4511 jcc(Assembler::equal, stub->entry());
4512 bind(stub->continuation());
4513 }
4514
4515 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4516 XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4517 switch(ideal_opc) {
4518 case Op_LShiftVS:
4519 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4520 case Op_LShiftVI:
4521 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4522 case Op_LShiftVL:
4523 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4524 case Op_RShiftVS:
4525 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4526 case Op_RShiftVI:
4527 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4528 case Op_RShiftVL:
4529 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4530 case Op_URShiftVS:
4531 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4532 case Op_URShiftVI:
4533 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4534 case Op_URShiftVL:
4535 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4536 case Op_RotateRightV:
4537 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4538 case Op_RotateLeftV:
4539 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4540 default:
4541 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4542 break;
4543 }
4544 }
4545
4546 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4547 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4548 if (is_unsigned) {
4549 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4550 } else {
4551 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4552 }
4553 }
4554
4555 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4556 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4557 switch (elem_bt) {
4558 case T_BYTE:
4559 if (ideal_opc == Op_SaturatingAddV) {
4560 evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4561 } else {
4562 assert(ideal_opc == Op_SaturatingSubV, "");
4563 evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4564 }
4565 break;
4566 case T_SHORT:
4567 if (ideal_opc == Op_SaturatingAddV) {
4568 evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4569 } else {
4570 assert(ideal_opc == Op_SaturatingSubV, "");
4571 evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4572 }
4573 break;
4574 default:
4575 fatal("Unsupported type %s", type2name(elem_bt));
4576 break;
4577 }
4578 }
4579
4580 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4581 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4582 switch (elem_bt) {
4583 case T_BYTE:
4584 if (ideal_opc == Op_SaturatingAddV) {
4585 evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4586 } else {
4587 assert(ideal_opc == Op_SaturatingSubV, "");
4588 evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4589 }
4590 break;
4591 case T_SHORT:
4592 if (ideal_opc == Op_SaturatingAddV) {
4593 evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4594 } else {
4595 assert(ideal_opc == Op_SaturatingSubV, "");
4596 evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4597 }
4598 break;
4599 default:
4600 fatal("Unsupported type %s", type2name(elem_bt));
4601 break;
4602 }
4603 }
4604
4605 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4606 Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4607 if (is_unsigned) {
4608 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4609 } else {
4610 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4611 }
4612 }
4613
4614 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4615 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4616 switch (elem_bt) {
4617 case T_BYTE:
4618 if (ideal_opc == Op_SaturatingAddV) {
4619 evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4620 } else {
4621 assert(ideal_opc == Op_SaturatingSubV, "");
4622 evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4623 }
4624 break;
4625 case T_SHORT:
4626 if (ideal_opc == Op_SaturatingAddV) {
4627 evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4628 } else {
4629 assert(ideal_opc == Op_SaturatingSubV, "");
4630 evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4631 }
4632 break;
4633 default:
4634 fatal("Unsupported type %s", type2name(elem_bt));
4635 break;
4636 }
4637 }
4638
4639 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4640 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4641 switch (elem_bt) {
4642 case T_BYTE:
4643 if (ideal_opc == Op_SaturatingAddV) {
4644 evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4645 } else {
4646 assert(ideal_opc == Op_SaturatingSubV, "");
4647 evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4648 }
4649 break;
4650 case T_SHORT:
4651 if (ideal_opc == Op_SaturatingAddV) {
4652 evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4653 } else {
4654 assert(ideal_opc == Op_SaturatingSubV, "");
4655 evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4656 }
4657 break;
4658 default:
4659 fatal("Unsupported type %s", type2name(elem_bt));
4660 break;
4661 }
4662 }
4663
4664 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4665 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4666 bool is_varshift) {
4667 switch (ideal_opc) {
4668 case Op_AddVB:
4669 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4670 case Op_AddVS:
4671 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4672 case Op_AddVI:
4673 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4674 case Op_AddVL:
4675 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4676 case Op_AddVF:
4677 evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4678 case Op_AddVD:
4679 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4680 case Op_SubVB:
4681 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4682 case Op_SubVS:
4683 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4684 case Op_SubVI:
4685 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4686 case Op_SubVL:
4687 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4688 case Op_SubVF:
4689 evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4690 case Op_SubVD:
4691 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4692 case Op_MulVS:
4693 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4694 case Op_MulVI:
4695 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4696 case Op_MulVL:
4697 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4698 case Op_MulVF:
4699 evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4700 case Op_MulVD:
4701 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4702 case Op_DivVF:
4703 evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4704 case Op_DivVD:
4705 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4706 case Op_SqrtVF:
4707 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4708 case Op_SqrtVD:
4709 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4710 case Op_AbsVB:
4711 evpabsb(dst, mask, src2, merge, vlen_enc); break;
4712 case Op_AbsVS:
4713 evpabsw(dst, mask, src2, merge, vlen_enc); break;
4714 case Op_AbsVI:
4715 evpabsd(dst, mask, src2, merge, vlen_enc); break;
4716 case Op_AbsVL:
4717 evpabsq(dst, mask, src2, merge, vlen_enc); break;
4718 case Op_FmaVF:
4719 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4720 case Op_FmaVD:
4721 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4722 case Op_VectorRearrange:
4723 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4724 case Op_LShiftVS:
4725 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4726 case Op_LShiftVI:
4727 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4728 case Op_LShiftVL:
4729 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4730 case Op_RShiftVS:
4731 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4732 case Op_RShiftVI:
4733 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4734 case Op_RShiftVL:
4735 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4736 case Op_URShiftVS:
4737 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4738 case Op_URShiftVI:
4739 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4740 case Op_URShiftVL:
4741 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4742 case Op_RotateLeftV:
4743 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4744 case Op_RotateRightV:
4745 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4746 case Op_MaxV:
4747 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4748 case Op_MinV:
4749 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4750 case Op_UMinV:
4751 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4752 case Op_UMaxV:
4753 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4754 case Op_XorV:
4755 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4756 case Op_OrV:
4757 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4758 case Op_AndV:
4759 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4760 default:
4761 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4762 break;
4763 }
4764 }
4765
4766 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4767 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4768 switch (ideal_opc) {
4769 case Op_AddVB:
4770 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4771 case Op_AddVS:
4772 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4773 case Op_AddVI:
4774 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4775 case Op_AddVL:
4776 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4777 case Op_AddVF:
4778 evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4779 case Op_AddVD:
4780 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4781 case Op_SubVB:
4782 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4783 case Op_SubVS:
4784 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4785 case Op_SubVI:
4786 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4787 case Op_SubVL:
4788 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4789 case Op_SubVF:
4790 evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4791 case Op_SubVD:
4792 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4793 case Op_MulVS:
4794 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4795 case Op_MulVI:
4796 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4797 case Op_MulVL:
4798 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4799 case Op_MulVF:
4800 evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4801 case Op_MulVD:
4802 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4803 case Op_DivVF:
4804 evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4805 case Op_DivVD:
4806 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4807 case Op_FmaVF:
4808 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4809 case Op_FmaVD:
4810 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4811 case Op_MaxV:
4812 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4813 case Op_MinV:
4814 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4815 case Op_UMaxV:
4816 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4817 case Op_UMinV:
4818 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4819 case Op_XorV:
4820 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4821 case Op_OrV:
4822 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4823 case Op_AndV:
4824 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4825 default:
4826 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4827 break;
4828 }
4829 }
4830
4831 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4832 KRegister src1, KRegister src2) {
4833 BasicType etype = T_ILLEGAL;
4834 switch(mask_len) {
4835 case 2:
4836 case 4:
4837 case 8: etype = T_BYTE; break;
4838 case 16: etype = T_SHORT; break;
4839 case 32: etype = T_INT; break;
4840 case 64: etype = T_LONG; break;
4841 default: fatal("Unsupported type"); break;
4842 }
4843 assert(etype != T_ILLEGAL, "");
4844 switch(ideal_opc) {
4845 case Op_AndVMask:
4846 kand(etype, dst, src1, src2); break;
4847 case Op_OrVMask:
4848 kor(etype, dst, src1, src2); break;
4849 case Op_XorVMask:
4850 kxor(etype, dst, src1, src2); break;
4851 default:
4852 fatal("Unsupported masked operation"); break;
4853 }
4854 }
4855
4856 /*
4857 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4858 * If src is NaN, the result is 0.
4859 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4860 * the result is equal to the value of Integer.MIN_VALUE.
4861 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4862 * the result is equal to the value of Integer.MAX_VALUE.
4863 */
4864 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4865 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4866 Register rscratch, AddressLiteral float_sign_flip,
4867 int vec_enc) {
4868 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4869 Label done;
4870 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4871 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4872 vptest(xtmp2, xtmp2, vec_enc);
4873 jccb(Assembler::equal, done);
4874
4875 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4876 vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4877
4878 vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4879 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4880 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4881
4882 // Recompute the mask for remaining special value.
4883 vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4884 // Extract SRC values corresponding to TRUE mask lanes.
4885 vpand(xtmp4, xtmp2, src, vec_enc);
4886 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4887 // values are set.
4888 vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4889
4890 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4891 bind(done);
4892 }
4893
4894 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4895 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4896 Register rscratch, AddressLiteral float_sign_flip,
4897 int vec_enc) {
4898 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4899 Label done;
4900 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4901 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4902 kortestwl(ktmp1, ktmp1);
4903 jccb(Assembler::equal, done);
4904
4905 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4906 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4907 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4908
4909 kxorwl(ktmp1, ktmp1, ktmp2);
4910 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4911 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4912 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4913 bind(done);
4914 }
4915
4916 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4917 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4918 Register rscratch, AddressLiteral double_sign_flip,
4919 int vec_enc) {
4920 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4921
4922 Label done;
4923 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4924 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4925 kortestwl(ktmp1, ktmp1);
4926 jccb(Assembler::equal, done);
4927
4928 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4929 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4930 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4931
4932 kxorwl(ktmp1, ktmp1, ktmp2);
4933 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4934 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4935 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4936 bind(done);
4937 }
4938
4939 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4940 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4941 Register rscratch, AddressLiteral float_sign_flip,
4942 int vec_enc) {
4943 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4944 Label done;
4945 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4946 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4947 kortestwl(ktmp1, ktmp1);
4948 jccb(Assembler::equal, done);
4949
4950 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4951 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4952 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4953
4954 kxorwl(ktmp1, ktmp1, ktmp2);
4955 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4956 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4957 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4958 bind(done);
4959 }
4960
4961 /*
4962 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4963 * If src is NaN, the result is 0.
4964 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4965 * the result is equal to the value of Long.MIN_VALUE.
4966 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4967 * the result is equal to the value of Long.MAX_VALUE.
4968 */
4969 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4970 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4971 Register rscratch, AddressLiteral double_sign_flip,
4972 int vec_enc) {
4973 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4974
4975 Label done;
4976 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4977 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4978 kortestwl(ktmp1, ktmp1);
4979 jccb(Assembler::equal, done);
4980
4981 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4982 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4983 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4984
4985 kxorwl(ktmp1, ktmp1, ktmp2);
4986 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4987 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4988 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4989 bind(done);
4990 }
4991
4992 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
4993 XMMRegister xtmp, int index, int vec_enc) {
4994 assert(vec_enc < Assembler::AVX_512bit, "");
4995 if (vec_enc == Assembler::AVX_256bit) {
4996 vextractf128_high(xtmp, src);
4997 vshufps(dst, src, xtmp, index, vec_enc);
4998 } else {
4999 vshufps(dst, src, zero, index, vec_enc);
5000 }
5001 }
5002
5003 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5004 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5005 AddressLiteral float_sign_flip, int src_vec_enc) {
5006 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5007
5008 Label done;
5009 // Compare the destination lanes with float_sign_flip
5010 // value to get mask for all special values.
5011 movdqu(xtmp1, float_sign_flip, rscratch);
5012 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5013 ptest(xtmp2, xtmp2);
5014 jccb(Assembler::equal, done);
5015
5016 // Flip float_sign_flip to get max integer value.
5017 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5018 pxor(xtmp1, xtmp4);
5019
5020 // Set detination lanes corresponding to unordered source lanes as zero.
5021 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5022 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5023
5024 // Shuffle mask vector and pack lower doubles word from each quadword lane.
5025 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5026 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5027
5028 // Recompute the mask for remaining special value.
5029 pxor(xtmp2, xtmp3);
5030 // Extract mask corresponding to non-negative source lanes.
5031 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5032
5033 // Shuffle mask vector and pack lower doubles word from each quadword lane.
5034 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5035 pand(xtmp3, xtmp2);
5036
5037 // Replace destination lanes holding special value(0x80000000) with max int
5038 // if corresponding source lane holds a +ve value.
5039 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5040 bind(done);
5041 }
5042
5043
5044 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5045 XMMRegister xtmp, Register rscratch, int vec_enc) {
5046 switch(to_elem_bt) {
5047 case T_SHORT:
5048 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5049 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5050 vpackusdw(dst, dst, zero, vec_enc);
5051 if (vec_enc == Assembler::AVX_256bit) {
5052 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5053 }
5054 break;
5055 case T_BYTE:
5056 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5057 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5058 vpackusdw(dst, dst, zero, vec_enc);
5059 if (vec_enc == Assembler::AVX_256bit) {
5060 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5061 }
5062 vpackuswb(dst, dst, zero, vec_enc);
5063 break;
5064 default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt));
5065 }
5066 }
5067
5068 /*
5069 * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):-
5070 * a) Perform vector D2L/F2I cast.
5071 * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5072 * It signifies that source value could be any of the special floating point
5073 * values(NaN,-Inf,Inf,Max,-Min).
5074 * c) Set destination to zero if source is NaN value.
5075 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5076 */
5077
5078 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5079 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5080 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5081 int to_elem_sz = type2aelembytes(to_elem_bt);
5082 assert(to_elem_sz <= 4, "");
5083 vcvttps2dq(dst, src, vec_enc);
5084 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5085 if (to_elem_sz < 4) {
5086 vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5087 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5088 }
5089 }
5090
5091 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5092 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5093 Register rscratch, int vec_enc) {
5094 int to_elem_sz = type2aelembytes(to_elem_bt);
5095 assert(to_elem_sz <= 4, "");
5096 vcvttps2dq(dst, src, vec_enc);
5097 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5098 switch(to_elem_bt) {
5099 case T_INT:
5100 break;
5101 case T_SHORT:
5102 evpmovdw(dst, dst, vec_enc);
5103 break;
5104 case T_BYTE:
5105 evpmovdb(dst, dst, vec_enc);
5106 break;
5107 default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt));
5108 }
5109 }
5110
5111 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5112 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5113 Register rscratch, int vec_enc) {
5114 evcvttps2qq(dst, src, vec_enc);
5115 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5116 }
5117
5118 // Handling for downcasting from double to integer or sub-word types on AVX2.
5119 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5120 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5121 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5122 int to_elem_sz = type2aelembytes(to_elem_bt);
5123 assert(to_elem_sz < 8, "");
5124 vcvttpd2dq(dst, src, vec_enc);
5125 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5126 float_sign_flip, vec_enc);
5127 if (to_elem_sz < 4) {
5128 // xtmp4 holds all zero lanes.
5129 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5130 }
5131 }
5132
5133 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5134 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5135 KRegister ktmp2, AddressLiteral sign_flip,
5136 Register rscratch, int vec_enc) {
5137 if (VM_Version::supports_avx512dq()) {
5138 evcvttpd2qq(dst, src, vec_enc);
5139 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5140 switch(to_elem_bt) {
5141 case T_LONG:
5142 break;
5143 case T_INT:
5144 evpmovsqd(dst, dst, vec_enc);
5145 break;
5146 case T_SHORT:
5147 evpmovsqd(dst, dst, vec_enc);
5148 evpmovdw(dst, dst, vec_enc);
5149 break;
5150 case T_BYTE:
5151 evpmovsqd(dst, dst, vec_enc);
5152 evpmovdb(dst, dst, vec_enc);
5153 break;
5154 default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt));
5155 }
5156 } else {
5157 assert(type2aelembytes(to_elem_bt) <= 4, "");
5158 vcvttpd2dq(dst, src, vec_enc);
5159 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5160 switch(to_elem_bt) {
5161 case T_INT:
5162 break;
5163 case T_SHORT:
5164 evpmovdw(dst, dst, vec_enc);
5165 break;
5166 case T_BYTE:
5167 evpmovdb(dst, dst, vec_enc);
5168 break;
5169 default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt));
5170 }
5171 }
5172 }
5173
5174 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5175 switch(to_elem_bt) {
5176 case T_LONG:
5177 evcvttps2qqs(dst, src, vec_enc);
5178 break;
5179 case T_INT:
5180 evcvttps2dqs(dst, src, vec_enc);
5181 break;
5182 case T_SHORT:
5183 evcvttps2dqs(dst, src, vec_enc);
5184 evpmovdw(dst, dst, vec_enc);
5185 break;
5186 case T_BYTE:
5187 evcvttps2dqs(dst, src, vec_enc);
5188 evpmovdb(dst, dst, vec_enc);
5189 break;
5190 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt));
5191 }
5192 }
5193
5194 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5195 switch(to_elem_bt) {
5196 case T_LONG:
5197 evcvttps2qqs(dst, src, vec_enc);
5198 break;
5199 case T_INT:
5200 evcvttps2dqs(dst, src, vec_enc);
5201 break;
5202 case T_SHORT:
5203 evcvttps2dqs(dst, src, vec_enc);
5204 evpmovdw(dst, dst, vec_enc);
5205 break;
5206 case T_BYTE:
5207 evcvttps2dqs(dst, src, vec_enc);
5208 evpmovdb(dst, dst, vec_enc);
5209 break;
5210 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt));
5211 }
5212 }
5213
5214 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5215 switch(to_elem_bt) {
5216 case T_LONG:
5217 evcvttpd2qqs(dst, src, vec_enc);
5218 break;
5219 case T_INT:
5220 evcvttpd2dqs(dst, src, vec_enc);
5221 break;
5222 case T_SHORT:
5223 evcvttpd2dqs(dst, src, vec_enc);
5224 evpmovdw(dst, dst, vec_enc);
5225 break;
5226 case T_BYTE:
5227 evcvttpd2dqs(dst, src, vec_enc);
5228 evpmovdb(dst, dst, vec_enc);
5229 break;
5230 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt));
5231 }
5232 }
5233
5234 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5235 switch(to_elem_bt) {
5236 case T_LONG:
5237 evcvttpd2qqs(dst, src, vec_enc);
5238 break;
5239 case T_INT:
5240 evcvttpd2dqs(dst, src, vec_enc);
5241 break;
5242 case T_SHORT:
5243 evcvttpd2dqs(dst, src, vec_enc);
5244 evpmovdw(dst, dst, vec_enc);
5245 break;
5246 case T_BYTE:
5247 evcvttpd2dqs(dst, src, vec_enc);
5248 evpmovdb(dst, dst, vec_enc);
5249 break;
5250 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt));
5251 }
5252 }
5253
5254 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5255 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5256 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5257 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5258 // and re-instantiate original MXCSR.RC mode after that.
5259 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5260
5261 mov64(tmp, julong_cast(0.5L));
5262 evpbroadcastq(xtmp1, tmp, vec_enc);
5263 vaddpd(xtmp1, src , xtmp1, vec_enc);
5264 evcvtpd2qq(dst, xtmp1, vec_enc);
5265 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5266 double_sign_flip, vec_enc);;
5267
5268 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5269 }
5270
5271 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5272 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5273 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5274 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5275 // and re-instantiate original MXCSR.RC mode after that.
5276 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5277
5278 movl(tmp, jint_cast(0.5));
5279 movq(xtmp1, tmp);
5280 vbroadcastss(xtmp1, xtmp1, vec_enc);
5281 vaddps(xtmp1, src , xtmp1, vec_enc);
5282 vcvtps2dq(dst, xtmp1, vec_enc);
5283 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5284 float_sign_flip, vec_enc);
5285
5286 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5287 }
5288
5289 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5290 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5291 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5292 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5293 // and re-instantiate original MXCSR.RC mode after that.
5294 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5295
5296 movl(tmp, jint_cast(0.5));
5297 movq(xtmp1, tmp);
5298 vbroadcastss(xtmp1, xtmp1, vec_enc);
5299 vaddps(xtmp1, src , xtmp1, vec_enc);
5300 vcvtps2dq(dst, xtmp1, vec_enc);
5301 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5302
5303 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5304 }
5305
5306 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5307 BasicType from_elem_bt, BasicType to_elem_bt) {
5308 switch (from_elem_bt) {
5309 case T_BYTE:
5310 switch (to_elem_bt) {
5311 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5312 case T_INT: vpmovzxbd(dst, src, vlen_enc); break;
5313 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break;
5314 default: ShouldNotReachHere();
5315 }
5316 break;
5317 case T_SHORT:
5318 switch (to_elem_bt) {
5319 case T_INT: vpmovzxwd(dst, src, vlen_enc); break;
5320 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5321 default: ShouldNotReachHere();
5322 }
5323 break;
5324 case T_INT:
5325 assert(to_elem_bt == T_LONG, "");
5326 vpmovzxdq(dst, src, vlen_enc);
5327 break;
5328 default:
5329 ShouldNotReachHere();
5330 }
5331 }
5332
5333 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5334 BasicType from_elem_bt, BasicType to_elem_bt) {
5335 switch (from_elem_bt) {
5336 case T_BYTE:
5337 switch (to_elem_bt) {
5338 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5339 case T_INT: vpmovsxbd(dst, src, vlen_enc); break;
5340 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break;
5341 default: ShouldNotReachHere();
5342 }
5343 break;
5344 case T_SHORT:
5345 switch (to_elem_bt) {
5346 case T_INT: vpmovsxwd(dst, src, vlen_enc); break;
5347 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5348 default: ShouldNotReachHere();
5349 }
5350 break;
5351 case T_INT:
5352 assert(to_elem_bt == T_LONG, "");
5353 vpmovsxdq(dst, src, vlen_enc);
5354 break;
5355 default:
5356 ShouldNotReachHere();
5357 }
5358 }
5359
5360 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5361 BasicType dst_bt, BasicType src_bt, int vlen) {
5362 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5363 assert(vlen_enc != AVX_512bit, "");
5364
5365 int dst_bt_size = type2aelembytes(dst_bt);
5366 int src_bt_size = type2aelembytes(src_bt);
5367 if (dst_bt_size > src_bt_size) {
5368 switch (dst_bt_size / src_bt_size) {
5369 case 2: vpmovsxbw(dst, src, vlen_enc); break;
5370 case 4: vpmovsxbd(dst, src, vlen_enc); break;
5371 case 8: vpmovsxbq(dst, src, vlen_enc); break;
5372 default: ShouldNotReachHere();
5373 }
5374 } else {
5375 assert(dst_bt_size < src_bt_size, "");
5376 switch (src_bt_size / dst_bt_size) {
5377 case 2: {
5378 if (vlen_enc == AVX_128bit) {
5379 vpacksswb(dst, src, src, vlen_enc);
5380 } else {
5381 vpacksswb(dst, src, src, vlen_enc);
5382 vpermq(dst, dst, 0x08, vlen_enc);
5383 }
5384 break;
5385 }
5386 case 4: {
5387 if (vlen_enc == AVX_128bit) {
5388 vpackssdw(dst, src, src, vlen_enc);
5389 vpacksswb(dst, dst, dst, vlen_enc);
5390 } else {
5391 vpackssdw(dst, src, src, vlen_enc);
5392 vpermq(dst, dst, 0x08, vlen_enc);
5393 vpacksswb(dst, dst, dst, AVX_128bit);
5394 }
5395 break;
5396 }
5397 case 8: {
5398 if (vlen_enc == AVX_128bit) {
5399 vpshufd(dst, src, 0x08, vlen_enc);
5400 vpackssdw(dst, dst, dst, vlen_enc);
5401 vpacksswb(dst, dst, dst, vlen_enc);
5402 } else {
5403 vpshufd(dst, src, 0x08, vlen_enc);
5404 vpermq(dst, dst, 0x08, vlen_enc);
5405 vpackssdw(dst, dst, dst, AVX_128bit);
5406 vpacksswb(dst, dst, dst, AVX_128bit);
5407 }
5408 break;
5409 }
5410 default: ShouldNotReachHere();
5411 }
5412 }
5413 }
5414
5415 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5416 bool merge, BasicType bt, int vlen_enc) {
5417 if (bt == T_INT) {
5418 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5419 } else {
5420 assert(bt == T_LONG, "");
5421 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5422 }
5423 }
5424
5425 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5426 bool merge, BasicType bt, int vlen_enc) {
5427 if (bt == T_INT) {
5428 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5429 } else {
5430 assert(bt == T_LONG, "");
5431 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5432 }
5433 }
5434
5435 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5436 Register rtmp2, XMMRegister xtmp, int mask_len,
5437 int vec_enc) {
5438 int index = 0;
5439 int vindex = 0;
5440 mov64(rtmp1, 0x0101010101010101L);
5441 pdepq(rtmp1, src, rtmp1);
5442 if (mask_len > 8) {
5443 movq(rtmp2, src);
5444 vpxor(xtmp, xtmp, xtmp, vec_enc);
5445 movq(xtmp, rtmp1);
5446 }
5447 movq(dst, rtmp1);
5448
5449 mask_len -= 8;
5450 while (mask_len > 0) {
5451 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5452 index++;
5453 if ((index % 2) == 0) {
5454 pxor(xtmp, xtmp);
5455 }
5456 mov64(rtmp1, 0x0101010101010101L);
5457 shrq(rtmp2, 8);
5458 pdepq(rtmp1, rtmp2, rtmp1);
5459 pinsrq(xtmp, rtmp1, index % 2);
5460 vindex = index / 2;
5461 if (vindex) {
5462 // Write entire 16 byte vector when both 64 bit
5463 // lanes are update to save redundant instructions.
5464 if (index % 2) {
5465 vinsertf128(dst, dst, xtmp, vindex);
5466 }
5467 } else {
5468 vmovdqu(dst, xtmp);
5469 }
5470 mask_len -= 8;
5471 }
5472 }
5473
5474 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5475 switch(opc) {
5476 case Op_VectorMaskTrueCount:
5477 popcntq(dst, tmp);
5478 break;
5479 case Op_VectorMaskLastTrue:
5480 if (VM_Version::supports_lzcnt()) {
5481 lzcntq(tmp, tmp);
5482 movl(dst, 63);
5483 subl(dst, tmp);
5484 } else {
5485 movl(dst, -1);
5486 bsrq(tmp, tmp);
5487 cmov32(Assembler::notZero, dst, tmp);
5488 }
5489 break;
5490 case Op_VectorMaskFirstTrue:
5491 if (VM_Version::supports_bmi1()) {
5492 if (masklen < 32) {
5493 orl(tmp, 1 << masklen);
5494 tzcntl(dst, tmp);
5495 } else if (masklen == 32) {
5496 tzcntl(dst, tmp);
5497 } else {
5498 assert(masklen == 64, "");
5499 tzcntq(dst, tmp);
5500 }
5501 } else {
5502 if (masklen < 32) {
5503 orl(tmp, 1 << masklen);
5504 bsfl(dst, tmp);
5505 } else {
5506 assert(masklen == 32 || masklen == 64, "");
5507 movl(dst, masklen);
5508 if (masklen == 32) {
5509 bsfl(tmp, tmp);
5510 } else {
5511 bsfq(tmp, tmp);
5512 }
5513 cmov32(Assembler::notZero, dst, tmp);
5514 }
5515 }
5516 break;
5517 case Op_VectorMaskToLong:
5518 assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5519 break;
5520 default: assert(false, "Unhandled mask operation");
5521 }
5522 }
5523
5524 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5525 int masklen, int masksize, int vec_enc) {
5526 assert(VM_Version::supports_popcnt(), "");
5527
5528 if(VM_Version::supports_avx512bw()) {
5529 kmovql(tmp, mask);
5530 } else {
5531 assert(masklen <= 16, "");
5532 kmovwl(tmp, mask);
5533 }
5534
5535 // Mask generated out of partial vector comparisons/replicate/mask manipulation
5536 // operations needs to be clipped.
5537 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5538 andq(tmp, (1 << masklen) - 1);
5539 }
5540
5541 vector_mask_operation_helper(opc, dst, tmp, masklen);
5542 }
5543
5544 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5545 Register tmp, int masklen, BasicType bt, int vec_enc) {
5546 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5547 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5548 assert(VM_Version::supports_popcnt(), "");
5549
5550 bool need_clip = false;
5551 switch(bt) {
5552 case T_BOOLEAN:
5553 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5554 vpxor(xtmp, xtmp, xtmp, vec_enc);
5555 vpsubb(xtmp, xtmp, mask, vec_enc);
5556 vpmovmskb(tmp, xtmp, vec_enc);
5557 need_clip = masklen < 16;
5558 break;
5559 case T_BYTE:
5560 vpmovmskb(tmp, mask, vec_enc);
5561 need_clip = masklen < 16;
5562 break;
5563 case T_SHORT:
5564 vpacksswb(xtmp, mask, mask, vec_enc);
5565 if (masklen >= 16) {
5566 vpermpd(xtmp, xtmp, 8, vec_enc);
5567 }
5568 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5569 need_clip = masklen < 16;
5570 break;
5571 case T_INT:
5572 case T_FLOAT:
5573 vmovmskps(tmp, mask, vec_enc);
5574 need_clip = masklen < 4;
5575 break;
5576 case T_LONG:
5577 case T_DOUBLE:
5578 vmovmskpd(tmp, mask, vec_enc);
5579 need_clip = masklen < 2;
5580 break;
5581 default: assert(false, "Unhandled type, %s", type2name(bt));
5582 }
5583
5584 // Mask generated out of partial vector comparisons/replicate/mask manipulation
5585 // operations needs to be clipped.
5586 if (need_clip && opc != Op_VectorMaskFirstTrue) {
5587 // need_clip implies masklen < 32
5588 andq(tmp, (1 << masklen) - 1);
5589 }
5590
5591 vector_mask_operation_helper(opc, dst, tmp, masklen);
5592 }
5593
5594 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5595 Register rtmp2, int mask_len) {
5596 kmov(rtmp1, src);
5597 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5598 mov64(rtmp2, -1L);
5599 pextq(rtmp2, rtmp2, rtmp1);
5600 kmov(dst, rtmp2);
5601 }
5602
5603 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5604 XMMRegister mask, Register rtmp, Register rscratch,
5605 XMMRegister permv, XMMRegister xtmp, BasicType bt,
5606 int vec_enc) {
5607 assert(type2aelembytes(bt) >= 4, "");
5608 assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5609 address compress_perm_table = nullptr;
5610 address expand_perm_table = nullptr;
5611 if (type2aelembytes(bt) == 8) {
5612 compress_perm_table = StubRoutines::x86::compress_perm_table64();
5613 expand_perm_table = StubRoutines::x86::expand_perm_table64();
5614 vmovmskpd(rtmp, mask, vec_enc);
5615 } else {
5616 compress_perm_table = StubRoutines::x86::compress_perm_table32();
5617 expand_perm_table = StubRoutines::x86::expand_perm_table32();
5618 vmovmskps(rtmp, mask, vec_enc);
5619 }
5620 shlq(rtmp, 5); // for 32 byte permute row.
5621 if (opcode == Op_CompressV) {
5622 lea(rscratch, ExternalAddress(compress_perm_table));
5623 } else {
5624 lea(rscratch, ExternalAddress(expand_perm_table));
5625 }
5626 addptr(rtmp, rscratch);
5627 vmovdqu(permv, Address(rtmp));
5628 vpermps(dst, permv, src, Assembler::AVX_256bit);
5629 vpxor(xtmp, xtmp, xtmp, vec_enc);
5630 // Blend the result with zero vector using permute mask, each column entry
5631 // in a permute table row contains either a valid permute index or a -1 (default)
5632 // value, this can potentially be used as a blending mask after
5633 // compressing/expanding the source vector lanes.
5634 vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5635 }
5636
5637 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5638 bool merge, BasicType bt, int vec_enc) {
5639 if (opcode == Op_CompressV) {
5640 switch(bt) {
5641 case T_BYTE:
5642 evpcompressb(dst, mask, src, merge, vec_enc);
5643 break;
5644 case T_CHAR:
5645 case T_SHORT:
5646 evpcompressw(dst, mask, src, merge, vec_enc);
5647 break;
5648 case T_INT:
5649 evpcompressd(dst, mask, src, merge, vec_enc);
5650 break;
5651 case T_FLOAT:
5652 evcompressps(dst, mask, src, merge, vec_enc);
5653 break;
5654 case T_LONG:
5655 evpcompressq(dst, mask, src, merge, vec_enc);
5656 break;
5657 case T_DOUBLE:
5658 evcompresspd(dst, mask, src, merge, vec_enc);
5659 break;
5660 default:
5661 fatal("Unsupported type %s", type2name(bt));
5662 break;
5663 }
5664 } else {
5665 assert(opcode == Op_ExpandV, "");
5666 switch(bt) {
5667 case T_BYTE:
5668 evpexpandb(dst, mask, src, merge, vec_enc);
5669 break;
5670 case T_CHAR:
5671 case T_SHORT:
5672 evpexpandw(dst, mask, src, merge, vec_enc);
5673 break;
5674 case T_INT:
5675 evpexpandd(dst, mask, src, merge, vec_enc);
5676 break;
5677 case T_FLOAT:
5678 evexpandps(dst, mask, src, merge, vec_enc);
5679 break;
5680 case T_LONG:
5681 evpexpandq(dst, mask, src, merge, vec_enc);
5682 break;
5683 case T_DOUBLE:
5684 evexpandpd(dst, mask, src, merge, vec_enc);
5685 break;
5686 default:
5687 fatal("Unsupported type %s", type2name(bt));
5688 break;
5689 }
5690 }
5691 }
5692
5693 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5694 KRegister ktmp1, int vec_enc) {
5695 if (opcode == Op_SignumVD) {
5696 vsubpd(dst, zero, one, vec_enc);
5697 // if src < 0 ? -1 : 1
5698 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5699 evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5700 // if src == NaN, -0.0 or 0.0 return src.
5701 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5702 evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5703 } else {
5704 assert(opcode == Op_SignumVF, "");
5705 vsubps(dst, zero, one, vec_enc);
5706 // if src < 0 ? -1 : 1
5707 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5708 evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5709 // if src == NaN, -0.0 or 0.0 return src.
5710 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5711 evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5712 }
5713 }
5714
5715 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5716 XMMRegister xtmp1, int vec_enc) {
5717 if (opcode == Op_SignumVD) {
5718 vsubpd(dst, zero, one, vec_enc);
5719 // if src < 0 ? -1 : 1
5720 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5721 // if src == NaN, -0.0 or 0.0 return src.
5722 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5723 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5724 } else {
5725 assert(opcode == Op_SignumVF, "");
5726 vsubps(dst, zero, one, vec_enc);
5727 // if src < 0 ? -1 : 1
5728 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5729 // if src == NaN, -0.0 or 0.0 return src.
5730 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5731 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5732 }
5733 }
5734
5735 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5736 if (VM_Version::supports_avx512bw()) {
5737 if (mask_len > 32) {
5738 kmovql(dst, src);
5739 } else {
5740 kmovdl(dst, src);
5741 if (mask_len != 32) {
5742 kshiftrdl(dst, dst, 32 - mask_len);
5743 }
5744 }
5745 } else {
5746 assert(mask_len <= 16, "");
5747 kmovwl(dst, src);
5748 if (mask_len != 16) {
5749 kshiftrwl(dst, dst, 16 - mask_len);
5750 }
5751 }
5752 }
5753
5754 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5755 int lane_size = type2aelembytes(bt);
5756 if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5757 (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5758 movptr(rtmp, imm32);
5759 switch(lane_size) {
5760 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5761 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5762 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5763 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5764 fatal("Unsupported lane size %d", lane_size);
5765 break;
5766 }
5767 } else {
5768 movptr(rtmp, imm32);
5769 movq(dst, rtmp);
5770 switch(lane_size) {
5771 case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5772 case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5773 case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5774 case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5775 fatal("Unsupported lane size %d", lane_size);
5776 break;
5777 }
5778 }
5779 }
5780
5781 //
5782 // Following is lookup table based popcount computation algorithm:-
5783 // Index Bit set count
5784 // [ 0000 -> 0,
5785 // 0001 -> 1,
5786 // 0010 -> 1,
5787 // 0011 -> 2,
5788 // 0100 -> 1,
5789 // 0101 -> 2,
5790 // 0110 -> 2,
5791 // 0111 -> 3,
5792 // 1000 -> 1,
5793 // 1001 -> 2,
5794 // 1010 -> 3,
5795 // 1011 -> 3,
5796 // 1100 -> 2,
5797 // 1101 -> 3,
5798 // 1111 -> 4 ]
5799 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5800 // shuffle indices for lookup table access.
5801 // b. Right shift each byte of vector lane by 4 positions.
5802 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5803 // shuffle indices for lookup table access.
5804 // d. Add the bitset count of upper and lower 4 bits of each byte.
5805 // e. Unpack double words to quad words and compute sum of absolute difference of bitset
5806 // count of all the bytes of a quadword.
5807 // f. Perform step e. for upper 128bit vector lane.
5808 // g. Pack the bitset count of quadwords back to double word.
5809 // h. Unpacking and packing operations are not needed for 64bit vector lane.
5810
5811 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5812 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5813 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5814 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5815 vpsrlw(dst, src, 4, vec_enc);
5816 vpand(dst, dst, xtmp1, vec_enc);
5817 vpand(xtmp1, src, xtmp1, vec_enc);
5818 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5819 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5820 vpshufb(dst, xtmp2, dst, vec_enc);
5821 vpaddb(dst, dst, xtmp1, vec_enc);
5822 }
5823
5824 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5825 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5826 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5827 // Following code is as per steps e,f,g and h of above algorithm.
5828 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5829 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5830 vpsadbw(dst, dst, xtmp2, vec_enc);
5831 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5832 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5833 vpackuswb(dst, xtmp1, dst, vec_enc);
5834 }
5835
5836 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5837 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5838 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5839 // Add the popcount of upper and lower bytes of word.
5840 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5841 vpsrlw(dst, xtmp1, 8, vec_enc);
5842 vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5843 vpaddw(dst, dst, xtmp1, vec_enc);
5844 }
5845
5846 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5847 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5848 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5849 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5850 vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5851 }
5852
5853 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5854 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5855 switch(bt) {
5856 case T_LONG:
5857 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5858 break;
5859 case T_INT:
5860 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5861 break;
5862 case T_CHAR:
5863 case T_SHORT:
5864 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5865 break;
5866 case T_BYTE:
5867 case T_BOOLEAN:
5868 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5869 break;
5870 default:
5871 fatal("Unsupported type %s", type2name(bt));
5872 break;
5873 }
5874 }
5875
5876 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5877 KRegister mask, bool merge, int vec_enc) {
5878 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5879 switch(bt) {
5880 case T_LONG:
5881 assert(VM_Version::supports_avx512_vpopcntdq(), "");
5882 evpopcntq(dst, mask, src, merge, vec_enc);
5883 break;
5884 case T_INT:
5885 assert(VM_Version::supports_avx512_vpopcntdq(), "");
5886 evpopcntd(dst, mask, src, merge, vec_enc);
5887 break;
5888 case T_CHAR:
5889 case T_SHORT:
5890 assert(VM_Version::supports_avx512_bitalg(), "");
5891 evpopcntw(dst, mask, src, merge, vec_enc);
5892 break;
5893 case T_BYTE:
5894 case T_BOOLEAN:
5895 assert(VM_Version::supports_avx512_bitalg(), "");
5896 evpopcntb(dst, mask, src, merge, vec_enc);
5897 break;
5898 default:
5899 fatal("Unsupported type %s", type2name(bt));
5900 break;
5901 }
5902 }
5903
5904 // Bit reversal algorithm first reverses the bits of each byte followed by
5905 // a byte level reversal for multi-byte primitive types (short/int/long).
5906 // Algorithm performs a lookup table access to get reverse bit sequence
5907 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5908 // is obtained by swapping the reverse bit sequences of upper and lower
5909 // nibble of a byte.
5910 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5911 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5912 if (VM_Version::supports_avx512vlbw()) {
5913
5914 // Get the reverse bit sequence of lower nibble of each byte.
5915 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5916 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5917 evpandq(dst, xtmp2, src, vec_enc);
5918 vpshufb(dst, xtmp1, dst, vec_enc);
5919 vpsllq(dst, dst, 4, vec_enc);
5920
5921 // Get the reverse bit sequence of upper nibble of each byte.
5922 vpandn(xtmp2, xtmp2, src, vec_enc);
5923 vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5924 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5925
5926 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5927 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5928 evporq(xtmp2, dst, xtmp2, vec_enc);
5929 vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5930
5931 } else if(vec_enc == Assembler::AVX_512bit) {
5932 // Shift based bit reversal.
5933 assert(bt == T_LONG || bt == T_INT, "");
5934
5935 // Swap lower and upper nibble of each byte.
5936 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5937
5938 // Swap two least and most significant bits of each nibble.
5939 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5940
5941 // Swap adjacent pair of bits.
5942 evmovdqul(xtmp1, k0, dst, true, vec_enc);
5943 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5944
5945 evmovdqul(xtmp1, k0, dst, true, vec_enc);
5946 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5947 } else {
5948 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5949 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5950
5951 // Get the reverse bit sequence of lower nibble of each byte.
5952 vpand(dst, xtmp2, src, vec_enc);
5953 vpshufb(dst, xtmp1, dst, vec_enc);
5954 vpsllq(dst, dst, 4, vec_enc);
5955
5956 // Get the reverse bit sequence of upper nibble of each byte.
5957 vpandn(xtmp2, xtmp2, src, vec_enc);
5958 vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5959 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5960
5961 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5962 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5963 vpor(xtmp2, dst, xtmp2, vec_enc);
5964 vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5965 }
5966 }
5967
5968 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5969 XMMRegister xtmp, Register rscratch) {
5970 assert(VM_Version::supports_gfni(), "");
5971 assert(rscratch != noreg || always_reachable(mask), "missing");
5972
5973 // Galois field instruction based bit reversal based on following algorithm.
5974 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5975 vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5976 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5977 vector_reverse_byte(bt, dst, xtmp, vec_enc);
5978 }
5979
5980 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5981 XMMRegister xtmp1, Register rtmp, int vec_enc) {
5982 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5983 evpandq(dst, xtmp1, src, vec_enc);
5984 vpsllq(dst, dst, nbits, vec_enc);
5985 vpandn(xtmp1, xtmp1, src, vec_enc);
5986 vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5987 evporq(dst, dst, xtmp1, vec_enc);
5988 }
5989
5990 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5991 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5992 // Shift based bit reversal.
5993 assert(VM_Version::supports_evex(), "");
5994 switch(bt) {
5995 case T_LONG:
5996 // Swap upper and lower double word of each quad word.
5997 evprorq(xtmp1, k0, src, 32, true, vec_enc);
5998 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5999 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6000 break;
6001 case T_INT:
6002 // Swap upper and lower word of each double word.
6003 evprord(xtmp1, k0, src, 16, true, vec_enc);
6004 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6005 break;
6006 case T_CHAR:
6007 case T_SHORT:
6008 // Swap upper and lower byte of each word.
6009 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6010 break;
6011 case T_BYTE:
6012 evmovdquq(dst, k0, src, true, vec_enc);
6013 break;
6014 default:
6015 fatal("Unsupported type %s", type2name(bt));
6016 break;
6017 }
6018 }
6019
6020 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6021 if (bt == T_BYTE) {
6022 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6023 evmovdquq(dst, k0, src, true, vec_enc);
6024 } else {
6025 vmovdqu(dst, src);
6026 }
6027 return;
6028 }
6029 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6030 // pre-computed shuffle indices.
6031 switch(bt) {
6032 case T_LONG:
6033 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6034 break;
6035 case T_INT:
6036 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6037 break;
6038 case T_CHAR:
6039 case T_SHORT:
6040 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6041 break;
6042 default:
6043 fatal("Unsupported type %s", type2name(bt));
6044 break;
6045 }
6046 vpshufb(dst, src, dst, vec_enc);
6047 }
6048
6049 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6050 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6051 KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6052 assert(is_integral_type(bt), "");
6053 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6054 assert(VM_Version::supports_avx512cd(), "");
6055 switch(bt) {
6056 case T_LONG:
6057 evplzcntq(dst, ktmp, src, merge, vec_enc);
6058 break;
6059 case T_INT:
6060 evplzcntd(dst, ktmp, src, merge, vec_enc);
6061 break;
6062 case T_SHORT:
6063 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6064 vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6065 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6066 vpunpckhwd(dst, xtmp1, src, vec_enc);
6067 evplzcntd(dst, ktmp, dst, merge, vec_enc);
6068 vpackusdw(dst, xtmp2, dst, vec_enc);
6069 break;
6070 case T_BYTE:
6071 // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6072 // accessing the lookup table.
6073 // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6074 // accessing the lookup table.
6075 // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6076 assert(VM_Version::supports_avx512bw(), "");
6077 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6078 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6079 vpand(xtmp2, dst, src, vec_enc);
6080 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6081 vpsrlw(xtmp3, src, 4, vec_enc);
6082 vpand(xtmp3, dst, xtmp3, vec_enc);
6083 vpshufb(dst, xtmp1, xtmp3, vec_enc);
6084 vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6085 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6086 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6087 break;
6088 default:
6089 fatal("Unsupported type %s", type2name(bt));
6090 break;
6091 }
6092 }
6093
6094 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6095 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6096 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6097 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6098 // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6099 // accessing the lookup table.
6100 vpand(dst, xtmp2, src, vec_enc);
6101 vpshufb(dst, xtmp1, dst, vec_enc);
6102 // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6103 // accessing the lookup table.
6104 vpsrlw(xtmp3, src, 4, vec_enc);
6105 vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6106 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6107 // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6108 vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6109 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6110 vpaddb(dst, dst, xtmp2, vec_enc);
6111 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6112 }
6113
6114 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6115 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6116 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6117 // Add zero counts of lower byte and upper byte of a word if
6118 // upper byte holds a zero value.
6119 vpsrlw(xtmp3, src, 8, vec_enc);
6120 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6121 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6122 vpsllw(xtmp2, dst, 8, vec_enc);
6123 vpaddw(xtmp2, xtmp2, dst, vec_enc);
6124 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6125 vpsrlw(dst, dst, 8, vec_enc);
6126 }
6127
6128 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6129 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6130 // By converting the integer to a float, we can obtain the number of leading zeros based on the exponent of the float.
6131 // As the float exponent contains a bias of 127 for nonzero values, the bias must be removed before interpreting the
6132 // exponent as the leading zero count.
6133
6134 // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6135 // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6136 // contributes to the leading number of zeros.
6137 vpsrld(dst, src, 1, vec_enc);
6138 vpandn(dst, dst, src, vec_enc);
6139
6140 vcvtdq2ps(dst, dst, vec_enc);
6141
6142 // By comparing the register to itself, all the bits in the destination are set.
6143 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6144
6145 // Move the biased exponent to the low end of the lane and mask with 0xFF to discard the sign bit.
6146 vpsrld(xtmp2, xtmp1, 24, vec_enc);
6147 vpsrld(dst, dst, 23, vec_enc);
6148 vpand(dst, xtmp2, dst, vec_enc);
6149
6150 // Subtract 127 from the exponent, which removes the bias from the exponent.
6151 vpsrld(xtmp2, xtmp1, 25, vec_enc);
6152 vpsubd(dst, dst, xtmp2, vec_enc);
6153
6154 vpsrld(xtmp2, xtmp1, 27, vec_enc);
6155
6156 // If the original value is 0 the exponent would not have bias, so the subtraction creates a negative number. If this
6157 // is found in any of the lanes, replace the lane with -1 from xtmp1.
6158 vblendvps(dst, dst, xtmp1, dst, vec_enc, true, xtmp3);
6159
6160 // If the original value is negative, replace the lane with 31.
6161 vblendvps(dst, dst, xtmp2, src, vec_enc, true, xtmp3);
6162
6163 // Subtract the exponent from 31, giving the final result. For 0, the result is 32 as the exponent was replaced with -1,
6164 // and for negative numbers the result is 0 as the exponent was replaced with 31.
6165 vpsubd(dst, xtmp2, dst, vec_enc);
6166 }
6167
6168 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6169 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6170 // Find the leading zeros of the top and bottom halves of the long individually.
6171 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6172
6173 // Move the top half result to the bottom half of xtmp1, setting the top half to 0.
6174 vpsrlq(xtmp1, dst, 32, vec_enc);
6175 // By moving the top half result to the right by 6 bits, if the top half was empty (i.e. 32 is returned) the result bit will
6176 // be in the most significant position of the bottom half.
6177 vpsrlq(xtmp2, dst, 6, vec_enc);
6178
6179 // In the bottom half, add the top half and bottom half results.
6180 vpaddq(dst, xtmp1, dst, vec_enc);
6181
6182 // For the bottom half, choose between the values using the most significant bit of xtmp2.
6183 // If the MSB is set, then bottom+top in dst is the resulting value. If the top half is less than 32 xtmp1 is chosen,
6184 // which contains only the top half result.
6185 // In the top half the MSB is always zero, so the value in xtmp1 is always chosen. This value is always 0, which clears
6186 // the lane as required.
6187 vblendvps(dst, xtmp1, dst, xtmp2, vec_enc, true, xtmp3);
6188 }
6189
6190 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6191 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6192 Register rtmp, int vec_enc) {
6193 assert(is_integral_type(bt), "unexpected type");
6194 assert(vec_enc < Assembler::AVX_512bit, "");
6195 switch(bt) {
6196 case T_LONG:
6197 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6198 break;
6199 case T_INT:
6200 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6201 break;
6202 case T_SHORT:
6203 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6204 break;
6205 case T_BYTE:
6206 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6207 break;
6208 default:
6209 fatal("Unsupported type %s", type2name(bt));
6210 break;
6211 }
6212 }
6213
6214 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6215 switch(bt) {
6216 case T_BYTE:
6217 vpsubb(dst, src1, src2, vec_enc);
6218 break;
6219 case T_SHORT:
6220 vpsubw(dst, src1, src2, vec_enc);
6221 break;
6222 case T_INT:
6223 vpsubd(dst, src1, src2, vec_enc);
6224 break;
6225 case T_LONG:
6226 vpsubq(dst, src1, src2, vec_enc);
6227 break;
6228 default:
6229 fatal("Unsupported type %s", type2name(bt));
6230 break;
6231 }
6232 }
6233
6234 // Trailing zero count computation is based on leading zero count operation as per
6235 // following equation. All AVX3 targets support AVX512CD feature which offers
6236 // direct vector instruction to compute leading zero count.
6237 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6238 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6239 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6240 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6241 assert(is_integral_type(bt), "");
6242 // xtmp = -1
6243 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6244 // xtmp = xtmp + src
6245 vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6246 // xtmp = xtmp & ~src
6247 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6248 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6249 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6250 vpsub(bt, dst, xtmp4, dst, vec_enc);
6251 }
6252
6253 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6254 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6255 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6256 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6257 assert(is_integral_type(bt), "");
6258 // xtmp = 0
6259 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6260 // xtmp = 0 - src
6261 vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6262 // xtmp = xtmp | src
6263 vpor(xtmp3, xtmp3, src, vec_enc);
6264 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6265 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6266 vpsub(bt, dst, xtmp1, dst, vec_enc);
6267 }
6268
6269 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6270 Label done;
6271 Label neg_divisor_fastpath;
6272 cmpl(divisor, 0);
6273 jccb(Assembler::less, neg_divisor_fastpath);
6274 xorl(rdx, rdx);
6275 divl(divisor);
6276 jmpb(done);
6277 bind(neg_divisor_fastpath);
6278 // Fastpath for divisor < 0:
6279 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6280 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6281 movl(rdx, rax);
6282 subl(rdx, divisor);
6283 if (VM_Version::supports_bmi1()) {
6284 andnl(rax, rdx, rax);
6285 } else {
6286 notl(rdx);
6287 andl(rax, rdx);
6288 }
6289 shrl(rax, 31);
6290 bind(done);
6291 }
6292
6293 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6294 Label done;
6295 Label neg_divisor_fastpath;
6296 cmpl(divisor, 0);
6297 jccb(Assembler::less, neg_divisor_fastpath);
6298 xorl(rdx, rdx);
6299 divl(divisor);
6300 jmpb(done);
6301 bind(neg_divisor_fastpath);
6302 // Fastpath when divisor < 0:
6303 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6304 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6305 movl(rdx, rax);
6306 subl(rax, divisor);
6307 if (VM_Version::supports_bmi1()) {
6308 andnl(rax, rax, rdx);
6309 } else {
6310 notl(rax);
6311 andl(rax, rdx);
6312 }
6313 sarl(rax, 31);
6314 andl(rax, divisor);
6315 subl(rdx, rax);
6316 bind(done);
6317 }
6318
6319 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6320 Label done;
6321 Label neg_divisor_fastpath;
6322
6323 cmpl(divisor, 0);
6324 jccb(Assembler::less, neg_divisor_fastpath);
6325 xorl(rdx, rdx);
6326 divl(divisor);
6327 jmpb(done);
6328 bind(neg_divisor_fastpath);
6329 // Fastpath for divisor < 0:
6330 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6331 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6332 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6333 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6334 movl(rdx, rax);
6335 subl(rax, divisor);
6336 if (VM_Version::supports_bmi1()) {
6337 andnl(rax, rax, rdx);
6338 } else {
6339 notl(rax);
6340 andl(rax, rdx);
6341 }
6342 movl(tmp, rax);
6343 shrl(rax, 31); // quotient
6344 sarl(tmp, 31);
6345 andl(tmp, divisor);
6346 subl(rdx, tmp); // remainder
6347 bind(done);
6348 }
6349
6350 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6351 XMMRegister xtmp2, Register rtmp) {
6352 if(VM_Version::supports_gfni()) {
6353 // Galois field instruction based bit reversal based on following algorithm.
6354 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6355 mov64(rtmp, 0x8040201008040201L);
6356 movq(xtmp1, src);
6357 movq(xtmp2, rtmp);
6358 gf2p8affineqb(xtmp1, xtmp2, 0);
6359 movq(dst, xtmp1);
6360 } else {
6361 // Swap even and odd numbered bits.
6362 movl(rtmp, src);
6363 andl(rtmp, 0x55555555);
6364 shll(rtmp, 1);
6365 movl(dst, src);
6366 andl(dst, 0xAAAAAAAA);
6367 shrl(dst, 1);
6368 orl(dst, rtmp);
6369
6370 // Swap LSB and MSB 2 bits of each nibble.
6371 movl(rtmp, dst);
6372 andl(rtmp, 0x33333333);
6373 shll(rtmp, 2);
6374 andl(dst, 0xCCCCCCCC);
6375 shrl(dst, 2);
6376 orl(dst, rtmp);
6377
6378 // Swap LSB and MSB 4 bits of each byte.
6379 movl(rtmp, dst);
6380 andl(rtmp, 0x0F0F0F0F);
6381 shll(rtmp, 4);
6382 andl(dst, 0xF0F0F0F0);
6383 shrl(dst, 4);
6384 orl(dst, rtmp);
6385 }
6386 bswapl(dst);
6387 }
6388
6389 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6390 XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6391 if(VM_Version::supports_gfni()) {
6392 // Galois field instruction based bit reversal based on following algorithm.
6393 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6394 mov64(rtmp1, 0x8040201008040201L);
6395 movq(xtmp1, src);
6396 movq(xtmp2, rtmp1);
6397 gf2p8affineqb(xtmp1, xtmp2, 0);
6398 movq(dst, xtmp1);
6399 } else {
6400 // Swap even and odd numbered bits.
6401 movq(rtmp1, src);
6402 mov64(rtmp2, 0x5555555555555555L);
6403 andq(rtmp1, rtmp2);
6404 shlq(rtmp1, 1);
6405 movq(dst, src);
6406 notq(rtmp2);
6407 andq(dst, rtmp2);
6408 shrq(dst, 1);
6409 orq(dst, rtmp1);
6410
6411 // Swap LSB and MSB 2 bits of each nibble.
6412 movq(rtmp1, dst);
6413 mov64(rtmp2, 0x3333333333333333L);
6414 andq(rtmp1, rtmp2);
6415 shlq(rtmp1, 2);
6416 notq(rtmp2);
6417 andq(dst, rtmp2);
6418 shrq(dst, 2);
6419 orq(dst, rtmp1);
6420
6421 // Swap LSB and MSB 4 bits of each byte.
6422 movq(rtmp1, dst);
6423 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6424 andq(rtmp1, rtmp2);
6425 shlq(rtmp1, 4);
6426 notq(rtmp2);
6427 andq(dst, rtmp2);
6428 shrq(dst, 4);
6429 orq(dst, rtmp1);
6430 }
6431 bswapq(dst);
6432 }
6433
6434 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6435 Label done;
6436 Label neg_divisor_fastpath;
6437 cmpq(divisor, 0);
6438 jccb(Assembler::less, neg_divisor_fastpath);
6439 xorl(rdx, rdx);
6440 divq(divisor);
6441 jmpb(done);
6442 bind(neg_divisor_fastpath);
6443 // Fastpath for divisor < 0:
6444 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6445 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6446 movq(rdx, rax);
6447 subq(rdx, divisor);
6448 if (VM_Version::supports_bmi1()) {
6449 andnq(rax, rdx, rax);
6450 } else {
6451 notq(rdx);
6452 andq(rax, rdx);
6453 }
6454 shrq(rax, 63);
6455 bind(done);
6456 }
6457
6458 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6459 Label done;
6460 Label neg_divisor_fastpath;
6461 cmpq(divisor, 0);
6462 jccb(Assembler::less, neg_divisor_fastpath);
6463 xorq(rdx, rdx);
6464 divq(divisor);
6465 jmp(done);
6466 bind(neg_divisor_fastpath);
6467 // Fastpath when divisor < 0:
6468 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6469 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6470 movq(rdx, rax);
6471 subq(rax, divisor);
6472 if (VM_Version::supports_bmi1()) {
6473 andnq(rax, rax, rdx);
6474 } else {
6475 notq(rax);
6476 andq(rax, rdx);
6477 }
6478 sarq(rax, 63);
6479 andq(rax, divisor);
6480 subq(rdx, rax);
6481 bind(done);
6482 }
6483
6484 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6485 Label done;
6486 Label neg_divisor_fastpath;
6487 cmpq(divisor, 0);
6488 jccb(Assembler::less, neg_divisor_fastpath);
6489 xorq(rdx, rdx);
6490 divq(divisor);
6491 jmp(done);
6492 bind(neg_divisor_fastpath);
6493 // Fastpath for divisor < 0:
6494 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6495 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6496 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6497 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6498 movq(rdx, rax);
6499 subq(rax, divisor);
6500 if (VM_Version::supports_bmi1()) {
6501 andnq(rax, rax, rdx);
6502 } else {
6503 notq(rax);
6504 andq(rax, rdx);
6505 }
6506 movq(tmp, rax);
6507 shrq(rax, 63); // quotient
6508 sarq(tmp, 63);
6509 andq(tmp, divisor);
6510 subq(rdx, tmp); // remainder
6511 bind(done);
6512 }
6513
6514 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6515 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6516 int vlen_enc) {
6517 assert(VM_Version::supports_avx512bw(), "");
6518 // Byte shuffles are inlane operations and indices are determined using
6519 // lower 4 bit of each shuffle lane, thus all shuffle indices are
6520 // normalized to index range 0-15. This makes sure that all the multiples
6521 // of an index value are placed at same relative position in 128 bit
6522 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6523 // will be 16th element in their respective 128 bit lanes.
6524 movl(rtmp, 16);
6525 evpbroadcastb(xtmp1, rtmp, vlen_enc);
6526
6527 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6528 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6529 // original shuffle indices and move the shuffled lanes corresponding to true
6530 // mask to destination vector.
6531 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6532 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6533 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6534
6535 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6536 // and broadcasting second 128 bit lane.
6537 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
6538 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6539 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6540 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6541 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6542
6543 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6544 // and broadcasting third 128 bit lane.
6545 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc);
6546 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6547 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6548 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6549 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6550
6551 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6552 // and broadcasting third 128 bit lane.
6553 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
6554 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6555 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6556 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6557 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6558 }
6559
6560 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6561 XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6562 if (vlen_enc == AVX_128bit) {
6563 vpermilps(dst, src, shuffle, vlen_enc);
6564 } else if (bt == T_INT) {
6565 vpermd(dst, shuffle, src, vlen_enc);
6566 } else {
6567 assert(bt == T_FLOAT, "");
6568 vpermps(dst, shuffle, src, vlen_enc);
6569 }
6570 }
6571
6572 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6573 switch(opcode) {
6574 case Op_AddHF: vaddsh(dst, src1, src2); break;
6575 case Op_SubHF: vsubsh(dst, src1, src2); break;
6576 case Op_MulHF: vmulsh(dst, src1, src2); break;
6577 case Op_DivHF: vdivsh(dst, src1, src2); break;
6578 default: assert(false, "%s", NodeClassNames[opcode]); break;
6579 }
6580 }
6581
6582 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6583 switch(elem_bt) {
6584 case T_BYTE:
6585 if (ideal_opc == Op_SaturatingAddV) {
6586 vpaddsb(dst, src1, src2, vlen_enc);
6587 } else {
6588 assert(ideal_opc == Op_SaturatingSubV, "");
6589 vpsubsb(dst, src1, src2, vlen_enc);
6590 }
6591 break;
6592 case T_SHORT:
6593 if (ideal_opc == Op_SaturatingAddV) {
6594 vpaddsw(dst, src1, src2, vlen_enc);
6595 } else {
6596 assert(ideal_opc == Op_SaturatingSubV, "");
6597 vpsubsw(dst, src1, src2, vlen_enc);
6598 }
6599 break;
6600 default:
6601 fatal("Unsupported type %s", type2name(elem_bt));
6602 break;
6603 }
6604 }
6605
6606 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6607 switch(elem_bt) {
6608 case T_BYTE:
6609 if (ideal_opc == Op_SaturatingAddV) {
6610 vpaddusb(dst, src1, src2, vlen_enc);
6611 } else {
6612 assert(ideal_opc == Op_SaturatingSubV, "");
6613 vpsubusb(dst, src1, src2, vlen_enc);
6614 }
6615 break;
6616 case T_SHORT:
6617 if (ideal_opc == Op_SaturatingAddV) {
6618 vpaddusw(dst, src1, src2, vlen_enc);
6619 } else {
6620 assert(ideal_opc == Op_SaturatingSubV, "");
6621 vpsubusw(dst, src1, src2, vlen_enc);
6622 }
6623 break;
6624 default:
6625 fatal("Unsupported type %s", type2name(elem_bt));
6626 break;
6627 }
6628 }
6629
6630 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6631 XMMRegister src2, KRegister ktmp, int vlen_enc) {
6632 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6633 // overflow_mask = Inp1 <u Inp2
6634 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc);
6635 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6636 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6637 }
6638
6639 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6640 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6641 // Emulate unsigned comparison using signed comparison
6642 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6643 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6644 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6645 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6646
6647 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6648
6649 // Res = INP1 - INP2 (non-commutative and non-associative)
6650 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6651 // Res = Mask ? Zero : Res
6652 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6653 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6654 }
6655
6656 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6657 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6658 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6659 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6660 // Res = Signed Add INP1, INP2
6661 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6662 // T1 = SRC1 | SRC2
6663 vpor(xtmp1, src1, src2, vlen_enc);
6664 // Max_Unsigned = -1
6665 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6666 // Unsigned compare: Mask = Res <u T1
6667 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6668 // res = Mask ? Max_Unsigned : Res
6669 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc);
6670 }
6671
6672 //
6673 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6674 // unsigned addition operation.
6675 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6676 //
6677 // We empirically determined its semantic equivalence to following reduced expression
6678 // overflow_mask = (a + b) <u (a | b)
6679 //
6680 // and also verified it though Alive2 solver.
6681 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6682 //
6683
6684 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6685 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6686 // Res = Signed Add INP1, INP2
6687 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6688 // Compute T1 = INP1 | INP2
6689 vpor(xtmp3, src1, src2, vlen_enc);
6690 // T1 = Minimum signed value.
6691 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6692 // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6693 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6694 // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6695 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6696 // Compute overflow detection mask = Res<1> <s T1
6697 if (elem_bt == T_INT) {
6698 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6699 } else {
6700 assert(elem_bt == T_LONG, "");
6701 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6702 }
6703 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6704 }
6705
6706 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6707 int vlen_enc, bool xtmp2_hold_M1) {
6708 if (VM_Version::supports_avx512dq()) {
6709 evpmovq2m(ktmp, src, vlen_enc);
6710 } else {
6711 assert(VM_Version::supports_evex(), "");
6712 if (!xtmp2_hold_M1) {
6713 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6714 }
6715 evpsraq(xtmp1, src, 63, vlen_enc);
6716 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6717 }
6718 }
6719
6720 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6721 int vlen_enc, bool xtmp2_hold_M1) {
6722 if (VM_Version::supports_avx512dq()) {
6723 evpmovd2m(ktmp, src, vlen_enc);
6724 } else {
6725 assert(VM_Version::supports_evex(), "");
6726 if (!xtmp2_hold_M1) {
6727 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6728 }
6729 vpsrad(xtmp1, src, 31, vlen_enc);
6730 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6731 }
6732 }
6733
6734
6735 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6736 if (elem_bt == T_LONG) {
6737 if (VM_Version::supports_evex()) {
6738 evpsraq(dst, src, 63, vlen_enc);
6739 } else {
6740 vpsrad(dst, src, 31, vlen_enc);
6741 vpshufd(dst, dst, 0xF5, vlen_enc);
6742 }
6743 } else {
6744 assert(elem_bt == T_INT, "");
6745 vpsrad(dst, src, 31, vlen_enc);
6746 }
6747 }
6748
6749 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6750 if (compute_allones) {
6751 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6752 vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6753 } else {
6754 vpcmpeqq(allones, allones, allones, vlen_enc);
6755 }
6756 }
6757 if (elem_bt == T_LONG) {
6758 vpsrlq(dst, allones, 1, vlen_enc);
6759 } else {
6760 assert(elem_bt == T_INT, "");
6761 vpsrld(dst, allones, 1, vlen_enc);
6762 }
6763 }
6764
6765 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6766 if (compute_allones) {
6767 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6768 vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6769 } else {
6770 vpcmpeqq(allones, allones, allones, vlen_enc);
6771 }
6772 }
6773 if (elem_bt == T_LONG) {
6774 vpsllq(dst, allones, 63, vlen_enc);
6775 } else {
6776 assert(elem_bt == T_INT, "");
6777 vpslld(dst, allones, 31, vlen_enc);
6778 }
6779 }
6780
6781 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2,
6782 Assembler::ComparisonPredicate cond, int vlen_enc) {
6783 switch(elem_bt) {
6784 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6785 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6786 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6787 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6788 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6789 }
6790 }
6791
6792 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6793 switch(elem_bt) {
6794 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break;
6795 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break;
6796 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6797 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break;
6798 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6799 }
6800 }
6801
6802 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6803 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6804 if (elem_bt == T_LONG) {
6805 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6806 } else {
6807 assert(elem_bt == T_INT, "");
6808 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6809 }
6810 }
6811
6812 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6813 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6814 KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6815 assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6816 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6817 // Overflow detection based on Hacker's delight section 2-13.
6818 if (ideal_opc == Op_SaturatingAddV) {
6819 // res = src1 + src2
6820 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6821 // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6822 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6823 vpxor(xtmp1, dst, src1, vlen_enc);
6824 vpxor(xtmp2, dst, src2, vlen_enc);
6825 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6826 } else {
6827 assert(ideal_opc == Op_SaturatingSubV, "");
6828 // res = src1 - src2
6829 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6830 // Overflow occurs when both inputs have opposite polarity and
6831 // result polarity does not comply with first input polarity.
6832 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6833 vpxor(xtmp1, src1, src2, vlen_enc);
6834 vpxor(xtmp2, dst, src1, vlen_enc);
6835 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6836 }
6837
6838 // Compute overflow detection mask.
6839 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6840 // Note: xtmp1 hold -1 in all its lanes after above call.
6841
6842 // Compute mask based on first input polarity.
6843 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6844
6845 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6846 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6847
6848 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6849 // set bits in first input polarity mask holds a min value.
6850 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6851 // Blend destination lanes with saturated values using overflow detection mask.
6852 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6853 }
6854
6855
6856 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6857 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6858 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6859 assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6860 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6861 // Overflow detection based on Hacker's delight section 2-13.
6862 if (ideal_opc == Op_SaturatingAddV) {
6863 // res = src1 + src2
6864 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6865 // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6866 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6867 vpxor(xtmp1, dst, src1, vlen_enc);
6868 vpxor(xtmp2, dst, src2, vlen_enc);
6869 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6870 } else {
6871 assert(ideal_opc == Op_SaturatingSubV, "");
6872 // res = src1 - src2
6873 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6874 // Overflow occurs when both inputs have opposite polarity and
6875 // result polarity does not comply with first input polarity.
6876 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6877 vpxor(xtmp1, src1, src2, vlen_enc);
6878 vpxor(xtmp2, dst, src1, vlen_enc);
6879 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6880 }
6881
6882 // Sign-extend to compute overflow detection mask.
6883 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6884
6885 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6886 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6887 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6888
6889 // Compose saturating min/max vector using first input polarity mask.
6890 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6891 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6892
6893 // Blend result with saturating vector using overflow detection mask.
6894 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6895 }
6896
6897 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6898 switch(elem_bt) {
6899 case T_BYTE:
6900 if (ideal_opc == Op_SaturatingAddV) {
6901 vpaddsb(dst, src1, src2, vlen_enc);
6902 } else {
6903 assert(ideal_opc == Op_SaturatingSubV, "");
6904 vpsubsb(dst, src1, src2, vlen_enc);
6905 }
6906 break;
6907 case T_SHORT:
6908 if (ideal_opc == Op_SaturatingAddV) {
6909 vpaddsw(dst, src1, src2, vlen_enc);
6910 } else {
6911 assert(ideal_opc == Op_SaturatingSubV, "");
6912 vpsubsw(dst, src1, src2, vlen_enc);
6913 }
6914 break;
6915 default:
6916 fatal("Unsupported type %s", type2name(elem_bt));
6917 break;
6918 }
6919 }
6920
6921 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6922 switch(elem_bt) {
6923 case T_BYTE:
6924 if (ideal_opc == Op_SaturatingAddV) {
6925 vpaddusb(dst, src1, src2, vlen_enc);
6926 } else {
6927 assert(ideal_opc == Op_SaturatingSubV, "");
6928 vpsubusb(dst, src1, src2, vlen_enc);
6929 }
6930 break;
6931 case T_SHORT:
6932 if (ideal_opc == Op_SaturatingAddV) {
6933 vpaddusw(dst, src1, src2, vlen_enc);
6934 } else {
6935 assert(ideal_opc == Op_SaturatingSubV, "");
6936 vpsubusw(dst, src1, src2, vlen_enc);
6937 }
6938 break;
6939 default:
6940 fatal("Unsupported type %s", type2name(elem_bt));
6941 break;
6942 }
6943 }
6944
6945 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6946 XMMRegister src2, int vlen_enc) {
6947 switch(elem_bt) {
6948 case T_BYTE:
6949 evpermi2b(dst, src1, src2, vlen_enc);
6950 break;
6951 case T_SHORT:
6952 evpermi2w(dst, src1, src2, vlen_enc);
6953 break;
6954 case T_INT:
6955 evpermi2d(dst, src1, src2, vlen_enc);
6956 break;
6957 case T_LONG:
6958 evpermi2q(dst, src1, src2, vlen_enc);
6959 break;
6960 case T_FLOAT:
6961 evpermi2ps(dst, src1, src2, vlen_enc);
6962 break;
6963 case T_DOUBLE:
6964 evpermi2pd(dst, src1, src2, vlen_enc);
6965 break;
6966 default:
6967 fatal("Unsupported type %s", type2name(elem_bt));
6968 break;
6969 }
6970 }
6971
6972 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
6973 if (is_unsigned) {
6974 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6975 } else {
6976 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6977 }
6978 }
6979
6980 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
6981 if (is_unsigned) {
6982 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6983 } else {
6984 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6985 }
6986 }
6987
6988 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6989 switch(opcode) {
6990 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
6991 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
6992 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
6993 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
6994 default: assert(false, "%s", NodeClassNames[opcode]); break;
6995 }
6996 }
6997
6998 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6999 switch(opcode) {
7000 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7001 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7002 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7003 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7004 default: assert(false, "%s", NodeClassNames[opcode]); break;
7005 }
7006 }
7007
7008 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7009 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7010 vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7011 }
7012
7013 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7014 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7015 if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7016 // Move sign bits of src2 to mask register.
7017 evpmovw2m(ktmp, src2, vlen_enc);
7018 // xtmp1 = src2 < 0 ? src2 : src1
7019 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7020 // xtmp2 = src2 < 0 ? ? src1 : src2
7021 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7022 // Idea behind above swapping is to make seconds source operand a +ve value.
7023 // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7024 // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7025 // the second source operand, either a NaN or a valid floating-point value, is returned
7026 // dst = max(xtmp1, xtmp2)
7027 evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7028 // isNaN = is_unordered_quiet(xtmp1)
7029 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7030 // Final result is same as first source if its a NaN value,
7031 // in case second operand holds a NaN value then as per above semantics
7032 // result is same as second operand.
7033 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7034 } else {
7035 assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7036 // Move sign bits of src1 to mask register.
7037 evpmovw2m(ktmp, src1, vlen_enc);
7038 // xtmp1 = src1 < 0 ? src2 : src1
7039 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7040 // xtmp2 = src1 < 0 ? src1 : src2
7041 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7042 // Idea behind above swapping is to make seconds source operand a -ve value.
7043 // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7044 // the second source operand is returned.
7045 // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7046 // or a valid floating-point value, is written to the result.
7047 // dst = min(xtmp1, xtmp2)
7048 evminph(dst, xtmp1, xtmp2, vlen_enc);
7049 // isNaN = is_unordered_quiet(xtmp1)
7050 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7051 // Final result is same as first source if its a NaN value,
7052 // in case second operand holds a NaN value then as per above semantics
7053 // result is same as second operand.
7054 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7055 }
7056 }