1 /*
2 * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "asm/assembler.hpp"
26 #include "asm/assembler.inline.hpp"
27 #include "gc/shared/barrierSet.hpp"
28 #include "gc/shared/barrierSetAssembler.hpp"
29 #include "oops/methodData.hpp"
30 #include "opto/c2_MacroAssembler.hpp"
31 #include "opto/intrinsicnode.hpp"
32 #include "opto/output.hpp"
33 #include "opto/opcodes.hpp"
34 #include "opto/subnode.hpp"
35 #include "runtime/globals.hpp"
36 #include "runtime/objectMonitor.hpp"
37 #include "runtime/stubRoutines.hpp"
38 #include "utilities/checkedCast.hpp"
39 #include "utilities/globalDefinitions.hpp"
40 #include "utilities/powerOfTwo.hpp"
41 #include "utilities/sizes.hpp"
42
43 #ifdef PRODUCT
44 #define BLOCK_COMMENT(str) /* nothing */
45 #define STOP(error) stop(error)
46 #else
47 #define BLOCK_COMMENT(str) block_comment(str)
48 #define STOP(error) block_comment(error); stop(error)
49 #endif
50
51 // C2 compiled method's prolog code.
52 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
53 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
54
55 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
56 // Remove word for return addr
57 framesize -= wordSize;
58 stack_bang_size -= wordSize;
59
60 // Calls to C2R adapters often do not accept exceptional returns.
61 // We require that their callers must bang for them. But be careful, because
62 // some VM calls (such as call site linkage) can use several kilobytes of
63 // stack. But the stack safety zone should account for that.
64 // See bugs 4446381, 4468289, 4497237.
65 if (stack_bang_size > 0) {
66 generate_stack_overflow_check(stack_bang_size);
67
68 // We always push rbp, so that on return to interpreter rbp, will be
69 // restored correctly and we can correct the stack.
70 push(rbp);
71 // Save caller's stack pointer into RBP if the frame pointer is preserved.
72 if (PreserveFramePointer) {
73 mov(rbp, rsp);
74 }
75 // Remove word for ebp
76 framesize -= wordSize;
77
78 // Create frame
79 if (framesize) {
80 subptr(rsp, framesize);
81 }
82 } else {
83 subptr(rsp, framesize);
84
85 // Save RBP register now.
86 framesize -= wordSize;
87 movptr(Address(rsp, framesize), rbp);
88 // Save caller's stack pointer into RBP if the frame pointer is preserved.
89 if (PreserveFramePointer) {
90 movptr(rbp, rsp);
91 if (framesize > 0) {
92 addptr(rbp, framesize);
93 }
94 }
95 }
96
97 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
98 framesize -= wordSize;
99 movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
100 }
101
102 #ifdef ASSERT
103 if (VerifyStackAtCalls) {
104 Label L;
105 push(rax);
106 mov(rax, rsp);
107 andptr(rax, StackAlignmentInBytes-1);
108 cmpptr(rax, StackAlignmentInBytes-wordSize);
109 pop(rax);
110 jcc(Assembler::equal, L);
111 STOP("Stack is not properly aligned!");
112 bind(L);
113 }
114 #endif
115
116 if (!is_stub) {
117 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
118 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
119 Label dummy_slow_path;
120 Label dummy_continuation;
121 Label* slow_path = &dummy_slow_path;
122 Label* continuation = &dummy_continuation;
123 if (!Compile::current()->output()->in_scratch_emit_size()) {
124 // Use real labels from actual stub when not emitting code for the purpose of measuring its size
125 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
126 Compile::current()->output()->add_stub(stub);
127 slow_path = &stub->entry();
128 continuation = &stub->continuation();
129 }
130 bs->nmethod_entry_barrier(this, slow_path, continuation);
131 }
132 }
133
134 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
135 switch (vlen_in_bytes) {
136 case 4: // fall-through
137 case 8: // fall-through
138 case 16: return Assembler::AVX_128bit;
139 case 32: return Assembler::AVX_256bit;
140 case 64: return Assembler::AVX_512bit;
141
142 default: {
143 ShouldNotReachHere();
144 return Assembler::AVX_NoVec;
145 }
146 }
147 }
148
149 // fast_lock and fast_unlock used by C2
150
151 // Because the transitions from emitted code to the runtime
152 // monitorenter/exit helper stubs are so slow it's critical that
153 // we inline both the stack-locking fast path and the inflated fast path.
154 //
155 // See also: cmpFastLock and cmpFastUnlock.
156 //
157 // What follows is a specialized inline transliteration of the code
158 // in enter() and exit(). If we're concerned about I$ bloat another
159 // option would be to emit TrySlowEnter and TrySlowExit methods
160 // at startup-time. These methods would accept arguments as
161 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
162 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply
163 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
164 // In practice, however, the # of lock sites is bounded and is usually small.
165 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
166 // if the processor uses simple bimodal branch predictors keyed by EIP
167 // Since the helper routines would be called from multiple synchronization
168 // sites.
169 //
170 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
171 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
172 // to those specialized methods. That'd give us a mostly platform-independent
173 // implementation that the JITs could optimize and inline at their pleasure.
174 // Done correctly, the only time we'd need to cross to native could would be
175 // to park() or unpark() threads. We'd also need a few more unsafe operators
176 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
177 // (b) explicit barriers or fence operations.
178 //
179 // TODO:
180 //
181 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
182 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
183 // Given TLAB allocation, Self is usually manifested in a register, so passing it into
184 // the lock operators would typically be faster than reifying Self.
185 //
186 // * Ideally I'd define the primitives as:
187 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
188 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
189 // Unfortunately ADLC bugs prevent us from expressing the ideal form.
190 // Instead, we're stuck with a rather awkward and brittle register assignments below.
191 // Furthermore the register assignments are overconstrained, possibly resulting in
192 // sub-optimal code near the synchronization site.
193 //
194 // * Eliminate the sp-proximity tests and just use "== Self" tests instead.
195 // Alternately, use a better sp-proximity test.
196 //
197 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
198 // Either one is sufficient to uniquely identify a thread.
199 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
200 //
201 // * Intrinsify notify() and notifyAll() for the common cases where the
202 // object is locked by the calling thread but the waitlist is empty.
203 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
204 //
205 // * use jccb and jmpb instead of jcc and jmp to improve code density.
206 // But beware of excessive branch density on AMD Opterons.
207 //
208 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success
209 // or failure of the fast path. If the fast path fails then we pass
210 // control to the slow path, typically in C. In fast_lock and
211 // fast_unlock we often branch to DONE_LABEL, just to find that C2
212 // will emit a conditional branch immediately after the node.
213 // So we have branches to branches and lots of ICC.ZF games.
214 // Instead, it might be better to have C2 pass a "FailureLabel"
215 // into fast_lock and fast_unlock. In the case of success, control
216 // will drop through the node. ICC.ZF is undefined at exit.
217 // In the case of failure, the node will branch directly to the
218 // FailureLabel
219
220
221 // obj: object to lock
222 // box: on-stack box address -- KILLED
223 // rax: tmp -- KILLED
224 // t : tmp -- KILLED
225 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
226 Register t, Register thread) {
227 assert(rax_reg == rax, "Used for CAS");
228 assert_different_registers(obj, box, rax_reg, t, thread);
229
230 // Handle inflated monitor.
231 Label inflated;
232 // Finish fast lock successfully. ZF value is irrelevant.
233 Label locked;
234 // Finish fast lock unsuccessfully. MUST jump with ZF == 0
235 Label slow_path;
236
237 if (UseObjectMonitorTable) {
238 // Clear cache in case fast locking succeeds or we need to take the slow-path.
239 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
240 }
241
242 if (DiagnoseSyncOnValueBasedClasses != 0) {
243 load_klass(rax_reg, obj, t);
244 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
245 jcc(Assembler::notZero, slow_path);
246 }
247
248 const Register mark = t;
249
250 { // Lightweight Lock
251
252 Label push;
253
254 const Register top = UseObjectMonitorTable ? rax_reg : box;
255
256 // Load the mark.
257 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
258
259 // Prefetch top.
260 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
261
262 // Check for monitor (0b10).
263 testptr(mark, markWord::monitor_value);
264 jcc(Assembler::notZero, inflated);
265
266 // Check if lock-stack is full.
267 cmpl(top, LockStack::end_offset() - 1);
268 jcc(Assembler::greater, slow_path);
269
270 // Check if recursive.
271 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
272 jccb(Assembler::equal, push);
273
274 // Try to lock. Transition lock bits 0b01 => 0b00
275 movptr(rax_reg, mark);
276 orptr(rax_reg, markWord::unlocked_value);
277 andptr(mark, ~(int32_t)markWord::unlocked_value);
278 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
279 jcc(Assembler::notEqual, slow_path);
280
281 if (UseObjectMonitorTable) {
282 // Need to reload top, clobbered by CAS.
283 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
284 }
285 bind(push);
286 // After successful lock, push object on lock-stack.
287 movptr(Address(thread, top), obj);
288 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
289 jmpb(locked);
290 }
291
292 { // Handle inflated monitor.
293 bind(inflated);
294
295 const Register monitor = t;
296
297 if (!UseObjectMonitorTable) {
298 assert(mark == monitor, "should be the same here");
299 } else {
300 // Uses ObjectMonitorTable. Look for the monitor in the om_cache.
301 // Fetch ObjectMonitor* from the cache or take the slow-path.
302 Label monitor_found;
303
304 // Load cache address
305 lea(t, Address(thread, JavaThread::om_cache_oops_offset()));
306
307 const int num_unrolled = 2;
308 for (int i = 0; i < num_unrolled; i++) {
309 cmpptr(obj, Address(t));
310 jccb(Assembler::equal, monitor_found);
311 increment(t, in_bytes(OMCache::oop_to_oop_difference()));
312 }
313
314 Label loop;
315
316 // Search for obj in cache.
317 bind(loop);
318
319 // Check for match.
320 cmpptr(obj, Address(t));
321 jccb(Assembler::equal, monitor_found);
322
323 // Search until null encountered, guaranteed _null_sentinel at end.
324 cmpptr(Address(t), 1);
325 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0
326 increment(t, in_bytes(OMCache::oop_to_oop_difference()));
327 jmpb(loop);
328
329 // Cache hit.
330 bind(monitor_found);
331 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference()));
332 }
333 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
334 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
335 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
336
337 Label monitor_locked;
338 // Lock the monitor.
339
340 if (UseObjectMonitorTable) {
341 // Cache the monitor for unlock before trashing box. On failure to acquire
342 // the lock, the slow path will reset the entry accordingly (see CacheSetter).
343 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
344 }
345
346 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
347 xorptr(rax_reg, rax_reg);
348 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
349 lock(); cmpxchgptr(box, owner_address);
350 jccb(Assembler::equal, monitor_locked);
351
352 // Check if recursive.
353 cmpptr(box, rax_reg);
354 jccb(Assembler::notEqual, slow_path);
355
356 // Recursive.
357 increment(recursions_address);
358
359 bind(monitor_locked);
360 }
361
362 bind(locked);
363 // Set ZF = 1
364 xorl(rax_reg, rax_reg);
365
366 #ifdef ASSERT
367 // Check that locked label is reached with ZF set.
368 Label zf_correct;
369 Label zf_bad_zero;
370 jcc(Assembler::zero, zf_correct);
371 jmp(zf_bad_zero);
372 #endif
373
374 bind(slow_path);
375 #ifdef ASSERT
376 // Check that slow_path label is reached with ZF not set.
377 jcc(Assembler::notZero, zf_correct);
378 stop("Fast Lock ZF != 0");
379 bind(zf_bad_zero);
380 stop("Fast Lock ZF != 1");
381 bind(zf_correct);
382 #endif
383 // C2 uses the value of ZF to determine the continuation.
384 }
385
386 // obj: object to lock
387 // rax: tmp -- KILLED
388 // t : tmp - cannot be obj nor rax -- KILLED
389 //
390 // Some commentary on balanced locking:
391 //
392 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
393 // Methods that don't have provably balanced locking are forced to run in the
394 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
395 // The interpreter provides two properties:
396 // I1: At return-time the interpreter automatically and quietly unlocks any
397 // objects acquired in the current activation (frame). Recall that the
398 // interpreter maintains an on-stack list of locks currently held by
399 // a frame.
400 // I2: If a method attempts to unlock an object that is not held by the
401 // frame the interpreter throws IMSX.
402 //
403 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
404 // B() doesn't have provably balanced locking so it runs in the interpreter.
405 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
406 // is still locked by A().
407 //
408 // The only other source of unbalanced locking would be JNI. The "Java Native Interface
409 // Specification" states that an object locked by JNI's MonitorEnter should not be
410 // unlocked by "normal" java-level locking and vice-versa. The specification doesn't
411 // specify what will occur if a program engages in such mixed-mode locking, however.
412 // Arguably given that the spec legislates the JNI case as undefined our implementation
413 // could reasonably *avoid* checking owner in fast_unlock().
414 // In the interest of performance we elide m->Owner==Self check in unlock.
415 // A perfectly viable alternative is to elide the owner check except when
416 // Xcheck:jni is enabled.
417
418 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
419 assert(reg_rax == rax, "Used for CAS");
420 assert_different_registers(obj, reg_rax, t);
421
422 // Handle inflated monitor.
423 Label inflated, inflated_check_lock_stack;
424 // Finish fast unlock successfully. MUST jump with ZF == 1
425 Label unlocked, slow_path;
426
427 const Register mark = t;
428 const Register monitor = t;
429 const Register top = UseObjectMonitorTable ? t : reg_rax;
430 const Register box = reg_rax;
431
432 Label dummy;
433 C2FastUnlockLightweightStub* stub = nullptr;
434
435 if (!Compile::current()->output()->in_scratch_emit_size()) {
436 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
437 Compile::current()->output()->add_stub(stub);
438 }
439
440 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
441
442 { // Lightweight Unlock
443
444 // Load top.
445 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
446
447 if (!UseObjectMonitorTable) {
448 // Prefetch mark.
449 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
450 }
451
452 // Check if obj is top of lock-stack.
453 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
454 // Top of lock stack was not obj. Must be monitor.
455 jcc(Assembler::notEqual, inflated_check_lock_stack);
456
457 // Pop lock-stack.
458 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
459 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
460
461 // Check if recursive.
462 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
463 jcc(Assembler::equal, unlocked);
464
465 // We elide the monitor check, let the CAS fail instead.
466
467 if (UseObjectMonitorTable) {
468 // Load mark.
469 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
470 }
471
472 // Try to unlock. Transition lock bits 0b00 => 0b01
473 movptr(reg_rax, mark);
474 andptr(reg_rax, ~(int32_t)markWord::lock_mask);
475 orptr(mark, markWord::unlocked_value);
476 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
477 jcc(Assembler::notEqual, push_and_slow_path);
478 jmp(unlocked);
479 }
480
481
482 { // Handle inflated monitor.
483 bind(inflated_check_lock_stack);
484 #ifdef ASSERT
485 Label check_done;
486 subl(top, oopSize);
487 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
488 jcc(Assembler::below, check_done);
489 cmpptr(obj, Address(thread, top));
490 jccb(Assembler::notEqual, inflated_check_lock_stack);
491 stop("Fast Unlock lock on stack");
492 bind(check_done);
493 if (UseObjectMonitorTable) {
494 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
495 }
496 testptr(mark, markWord::monitor_value);
497 jccb(Assembler::notZero, inflated);
498 stop("Fast Unlock not monitor");
499 #endif
500
501 bind(inflated);
502
503 if (!UseObjectMonitorTable) {
504 assert(mark == monitor, "should be the same here");
505 } else {
506 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack.
507 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
508 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
509 cmpptr(monitor, alignof(ObjectMonitor*));
510 jcc(Assembler::below, slow_path);
511 }
512 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
513 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
514 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
515 const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
516 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
517
518 Label recursive;
519
520 // Check if recursive.
521 cmpptr(recursions_address, 0);
522 jccb(Assembler::notZero, recursive);
523
524 // Set owner to null.
525 // Release to satisfy the JMM
526 movptr(owner_address, NULL_WORD);
527 // We need a full fence after clearing owner to avoid stranding.
528 // StoreLoad achieves this.
529 membar(StoreLoad);
530
531 // Check if the entry_list is empty.
532 cmpptr(entry_list_address, NULL_WORD);
533 jccb(Assembler::zero, unlocked); // If so we are done.
534
535 // Check if there is a successor.
536 cmpptr(succ_address, NULL_WORD);
537 jccb(Assembler::notZero, unlocked); // If so we are done.
538
539 // Save the monitor pointer in the current thread, so we can try to
540 // reacquire the lock in SharedRuntime::monitor_exit_helper().
541 if (!UseObjectMonitorTable) {
542 andptr(monitor, ~(int32_t)markWord::monitor_value);
543 }
544 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
545
546 orl(t, 1); // Fast Unlock ZF = 0
547 jmpb(slow_path);
548
549 // Recursive unlock.
550 bind(recursive);
551 decrement(recursions_address);
552 }
553
554 bind(unlocked);
555 xorl(t, t); // Fast Unlock ZF = 1
556
557 #ifdef ASSERT
558 // Check that unlocked label is reached with ZF set.
559 Label zf_correct;
560 Label zf_bad_zero;
561 jcc(Assembler::zero, zf_correct);
562 jmp(zf_bad_zero);
563 #endif
564
565 bind(slow_path);
566 if (stub != nullptr) {
567 bind(stub->slow_path_continuation());
568 }
569 #ifdef ASSERT
570 // Check that stub->continuation() label is reached with ZF not set.
571 jcc(Assembler::notZero, zf_correct);
572 stop("Fast Unlock ZF != 0");
573 bind(zf_bad_zero);
574 stop("Fast Unlock ZF != 1");
575 bind(zf_correct);
576 #endif
577 // C2 uses the value of ZF to determine the continuation.
578 }
579
580 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
581 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
582 }
583
584 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
585 const int framesize = Compile::current()->output()->frame_size_in_bytes();
586 masm->movptr(dst, rsp);
587 if (framesize > 2 * wordSize) {
588 masm->addptr(dst, framesize - 2 * wordSize);
589 }
590 }
591
592 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
593 if (PreserveFramePointer) {
594 // frame pointer is valid
595 #ifdef ASSERT
596 // Verify frame pointer value in rbp.
597 reconstruct_frame_pointer_helper(this, rtmp);
598 Label L_success;
599 cmpq(rbp, rtmp);
600 jccb(Assembler::equal, L_success);
601 STOP("frame pointer mismatch");
602 bind(L_success);
603 #endif // ASSERT
604 } else {
605 reconstruct_frame_pointer_helper(this, rbp);
606 }
607 }
608
609 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
610 jint lo = t->_lo;
611 jint hi = t->_hi;
612 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
613 if (t == TypeInt::INT) {
614 return;
615 }
616
617 BLOCK_COMMENT("CastII {");
618 Label fail;
619 Label succeed;
620 if (hi == max_jint) {
621 cmpl(val, lo);
622 jccb(Assembler::greaterEqual, succeed);
623 } else {
624 if (lo != min_jint) {
625 cmpl(val, lo);
626 jccb(Assembler::less, fail);
627 }
628 cmpl(val, hi);
629 jccb(Assembler::lessEqual, succeed);
630 }
631
632 bind(fail);
633 movl(c_rarg0, idx);
634 movl(c_rarg1, val);
635 movl(c_rarg2, lo);
636 movl(c_rarg3, hi);
637 reconstruct_frame_pointer(rscratch1);
638 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
639 hlt();
640 bind(succeed);
641 BLOCK_COMMENT("} // CastII");
642 }
643
644 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
645 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
646 }
647
648 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
649 jlong lo = t->_lo;
650 jlong hi = t->_hi;
651 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
652 if (t == TypeLong::LONG) {
653 return;
654 }
655
656 BLOCK_COMMENT("CastLL {");
657 Label fail;
658 Label succeed;
659
660 auto cmp_val = [&](jlong bound) {
661 if (is_simm32(bound)) {
662 cmpq(val, checked_cast<int>(bound));
663 } else {
664 mov64(tmp, bound);
665 cmpq(val, tmp);
666 }
667 };
668
669 if (hi == max_jlong) {
670 cmp_val(lo);
671 jccb(Assembler::greaterEqual, succeed);
672 } else {
673 if (lo != min_jlong) {
674 cmp_val(lo);
675 jccb(Assembler::less, fail);
676 }
677 cmp_val(hi);
678 jccb(Assembler::lessEqual, succeed);
679 }
680
681 bind(fail);
682 movl(c_rarg0, idx);
683 movq(c_rarg1, val);
684 mov64(c_rarg2, lo);
685 mov64(c_rarg3, hi);
686 reconstruct_frame_pointer(rscratch1);
687 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
688 hlt();
689 bind(succeed);
690 BLOCK_COMMENT("} // CastLL");
691 }
692
693 //-------------------------------------------------------------------------------------------
694 // Generic instructions support for use in .ad files C2 code generation
695
696 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
697 if (dst != src) {
698 movdqu(dst, src);
699 }
700 if (opcode == Op_AbsVD) {
701 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
702 } else {
703 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
704 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
705 }
706 }
707
708 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
709 if (opcode == Op_AbsVD) {
710 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
711 } else {
712 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
713 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
714 }
715 }
716
717 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
718 if (dst != src) {
719 movdqu(dst, src);
720 }
721 if (opcode == Op_AbsVF) {
722 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
723 } else {
724 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
725 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
726 }
727 }
728
729 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
730 if (opcode == Op_AbsVF) {
731 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
732 } else {
733 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
734 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
735 }
736 }
737
738 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
739 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
740 assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
741
742 if (opcode == Op_MinV) {
743 if (elem_bt == T_BYTE) {
744 pminsb(dst, src);
745 } else if (elem_bt == T_SHORT) {
746 pminsw(dst, src);
747 } else if (elem_bt == T_INT) {
748 pminsd(dst, src);
749 } else {
750 assert(elem_bt == T_LONG, "required");
751 assert(tmp == xmm0, "required");
752 assert_different_registers(dst, src, tmp);
753 movdqu(xmm0, dst);
754 pcmpgtq(xmm0, src);
755 blendvpd(dst, src); // xmm0 as mask
756 }
757 } else { // opcode == Op_MaxV
758 if (elem_bt == T_BYTE) {
759 pmaxsb(dst, src);
760 } else if (elem_bt == T_SHORT) {
761 pmaxsw(dst, src);
762 } else if (elem_bt == T_INT) {
763 pmaxsd(dst, src);
764 } else {
765 assert(elem_bt == T_LONG, "required");
766 assert(tmp == xmm0, "required");
767 assert_different_registers(dst, src, tmp);
768 movdqu(xmm0, src);
769 pcmpgtq(xmm0, dst);
770 blendvpd(dst, src); // xmm0 as mask
771 }
772 }
773 }
774
775 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
776 XMMRegister src1, Address src2, int vlen_enc) {
777 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
778 if (opcode == Op_UMinV) {
779 switch(elem_bt) {
780 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break;
781 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
782 case T_INT: vpminud(dst, src1, src2, vlen_enc); break;
783 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
784 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
785 }
786 } else {
787 assert(opcode == Op_UMaxV, "required");
788 switch(elem_bt) {
789 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break;
790 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
791 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break;
792 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
793 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
794 }
795 }
796 }
797
798 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
799 // For optimality, leverage a full vector width of 512 bits
800 // for operations over smaller vector sizes on AVX512 targets.
801 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
802 if (opcode == Op_UMaxV) {
803 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
804 } else {
805 assert(opcode == Op_UMinV, "required");
806 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
807 }
808 } else {
809 // T1 = -1
810 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
811 // T1 = -1 << 63
812 vpsllq(xtmp1, xtmp1, 63, vlen_enc);
813 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
814 vpaddq(xtmp2, xtmp1, src2, vlen_enc);
815 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
816 vpaddq(xtmp1, xtmp1, src1, vlen_enc);
817 // Mask = T2 > T1
818 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
819 if (opcode == Op_UMaxV) {
820 // Res = Mask ? Src2 : Src1
821 vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
822 } else {
823 // Res = Mask ? Src1 : Src2
824 vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
825 }
826 }
827 }
828
829 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
830 XMMRegister src1, XMMRegister src2, int vlen_enc) {
831 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
832 if (opcode == Op_UMinV) {
833 switch(elem_bt) {
834 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break;
835 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
836 case T_INT: vpminud(dst, src1, src2, vlen_enc); break;
837 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
838 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
839 }
840 } else {
841 assert(opcode == Op_UMaxV, "required");
842 switch(elem_bt) {
843 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break;
844 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
845 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break;
846 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
847 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
848 }
849 }
850 }
851
852 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
853 XMMRegister dst, XMMRegister src1, XMMRegister src2,
854 int vlen_enc) {
855 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
856
857 if (opcode == Op_MinV) {
858 if (elem_bt == T_BYTE) {
859 vpminsb(dst, src1, src2, vlen_enc);
860 } else if (elem_bt == T_SHORT) {
861 vpminsw(dst, src1, src2, vlen_enc);
862 } else if (elem_bt == T_INT) {
863 vpminsd(dst, src1, src2, vlen_enc);
864 } else {
865 assert(elem_bt == T_LONG, "required");
866 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
867 vpminsq(dst, src1, src2, vlen_enc);
868 } else {
869 assert_different_registers(dst, src1, src2);
870 vpcmpgtq(dst, src1, src2, vlen_enc);
871 vblendvpd(dst, src1, src2, dst, vlen_enc);
872 }
873 }
874 } else { // opcode == Op_MaxV
875 if (elem_bt == T_BYTE) {
876 vpmaxsb(dst, src1, src2, vlen_enc);
877 } else if (elem_bt == T_SHORT) {
878 vpmaxsw(dst, src1, src2, vlen_enc);
879 } else if (elem_bt == T_INT) {
880 vpmaxsd(dst, src1, src2, vlen_enc);
881 } else {
882 assert(elem_bt == T_LONG, "required");
883 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
884 vpmaxsq(dst, src1, src2, vlen_enc);
885 } else {
886 assert_different_registers(dst, src1, src2);
887 vpcmpgtq(dst, src1, src2, vlen_enc);
888 vblendvpd(dst, src2, src1, dst, vlen_enc);
889 }
890 }
891 }
892 }
893
894 // Float/Double min max
895
896 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
897 XMMRegister dst, XMMRegister a, XMMRegister b,
898 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
899 int vlen_enc) {
900 assert(UseAVX > 0, "required");
901 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
902 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
903 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
904 assert_different_registers(a, tmp, atmp, btmp);
905 assert_different_registers(b, tmp, atmp, btmp);
906
907 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
908 bool is_double_word = is_double_word_type(elem_bt);
909
910 /* Note on 'non-obvious' assembly sequence:
911 *
912 * While there are vminps/vmaxps instructions, there are two important differences between hardware
913 * and Java on how they handle floats:
914 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
915 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
916 *
917 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
918 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
919 * (only useful when signs differ, noop otherwise)
920 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
921
922 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
923 * btmp = (b < +0.0) ? a : b
924 * atmp = (b < +0.0) ? b : a
925 * Tmp = Max_Float(atmp , btmp)
926 * Res = (atmp == NaN) ? atmp : Tmp
927 */
928
929 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
930 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
931 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
932 XMMRegister mask;
933
934 if (!is_double_word && is_min) {
935 mask = a;
936 vblend = &MacroAssembler::vblendvps;
937 vmaxmin = &MacroAssembler::vminps;
938 vcmp = &MacroAssembler::vcmpps;
939 } else if (!is_double_word && !is_min) {
940 mask = b;
941 vblend = &MacroAssembler::vblendvps;
942 vmaxmin = &MacroAssembler::vmaxps;
943 vcmp = &MacroAssembler::vcmpps;
944 } else if (is_double_word && is_min) {
945 mask = a;
946 vblend = &MacroAssembler::vblendvpd;
947 vmaxmin = &MacroAssembler::vminpd;
948 vcmp = &MacroAssembler::vcmppd;
949 } else {
950 assert(is_double_word && !is_min, "sanity");
951 mask = b;
952 vblend = &MacroAssembler::vblendvpd;
953 vmaxmin = &MacroAssembler::vmaxpd;
954 vcmp = &MacroAssembler::vcmppd;
955 }
956
957 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
958 XMMRegister maxmin, scratch;
959 if (dst == btmp) {
960 maxmin = btmp;
961 scratch = tmp;
962 } else {
963 maxmin = tmp;
964 scratch = btmp;
965 }
966
967 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
968 if (precompute_mask && !is_double_word) {
969 vpsrad(tmp, mask, 32, vlen_enc);
970 mask = tmp;
971 } else if (precompute_mask && is_double_word) {
972 vpxor(tmp, tmp, tmp, vlen_enc);
973 vpcmpgtq(tmp, tmp, mask, vlen_enc);
974 mask = tmp;
975 }
976
977 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
978 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
979 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
980 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
981 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
982 }
983
984 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
985 XMMRegister dst, XMMRegister a, XMMRegister b,
986 KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
987 int vlen_enc) {
988 assert(UseAVX > 2, "required");
989 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
990 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
991 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
992 assert_different_registers(dst, a, atmp, btmp);
993 assert_different_registers(dst, b, atmp, btmp);
994
995 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
996 bool is_double_word = is_double_word_type(elem_bt);
997 bool merge = true;
998
999 if (!is_double_word && is_min) {
1000 evpmovd2m(ktmp, a, vlen_enc);
1001 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1002 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1003 vminps(dst, atmp, btmp, vlen_enc);
1004 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1005 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1006 } else if (!is_double_word && !is_min) {
1007 evpmovd2m(ktmp, b, vlen_enc);
1008 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1009 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1010 vmaxps(dst, atmp, btmp, vlen_enc);
1011 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1012 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1013 } else if (is_double_word && is_min) {
1014 evpmovq2m(ktmp, a, vlen_enc);
1015 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1016 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1017 vminpd(dst, atmp, btmp, vlen_enc);
1018 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1019 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1020 } else {
1021 assert(is_double_word && !is_min, "sanity");
1022 evpmovq2m(ktmp, b, vlen_enc);
1023 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1024 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1025 vmaxpd(dst, atmp, btmp, vlen_enc);
1026 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1027 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1028 }
1029 }
1030
1031 void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1032 XMMRegister src1, XMMRegister src2, int vlen_enc) {
1033 assert(opc == Op_MinV || opc == Op_MinReductionV ||
1034 opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1035
1036 int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_MINMAX_MIN_COMPARE_SIGN
1037 : AVX10_MINMAX_MAX_COMPARE_SIGN;
1038 if (elem_bt == T_FLOAT) {
1039 evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1040 } else {
1041 assert(elem_bt == T_DOUBLE, "");
1042 evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1043 }
1044 }
1045
1046 // Float/Double signum
1047 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1048 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1049
1050 Label DONE_LABEL;
1051
1052 if (opcode == Op_SignumF) {
1053 ucomiss(dst, zero);
1054 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1055 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN
1056 movflt(dst, one);
1057 jcc(Assembler::above, DONE_LABEL);
1058 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1059 } else if (opcode == Op_SignumD) {
1060 ucomisd(dst, zero);
1061 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1062 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN
1063 movdbl(dst, one);
1064 jcc(Assembler::above, DONE_LABEL);
1065 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1066 }
1067
1068 bind(DONE_LABEL);
1069 }
1070
1071 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1072 if (sign) {
1073 pmovsxbw(dst, src);
1074 } else {
1075 pmovzxbw(dst, src);
1076 }
1077 }
1078
1079 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1080 if (sign) {
1081 vpmovsxbw(dst, src, vector_len);
1082 } else {
1083 vpmovzxbw(dst, src, vector_len);
1084 }
1085 }
1086
1087 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1088 if (sign) {
1089 vpmovsxbd(dst, src, vector_len);
1090 } else {
1091 vpmovzxbd(dst, src, vector_len);
1092 }
1093 }
1094
1095 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1096 if (sign) {
1097 vpmovsxwd(dst, src, vector_len);
1098 } else {
1099 vpmovzxwd(dst, src, vector_len);
1100 }
1101 }
1102
1103 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1104 int shift, int vector_len) {
1105 if (opcode == Op_RotateLeftV) {
1106 if (etype == T_INT) {
1107 evprold(dst, src, shift, vector_len);
1108 } else {
1109 assert(etype == T_LONG, "expected type T_LONG");
1110 evprolq(dst, src, shift, vector_len);
1111 }
1112 } else {
1113 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1114 if (etype == T_INT) {
1115 evprord(dst, src, shift, vector_len);
1116 } else {
1117 assert(etype == T_LONG, "expected type T_LONG");
1118 evprorq(dst, src, shift, vector_len);
1119 }
1120 }
1121 }
1122
1123 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1124 XMMRegister shift, int vector_len) {
1125 if (opcode == Op_RotateLeftV) {
1126 if (etype == T_INT) {
1127 evprolvd(dst, src, shift, vector_len);
1128 } else {
1129 assert(etype == T_LONG, "expected type T_LONG");
1130 evprolvq(dst, src, shift, vector_len);
1131 }
1132 } else {
1133 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1134 if (etype == T_INT) {
1135 evprorvd(dst, src, shift, vector_len);
1136 } else {
1137 assert(etype == T_LONG, "expected type T_LONG");
1138 evprorvq(dst, src, shift, vector_len);
1139 }
1140 }
1141 }
1142
1143 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1144 if (opcode == Op_RShiftVI) {
1145 psrad(dst, shift);
1146 } else if (opcode == Op_LShiftVI) {
1147 pslld(dst, shift);
1148 } else {
1149 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1150 psrld(dst, shift);
1151 }
1152 }
1153
1154 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1155 switch (opcode) {
1156 case Op_RShiftVI: psrad(dst, shift); break;
1157 case Op_LShiftVI: pslld(dst, shift); break;
1158 case Op_URShiftVI: psrld(dst, shift); break;
1159
1160 default: assert(false, "%s", NodeClassNames[opcode]);
1161 }
1162 }
1163
1164 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1165 if (opcode == Op_RShiftVI) {
1166 vpsrad(dst, nds, shift, vector_len);
1167 } else if (opcode == Op_LShiftVI) {
1168 vpslld(dst, nds, shift, vector_len);
1169 } else {
1170 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1171 vpsrld(dst, nds, shift, vector_len);
1172 }
1173 }
1174
1175 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1176 switch (opcode) {
1177 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break;
1178 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break;
1179 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1180
1181 default: assert(false, "%s", NodeClassNames[opcode]);
1182 }
1183 }
1184
1185 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1186 switch (opcode) {
1187 case Op_RShiftVB: // fall-through
1188 case Op_RShiftVS: psraw(dst, shift); break;
1189
1190 case Op_LShiftVB: // fall-through
1191 case Op_LShiftVS: psllw(dst, shift); break;
1192
1193 case Op_URShiftVS: // fall-through
1194 case Op_URShiftVB: psrlw(dst, shift); break;
1195
1196 default: assert(false, "%s", NodeClassNames[opcode]);
1197 }
1198 }
1199
1200 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1201 switch (opcode) {
1202 case Op_RShiftVB: // fall-through
1203 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break;
1204
1205 case Op_LShiftVB: // fall-through
1206 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break;
1207
1208 case Op_URShiftVS: // fall-through
1209 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1210
1211 default: assert(false, "%s", NodeClassNames[opcode]);
1212 }
1213 }
1214
1215 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1216 switch (opcode) {
1217 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1218 case Op_LShiftVL: psllq(dst, shift); break;
1219 case Op_URShiftVL: psrlq(dst, shift); break;
1220
1221 default: assert(false, "%s", NodeClassNames[opcode]);
1222 }
1223 }
1224
1225 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1226 if (opcode == Op_RShiftVL) {
1227 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems
1228 } else if (opcode == Op_LShiftVL) {
1229 psllq(dst, shift);
1230 } else {
1231 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1232 psrlq(dst, shift);
1233 }
1234 }
1235
1236 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1237 switch (opcode) {
1238 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1239 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break;
1240 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1241
1242 default: assert(false, "%s", NodeClassNames[opcode]);
1243 }
1244 }
1245
1246 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1247 if (opcode == Op_RShiftVL) {
1248 evpsraq(dst, nds, shift, vector_len);
1249 } else if (opcode == Op_LShiftVL) {
1250 vpsllq(dst, nds, shift, vector_len);
1251 } else {
1252 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1253 vpsrlq(dst, nds, shift, vector_len);
1254 }
1255 }
1256
1257 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1258 switch (opcode) {
1259 case Op_RShiftVB: // fall-through
1260 case Op_RShiftVS: // fall-through
1261 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break;
1262
1263 case Op_LShiftVB: // fall-through
1264 case Op_LShiftVS: // fall-through
1265 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break;
1266
1267 case Op_URShiftVB: // fall-through
1268 case Op_URShiftVS: // fall-through
1269 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1270
1271 default: assert(false, "%s", NodeClassNames[opcode]);
1272 }
1273 }
1274
1275 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1276 switch (opcode) {
1277 case Op_RShiftVB: // fall-through
1278 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break;
1279
1280 case Op_LShiftVB: // fall-through
1281 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break;
1282
1283 case Op_URShiftVB: // fall-through
1284 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1285
1286 default: assert(false, "%s", NodeClassNames[opcode]);
1287 }
1288 }
1289
1290 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1291 assert(UseAVX >= 2, "required");
1292 switch (opcode) {
1293 case Op_RShiftVL: {
1294 if (UseAVX > 2) {
1295 assert(tmp == xnoreg, "not used");
1296 if (!VM_Version::supports_avx512vl()) {
1297 vlen_enc = Assembler::AVX_512bit;
1298 }
1299 evpsravq(dst, src, shift, vlen_enc);
1300 } else {
1301 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1302 vpsrlvq(dst, src, shift, vlen_enc);
1303 vpsrlvq(tmp, tmp, shift, vlen_enc);
1304 vpxor(dst, dst, tmp, vlen_enc);
1305 vpsubq(dst, dst, tmp, vlen_enc);
1306 }
1307 break;
1308 }
1309 case Op_LShiftVL: {
1310 assert(tmp == xnoreg, "not used");
1311 vpsllvq(dst, src, shift, vlen_enc);
1312 break;
1313 }
1314 case Op_URShiftVL: {
1315 assert(tmp == xnoreg, "not used");
1316 vpsrlvq(dst, src, shift, vlen_enc);
1317 break;
1318 }
1319 default: assert(false, "%s", NodeClassNames[opcode]);
1320 }
1321 }
1322
1323 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1324 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1325 assert(opcode == Op_LShiftVB ||
1326 opcode == Op_RShiftVB ||
1327 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1328 bool sign = (opcode != Op_URShiftVB);
1329 assert(vector_len == 0, "required");
1330 vextendbd(sign, dst, src, 1);
1331 vpmovzxbd(vtmp, shift, 1);
1332 varshiftd(opcode, dst, dst, vtmp, 1);
1333 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1334 vextracti128_high(vtmp, dst);
1335 vpackusdw(dst, dst, vtmp, 0);
1336 }
1337
1338 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1339 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1340 assert(opcode == Op_LShiftVB ||
1341 opcode == Op_RShiftVB ||
1342 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1343 bool sign = (opcode != Op_URShiftVB);
1344 int ext_vector_len = vector_len + 1;
1345 vextendbw(sign, dst, src, ext_vector_len);
1346 vpmovzxbw(vtmp, shift, ext_vector_len);
1347 varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1348 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1349 if (vector_len == 0) {
1350 vextracti128_high(vtmp, dst);
1351 vpackuswb(dst, dst, vtmp, vector_len);
1352 } else {
1353 vextracti64x4_high(vtmp, dst);
1354 vpackuswb(dst, dst, vtmp, vector_len);
1355 vpermq(dst, dst, 0xD8, vector_len);
1356 }
1357 }
1358
1359 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1360 switch(typ) {
1361 case T_BYTE:
1362 pinsrb(dst, val, idx);
1363 break;
1364 case T_SHORT:
1365 pinsrw(dst, val, idx);
1366 break;
1367 case T_INT:
1368 pinsrd(dst, val, idx);
1369 break;
1370 case T_LONG:
1371 pinsrq(dst, val, idx);
1372 break;
1373 default:
1374 assert(false,"Should not reach here.");
1375 break;
1376 }
1377 }
1378
1379 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1380 switch(typ) {
1381 case T_BYTE:
1382 vpinsrb(dst, src, val, idx);
1383 break;
1384 case T_SHORT:
1385 vpinsrw(dst, src, val, idx);
1386 break;
1387 case T_INT:
1388 vpinsrd(dst, src, val, idx);
1389 break;
1390 case T_LONG:
1391 vpinsrq(dst, src, val, idx);
1392 break;
1393 default:
1394 assert(false,"Should not reach here.");
1395 break;
1396 }
1397 }
1398
1399 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst,
1400 Register base, Register idx_base,
1401 Register mask, Register mask_idx,
1402 Register rtmp, int vlen_enc) {
1403 vpxor(dst, dst, dst, vlen_enc);
1404 if (elem_bt == T_SHORT) {
1405 for (int i = 0; i < 4; i++) {
1406 // dst[i] = mask[i] ? src[idx_base[i]] : 0
1407 Label skip_load;
1408 btq(mask, mask_idx);
1409 jccb(Assembler::carryClear, skip_load);
1410 movl(rtmp, Address(idx_base, i * 4));
1411 pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1412 bind(skip_load);
1413 incq(mask_idx);
1414 }
1415 } else {
1416 assert(elem_bt == T_BYTE, "");
1417 for (int i = 0; i < 8; i++) {
1418 // dst[i] = mask[i] ? src[idx_base[i]] : 0
1419 Label skip_load;
1420 btq(mask, mask_idx);
1421 jccb(Assembler::carryClear, skip_load);
1422 movl(rtmp, Address(idx_base, i * 4));
1423 pinsrb(dst, Address(base, rtmp), i);
1424 bind(skip_load);
1425 incq(mask_idx);
1426 }
1427 }
1428 }
1429
1430 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
1431 Register base, Register idx_base,
1432 Register rtmp, int vlen_enc) {
1433 vpxor(dst, dst, dst, vlen_enc);
1434 if (elem_bt == T_SHORT) {
1435 for (int i = 0; i < 4; i++) {
1436 // dst[i] = src[idx_base[i]]
1437 movl(rtmp, Address(idx_base, i * 4));
1438 pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1439 }
1440 } else {
1441 assert(elem_bt == T_BYTE, "");
1442 for (int i = 0; i < 8; i++) {
1443 // dst[i] = src[idx_base[i]]
1444 movl(rtmp, Address(idx_base, i * 4));
1445 pinsrb(dst, Address(base, rtmp), i);
1446 }
1447 }
1448 }
1449
1450 /*
1451 * Gather using hybrid algorithm, first partially unroll scalar loop
1452 * to accumulate values from gather indices into a quad-word(64bit) slice.
1453 * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1454 * permutation to place the slice into appropriate vector lane
1455 * locations in destination vector. Following pseudo code describes the
1456 * algorithm in detail:
1457 *
1458 * DST_VEC = ZERO_VEC
1459 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1460 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1461 * FOREACH_ITER:
1462 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1463 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1464 * DST_VEC = DST_VEC OR TEMP_PERM_VEC
1465 * PERM_INDEX = PERM_INDEX - TWO_VEC
1466 *
1467 * With each iteration, doubleword permute indices (0,1) corresponding
1468 * to gathered quadword gets right shifted by two lane positions.
1469 *
1470 */
1471 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1472 Register base, Register idx_base,
1473 Register mask, XMMRegister xtmp1,
1474 XMMRegister xtmp2, XMMRegister temp_dst,
1475 Register rtmp, Register mask_idx,
1476 Register length, int vector_len, int vlen_enc) {
1477 Label GATHER8_LOOP;
1478 assert(is_subword_type(elem_ty), "");
1479 movl(length, vector_len);
1480 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1481 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1482 vallones(xtmp2, vlen_enc);
1483 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1484 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1485 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1486
1487 bind(GATHER8_LOOP);
1488 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1489 if (mask == noreg) {
1490 vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc);
1491 } else {
1492 vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc);
1493 }
1494 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1495 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1496 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1497 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1498 // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1499 vpor(dst, dst, temp_dst, vlen_enc);
1500 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1));
1501 subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1502 jcc(Assembler::notEqual, GATHER8_LOOP);
1503 }
1504
1505 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1506 switch(typ) {
1507 case T_INT:
1508 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1509 break;
1510 case T_FLOAT:
1511 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1512 break;
1513 case T_LONG:
1514 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1515 break;
1516 case T_DOUBLE:
1517 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1518 break;
1519 default:
1520 assert(false,"Should not reach here.");
1521 break;
1522 }
1523 }
1524
1525 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1526 switch(typ) {
1527 case T_INT:
1528 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1529 break;
1530 case T_FLOAT:
1531 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1532 break;
1533 case T_LONG:
1534 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1535 break;
1536 case T_DOUBLE:
1537 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1538 break;
1539 default:
1540 assert(false,"Should not reach here.");
1541 break;
1542 }
1543 }
1544
1545 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1546 switch(typ) {
1547 case T_INT:
1548 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1549 break;
1550 case T_FLOAT:
1551 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1552 break;
1553 case T_LONG:
1554 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1555 break;
1556 case T_DOUBLE:
1557 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1558 break;
1559 default:
1560 assert(false,"Should not reach here.");
1561 break;
1562 }
1563 }
1564
1565 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1566 if (vlen_in_bytes <= 16) {
1567 pxor (dst, dst);
1568 psubb(dst, src);
1569 switch (elem_bt) {
1570 case T_BYTE: /* nothing to do */ break;
1571 case T_SHORT: pmovsxbw(dst, dst); break;
1572 case T_INT: pmovsxbd(dst, dst); break;
1573 case T_FLOAT: pmovsxbd(dst, dst); break;
1574 case T_LONG: pmovsxbq(dst, dst); break;
1575 case T_DOUBLE: pmovsxbq(dst, dst); break;
1576
1577 default: assert(false, "%s", type2name(elem_bt));
1578 }
1579 } else {
1580 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1581 int vlen_enc = vector_length_encoding(vlen_in_bytes);
1582
1583 vpxor (dst, dst, dst, vlen_enc);
1584 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1585
1586 switch (elem_bt) {
1587 case T_BYTE: /* nothing to do */ break;
1588 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break;
1589 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break;
1590 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break;
1591 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break;
1592 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1593
1594 default: assert(false, "%s", type2name(elem_bt));
1595 }
1596 }
1597 }
1598
1599 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1600 if (novlbwdq) {
1601 vpmovsxbd(xtmp, src, vlen_enc);
1602 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1603 Assembler::eq, true, vlen_enc, noreg);
1604 } else {
1605 vpxor(xtmp, xtmp, xtmp, vlen_enc);
1606 vpsubb(xtmp, xtmp, src, vlen_enc);
1607 evpmovb2m(dst, xtmp, vlen_enc);
1608 }
1609 }
1610
1611 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1612 if (is_integral_type(bt)) {
1613 switch (vlen_in_bytes) {
1614 case 4: movdl(dst, src); break;
1615 case 8: movq(dst, src); break;
1616 case 16: movdqu(dst, src); break;
1617 case 32: vmovdqu(dst, src); break;
1618 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1619 default: ShouldNotReachHere();
1620 }
1621 } else {
1622 switch (vlen_in_bytes) {
1623 case 4: movflt(dst, src); break;
1624 case 8: movdbl(dst, src); break;
1625 case 16: movups(dst, src); break;
1626 case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1627 case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1628 default: ShouldNotReachHere();
1629 }
1630 }
1631 }
1632
1633 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1634 assert(rscratch != noreg || always_reachable(src), "missing");
1635
1636 if (reachable(src)) {
1637 load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1638 } else {
1639 lea(rscratch, src);
1640 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1641 }
1642 }
1643
1644 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1645 int vlen_enc = vector_length_encoding(vlen);
1646 if (VM_Version::supports_avx()) {
1647 if (bt == T_LONG) {
1648 if (VM_Version::supports_avx2()) {
1649 vpbroadcastq(dst, src, vlen_enc);
1650 } else {
1651 vmovddup(dst, src, vlen_enc);
1652 }
1653 } else if (bt == T_DOUBLE) {
1654 if (vlen_enc != Assembler::AVX_128bit) {
1655 vbroadcastsd(dst, src, vlen_enc, noreg);
1656 } else {
1657 vmovddup(dst, src, vlen_enc);
1658 }
1659 } else {
1660 if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1661 vpbroadcastd(dst, src, vlen_enc);
1662 } else {
1663 vbroadcastss(dst, src, vlen_enc);
1664 }
1665 }
1666 } else if (VM_Version::supports_sse3()) {
1667 movddup(dst, src);
1668 } else {
1669 load_vector(bt, dst, src, vlen);
1670 }
1671 }
1672
1673 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1674 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1675 int offset = exact_log2(type2aelembytes(bt)) << 6;
1676 if (is_floating_point_type(bt)) {
1677 offset += 128;
1678 }
1679 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1680 load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1681 }
1682
1683 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1684
1685 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1686 int vector_len = Assembler::AVX_128bit;
1687
1688 switch (opcode) {
1689 case Op_AndReductionV: pand(dst, src); break;
1690 case Op_OrReductionV: por (dst, src); break;
1691 case Op_XorReductionV: pxor(dst, src); break;
1692 case Op_MinReductionV:
1693 switch (typ) {
1694 case T_BYTE: pminsb(dst, src); break;
1695 case T_SHORT: pminsw(dst, src); break;
1696 case T_INT: pminsd(dst, src); break;
1697 case T_LONG: assert(UseAVX > 2, "required");
1698 vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1699 default: assert(false, "wrong type");
1700 }
1701 break;
1702 case Op_MaxReductionV:
1703 switch (typ) {
1704 case T_BYTE: pmaxsb(dst, src); break;
1705 case T_SHORT: pmaxsw(dst, src); break;
1706 case T_INT: pmaxsd(dst, src); break;
1707 case T_LONG: assert(UseAVX > 2, "required");
1708 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1709 default: assert(false, "wrong type");
1710 }
1711 break;
1712 case Op_AddReductionVF: addss(dst, src); break;
1713 case Op_AddReductionVD: addsd(dst, src); break;
1714 case Op_AddReductionVI:
1715 switch (typ) {
1716 case T_BYTE: paddb(dst, src); break;
1717 case T_SHORT: paddw(dst, src); break;
1718 case T_INT: paddd(dst, src); break;
1719 default: assert(false, "wrong type");
1720 }
1721 break;
1722 case Op_AddReductionVL: paddq(dst, src); break;
1723 case Op_MulReductionVF: mulss(dst, src); break;
1724 case Op_MulReductionVD: mulsd(dst, src); break;
1725 case Op_MulReductionVI:
1726 switch (typ) {
1727 case T_SHORT: pmullw(dst, src); break;
1728 case T_INT: pmulld(dst, src); break;
1729 default: assert(false, "wrong type");
1730 }
1731 break;
1732 case Op_MulReductionVL: assert(UseAVX > 2, "required");
1733 evpmullq(dst, dst, src, vector_len); break;
1734 default: assert(false, "wrong opcode");
1735 }
1736 }
1737
1738 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1739 switch (opcode) {
1740 case Op_AddReductionVF: addps(dst, src); break;
1741 case Op_AddReductionVD: addpd(dst, src); break;
1742 case Op_MulReductionVF: mulps(dst, src); break;
1743 case Op_MulReductionVD: mulpd(dst, src); break;
1744 default: assert(false, "%s", NodeClassNames[opcode]);
1745 }
1746 }
1747
1748 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1749 int vector_len = Assembler::AVX_256bit;
1750
1751 switch (opcode) {
1752 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break;
1753 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break;
1754 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break;
1755 case Op_MinReductionV:
1756 switch (typ) {
1757 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break;
1758 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break;
1759 case T_INT: vpminsd(dst, src1, src2, vector_len); break;
1760 case T_LONG: assert(UseAVX > 2, "required");
1761 vpminsq(dst, src1, src2, vector_len); break;
1762 default: assert(false, "wrong type");
1763 }
1764 break;
1765 case Op_MaxReductionV:
1766 switch (typ) {
1767 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break;
1768 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break;
1769 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break;
1770 case T_LONG: assert(UseAVX > 2, "required");
1771 vpmaxsq(dst, src1, src2, vector_len); break;
1772 default: assert(false, "wrong type");
1773 }
1774 break;
1775 case Op_AddReductionVI:
1776 switch (typ) {
1777 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break;
1778 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break;
1779 case T_INT: vpaddd(dst, src1, src2, vector_len); break;
1780 default: assert(false, "wrong type");
1781 }
1782 break;
1783 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1784 case Op_MulReductionVI:
1785 switch (typ) {
1786 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break;
1787 case T_INT: vpmulld(dst, src1, src2, vector_len); break;
1788 default: assert(false, "wrong type");
1789 }
1790 break;
1791 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1792 default: assert(false, "wrong opcode");
1793 }
1794 }
1795
1796 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1797 int vector_len = Assembler::AVX_256bit;
1798
1799 switch (opcode) {
1800 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1801 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1802 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1803 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1804 default: assert(false, "%s", NodeClassNames[opcode]);
1805 }
1806 }
1807
1808 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1809 XMMRegister dst, XMMRegister src,
1810 XMMRegister vtmp1, XMMRegister vtmp2) {
1811 switch (opcode) {
1812 case Op_AddReductionVF:
1813 case Op_MulReductionVF:
1814 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1815 break;
1816
1817 case Op_AddReductionVD:
1818 case Op_MulReductionVD:
1819 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1820 break;
1821
1822 default: assert(false, "wrong opcode");
1823 }
1824 }
1825
1826 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1827 XMMRegister dst, XMMRegister src,
1828 XMMRegister vtmp1, XMMRegister vtmp2) {
1829 switch (opcode) {
1830 case Op_AddReductionVF:
1831 case Op_MulReductionVF:
1832 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1833 break;
1834
1835 case Op_AddReductionVD:
1836 case Op_MulReductionVD:
1837 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1838 break;
1839
1840 default: assert(false, "%s", NodeClassNames[opcode]);
1841 }
1842 }
1843
1844 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1845 Register dst, Register src1, XMMRegister src2,
1846 XMMRegister vtmp1, XMMRegister vtmp2) {
1847 switch (vlen) {
1848 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1849 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1850 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1851 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1852
1853 default: assert(false, "wrong vector length");
1854 }
1855 }
1856
1857 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1858 Register dst, Register src1, XMMRegister src2,
1859 XMMRegister vtmp1, XMMRegister vtmp2) {
1860 switch (vlen) {
1861 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1862 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1863 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1864 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1865
1866 default: assert(false, "wrong vector length");
1867 }
1868 }
1869
1870 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1871 Register dst, Register src1, XMMRegister src2,
1872 XMMRegister vtmp1, XMMRegister vtmp2) {
1873 switch (vlen) {
1874 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1875 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1876 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1877 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1878
1879 default: assert(false, "wrong vector length");
1880 }
1881 }
1882
1883 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1884 Register dst, Register src1, XMMRegister src2,
1885 XMMRegister vtmp1, XMMRegister vtmp2) {
1886 switch (vlen) {
1887 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1888 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1889 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1890 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1891
1892 default: assert(false, "wrong vector length");
1893 }
1894 }
1895
1896 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1897 Register dst, Register src1, XMMRegister src2,
1898 XMMRegister vtmp1, XMMRegister vtmp2) {
1899 switch (vlen) {
1900 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1901 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1902 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1903
1904 default: assert(false, "wrong vector length");
1905 }
1906 }
1907
1908 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1909 switch (vlen) {
1910 case 2:
1911 assert(vtmp2 == xnoreg, "");
1912 reduce2F(opcode, dst, src, vtmp1);
1913 break;
1914 case 4:
1915 assert(vtmp2 == xnoreg, "");
1916 reduce4F(opcode, dst, src, vtmp1);
1917 break;
1918 case 8:
1919 reduce8F(opcode, dst, src, vtmp1, vtmp2);
1920 break;
1921 case 16:
1922 reduce16F(opcode, dst, src, vtmp1, vtmp2);
1923 break;
1924 default: assert(false, "wrong vector length");
1925 }
1926 }
1927
1928 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1929 switch (vlen) {
1930 case 2:
1931 assert(vtmp2 == xnoreg, "");
1932 reduce2D(opcode, dst, src, vtmp1);
1933 break;
1934 case 4:
1935 reduce4D(opcode, dst, src, vtmp1, vtmp2);
1936 break;
1937 case 8:
1938 reduce8D(opcode, dst, src, vtmp1, vtmp2);
1939 break;
1940 default: assert(false, "wrong vector length");
1941 }
1942 }
1943
1944 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1945 switch (vlen) {
1946 case 2:
1947 assert(vtmp1 == xnoreg, "");
1948 assert(vtmp2 == xnoreg, "");
1949 unorderedReduce2F(opcode, dst, src);
1950 break;
1951 case 4:
1952 assert(vtmp2 == xnoreg, "");
1953 unorderedReduce4F(opcode, dst, src, vtmp1);
1954 break;
1955 case 8:
1956 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
1957 break;
1958 case 16:
1959 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
1960 break;
1961 default: assert(false, "wrong vector length");
1962 }
1963 }
1964
1965 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1966 switch (vlen) {
1967 case 2:
1968 assert(vtmp1 == xnoreg, "");
1969 assert(vtmp2 == xnoreg, "");
1970 unorderedReduce2D(opcode, dst, src);
1971 break;
1972 case 4:
1973 assert(vtmp2 == xnoreg, "");
1974 unorderedReduce4D(opcode, dst, src, vtmp1);
1975 break;
1976 case 8:
1977 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
1978 break;
1979 default: assert(false, "wrong vector length");
1980 }
1981 }
1982
1983 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1984 if (opcode == Op_AddReductionVI) {
1985 if (vtmp1 != src2) {
1986 movdqu(vtmp1, src2);
1987 }
1988 phaddd(vtmp1, vtmp1);
1989 } else {
1990 pshufd(vtmp1, src2, 0x1);
1991 reduce_operation_128(T_INT, opcode, vtmp1, src2);
1992 }
1993 movdl(vtmp2, src1);
1994 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1995 movdl(dst, vtmp1);
1996 }
1997
1998 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1999 if (opcode == Op_AddReductionVI) {
2000 if (vtmp1 != src2) {
2001 movdqu(vtmp1, src2);
2002 }
2003 phaddd(vtmp1, src2);
2004 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2005 } else {
2006 pshufd(vtmp2, src2, 0xE);
2007 reduce_operation_128(T_INT, opcode, vtmp2, src2);
2008 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2009 }
2010 }
2011
2012 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2013 if (opcode == Op_AddReductionVI) {
2014 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2015 vextracti128_high(vtmp2, vtmp1);
2016 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2017 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2018 } else {
2019 vextracti128_high(vtmp1, src2);
2020 reduce_operation_128(T_INT, opcode, vtmp1, src2);
2021 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2022 }
2023 }
2024
2025 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2026 vextracti64x4_high(vtmp2, src2);
2027 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2028 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2029 }
2030
2031 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2032 pshufd(vtmp2, src2, 0x1);
2033 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2034 movdqu(vtmp1, vtmp2);
2035 psrldq(vtmp1, 2);
2036 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2037 movdqu(vtmp2, vtmp1);
2038 psrldq(vtmp2, 1);
2039 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2040 movdl(vtmp2, src1);
2041 pmovsxbd(vtmp1, vtmp1);
2042 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2043 pextrb(dst, vtmp1, 0x0);
2044 movsbl(dst, dst);
2045 }
2046
2047 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2048 pshufd(vtmp1, src2, 0xE);
2049 reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2050 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2051 }
2052
2053 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2054 vextracti128_high(vtmp2, src2);
2055 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2056 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2057 }
2058
2059 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2060 vextracti64x4_high(vtmp1, src2);
2061 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2062 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2063 }
2064
2065 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2066 pmovsxbw(vtmp2, src2);
2067 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2068 }
2069
2070 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2071 if (UseAVX > 1) {
2072 int vector_len = Assembler::AVX_256bit;
2073 vpmovsxbw(vtmp1, src2, vector_len);
2074 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2075 } else {
2076 pmovsxbw(vtmp2, src2);
2077 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2078 pshufd(vtmp2, src2, 0x1);
2079 pmovsxbw(vtmp2, src2);
2080 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2081 }
2082 }
2083
2084 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2085 if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2086 int vector_len = Assembler::AVX_512bit;
2087 vpmovsxbw(vtmp1, src2, vector_len);
2088 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2089 } else {
2090 assert(UseAVX >= 2,"Should not reach here.");
2091 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2092 vextracti128_high(vtmp2, src2);
2093 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2094 }
2095 }
2096
2097 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2098 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2099 vextracti64x4_high(vtmp2, src2);
2100 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2101 }
2102
2103 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2104 if (opcode == Op_AddReductionVI) {
2105 if (vtmp1 != src2) {
2106 movdqu(vtmp1, src2);
2107 }
2108 phaddw(vtmp1, vtmp1);
2109 phaddw(vtmp1, vtmp1);
2110 } else {
2111 pshufd(vtmp2, src2, 0x1);
2112 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2113 movdqu(vtmp1, vtmp2);
2114 psrldq(vtmp1, 2);
2115 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2116 }
2117 movdl(vtmp2, src1);
2118 pmovsxwd(vtmp1, vtmp1);
2119 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2120 pextrw(dst, vtmp1, 0x0);
2121 movswl(dst, dst);
2122 }
2123
2124 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2125 if (opcode == Op_AddReductionVI) {
2126 if (vtmp1 != src2) {
2127 movdqu(vtmp1, src2);
2128 }
2129 phaddw(vtmp1, src2);
2130 } else {
2131 pshufd(vtmp1, src2, 0xE);
2132 reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2133 }
2134 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2135 }
2136
2137 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2138 if (opcode == Op_AddReductionVI) {
2139 int vector_len = Assembler::AVX_256bit;
2140 vphaddw(vtmp2, src2, src2, vector_len);
2141 vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2142 } else {
2143 vextracti128_high(vtmp2, src2);
2144 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2145 }
2146 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2147 }
2148
2149 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2150 int vector_len = Assembler::AVX_256bit;
2151 vextracti64x4_high(vtmp1, src2);
2152 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2153 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2154 }
2155
2156 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2157 pshufd(vtmp2, src2, 0xE);
2158 reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2159 movdq(vtmp1, src1);
2160 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2161 movdq(dst, vtmp1);
2162 }
2163
2164 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2165 vextracti128_high(vtmp1, src2);
2166 reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2167 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2168 }
2169
2170 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2171 vextracti64x4_high(vtmp2, src2);
2172 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2173 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2174 }
2175
2176 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2177 mov64(temp, -1L);
2178 bzhiq(temp, temp, len);
2179 kmovql(dst, temp);
2180 }
2181
2182 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2183 reduce_operation_128(T_FLOAT, opcode, dst, src);
2184 pshufd(vtmp, src, 0x1);
2185 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2186 }
2187
2188 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2189 reduce2F(opcode, dst, src, vtmp);
2190 pshufd(vtmp, src, 0x2);
2191 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2192 pshufd(vtmp, src, 0x3);
2193 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2194 }
2195
2196 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2197 reduce4F(opcode, dst, src, vtmp2);
2198 vextractf128_high(vtmp2, src);
2199 reduce4F(opcode, dst, vtmp2, vtmp1);
2200 }
2201
2202 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2203 reduce8F(opcode, dst, src, vtmp1, vtmp2);
2204 vextracti64x4_high(vtmp1, src);
2205 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2206 }
2207
2208 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2209 pshufd(dst, src, 0x1);
2210 reduce_operation_128(T_FLOAT, opcode, dst, src);
2211 }
2212
2213 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2214 pshufd(vtmp, src, 0xE);
2215 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2216 unorderedReduce2F(opcode, dst, vtmp);
2217 }
2218
2219 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2220 vextractf128_high(vtmp1, src);
2221 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2222 unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2223 }
2224
2225 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2226 vextractf64x4_high(vtmp2, src);
2227 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2228 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2229 }
2230
2231 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2232 reduce_operation_128(T_DOUBLE, opcode, dst, src);
2233 pshufd(vtmp, src, 0xE);
2234 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2235 }
2236
2237 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2238 reduce2D(opcode, dst, src, vtmp2);
2239 vextractf128_high(vtmp2, src);
2240 reduce2D(opcode, dst, vtmp2, vtmp1);
2241 }
2242
2243 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2244 reduce4D(opcode, dst, src, vtmp1, vtmp2);
2245 vextracti64x4_high(vtmp1, src);
2246 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2247 }
2248
2249 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2250 pshufd(dst, src, 0xE);
2251 reduce_operation_128(T_DOUBLE, opcode, dst, src);
2252 }
2253
2254 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2255 vextractf128_high(vtmp, src);
2256 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2257 unorderedReduce2D(opcode, dst, vtmp);
2258 }
2259
2260 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2261 vextractf64x4_high(vtmp2, src);
2262 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2263 unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2264 }
2265
2266 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2267 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2268 }
2269
2270 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2271 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2272 }
2273
2274 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2275 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2276 }
2277
2278 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2279 int vec_enc) {
2280 switch(elem_bt) {
2281 case T_INT:
2282 case T_FLOAT:
2283 vmaskmovps(dst, src, mask, vec_enc);
2284 break;
2285 case T_LONG:
2286 case T_DOUBLE:
2287 vmaskmovpd(dst, src, mask, vec_enc);
2288 break;
2289 default:
2290 fatal("Unsupported type %s", type2name(elem_bt));
2291 break;
2292 }
2293 }
2294
2295 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2296 int vec_enc) {
2297 switch(elem_bt) {
2298 case T_INT:
2299 case T_FLOAT:
2300 vmaskmovps(dst, src, mask, vec_enc);
2301 break;
2302 case T_LONG:
2303 case T_DOUBLE:
2304 vmaskmovpd(dst, src, mask, vec_enc);
2305 break;
2306 default:
2307 fatal("Unsupported type %s", type2name(elem_bt));
2308 break;
2309 }
2310 }
2311
2312 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2313 XMMRegister dst, XMMRegister src,
2314 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2315 XMMRegister xmm_0, XMMRegister xmm_1) {
2316 const int permconst[] = {1, 14};
2317 XMMRegister wsrc = src;
2318 XMMRegister wdst = xmm_0;
2319 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2320
2321 int vlen_enc = Assembler::AVX_128bit;
2322 if (vlen == 16) {
2323 vlen_enc = Assembler::AVX_256bit;
2324 }
2325
2326 for (int i = log2(vlen) - 1; i >=0; i--) {
2327 if (i == 0 && !is_dst_valid) {
2328 wdst = dst;
2329 }
2330 if (i == 3) {
2331 vextracti64x4_high(wtmp, wsrc);
2332 } else if (i == 2) {
2333 vextracti128_high(wtmp, wsrc);
2334 } else { // i = [0,1]
2335 vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2336 }
2337
2338 if (VM_Version::supports_avx10_2()) {
2339 vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2340 } else {
2341 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2342 }
2343 wsrc = wdst;
2344 vlen_enc = Assembler::AVX_128bit;
2345 }
2346 if (is_dst_valid) {
2347 if (VM_Version::supports_avx10_2()) {
2348 vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2349 } else {
2350 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2351 }
2352 }
2353 }
2354
2355 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2356 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2357 XMMRegister xmm_0, XMMRegister xmm_1) {
2358 XMMRegister wsrc = src;
2359 XMMRegister wdst = xmm_0;
2360 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2361 int vlen_enc = Assembler::AVX_128bit;
2362 if (vlen == 8) {
2363 vlen_enc = Assembler::AVX_256bit;
2364 }
2365 for (int i = log2(vlen) - 1; i >=0; i--) {
2366 if (i == 0 && !is_dst_valid) {
2367 wdst = dst;
2368 }
2369 if (i == 1) {
2370 vextracti128_high(wtmp, wsrc);
2371 } else if (i == 2) {
2372 vextracti64x4_high(wtmp, wsrc);
2373 } else {
2374 assert(i == 0, "%d", i);
2375 vpermilpd(wtmp, wsrc, 1, vlen_enc);
2376 }
2377
2378 if (VM_Version::supports_avx10_2()) {
2379 vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2380 } else {
2381 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2382 }
2383
2384 wsrc = wdst;
2385 vlen_enc = Assembler::AVX_128bit;
2386 }
2387
2388 if (is_dst_valid) {
2389 if (VM_Version::supports_avx10_2()) {
2390 vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2391 } else {
2392 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2393 }
2394 }
2395 }
2396
2397 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2398 switch (bt) {
2399 case T_BYTE: pextrb(dst, src, idx); break;
2400 case T_SHORT: pextrw(dst, src, idx); break;
2401 case T_INT: pextrd(dst, src, idx); break;
2402 case T_LONG: pextrq(dst, src, idx); break;
2403
2404 default:
2405 assert(false,"Should not reach here.");
2406 break;
2407 }
2408 }
2409
2410 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2411 int esize = type2aelembytes(typ);
2412 int elem_per_lane = 16/esize;
2413 int lane = elemindex / elem_per_lane;
2414 int eindex = elemindex % elem_per_lane;
2415
2416 if (lane >= 2) {
2417 assert(UseAVX > 2, "required");
2418 vextractf32x4(dst, src, lane & 3);
2419 return dst;
2420 } else if (lane > 0) {
2421 assert(UseAVX > 0, "required");
2422 vextractf128(dst, src, lane);
2423 return dst;
2424 } else {
2425 return src;
2426 }
2427 }
2428
2429 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2430 if (typ == T_BYTE) {
2431 movsbl(dst, dst);
2432 } else if (typ == T_SHORT) {
2433 movswl(dst, dst);
2434 }
2435 }
2436
2437 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2438 int esize = type2aelembytes(typ);
2439 int elem_per_lane = 16/esize;
2440 int eindex = elemindex % elem_per_lane;
2441 assert(is_integral_type(typ),"required");
2442
2443 if (eindex == 0) {
2444 if (typ == T_LONG) {
2445 movq(dst, src);
2446 } else {
2447 movdl(dst, src);
2448 movsxl(typ, dst);
2449 }
2450 } else {
2451 extract(typ, dst, src, eindex);
2452 movsxl(typ, dst);
2453 }
2454 }
2455
2456 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2457 int esize = type2aelembytes(typ);
2458 int elem_per_lane = 16/esize;
2459 int eindex = elemindex % elem_per_lane;
2460 assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2461
2462 if (eindex == 0) {
2463 movq(dst, src);
2464 } else {
2465 if (typ == T_FLOAT) {
2466 if (UseAVX == 0) {
2467 movdqu(dst, src);
2468 shufps(dst, dst, eindex);
2469 } else {
2470 vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2471 }
2472 } else {
2473 if (UseAVX == 0) {
2474 movdqu(dst, src);
2475 psrldq(dst, eindex*esize);
2476 } else {
2477 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2478 }
2479 movq(dst, dst);
2480 }
2481 }
2482 // Zero upper bits
2483 if (typ == T_FLOAT) {
2484 if (UseAVX == 0) {
2485 assert(vtmp != xnoreg, "required.");
2486 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2487 pand(dst, vtmp);
2488 } else {
2489 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2490 }
2491 }
2492 }
2493
2494 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2495 switch(typ) {
2496 case T_BYTE:
2497 case T_BOOLEAN:
2498 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2499 break;
2500 case T_SHORT:
2501 case T_CHAR:
2502 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2503 break;
2504 case T_INT:
2505 case T_FLOAT:
2506 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2507 break;
2508 case T_LONG:
2509 case T_DOUBLE:
2510 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2511 break;
2512 default:
2513 assert(false,"Should not reach here.");
2514 break;
2515 }
2516 }
2517
2518 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2519 assert(rscratch != noreg || always_reachable(src2), "missing");
2520
2521 switch(typ) {
2522 case T_BOOLEAN:
2523 case T_BYTE:
2524 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2525 break;
2526 case T_CHAR:
2527 case T_SHORT:
2528 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2529 break;
2530 case T_INT:
2531 case T_FLOAT:
2532 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2533 break;
2534 case T_LONG:
2535 case T_DOUBLE:
2536 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2537 break;
2538 default:
2539 assert(false,"Should not reach here.");
2540 break;
2541 }
2542 }
2543
2544 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2545 switch(typ) {
2546 case T_BYTE:
2547 evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2548 break;
2549 case T_SHORT:
2550 evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2551 break;
2552 case T_INT:
2553 case T_FLOAT:
2554 evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2555 break;
2556 case T_LONG:
2557 case T_DOUBLE:
2558 evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2559 break;
2560 default:
2561 assert(false,"Should not reach here.");
2562 break;
2563 }
2564 }
2565
2566 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2567 assert(vlen_in_bytes <= 32, "");
2568 int esize = type2aelembytes(bt);
2569 if (vlen_in_bytes == 32) {
2570 assert(vtmp == xnoreg, "required.");
2571 if (esize >= 4) {
2572 vtestps(src1, src2, AVX_256bit);
2573 } else {
2574 vptest(src1, src2, AVX_256bit);
2575 }
2576 return;
2577 }
2578 if (vlen_in_bytes < 16) {
2579 // Duplicate the lower part to fill the whole register,
2580 // Don't need to do so for src2
2581 assert(vtmp != xnoreg, "required");
2582 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2583 pshufd(vtmp, src1, shuffle_imm);
2584 } else {
2585 assert(vtmp == xnoreg, "required");
2586 vtmp = src1;
2587 }
2588 if (esize >= 4 && VM_Version::supports_avx()) {
2589 vtestps(vtmp, src2, AVX_128bit);
2590 } else {
2591 ptest(vtmp, src2);
2592 }
2593 }
2594
2595 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2596 #ifdef ASSERT
2597 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2598 bool is_bw_supported = VM_Version::supports_avx512bw();
2599 if (is_bw && !is_bw_supported) {
2600 assert(vlen_enc != Assembler::AVX_512bit, "required");
2601 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2602 "XMM register should be 0-15");
2603 }
2604 #endif // ASSERT
2605 switch (elem_bt) {
2606 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2607 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2608 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2609 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2610 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2611 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2612 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2613 }
2614 }
2615
2616 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2617 assert(UseAVX >= 2, "required");
2618 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2619 bool is_vl = vlen_enc != Assembler::AVX_512bit;
2620 if ((UseAVX > 2) &&
2621 (!is_bw || VM_Version::supports_avx512bw()) &&
2622 (!is_vl || VM_Version::supports_avx512vl())) {
2623 switch (elem_bt) {
2624 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2625 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2626 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2627 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2628 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2629 }
2630 } else {
2631 assert(vlen_enc != Assembler::AVX_512bit, "required");
2632 assert((dst->encoding() < 16),"XMM register should be 0-15");
2633 switch (elem_bt) {
2634 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2635 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2636 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2637 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2638 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2639 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2640 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2641 }
2642 }
2643 }
2644
2645 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2646 switch (to_elem_bt) {
2647 case T_SHORT:
2648 vpmovsxbw(dst, src, vlen_enc);
2649 break;
2650 case T_INT:
2651 vpmovsxbd(dst, src, vlen_enc);
2652 break;
2653 case T_FLOAT:
2654 vpmovsxbd(dst, src, vlen_enc);
2655 vcvtdq2ps(dst, dst, vlen_enc);
2656 break;
2657 case T_LONG:
2658 vpmovsxbq(dst, src, vlen_enc);
2659 break;
2660 case T_DOUBLE: {
2661 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2662 vpmovsxbd(dst, src, mid_vlen_enc);
2663 vcvtdq2pd(dst, dst, vlen_enc);
2664 break;
2665 }
2666 default:
2667 fatal("Unsupported type %s", type2name(to_elem_bt));
2668 break;
2669 }
2670 }
2671
2672 //-------------------------------------------------------------------------------------------
2673
2674 // IndexOf for constant substrings with size >= 8 chars
2675 // which don't need to be loaded through stack.
2676 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2677 Register cnt1, Register cnt2,
2678 int int_cnt2, Register result,
2679 XMMRegister vec, Register tmp,
2680 int ae) {
2681 ShortBranchVerifier sbv(this);
2682 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2683 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2684
2685 // This method uses the pcmpestri instruction with bound registers
2686 // inputs:
2687 // xmm - substring
2688 // rax - substring length (elements count)
2689 // mem - scanned string
2690 // rdx - string length (elements count)
2691 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2692 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2693 // outputs:
2694 // rcx - matched index in string
2695 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2696 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2697 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2698 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2699 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2700
2701 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2702 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2703 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2704
2705 // Note, inline_string_indexOf() generates checks:
2706 // if (substr.count > string.count) return -1;
2707 // if (substr.count == 0) return 0;
2708 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2709
2710 // Load substring.
2711 if (ae == StrIntrinsicNode::UL) {
2712 pmovzxbw(vec, Address(str2, 0));
2713 } else {
2714 movdqu(vec, Address(str2, 0));
2715 }
2716 movl(cnt2, int_cnt2);
2717 movptr(result, str1); // string addr
2718
2719 if (int_cnt2 > stride) {
2720 jmpb(SCAN_TO_SUBSTR);
2721
2722 // Reload substr for rescan, this code
2723 // is executed only for large substrings (> 8 chars)
2724 bind(RELOAD_SUBSTR);
2725 if (ae == StrIntrinsicNode::UL) {
2726 pmovzxbw(vec, Address(str2, 0));
2727 } else {
2728 movdqu(vec, Address(str2, 0));
2729 }
2730 negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2731
2732 bind(RELOAD_STR);
2733 // We came here after the beginning of the substring was
2734 // matched but the rest of it was not so we need to search
2735 // again. Start from the next element after the previous match.
2736
2737 // cnt2 is number of substring reminding elements and
2738 // cnt1 is number of string reminding elements when cmp failed.
2739 // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2740 subl(cnt1, cnt2);
2741 addl(cnt1, int_cnt2);
2742 movl(cnt2, int_cnt2); // Now restore cnt2
2743
2744 decrementl(cnt1); // Shift to next element
2745 cmpl(cnt1, cnt2);
2746 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2747
2748 addptr(result, (1<<scale1));
2749
2750 } // (int_cnt2 > 8)
2751
2752 // Scan string for start of substr in 16-byte vectors
2753 bind(SCAN_TO_SUBSTR);
2754 pcmpestri(vec, Address(result, 0), mode);
2755 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
2756 subl(cnt1, stride);
2757 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2758 cmpl(cnt1, cnt2);
2759 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2760 addptr(result, 16);
2761 jmpb(SCAN_TO_SUBSTR);
2762
2763 // Found a potential substr
2764 bind(FOUND_CANDIDATE);
2765 // Matched whole vector if first element matched (tmp(rcx) == 0).
2766 if (int_cnt2 == stride) {
2767 jccb(Assembler::overflow, RET_FOUND); // OF == 1
2768 } else { // int_cnt2 > 8
2769 jccb(Assembler::overflow, FOUND_SUBSTR);
2770 }
2771 // After pcmpestri tmp(rcx) contains matched element index
2772 // Compute start addr of substr
2773 lea(result, Address(result, tmp, scale1));
2774
2775 // Make sure string is still long enough
2776 subl(cnt1, tmp);
2777 cmpl(cnt1, cnt2);
2778 if (int_cnt2 == stride) {
2779 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2780 } else { // int_cnt2 > 8
2781 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2782 }
2783 // Left less then substring.
2784
2785 bind(RET_NOT_FOUND);
2786 movl(result, -1);
2787 jmp(EXIT);
2788
2789 if (int_cnt2 > stride) {
2790 // This code is optimized for the case when whole substring
2791 // is matched if its head is matched.
2792 bind(MATCH_SUBSTR_HEAD);
2793 pcmpestri(vec, Address(result, 0), mode);
2794 // Reload only string if does not match
2795 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2796
2797 Label CONT_SCAN_SUBSTR;
2798 // Compare the rest of substring (> 8 chars).
2799 bind(FOUND_SUBSTR);
2800 // First 8 chars are already matched.
2801 negptr(cnt2);
2802 addptr(cnt2, stride);
2803
2804 bind(SCAN_SUBSTR);
2805 subl(cnt1, stride);
2806 cmpl(cnt2, -stride); // Do not read beyond substring
2807 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2808 // Back-up strings to avoid reading beyond substring:
2809 // cnt1 = cnt1 - cnt2 + 8
2810 addl(cnt1, cnt2); // cnt2 is negative
2811 addl(cnt1, stride);
2812 movl(cnt2, stride); negptr(cnt2);
2813 bind(CONT_SCAN_SUBSTR);
2814 if (int_cnt2 < (int)G) {
2815 int tail_off1 = int_cnt2<<scale1;
2816 int tail_off2 = int_cnt2<<scale2;
2817 if (ae == StrIntrinsicNode::UL) {
2818 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2819 } else {
2820 movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2821 }
2822 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2823 } else {
2824 // calculate index in register to avoid integer overflow (int_cnt2*2)
2825 movl(tmp, int_cnt2);
2826 addptr(tmp, cnt2);
2827 if (ae == StrIntrinsicNode::UL) {
2828 pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2829 } else {
2830 movdqu(vec, Address(str2, tmp, scale2, 0));
2831 }
2832 pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2833 }
2834 // Need to reload strings pointers if not matched whole vector
2835 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2836 addptr(cnt2, stride);
2837 jcc(Assembler::negative, SCAN_SUBSTR);
2838 // Fall through if found full substring
2839
2840 } // (int_cnt2 > 8)
2841
2842 bind(RET_FOUND);
2843 // Found result if we matched full small substring.
2844 // Compute substr offset
2845 subptr(result, str1);
2846 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2847 shrl(result, 1); // index
2848 }
2849 bind(EXIT);
2850
2851 } // string_indexofC8
2852
2853 // Small strings are loaded through stack if they cross page boundary.
2854 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2855 Register cnt1, Register cnt2,
2856 int int_cnt2, Register result,
2857 XMMRegister vec, Register tmp,
2858 int ae) {
2859 ShortBranchVerifier sbv(this);
2860 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2861 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2862
2863 //
2864 // int_cnt2 is length of small (< 8 chars) constant substring
2865 // or (-1) for non constant substring in which case its length
2866 // is in cnt2 register.
2867 //
2868 // Note, inline_string_indexOf() generates checks:
2869 // if (substr.count > string.count) return -1;
2870 // if (substr.count == 0) return 0;
2871 //
2872 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2873 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2874 // This method uses the pcmpestri instruction with bound registers
2875 // inputs:
2876 // xmm - substring
2877 // rax - substring length (elements count)
2878 // mem - scanned string
2879 // rdx - string length (elements count)
2880 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2881 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2882 // outputs:
2883 // rcx - matched index in string
2884 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2885 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2886 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2887 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2888
2889 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2890 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2891 FOUND_CANDIDATE;
2892
2893 { //========================================================
2894 // We don't know where these strings are located
2895 // and we can't read beyond them. Load them through stack.
2896 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2897
2898 movptr(tmp, rsp); // save old SP
2899
2900 if (int_cnt2 > 0) { // small (< 8 chars) constant substring
2901 if (int_cnt2 == (1>>scale2)) { // One byte
2902 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2903 load_unsigned_byte(result, Address(str2, 0));
2904 movdl(vec, result); // move 32 bits
2905 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes
2906 // Not enough header space in 32-bit VM: 12+3 = 15.
2907 movl(result, Address(str2, -1));
2908 shrl(result, 8);
2909 movdl(vec, result); // move 32 bits
2910 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char
2911 load_unsigned_short(result, Address(str2, 0));
2912 movdl(vec, result); // move 32 bits
2913 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2914 movdl(vec, Address(str2, 0)); // move 32 bits
2915 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2916 movq(vec, Address(str2, 0)); // move 64 bits
2917 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2918 // Array header size is 12 bytes in 32-bit VM
2919 // + 6 bytes for 3 chars == 18 bytes,
2920 // enough space to load vec and shift.
2921 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2922 if (ae == StrIntrinsicNode::UL) {
2923 int tail_off = int_cnt2-8;
2924 pmovzxbw(vec, Address(str2, tail_off));
2925 psrldq(vec, -2*tail_off);
2926 }
2927 else {
2928 int tail_off = int_cnt2*(1<<scale2);
2929 movdqu(vec, Address(str2, tail_off-16));
2930 psrldq(vec, 16-tail_off);
2931 }
2932 }
2933 } else { // not constant substring
2934 cmpl(cnt2, stride);
2935 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2936
2937 // We can read beyond string if srt+16 does not cross page boundary
2938 // since heaps are aligned and mapped by pages.
2939 assert(os::vm_page_size() < (int)G, "default page should be small");
2940 movl(result, str2); // We need only low 32 bits
2941 andl(result, ((int)os::vm_page_size()-1));
2942 cmpl(result, ((int)os::vm_page_size()-16));
2943 jccb(Assembler::belowEqual, CHECK_STR);
2944
2945 // Move small strings to stack to allow load 16 bytes into vec.
2946 subptr(rsp, 16);
2947 int stk_offset = wordSize-(1<<scale2);
2948 push(cnt2);
2949
2950 bind(COPY_SUBSTR);
2951 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2952 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2953 movb(Address(rsp, cnt2, scale2, stk_offset), result);
2954 } else if (ae == StrIntrinsicNode::UU) {
2955 load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2956 movw(Address(rsp, cnt2, scale2, stk_offset), result);
2957 }
2958 decrement(cnt2);
2959 jccb(Assembler::notZero, COPY_SUBSTR);
2960
2961 pop(cnt2);
2962 movptr(str2, rsp); // New substring address
2963 } // non constant
2964
2965 bind(CHECK_STR);
2966 cmpl(cnt1, stride);
2967 jccb(Assembler::aboveEqual, BIG_STRINGS);
2968
2969 // Check cross page boundary.
2970 movl(result, str1); // We need only low 32 bits
2971 andl(result, ((int)os::vm_page_size()-1));
2972 cmpl(result, ((int)os::vm_page_size()-16));
2973 jccb(Assembler::belowEqual, BIG_STRINGS);
2974
2975 subptr(rsp, 16);
2976 int stk_offset = -(1<<scale1);
2977 if (int_cnt2 < 0) { // not constant
2978 push(cnt2);
2979 stk_offset += wordSize;
2980 }
2981 movl(cnt2, cnt1);
2982
2983 bind(COPY_STR);
2984 if (ae == StrIntrinsicNode::LL) {
2985 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2986 movb(Address(rsp, cnt2, scale1, stk_offset), result);
2987 } else {
2988 load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2989 movw(Address(rsp, cnt2, scale1, stk_offset), result);
2990 }
2991 decrement(cnt2);
2992 jccb(Assembler::notZero, COPY_STR);
2993
2994 if (int_cnt2 < 0) { // not constant
2995 pop(cnt2);
2996 }
2997 movptr(str1, rsp); // New string address
2998
2999 bind(BIG_STRINGS);
3000 // Load substring.
3001 if (int_cnt2 < 0) { // -1
3002 if (ae == StrIntrinsicNode::UL) {
3003 pmovzxbw(vec, Address(str2, 0));
3004 } else {
3005 movdqu(vec, Address(str2, 0));
3006 }
3007 push(cnt2); // substr count
3008 push(str2); // substr addr
3009 push(str1); // string addr
3010 } else {
3011 // Small (< 8 chars) constant substrings are loaded already.
3012 movl(cnt2, int_cnt2);
3013 }
3014 push(tmp); // original SP
3015
3016 } // Finished loading
3017
3018 //========================================================
3019 // Start search
3020 //
3021
3022 movptr(result, str1); // string addr
3023
3024 if (int_cnt2 < 0) { // Only for non constant substring
3025 jmpb(SCAN_TO_SUBSTR);
3026
3027 // SP saved at sp+0
3028 // String saved at sp+1*wordSize
3029 // Substr saved at sp+2*wordSize
3030 // Substr count saved at sp+3*wordSize
3031
3032 // Reload substr for rescan, this code
3033 // is executed only for large substrings (> 8 chars)
3034 bind(RELOAD_SUBSTR);
3035 movptr(str2, Address(rsp, 2*wordSize));
3036 movl(cnt2, Address(rsp, 3*wordSize));
3037 if (ae == StrIntrinsicNode::UL) {
3038 pmovzxbw(vec, Address(str2, 0));
3039 } else {
3040 movdqu(vec, Address(str2, 0));
3041 }
3042 // We came here after the beginning of the substring was
3043 // matched but the rest of it was not so we need to search
3044 // again. Start from the next element after the previous match.
3045 subptr(str1, result); // Restore counter
3046 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3047 shrl(str1, 1);
3048 }
3049 addl(cnt1, str1);
3050 decrementl(cnt1); // Shift to next element
3051 cmpl(cnt1, cnt2);
3052 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
3053
3054 addptr(result, (1<<scale1));
3055 } // non constant
3056
3057 // Scan string for start of substr in 16-byte vectors
3058 bind(SCAN_TO_SUBSTR);
3059 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3060 pcmpestri(vec, Address(result, 0), mode);
3061 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
3062 subl(cnt1, stride);
3063 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3064 cmpl(cnt1, cnt2);
3065 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
3066 addptr(result, 16);
3067
3068 bind(ADJUST_STR);
3069 cmpl(cnt1, stride); // Do not read beyond string
3070 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3071 // Back-up string to avoid reading beyond string.
3072 lea(result, Address(result, cnt1, scale1, -16));
3073 movl(cnt1, stride);
3074 jmpb(SCAN_TO_SUBSTR);
3075
3076 // Found a potential substr
3077 bind(FOUND_CANDIDATE);
3078 // After pcmpestri tmp(rcx) contains matched element index
3079
3080 // Make sure string is still long enough
3081 subl(cnt1, tmp);
3082 cmpl(cnt1, cnt2);
3083 jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3084 // Left less then substring.
3085
3086 bind(RET_NOT_FOUND);
3087 movl(result, -1);
3088 jmp(CLEANUP);
3089
3090 bind(FOUND_SUBSTR);
3091 // Compute start addr of substr
3092 lea(result, Address(result, tmp, scale1));
3093 if (int_cnt2 > 0) { // Constant substring
3094 // Repeat search for small substring (< 8 chars)
3095 // from new point without reloading substring.
3096 // Have to check that we don't read beyond string.
3097 cmpl(tmp, stride-int_cnt2);
3098 jccb(Assembler::greater, ADJUST_STR);
3099 // Fall through if matched whole substring.
3100 } else { // non constant
3101 assert(int_cnt2 == -1, "should be != 0");
3102
3103 addl(tmp, cnt2);
3104 // Found result if we matched whole substring.
3105 cmpl(tmp, stride);
3106 jcc(Assembler::lessEqual, RET_FOUND);
3107
3108 // Repeat search for small substring (<= 8 chars)
3109 // from new point 'str1' without reloading substring.
3110 cmpl(cnt2, stride);
3111 // Have to check that we don't read beyond string.
3112 jccb(Assembler::lessEqual, ADJUST_STR);
3113
3114 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3115 // Compare the rest of substring (> 8 chars).
3116 movptr(str1, result);
3117
3118 cmpl(tmp, cnt2);
3119 // First 8 chars are already matched.
3120 jccb(Assembler::equal, CHECK_NEXT);
3121
3122 bind(SCAN_SUBSTR);
3123 pcmpestri(vec, Address(str1, 0), mode);
3124 // Need to reload strings pointers if not matched whole vector
3125 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3126
3127 bind(CHECK_NEXT);
3128 subl(cnt2, stride);
3129 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3130 addptr(str1, 16);
3131 if (ae == StrIntrinsicNode::UL) {
3132 addptr(str2, 8);
3133 } else {
3134 addptr(str2, 16);
3135 }
3136 subl(cnt1, stride);
3137 cmpl(cnt2, stride); // Do not read beyond substring
3138 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3139 // Back-up strings to avoid reading beyond substring.
3140
3141 if (ae == StrIntrinsicNode::UL) {
3142 lea(str2, Address(str2, cnt2, scale2, -8));
3143 lea(str1, Address(str1, cnt2, scale1, -16));
3144 } else {
3145 lea(str2, Address(str2, cnt2, scale2, -16));
3146 lea(str1, Address(str1, cnt2, scale1, -16));
3147 }
3148 subl(cnt1, cnt2);
3149 movl(cnt2, stride);
3150 addl(cnt1, stride);
3151 bind(CONT_SCAN_SUBSTR);
3152 if (ae == StrIntrinsicNode::UL) {
3153 pmovzxbw(vec, Address(str2, 0));
3154 } else {
3155 movdqu(vec, Address(str2, 0));
3156 }
3157 jmp(SCAN_SUBSTR);
3158
3159 bind(RET_FOUND_LONG);
3160 movptr(str1, Address(rsp, wordSize));
3161 } // non constant
3162
3163 bind(RET_FOUND);
3164 // Compute substr offset
3165 subptr(result, str1);
3166 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3167 shrl(result, 1); // index
3168 }
3169 bind(CLEANUP);
3170 pop(rsp); // restore SP
3171
3172 } // string_indexof
3173
3174 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3175 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3176 ShortBranchVerifier sbv(this);
3177 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3178
3179 int stride = 8;
3180
3181 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3182 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3183 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3184 FOUND_SEQ_CHAR, DONE_LABEL;
3185
3186 movptr(result, str1);
3187 if (UseAVX >= 2) {
3188 cmpl(cnt1, stride);
3189 jcc(Assembler::less, SCAN_TO_CHAR);
3190 cmpl(cnt1, 2*stride);
3191 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3192 movdl(vec1, ch);
3193 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3194 vpxor(vec2, vec2);
3195 movl(tmp, cnt1);
3196 andl(tmp, 0xFFFFFFF0); //vector count (in chars)
3197 andl(cnt1,0x0000000F); //tail count (in chars)
3198
3199 bind(SCAN_TO_16_CHAR_LOOP);
3200 vmovdqu(vec3, Address(result, 0));
3201 vpcmpeqw(vec3, vec3, vec1, 1);
3202 vptest(vec2, vec3);
3203 jcc(Assembler::carryClear, FOUND_CHAR);
3204 addptr(result, 32);
3205 subl(tmp, 2*stride);
3206 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3207 jmp(SCAN_TO_8_CHAR);
3208 bind(SCAN_TO_8_CHAR_INIT);
3209 movdl(vec1, ch);
3210 pshuflw(vec1, vec1, 0x00);
3211 pshufd(vec1, vec1, 0);
3212 pxor(vec2, vec2);
3213 }
3214 bind(SCAN_TO_8_CHAR);
3215 cmpl(cnt1, stride);
3216 jcc(Assembler::less, SCAN_TO_CHAR);
3217 if (UseAVX < 2) {
3218 movdl(vec1, ch);
3219 pshuflw(vec1, vec1, 0x00);
3220 pshufd(vec1, vec1, 0);
3221 pxor(vec2, vec2);
3222 }
3223 movl(tmp, cnt1);
3224 andl(tmp, 0xFFFFFFF8); //vector count (in chars)
3225 andl(cnt1,0x00000007); //tail count (in chars)
3226
3227 bind(SCAN_TO_8_CHAR_LOOP);
3228 movdqu(vec3, Address(result, 0));
3229 pcmpeqw(vec3, vec1);
3230 ptest(vec2, vec3);
3231 jcc(Assembler::carryClear, FOUND_CHAR);
3232 addptr(result, 16);
3233 subl(tmp, stride);
3234 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3235 bind(SCAN_TO_CHAR);
3236 testl(cnt1, cnt1);
3237 jcc(Assembler::zero, RET_NOT_FOUND);
3238 bind(SCAN_TO_CHAR_LOOP);
3239 load_unsigned_short(tmp, Address(result, 0));
3240 cmpl(ch, tmp);
3241 jccb(Assembler::equal, FOUND_SEQ_CHAR);
3242 addptr(result, 2);
3243 subl(cnt1, 1);
3244 jccb(Assembler::zero, RET_NOT_FOUND);
3245 jmp(SCAN_TO_CHAR_LOOP);
3246
3247 bind(RET_NOT_FOUND);
3248 movl(result, -1);
3249 jmpb(DONE_LABEL);
3250
3251 bind(FOUND_CHAR);
3252 if (UseAVX >= 2) {
3253 vpmovmskb(tmp, vec3);
3254 } else {
3255 pmovmskb(tmp, vec3);
3256 }
3257 bsfl(ch, tmp);
3258 addptr(result, ch);
3259
3260 bind(FOUND_SEQ_CHAR);
3261 subptr(result, str1);
3262 shrl(result, 1);
3263
3264 bind(DONE_LABEL);
3265 } // string_indexof_char
3266
3267 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3268 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3269 ShortBranchVerifier sbv(this);
3270 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3271
3272 int stride = 16;
3273
3274 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3275 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3276 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3277 FOUND_SEQ_CHAR, DONE_LABEL;
3278
3279 movptr(result, str1);
3280 if (UseAVX >= 2) {
3281 cmpl(cnt1, stride);
3282 jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3283 cmpl(cnt1, stride*2);
3284 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3285 movdl(vec1, ch);
3286 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3287 vpxor(vec2, vec2);
3288 movl(tmp, cnt1);
3289 andl(tmp, 0xFFFFFFE0); //vector count (in chars)
3290 andl(cnt1,0x0000001F); //tail count (in chars)
3291
3292 bind(SCAN_TO_32_CHAR_LOOP);
3293 vmovdqu(vec3, Address(result, 0));
3294 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3295 vptest(vec2, vec3);
3296 jcc(Assembler::carryClear, FOUND_CHAR);
3297 addptr(result, 32);
3298 subl(tmp, stride*2);
3299 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3300 jmp(SCAN_TO_16_CHAR);
3301
3302 bind(SCAN_TO_16_CHAR_INIT);
3303 movdl(vec1, ch);
3304 pxor(vec2, vec2);
3305 pshufb(vec1, vec2);
3306 }
3307
3308 bind(SCAN_TO_16_CHAR);
3309 cmpl(cnt1, stride);
3310 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3311 if (UseAVX < 2) {
3312 movdl(vec1, ch);
3313 pxor(vec2, vec2);
3314 pshufb(vec1, vec2);
3315 }
3316 movl(tmp, cnt1);
3317 andl(tmp, 0xFFFFFFF0); //vector count (in bytes)
3318 andl(cnt1,0x0000000F); //tail count (in bytes)
3319
3320 bind(SCAN_TO_16_CHAR_LOOP);
3321 movdqu(vec3, Address(result, 0));
3322 pcmpeqb(vec3, vec1);
3323 ptest(vec2, vec3);
3324 jcc(Assembler::carryClear, FOUND_CHAR);
3325 addptr(result, 16);
3326 subl(tmp, stride);
3327 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3328
3329 bind(SCAN_TO_CHAR_INIT);
3330 testl(cnt1, cnt1);
3331 jcc(Assembler::zero, RET_NOT_FOUND);
3332 bind(SCAN_TO_CHAR_LOOP);
3333 load_unsigned_byte(tmp, Address(result, 0));
3334 cmpl(ch, tmp);
3335 jccb(Assembler::equal, FOUND_SEQ_CHAR);
3336 addptr(result, 1);
3337 subl(cnt1, 1);
3338 jccb(Assembler::zero, RET_NOT_FOUND);
3339 jmp(SCAN_TO_CHAR_LOOP);
3340
3341 bind(RET_NOT_FOUND);
3342 movl(result, -1);
3343 jmpb(DONE_LABEL);
3344
3345 bind(FOUND_CHAR);
3346 if (UseAVX >= 2) {
3347 vpmovmskb(tmp, vec3);
3348 } else {
3349 pmovmskb(tmp, vec3);
3350 }
3351 bsfl(ch, tmp);
3352 addptr(result, ch);
3353
3354 bind(FOUND_SEQ_CHAR);
3355 subptr(result, str1);
3356
3357 bind(DONE_LABEL);
3358 } // stringL_indexof_char
3359
3360 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3361 switch (eltype) {
3362 case T_BOOLEAN: return sizeof(jboolean);
3363 case T_BYTE: return sizeof(jbyte);
3364 case T_SHORT: return sizeof(jshort);
3365 case T_CHAR: return sizeof(jchar);
3366 case T_INT: return sizeof(jint);
3367 default:
3368 ShouldNotReachHere();
3369 return -1;
3370 }
3371 }
3372
3373 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3374 switch (eltype) {
3375 // T_BOOLEAN used as surrogate for unsigned byte
3376 case T_BOOLEAN: movzbl(dst, src); break;
3377 case T_BYTE: movsbl(dst, src); break;
3378 case T_SHORT: movswl(dst, src); break;
3379 case T_CHAR: movzwl(dst, src); break;
3380 case T_INT: movl(dst, src); break;
3381 default:
3382 ShouldNotReachHere();
3383 }
3384 }
3385
3386 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3387 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3388 }
3389
3390 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3391 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3392 }
3393
3394 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3395 const int vlen = Assembler::AVX_256bit;
3396 switch (eltype) {
3397 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break;
3398 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break;
3399 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3400 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3401 case T_INT:
3402 // do nothing
3403 break;
3404 default:
3405 ShouldNotReachHere();
3406 }
3407 }
3408
3409 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3410 Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3411 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3412 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3413 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3414 BasicType eltype) {
3415 ShortBranchVerifier sbv(this);
3416 assert(UseAVX >= 2, "AVX2 intrinsics are required");
3417 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3418 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3419
3420 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3421 SHORT_UNROLLED_LOOP_EXIT,
3422 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3423 UNROLLED_VECTOR_LOOP_BEGIN,
3424 END;
3425 switch (eltype) {
3426 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3427 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break;
3428 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break;
3429 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break;
3430 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break;
3431 default: BLOCK_COMMENT("arrays_hashcode {"); break;
3432 }
3433
3434 // For "renaming" for readibility of the code
3435 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3436 vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3437 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3438
3439 const int elsize = arrays_hashcode_elsize(eltype);
3440
3441 /*
3442 if (cnt1 >= 2) {
3443 if (cnt1 >= 32) {
3444 UNROLLED VECTOR LOOP
3445 }
3446 UNROLLED SCALAR LOOP
3447 }
3448 SINGLE SCALAR
3449 */
3450
3451 cmpl(cnt1, 32);
3452 jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3453
3454 // cnt1 >= 32 && generate_vectorized_loop
3455 xorl(index, index);
3456
3457 // vresult = IntVector.zero(I256);
3458 for (int idx = 0; idx < 4; idx++) {
3459 vpxor(vresult[idx], vresult[idx]);
3460 }
3461 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3462 Register bound = tmp2;
3463 Register next = tmp3;
3464 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3465 movl(next, Address(tmp2, 0));
3466 movdl(vnext, next);
3467 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3468
3469 // index = 0;
3470 // bound = cnt1 & ~(32 - 1);
3471 movl(bound, cnt1);
3472 andl(bound, ~(32 - 1));
3473 // for (; index < bound; index += 32) {
3474 bind(UNROLLED_VECTOR_LOOP_BEGIN);
3475 // result *= next;
3476 imull(result, next);
3477 // loop fission to upfront the cost of fetching from memory, OOO execution
3478 // can then hopefully do a better job of prefetching
3479 for (int idx = 0; idx < 4; idx++) {
3480 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3481 }
3482 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3483 for (int idx = 0; idx < 4; idx++) {
3484 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3485 arrays_hashcode_elvcast(vtmp[idx], eltype);
3486 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3487 }
3488 // index += 32;
3489 addl(index, 32);
3490 // index < bound;
3491 cmpl(index, bound);
3492 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3493 // }
3494
3495 lea(ary1, Address(ary1, bound, Address::times(elsize)));
3496 subl(cnt1, bound);
3497 // release bound
3498
3499 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3500 for (int idx = 0; idx < 4; idx++) {
3501 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3502 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3503 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3504 }
3505 // result += vresult.reduceLanes(ADD);
3506 for (int idx = 0; idx < 4; idx++) {
3507 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3508 }
3509
3510 // } else if (cnt1 < 32) {
3511
3512 bind(SHORT_UNROLLED_BEGIN);
3513 // int i = 1;
3514 movl(index, 1);
3515 cmpl(index, cnt1);
3516 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3517
3518 // for (; i < cnt1 ; i += 2) {
3519 bind(SHORT_UNROLLED_LOOP_BEGIN);
3520 movl(tmp3, 961);
3521 imull(result, tmp3);
3522 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3523 movl(tmp3, tmp2);
3524 shll(tmp3, 5);
3525 subl(tmp3, tmp2);
3526 addl(result, tmp3);
3527 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3528 addl(result, tmp3);
3529 addl(index, 2);
3530 cmpl(index, cnt1);
3531 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3532
3533 // }
3534 // if (i >= cnt1) {
3535 bind(SHORT_UNROLLED_LOOP_EXIT);
3536 jccb(Assembler::greater, END);
3537 movl(tmp2, result);
3538 shll(result, 5);
3539 subl(result, tmp2);
3540 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3541 addl(result, tmp3);
3542 // }
3543 bind(END);
3544
3545 BLOCK_COMMENT("} // arrays_hashcode");
3546
3547 } // arrays_hashcode
3548
3549 // helper function for string_compare
3550 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3551 Address::ScaleFactor scale, Address::ScaleFactor scale1,
3552 Address::ScaleFactor scale2, Register index, int ae) {
3553 if (ae == StrIntrinsicNode::LL) {
3554 load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3555 load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3556 } else if (ae == StrIntrinsicNode::UU) {
3557 load_unsigned_short(elem1, Address(str1, index, scale, 0));
3558 load_unsigned_short(elem2, Address(str2, index, scale, 0));
3559 } else {
3560 load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3561 load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3562 }
3563 }
3564
3565 // Compare strings, used for char[] and byte[].
3566 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3567 Register cnt1, Register cnt2, Register result,
3568 XMMRegister vec1, int ae, KRegister mask) {
3569 ShortBranchVerifier sbv(this);
3570 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3571 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only AVX3
3572 int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3573 int stride2x2 = 0x40;
3574 Address::ScaleFactor scale = Address::no_scale;
3575 Address::ScaleFactor scale1 = Address::no_scale;
3576 Address::ScaleFactor scale2 = Address::no_scale;
3577
3578 if (ae != StrIntrinsicNode::LL) {
3579 stride2x2 = 0x20;
3580 }
3581
3582 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3583 shrl(cnt2, 1);
3584 }
3585 // Compute the minimum of the string lengths and the
3586 // difference of the string lengths (stack).
3587 // Do the conditional move stuff
3588 movl(result, cnt1);
3589 subl(cnt1, cnt2);
3590 push(cnt1);
3591 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2)
3592
3593 // Is the minimum length zero?
3594 testl(cnt2, cnt2);
3595 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3596 if (ae == StrIntrinsicNode::LL) {
3597 // Load first bytes
3598 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0]
3599 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0]
3600 } else if (ae == StrIntrinsicNode::UU) {
3601 // Load first characters
3602 load_unsigned_short(result, Address(str1, 0));
3603 load_unsigned_short(cnt1, Address(str2, 0));
3604 } else {
3605 load_unsigned_byte(result, Address(str1, 0));
3606 load_unsigned_short(cnt1, Address(str2, 0));
3607 }
3608 subl(result, cnt1);
3609 jcc(Assembler::notZero, POP_LABEL);
3610
3611 if (ae == StrIntrinsicNode::UU) {
3612 // Divide length by 2 to get number of chars
3613 shrl(cnt2, 1);
3614 }
3615 cmpl(cnt2, 1);
3616 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3617
3618 // Check if the strings start at the same location and setup scale and stride
3619 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3620 cmpptr(str1, str2);
3621 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3622 if (ae == StrIntrinsicNode::LL) {
3623 scale = Address::times_1;
3624 stride = 16;
3625 } else {
3626 scale = Address::times_2;
3627 stride = 8;
3628 }
3629 } else {
3630 scale1 = Address::times_1;
3631 scale2 = Address::times_2;
3632 // scale not used
3633 stride = 8;
3634 }
3635
3636 if (UseAVX >= 2 && UseSSE42Intrinsics) {
3637 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3638 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3639 Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3640 Label COMPARE_TAIL_LONG;
3641 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only AVX3
3642
3643 int pcmpmask = 0x19;
3644 if (ae == StrIntrinsicNode::LL) {
3645 pcmpmask &= ~0x01;
3646 }
3647
3648 // Setup to compare 16-chars (32-bytes) vectors,
3649 // start from first character again because it has aligned address.
3650 if (ae == StrIntrinsicNode::LL) {
3651 stride2 = 32;
3652 } else {
3653 stride2 = 16;
3654 }
3655 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3656 adr_stride = stride << scale;
3657 } else {
3658 adr_stride1 = 8; //stride << scale1;
3659 adr_stride2 = 16; //stride << scale2;
3660 }
3661
3662 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3663 // rax and rdx are used by pcmpestri as elements counters
3664 movl(result, cnt2);
3665 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count
3666 jcc(Assembler::zero, COMPARE_TAIL_LONG);
3667
3668 // fast path : compare first 2 8-char vectors.
3669 bind(COMPARE_16_CHARS);
3670 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3671 movdqu(vec1, Address(str1, 0));
3672 } else {
3673 pmovzxbw(vec1, Address(str1, 0));
3674 }
3675 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3676 jccb(Assembler::below, COMPARE_INDEX_CHAR);
3677
3678 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3679 movdqu(vec1, Address(str1, adr_stride));
3680 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3681 } else {
3682 pmovzxbw(vec1, Address(str1, adr_stride1));
3683 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3684 }
3685 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3686 addl(cnt1, stride);
3687
3688 // Compare the characters at index in cnt1
3689 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3690 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3691 subl(result, cnt2);
3692 jmp(POP_LABEL);
3693
3694 // Setup the registers to start vector comparison loop
3695 bind(COMPARE_WIDE_VECTORS);
3696 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3697 lea(str1, Address(str1, result, scale));
3698 lea(str2, Address(str2, result, scale));
3699 } else {
3700 lea(str1, Address(str1, result, scale1));
3701 lea(str2, Address(str2, result, scale2));
3702 }
3703 subl(result, stride2);
3704 subl(cnt2, stride2);
3705 jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3706 negptr(result);
3707
3708 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3709 bind(COMPARE_WIDE_VECTORS_LOOP);
3710
3711 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3712 cmpl(cnt2, stride2x2);
3713 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3714 testl(cnt2, stride2x2-1); // cnt2 holds the vector count
3715 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40
3716
3717 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3718 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3719 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3720 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3721 } else {
3722 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3723 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3724 }
3725 kortestql(mask, mask);
3726 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare
3727 addptr(result, stride2x2); // update since we already compared at this addr
3728 subl(cnt2, stride2x2); // and sub the size too
3729 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3730
3731 vpxor(vec1, vec1);
3732 jmpb(COMPARE_WIDE_TAIL);
3733 }//if (VM_Version::supports_avx512vlbw())
3734
3735 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3736 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3737 vmovdqu(vec1, Address(str1, result, scale));
3738 vpxor(vec1, Address(str2, result, scale));
3739 } else {
3740 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3741 vpxor(vec1, Address(str2, result, scale2));
3742 }
3743 vptest(vec1, vec1);
3744 jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3745 addptr(result, stride2);
3746 subl(cnt2, stride2);
3747 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3748 // clean upper bits of YMM registers
3749 vpxor(vec1, vec1);
3750
3751 // compare wide vectors tail
3752 bind(COMPARE_WIDE_TAIL);
3753 testptr(result, result);
3754 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3755
3756 movl(result, stride2);
3757 movl(cnt2, result);
3758 negptr(result);
3759 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3760
3761 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3762 bind(VECTOR_NOT_EQUAL);
3763 // clean upper bits of YMM registers
3764 vpxor(vec1, vec1);
3765 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3766 lea(str1, Address(str1, result, scale));
3767 lea(str2, Address(str2, result, scale));
3768 } else {
3769 lea(str1, Address(str1, result, scale1));
3770 lea(str2, Address(str2, result, scale2));
3771 }
3772 jmp(COMPARE_16_CHARS);
3773
3774 // Compare tail chars, length between 1 to 15 chars
3775 bind(COMPARE_TAIL_LONG);
3776 movl(cnt2, result);
3777 cmpl(cnt2, stride);
3778 jcc(Assembler::less, COMPARE_SMALL_STR);
3779
3780 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3781 movdqu(vec1, Address(str1, 0));
3782 } else {
3783 pmovzxbw(vec1, Address(str1, 0));
3784 }
3785 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3786 jcc(Assembler::below, COMPARE_INDEX_CHAR);
3787 subptr(cnt2, stride);
3788 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3789 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3790 lea(str1, Address(str1, result, scale));
3791 lea(str2, Address(str2, result, scale));
3792 } else {
3793 lea(str1, Address(str1, result, scale1));
3794 lea(str2, Address(str2, result, scale2));
3795 }
3796 negptr(cnt2);
3797 jmpb(WHILE_HEAD_LABEL);
3798
3799 bind(COMPARE_SMALL_STR);
3800 } else if (UseSSE42Intrinsics) {
3801 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3802 int pcmpmask = 0x19;
3803 // Setup to compare 8-char (16-byte) vectors,
3804 // start from first character again because it has aligned address.
3805 movl(result, cnt2);
3806 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count
3807 if (ae == StrIntrinsicNode::LL) {
3808 pcmpmask &= ~0x01;
3809 }
3810 jcc(Assembler::zero, COMPARE_TAIL);
3811 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3812 lea(str1, Address(str1, result, scale));
3813 lea(str2, Address(str2, result, scale));
3814 } else {
3815 lea(str1, Address(str1, result, scale1));
3816 lea(str2, Address(str2, result, scale2));
3817 }
3818 negptr(result);
3819
3820 // pcmpestri
3821 // inputs:
3822 // vec1- substring
3823 // rax - negative string length (elements count)
3824 // mem - scanned string
3825 // rdx - string length (elements count)
3826 // pcmpmask - cmp mode: 11000 (string compare with negated result)
3827 // + 00 (unsigned bytes) or + 01 (unsigned shorts)
3828 // outputs:
3829 // rcx - first mismatched element index
3830 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3831
3832 bind(COMPARE_WIDE_VECTORS);
3833 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3834 movdqu(vec1, Address(str1, result, scale));
3835 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3836 } else {
3837 pmovzxbw(vec1, Address(str1, result, scale1));
3838 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3839 }
3840 // After pcmpestri cnt1(rcx) contains mismatched element index
3841
3842 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1
3843 addptr(result, stride);
3844 subptr(cnt2, stride);
3845 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3846
3847 // compare wide vectors tail
3848 testptr(result, result);
3849 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3850
3851 movl(cnt2, stride);
3852 movl(result, stride);
3853 negptr(result);
3854 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3855 movdqu(vec1, Address(str1, result, scale));
3856 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3857 } else {
3858 pmovzxbw(vec1, Address(str1, result, scale1));
3859 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3860 }
3861 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3862
3863 // Mismatched characters in the vectors
3864 bind(VECTOR_NOT_EQUAL);
3865 addptr(cnt1, result);
3866 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3867 subl(result, cnt2);
3868 jmpb(POP_LABEL);
3869
3870 bind(COMPARE_TAIL); // limit is zero
3871 movl(cnt2, result);
3872 // Fallthru to tail compare
3873 }
3874 // Shift str2 and str1 to the end of the arrays, negate min
3875 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3876 lea(str1, Address(str1, cnt2, scale));
3877 lea(str2, Address(str2, cnt2, scale));
3878 } else {
3879 lea(str1, Address(str1, cnt2, scale1));
3880 lea(str2, Address(str2, cnt2, scale2));
3881 }
3882 decrementl(cnt2); // first character was compared already
3883 negptr(cnt2);
3884
3885 // Compare the rest of the elements
3886 bind(WHILE_HEAD_LABEL);
3887 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3888 subl(result, cnt1);
3889 jccb(Assembler::notZero, POP_LABEL);
3890 increment(cnt2);
3891 jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3892
3893 // Strings are equal up to min length. Return the length difference.
3894 bind(LENGTH_DIFF_LABEL);
3895 pop(result);
3896 if (ae == StrIntrinsicNode::UU) {
3897 // Divide diff by 2 to get number of chars
3898 sarl(result, 1);
3899 }
3900 jmpb(DONE_LABEL);
3901
3902 if (VM_Version::supports_avx512vlbw()) {
3903
3904 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3905
3906 kmovql(cnt1, mask);
3907 notq(cnt1);
3908 bsfq(cnt2, cnt1);
3909 if (ae != StrIntrinsicNode::LL) {
3910 // Divide diff by 2 to get number of chars
3911 sarl(cnt2, 1);
3912 }
3913 addq(result, cnt2);
3914 if (ae == StrIntrinsicNode::LL) {
3915 load_unsigned_byte(cnt1, Address(str2, result));
3916 load_unsigned_byte(result, Address(str1, result));
3917 } else if (ae == StrIntrinsicNode::UU) {
3918 load_unsigned_short(cnt1, Address(str2, result, scale));
3919 load_unsigned_short(result, Address(str1, result, scale));
3920 } else {
3921 load_unsigned_short(cnt1, Address(str2, result, scale2));
3922 load_unsigned_byte(result, Address(str1, result, scale1));
3923 }
3924 subl(result, cnt1);
3925 jmpb(POP_LABEL);
3926 }//if (VM_Version::supports_avx512vlbw())
3927
3928 // Discard the stored length difference
3929 bind(POP_LABEL);
3930 pop(cnt1);
3931
3932 // That's it
3933 bind(DONE_LABEL);
3934 if(ae == StrIntrinsicNode::UL) {
3935 negl(result);
3936 }
3937
3938 }
3939
3940 // Search for Non-ASCII character (Negative byte value) in a byte array,
3941 // return the index of the first such character, otherwise the length
3942 // of the array segment searched.
3943 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3944 // @IntrinsicCandidate
3945 // public static int countPositives(byte[] ba, int off, int len) {
3946 // for (int i = off; i < off + len; i++) {
3947 // if (ba[i] < 0) {
3948 // return i - off;
3949 // }
3950 // }
3951 // return len;
3952 // }
3953 void C2_MacroAssembler::count_positives(Register ary1, Register len,
3954 Register result, Register tmp1,
3955 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3956 // rsi: byte array
3957 // rcx: len
3958 // rax: result
3959 ShortBranchVerifier sbv(this);
3960 assert_different_registers(ary1, len, result, tmp1);
3961 assert_different_registers(vec1, vec2);
3962 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3963
3964 movl(result, len); // copy
3965 // len == 0
3966 testl(len, len);
3967 jcc(Assembler::zero, DONE);
3968
3969 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3970 VM_Version::supports_avx512vlbw() &&
3971 VM_Version::supports_bmi2()) {
3972
3973 Label test_64_loop, test_tail, BREAK_LOOP;
3974 movl(tmp1, len);
3975 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3976
3977 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
3978 andl(len, 0xffffffc0); // vector count (in chars)
3979 jccb(Assembler::zero, test_tail);
3980
3981 lea(ary1, Address(ary1, len, Address::times_1));
3982 negptr(len);
3983
3984 bind(test_64_loop);
3985 // Check whether our 64 elements of size byte contain negatives
3986 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3987 kortestql(mask1, mask1);
3988 jcc(Assembler::notZero, BREAK_LOOP);
3989
3990 addptr(len, 64);
3991 jccb(Assembler::notZero, test_64_loop);
3992
3993 bind(test_tail);
3994 // bail out when there is nothing to be done
3995 testl(tmp1, -1);
3996 jcc(Assembler::zero, DONE);
3997
3998
3999 // check the tail for absense of negatives
4000 // ~(~0 << len) applied up to two times (for 32-bit scenario)
4001 {
4002 Register tmp3_aliased = len;
4003 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4004 shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4005 notq(tmp3_aliased);
4006 kmovql(mask2, tmp3_aliased);
4007 }
4008
4009 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4010 ktestq(mask1, mask2);
4011 jcc(Assembler::zero, DONE);
4012
4013 // do a full check for negative registers in the tail
4014 movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4015 // ary1 already pointing to the right place
4016 jmpb(TAIL_START);
4017
4018 bind(BREAK_LOOP);
4019 // At least one byte in the last 64 byte block was negative.
4020 // Set up to look at the last 64 bytes as if they were a tail
4021 lea(ary1, Address(ary1, len, Address::times_1));
4022 addptr(result, len);
4023 // Ignore the very last byte: if all others are positive,
4024 // it must be negative, so we can skip right to the 2+1 byte
4025 // end comparison at this point
4026 orl(result, 63);
4027 movl(len, 63);
4028 // Fallthru to tail compare
4029 } else {
4030
4031 if (UseAVX >= 2) {
4032 // With AVX2, use 32-byte vector compare
4033 Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4034
4035 // Compare 32-byte vectors
4036 testl(len, 0xffffffe0); // vector count (in bytes)
4037 jccb(Assembler::zero, TAIL_START);
4038
4039 andl(len, 0xffffffe0);
4040 lea(ary1, Address(ary1, len, Address::times_1));
4041 negptr(len);
4042
4043 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
4044 movdl(vec2, tmp1);
4045 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4046
4047 bind(COMPARE_WIDE_VECTORS);
4048 vmovdqu(vec1, Address(ary1, len, Address::times_1));
4049 vptest(vec1, vec2);
4050 jccb(Assembler::notZero, BREAK_LOOP);
4051 addptr(len, 32);
4052 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4053
4054 testl(result, 0x0000001f); // any bytes remaining?
4055 jcc(Assembler::zero, DONE);
4056
4057 // Quick test using the already prepared vector mask
4058 movl(len, result);
4059 andl(len, 0x0000001f);
4060 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4061 vptest(vec1, vec2);
4062 jcc(Assembler::zero, DONE);
4063 // There are zeros, jump to the tail to determine exactly where
4064 jmpb(TAIL_START);
4065
4066 bind(BREAK_LOOP);
4067 // At least one byte in the last 32-byte vector is negative.
4068 // Set up to look at the last 32 bytes as if they were a tail
4069 lea(ary1, Address(ary1, len, Address::times_1));
4070 addptr(result, len);
4071 // Ignore the very last byte: if all others are positive,
4072 // it must be negative, so we can skip right to the 2+1 byte
4073 // end comparison at this point
4074 orl(result, 31);
4075 movl(len, 31);
4076 // Fallthru to tail compare
4077 } else if (UseSSE42Intrinsics) {
4078 // With SSE4.2, use double quad vector compare
4079 Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4080
4081 // Compare 16-byte vectors
4082 testl(len, 0xfffffff0); // vector count (in bytes)
4083 jcc(Assembler::zero, TAIL_START);
4084
4085 andl(len, 0xfffffff0);
4086 lea(ary1, Address(ary1, len, Address::times_1));
4087 negptr(len);
4088
4089 movl(tmp1, 0x80808080);
4090 movdl(vec2, tmp1);
4091 pshufd(vec2, vec2, 0);
4092
4093 bind(COMPARE_WIDE_VECTORS);
4094 movdqu(vec1, Address(ary1, len, Address::times_1));
4095 ptest(vec1, vec2);
4096 jccb(Assembler::notZero, BREAK_LOOP);
4097 addptr(len, 16);
4098 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4099
4100 testl(result, 0x0000000f); // len is zero, any bytes remaining?
4101 jcc(Assembler::zero, DONE);
4102
4103 // Quick test using the already prepared vector mask
4104 movl(len, result);
4105 andl(len, 0x0000000f); // tail count (in bytes)
4106 movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4107 ptest(vec1, vec2);
4108 jcc(Assembler::zero, DONE);
4109 jmpb(TAIL_START);
4110
4111 bind(BREAK_LOOP);
4112 // At least one byte in the last 16-byte vector is negative.
4113 // Set up and look at the last 16 bytes as if they were a tail
4114 lea(ary1, Address(ary1, len, Address::times_1));
4115 addptr(result, len);
4116 // Ignore the very last byte: if all others are positive,
4117 // it must be negative, so we can skip right to the 2+1 byte
4118 // end comparison at this point
4119 orl(result, 15);
4120 movl(len, 15);
4121 // Fallthru to tail compare
4122 }
4123 }
4124
4125 bind(TAIL_START);
4126 // Compare 4-byte vectors
4127 andl(len, 0xfffffffc); // vector count (in bytes)
4128 jccb(Assembler::zero, COMPARE_CHAR);
4129
4130 lea(ary1, Address(ary1, len, Address::times_1));
4131 negptr(len);
4132
4133 bind(COMPARE_VECTORS);
4134 movl(tmp1, Address(ary1, len, Address::times_1));
4135 andl(tmp1, 0x80808080);
4136 jccb(Assembler::notZero, TAIL_ADJUST);
4137 addptr(len, 4);
4138 jccb(Assembler::notZero, COMPARE_VECTORS);
4139
4140 // Compare trailing char (final 2-3 bytes), if any
4141 bind(COMPARE_CHAR);
4142
4143 testl(result, 0x2); // tail char
4144 jccb(Assembler::zero, COMPARE_BYTE);
4145 load_unsigned_short(tmp1, Address(ary1, 0));
4146 andl(tmp1, 0x00008080);
4147 jccb(Assembler::notZero, CHAR_ADJUST);
4148 lea(ary1, Address(ary1, 2));
4149
4150 bind(COMPARE_BYTE);
4151 testl(result, 0x1); // tail byte
4152 jccb(Assembler::zero, DONE);
4153 load_unsigned_byte(tmp1, Address(ary1, 0));
4154 testl(tmp1, 0x00000080);
4155 jccb(Assembler::zero, DONE);
4156 subptr(result, 1);
4157 jmpb(DONE);
4158
4159 bind(TAIL_ADJUST);
4160 // there are negative bits in the last 4 byte block.
4161 // Adjust result and check the next three bytes
4162 addptr(result, len);
4163 orl(result, 3);
4164 lea(ary1, Address(ary1, len, Address::times_1));
4165 jmpb(COMPARE_CHAR);
4166
4167 bind(CHAR_ADJUST);
4168 // We are looking at a char + optional byte tail, and found that one
4169 // of the bytes in the char is negative. Adjust the result, check the
4170 // first byte and readjust if needed.
4171 andl(result, 0xfffffffc);
4172 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4173 jccb(Assembler::notZero, DONE);
4174 addptr(result, 1);
4175
4176 // That's it
4177 bind(DONE);
4178 if (UseAVX >= 2) {
4179 // clean upper bits of YMM registers
4180 vpxor(vec1, vec1);
4181 vpxor(vec2, vec2);
4182 }
4183 }
4184
4185 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4186 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4187 Register limit, Register result, Register chr,
4188 XMMRegister vec1, XMMRegister vec2, bool is_char,
4189 KRegister mask, bool expand_ary2) {
4190 // for expand_ary2, limit is the (smaller) size of the second array.
4191 ShortBranchVerifier sbv(this);
4192 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4193
4194 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4195 "Expansion only implemented for AVX2");
4196
4197 int length_offset = arrayOopDesc::length_offset_in_bytes();
4198 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4199
4200 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4201 int scaleIncr = expand_ary2 ? 8 : 16;
4202
4203 if (is_array_equ) {
4204 // Check the input args
4205 cmpoop(ary1, ary2);
4206 jcc(Assembler::equal, TRUE_LABEL);
4207
4208 // Need additional checks for arrays_equals.
4209 testptr(ary1, ary1);
4210 jcc(Assembler::zero, FALSE_LABEL);
4211 testptr(ary2, ary2);
4212 jcc(Assembler::zero, FALSE_LABEL);
4213
4214 // Check the lengths
4215 movl(limit, Address(ary1, length_offset));
4216 cmpl(limit, Address(ary2, length_offset));
4217 jcc(Assembler::notEqual, FALSE_LABEL);
4218 }
4219
4220 // count == 0
4221 testl(limit, limit);
4222 jcc(Assembler::zero, TRUE_LABEL);
4223
4224 if (is_array_equ) {
4225 // Load array address
4226 lea(ary1, Address(ary1, base_offset));
4227 lea(ary2, Address(ary2, base_offset));
4228 }
4229
4230 if (is_array_equ && is_char) {
4231 // arrays_equals when used for char[].
4232 shll(limit, 1); // byte count != 0
4233 }
4234 movl(result, limit); // copy
4235
4236 if (UseAVX >= 2) {
4237 // With AVX2, use 32-byte vector compare
4238 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4239
4240 // Compare 32-byte vectors
4241 if (expand_ary2) {
4242 andl(result, 0x0000000f); // tail count (in bytes)
4243 andl(limit, 0xfffffff0); // vector count (in bytes)
4244 jcc(Assembler::zero, COMPARE_TAIL);
4245 } else {
4246 andl(result, 0x0000001f); // tail count (in bytes)
4247 andl(limit, 0xffffffe0); // vector count (in bytes)
4248 jcc(Assembler::zero, COMPARE_TAIL_16);
4249 }
4250
4251 lea(ary1, Address(ary1, limit, scaleFactor));
4252 lea(ary2, Address(ary2, limit, Address::times_1));
4253 negptr(limit);
4254
4255 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4256 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4257
4258 cmpl(limit, -64);
4259 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4260
4261 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4262
4263 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4264 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4265 kortestql(mask, mask);
4266 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
4267 addptr(limit, 64); // update since we already compared at this addr
4268 cmpl(limit, -64);
4269 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4270
4271 // At this point we may still need to compare -limit+result bytes.
4272 // We could execute the next two instruction and just continue via non-wide path:
4273 // cmpl(limit, 0);
4274 // jcc(Assembler::equal, COMPARE_TAIL); // true
4275 // But since we stopped at the points ary{1,2}+limit which are
4276 // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4277 // (|limit| <= 32 and result < 32),
4278 // we may just compare the last 64 bytes.
4279 //
4280 addptr(result, -64); // it is safe, bc we just came from this area
4281 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4282 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4283 kortestql(mask, mask);
4284 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
4285
4286 jmp(TRUE_LABEL);
4287
4288 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4289
4290 }//if (VM_Version::supports_avx512vlbw())
4291
4292 bind(COMPARE_WIDE_VECTORS);
4293 vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4294 if (expand_ary2) {
4295 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4296 } else {
4297 vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4298 }
4299 vpxor(vec1, vec2);
4300
4301 vptest(vec1, vec1);
4302 jcc(Assembler::notZero, FALSE_LABEL);
4303 addptr(limit, scaleIncr * 2);
4304 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4305
4306 testl(result, result);
4307 jcc(Assembler::zero, TRUE_LABEL);
4308
4309 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4310 if (expand_ary2) {
4311 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4312 } else {
4313 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4314 }
4315 vpxor(vec1, vec2);
4316
4317 vptest(vec1, vec1);
4318 jcc(Assembler::notZero, FALSE_LABEL);
4319 jmp(TRUE_LABEL);
4320
4321 bind(COMPARE_TAIL_16); // limit is zero
4322 movl(limit, result);
4323
4324 // Compare 16-byte chunks
4325 andl(result, 0x0000000f); // tail count (in bytes)
4326 andl(limit, 0xfffffff0); // vector count (in bytes)
4327 jcc(Assembler::zero, COMPARE_TAIL);
4328
4329 lea(ary1, Address(ary1, limit, scaleFactor));
4330 lea(ary2, Address(ary2, limit, Address::times_1));
4331 negptr(limit);
4332
4333 bind(COMPARE_WIDE_VECTORS_16);
4334 movdqu(vec1, Address(ary1, limit, scaleFactor));
4335 if (expand_ary2) {
4336 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4337 } else {
4338 movdqu(vec2, Address(ary2, limit, Address::times_1));
4339 }
4340 pxor(vec1, vec2);
4341
4342 ptest(vec1, vec1);
4343 jcc(Assembler::notZero, FALSE_LABEL);
4344 addptr(limit, scaleIncr);
4345 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4346
4347 bind(COMPARE_TAIL); // limit is zero
4348 movl(limit, result);
4349 // Fallthru to tail compare
4350 } else if (UseSSE42Intrinsics) {
4351 // With SSE4.2, use double quad vector compare
4352 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4353
4354 // Compare 16-byte vectors
4355 andl(result, 0x0000000f); // tail count (in bytes)
4356 andl(limit, 0xfffffff0); // vector count (in bytes)
4357 jcc(Assembler::zero, COMPARE_TAIL);
4358
4359 lea(ary1, Address(ary1, limit, Address::times_1));
4360 lea(ary2, Address(ary2, limit, Address::times_1));
4361 negptr(limit);
4362
4363 bind(COMPARE_WIDE_VECTORS);
4364 movdqu(vec1, Address(ary1, limit, Address::times_1));
4365 movdqu(vec2, Address(ary2, limit, Address::times_1));
4366 pxor(vec1, vec2);
4367
4368 ptest(vec1, vec1);
4369 jcc(Assembler::notZero, FALSE_LABEL);
4370 addptr(limit, 16);
4371 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4372
4373 testl(result, result);
4374 jcc(Assembler::zero, TRUE_LABEL);
4375
4376 movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4377 movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4378 pxor(vec1, vec2);
4379
4380 ptest(vec1, vec1);
4381 jccb(Assembler::notZero, FALSE_LABEL);
4382 jmpb(TRUE_LABEL);
4383
4384 bind(COMPARE_TAIL); // limit is zero
4385 movl(limit, result);
4386 // Fallthru to tail compare
4387 }
4388
4389 // Compare 4-byte vectors
4390 if (expand_ary2) {
4391 testl(result, result);
4392 jccb(Assembler::zero, TRUE_LABEL);
4393 } else {
4394 andl(limit, 0xfffffffc); // vector count (in bytes)
4395 jccb(Assembler::zero, COMPARE_CHAR);
4396 }
4397
4398 lea(ary1, Address(ary1, limit, scaleFactor));
4399 lea(ary2, Address(ary2, limit, Address::times_1));
4400 negptr(limit);
4401
4402 bind(COMPARE_VECTORS);
4403 if (expand_ary2) {
4404 // There are no "vector" operations for bytes to shorts
4405 movzbl(chr, Address(ary2, limit, Address::times_1));
4406 cmpw(Address(ary1, limit, Address::times_2), chr);
4407 jccb(Assembler::notEqual, FALSE_LABEL);
4408 addptr(limit, 1);
4409 jcc(Assembler::notZero, COMPARE_VECTORS);
4410 jmp(TRUE_LABEL);
4411 } else {
4412 movl(chr, Address(ary1, limit, Address::times_1));
4413 cmpl(chr, Address(ary2, limit, Address::times_1));
4414 jccb(Assembler::notEqual, FALSE_LABEL);
4415 addptr(limit, 4);
4416 jcc(Assembler::notZero, COMPARE_VECTORS);
4417 }
4418
4419 // Compare trailing char (final 2 bytes), if any
4420 bind(COMPARE_CHAR);
4421 testl(result, 0x2); // tail char
4422 jccb(Assembler::zero, COMPARE_BYTE);
4423 load_unsigned_short(chr, Address(ary1, 0));
4424 load_unsigned_short(limit, Address(ary2, 0));
4425 cmpl(chr, limit);
4426 jccb(Assembler::notEqual, FALSE_LABEL);
4427
4428 if (is_array_equ && is_char) {
4429 bind(COMPARE_BYTE);
4430 } else {
4431 lea(ary1, Address(ary1, 2));
4432 lea(ary2, Address(ary2, 2));
4433
4434 bind(COMPARE_BYTE);
4435 testl(result, 0x1); // tail byte
4436 jccb(Assembler::zero, TRUE_LABEL);
4437 load_unsigned_byte(chr, Address(ary1, 0));
4438 load_unsigned_byte(limit, Address(ary2, 0));
4439 cmpl(chr, limit);
4440 jccb(Assembler::notEqual, FALSE_LABEL);
4441 }
4442 bind(TRUE_LABEL);
4443 movl(result, 1); // return true
4444 jmpb(DONE);
4445
4446 bind(FALSE_LABEL);
4447 xorl(result, result); // return false
4448
4449 // That's it
4450 bind(DONE);
4451 if (UseAVX >= 2) {
4452 // clean upper bits of YMM registers
4453 vpxor(vec1, vec1);
4454 vpxor(vec2, vec2);
4455 }
4456 }
4457
4458 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4459 #define __ masm.
4460 Register dst = stub.data<0>();
4461 XMMRegister src = stub.data<1>();
4462 address target = stub.data<2>();
4463 __ bind(stub.entry());
4464 __ subptr(rsp, 8);
4465 __ movdbl(Address(rsp), src);
4466 __ call(RuntimeAddress(target));
4467 // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4468 __ pop(dst);
4469 __ jmp(stub.continuation());
4470 #undef __
4471 }
4472
4473 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4474 assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4475 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4476
4477 address slowpath_target;
4478 if (dst_bt == T_INT) {
4479 if (src_bt == T_FLOAT) {
4480 cvttss2sil(dst, src);
4481 cmpl(dst, 0x80000000);
4482 slowpath_target = StubRoutines::x86::f2i_fixup();
4483 } else {
4484 cvttsd2sil(dst, src);
4485 cmpl(dst, 0x80000000);
4486 slowpath_target = StubRoutines::x86::d2i_fixup();
4487 }
4488 } else {
4489 if (src_bt == T_FLOAT) {
4490 cvttss2siq(dst, src);
4491 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4492 slowpath_target = StubRoutines::x86::f2l_fixup();
4493 } else {
4494 cvttsd2siq(dst, src);
4495 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4496 slowpath_target = StubRoutines::x86::d2l_fixup();
4497 }
4498 }
4499
4500 // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4501 int max_size = 23 + (UseAPX ? 1 : 0);
4502 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4503 jcc(Assembler::equal, stub->entry());
4504 bind(stub->continuation());
4505 }
4506
4507 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4508 XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4509 switch(ideal_opc) {
4510 case Op_LShiftVS:
4511 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4512 case Op_LShiftVI:
4513 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4514 case Op_LShiftVL:
4515 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4516 case Op_RShiftVS:
4517 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4518 case Op_RShiftVI:
4519 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4520 case Op_RShiftVL:
4521 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4522 case Op_URShiftVS:
4523 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4524 case Op_URShiftVI:
4525 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4526 case Op_URShiftVL:
4527 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4528 case Op_RotateRightV:
4529 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4530 case Op_RotateLeftV:
4531 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4532 default:
4533 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4534 break;
4535 }
4536 }
4537
4538 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4539 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4540 if (is_unsigned) {
4541 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4542 } else {
4543 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4544 }
4545 }
4546
4547 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4548 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4549 switch (elem_bt) {
4550 case T_BYTE:
4551 if (ideal_opc == Op_SaturatingAddV) {
4552 evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4553 } else {
4554 assert(ideal_opc == Op_SaturatingSubV, "");
4555 evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4556 }
4557 break;
4558 case T_SHORT:
4559 if (ideal_opc == Op_SaturatingAddV) {
4560 evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4561 } else {
4562 assert(ideal_opc == Op_SaturatingSubV, "");
4563 evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4564 }
4565 break;
4566 default:
4567 fatal("Unsupported type %s", type2name(elem_bt));
4568 break;
4569 }
4570 }
4571
4572 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4573 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4574 switch (elem_bt) {
4575 case T_BYTE:
4576 if (ideal_opc == Op_SaturatingAddV) {
4577 evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4578 } else {
4579 assert(ideal_opc == Op_SaturatingSubV, "");
4580 evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4581 }
4582 break;
4583 case T_SHORT:
4584 if (ideal_opc == Op_SaturatingAddV) {
4585 evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4586 } else {
4587 assert(ideal_opc == Op_SaturatingSubV, "");
4588 evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4589 }
4590 break;
4591 default:
4592 fatal("Unsupported type %s", type2name(elem_bt));
4593 break;
4594 }
4595 }
4596
4597 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4598 Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4599 if (is_unsigned) {
4600 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4601 } else {
4602 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4603 }
4604 }
4605
4606 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4607 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4608 switch (elem_bt) {
4609 case T_BYTE:
4610 if (ideal_opc == Op_SaturatingAddV) {
4611 evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4612 } else {
4613 assert(ideal_opc == Op_SaturatingSubV, "");
4614 evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4615 }
4616 break;
4617 case T_SHORT:
4618 if (ideal_opc == Op_SaturatingAddV) {
4619 evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4620 } else {
4621 assert(ideal_opc == Op_SaturatingSubV, "");
4622 evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4623 }
4624 break;
4625 default:
4626 fatal("Unsupported type %s", type2name(elem_bt));
4627 break;
4628 }
4629 }
4630
4631 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4632 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4633 switch (elem_bt) {
4634 case T_BYTE:
4635 if (ideal_opc == Op_SaturatingAddV) {
4636 evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4637 } else {
4638 assert(ideal_opc == Op_SaturatingSubV, "");
4639 evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4640 }
4641 break;
4642 case T_SHORT:
4643 if (ideal_opc == Op_SaturatingAddV) {
4644 evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4645 } else {
4646 assert(ideal_opc == Op_SaturatingSubV, "");
4647 evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4648 }
4649 break;
4650 default:
4651 fatal("Unsupported type %s", type2name(elem_bt));
4652 break;
4653 }
4654 }
4655
4656 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4657 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4658 bool is_varshift) {
4659 switch (ideal_opc) {
4660 case Op_AddVB:
4661 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4662 case Op_AddVS:
4663 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4664 case Op_AddVI:
4665 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4666 case Op_AddVL:
4667 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4668 case Op_AddVF:
4669 evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4670 case Op_AddVD:
4671 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4672 case Op_SubVB:
4673 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4674 case Op_SubVS:
4675 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4676 case Op_SubVI:
4677 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4678 case Op_SubVL:
4679 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4680 case Op_SubVF:
4681 evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4682 case Op_SubVD:
4683 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4684 case Op_MulVS:
4685 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4686 case Op_MulVI:
4687 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4688 case Op_MulVL:
4689 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4690 case Op_MulVF:
4691 evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4692 case Op_MulVD:
4693 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4694 case Op_DivVF:
4695 evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4696 case Op_DivVD:
4697 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4698 case Op_SqrtVF:
4699 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4700 case Op_SqrtVD:
4701 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4702 case Op_AbsVB:
4703 evpabsb(dst, mask, src2, merge, vlen_enc); break;
4704 case Op_AbsVS:
4705 evpabsw(dst, mask, src2, merge, vlen_enc); break;
4706 case Op_AbsVI:
4707 evpabsd(dst, mask, src2, merge, vlen_enc); break;
4708 case Op_AbsVL:
4709 evpabsq(dst, mask, src2, merge, vlen_enc); break;
4710 case Op_FmaVF:
4711 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4712 case Op_FmaVD:
4713 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4714 case Op_VectorRearrange:
4715 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4716 case Op_LShiftVS:
4717 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4718 case Op_LShiftVI:
4719 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4720 case Op_LShiftVL:
4721 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4722 case Op_RShiftVS:
4723 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4724 case Op_RShiftVI:
4725 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4726 case Op_RShiftVL:
4727 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4728 case Op_URShiftVS:
4729 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4730 case Op_URShiftVI:
4731 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4732 case Op_URShiftVL:
4733 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4734 case Op_RotateLeftV:
4735 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4736 case Op_RotateRightV:
4737 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4738 case Op_MaxV:
4739 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4740 case Op_MinV:
4741 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4742 case Op_UMinV:
4743 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4744 case Op_UMaxV:
4745 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4746 case Op_XorV:
4747 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4748 case Op_OrV:
4749 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4750 case Op_AndV:
4751 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4752 default:
4753 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4754 break;
4755 }
4756 }
4757
4758 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4759 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4760 switch (ideal_opc) {
4761 case Op_AddVB:
4762 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4763 case Op_AddVS:
4764 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4765 case Op_AddVI:
4766 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4767 case Op_AddVL:
4768 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4769 case Op_AddVF:
4770 evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4771 case Op_AddVD:
4772 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4773 case Op_SubVB:
4774 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4775 case Op_SubVS:
4776 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4777 case Op_SubVI:
4778 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4779 case Op_SubVL:
4780 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4781 case Op_SubVF:
4782 evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4783 case Op_SubVD:
4784 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4785 case Op_MulVS:
4786 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4787 case Op_MulVI:
4788 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4789 case Op_MulVL:
4790 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4791 case Op_MulVF:
4792 evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4793 case Op_MulVD:
4794 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4795 case Op_DivVF:
4796 evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4797 case Op_DivVD:
4798 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4799 case Op_FmaVF:
4800 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4801 case Op_FmaVD:
4802 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4803 case Op_MaxV:
4804 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4805 case Op_MinV:
4806 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4807 case Op_UMaxV:
4808 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4809 case Op_UMinV:
4810 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4811 case Op_XorV:
4812 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4813 case Op_OrV:
4814 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4815 case Op_AndV:
4816 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4817 default:
4818 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4819 break;
4820 }
4821 }
4822
4823 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4824 KRegister src1, KRegister src2) {
4825 BasicType etype = T_ILLEGAL;
4826 switch(mask_len) {
4827 case 2:
4828 case 4:
4829 case 8: etype = T_BYTE; break;
4830 case 16: etype = T_SHORT; break;
4831 case 32: etype = T_INT; break;
4832 case 64: etype = T_LONG; break;
4833 default: fatal("Unsupported type"); break;
4834 }
4835 assert(etype != T_ILLEGAL, "");
4836 switch(ideal_opc) {
4837 case Op_AndVMask:
4838 kand(etype, dst, src1, src2); break;
4839 case Op_OrVMask:
4840 kor(etype, dst, src1, src2); break;
4841 case Op_XorVMask:
4842 kxor(etype, dst, src1, src2); break;
4843 default:
4844 fatal("Unsupported masked operation"); break;
4845 }
4846 }
4847
4848 /*
4849 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4850 * If src is NaN, the result is 0.
4851 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4852 * the result is equal to the value of Integer.MIN_VALUE.
4853 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4854 * the result is equal to the value of Integer.MAX_VALUE.
4855 */
4856 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4857 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4858 Register rscratch, AddressLiteral float_sign_flip,
4859 int vec_enc) {
4860 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4861 Label done;
4862 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4863 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4864 vptest(xtmp2, xtmp2, vec_enc);
4865 jccb(Assembler::equal, done);
4866
4867 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4868 vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4869
4870 vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4871 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4872 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4873
4874 // Recompute the mask for remaining special value.
4875 vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4876 // Extract SRC values corresponding to TRUE mask lanes.
4877 vpand(xtmp4, xtmp2, src, vec_enc);
4878 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4879 // values are set.
4880 vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4881
4882 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4883 bind(done);
4884 }
4885
4886 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4887 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4888 Register rscratch, AddressLiteral float_sign_flip,
4889 int vec_enc) {
4890 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4891 Label done;
4892 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4893 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4894 kortestwl(ktmp1, ktmp1);
4895 jccb(Assembler::equal, done);
4896
4897 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4898 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4899 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4900
4901 kxorwl(ktmp1, ktmp1, ktmp2);
4902 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4903 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4904 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4905 bind(done);
4906 }
4907
4908 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4909 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4910 Register rscratch, AddressLiteral double_sign_flip,
4911 int vec_enc) {
4912 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4913
4914 Label done;
4915 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4916 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4917 kortestwl(ktmp1, ktmp1);
4918 jccb(Assembler::equal, done);
4919
4920 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4921 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4922 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4923
4924 kxorwl(ktmp1, ktmp1, ktmp2);
4925 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4926 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4927 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4928 bind(done);
4929 }
4930
4931 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4932 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4933 Register rscratch, AddressLiteral float_sign_flip,
4934 int vec_enc) {
4935 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4936 Label done;
4937 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4938 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4939 kortestwl(ktmp1, ktmp1);
4940 jccb(Assembler::equal, done);
4941
4942 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4943 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4944 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4945
4946 kxorwl(ktmp1, ktmp1, ktmp2);
4947 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4948 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4949 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4950 bind(done);
4951 }
4952
4953 /*
4954 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4955 * If src is NaN, the result is 0.
4956 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
4957 * the result is equal to the value of Long.MIN_VALUE.
4958 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
4959 * the result is equal to the value of Long.MAX_VALUE.
4960 */
4961 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4962 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4963 Register rscratch, AddressLiteral double_sign_flip,
4964 int vec_enc) {
4965 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4966
4967 Label done;
4968 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4969 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
4970 kortestwl(ktmp1, ktmp1);
4971 jccb(Assembler::equal, done);
4972
4973 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4974 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4975 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4976
4977 kxorwl(ktmp1, ktmp1, ktmp2);
4978 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4979 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4980 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4981 bind(done);
4982 }
4983
4984 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
4985 XMMRegister xtmp, int index, int vec_enc) {
4986 assert(vec_enc < Assembler::AVX_512bit, "");
4987 if (vec_enc == Assembler::AVX_256bit) {
4988 vextractf128_high(xtmp, src);
4989 vshufps(dst, src, xtmp, index, vec_enc);
4990 } else {
4991 vshufps(dst, src, zero, index, vec_enc);
4992 }
4993 }
4994
4995 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
4996 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
4997 AddressLiteral float_sign_flip, int src_vec_enc) {
4998 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4999
5000 Label done;
5001 // Compare the destination lanes with float_sign_flip
5002 // value to get mask for all special values.
5003 movdqu(xtmp1, float_sign_flip, rscratch);
5004 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5005 ptest(xtmp2, xtmp2);
5006 jccb(Assembler::equal, done);
5007
5008 // Flip float_sign_flip to get max integer value.
5009 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5010 pxor(xtmp1, xtmp4);
5011
5012 // Set detination lanes corresponding to unordered source lanes as zero.
5013 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5014 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5015
5016 // Shuffle mask vector and pack lower doubles word from each quadword lane.
5017 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5018 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5019
5020 // Recompute the mask for remaining special value.
5021 pxor(xtmp2, xtmp3);
5022 // Extract mask corresponding to non-negative source lanes.
5023 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5024
5025 // Shuffle mask vector and pack lower doubles word from each quadword lane.
5026 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5027 pand(xtmp3, xtmp2);
5028
5029 // Replace destination lanes holding special value(0x80000000) with max int
5030 // if corresponding source lane holds a +ve value.
5031 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5032 bind(done);
5033 }
5034
5035
5036 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5037 XMMRegister xtmp, Register rscratch, int vec_enc) {
5038 switch(to_elem_bt) {
5039 case T_SHORT:
5040 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5041 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5042 vpackusdw(dst, dst, zero, vec_enc);
5043 if (vec_enc == Assembler::AVX_256bit) {
5044 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5045 }
5046 break;
5047 case T_BYTE:
5048 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5049 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5050 vpackusdw(dst, dst, zero, vec_enc);
5051 if (vec_enc == Assembler::AVX_256bit) {
5052 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5053 }
5054 vpackuswb(dst, dst, zero, vec_enc);
5055 break;
5056 default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt));
5057 }
5058 }
5059
5060 /*
5061 * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):-
5062 * a) Perform vector D2L/F2I cast.
5063 * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5064 * It signifies that source value could be any of the special floating point
5065 * values(NaN,-Inf,Inf,Max,-Min).
5066 * c) Set destination to zero if source is NaN value.
5067 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5068 */
5069
5070 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5071 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5072 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5073 int to_elem_sz = type2aelembytes(to_elem_bt);
5074 assert(to_elem_sz <= 4, "");
5075 vcvttps2dq(dst, src, vec_enc);
5076 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5077 if (to_elem_sz < 4) {
5078 vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5079 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5080 }
5081 }
5082
5083 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5084 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5085 Register rscratch, int vec_enc) {
5086 int to_elem_sz = type2aelembytes(to_elem_bt);
5087 assert(to_elem_sz <= 4, "");
5088 vcvttps2dq(dst, src, vec_enc);
5089 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5090 switch(to_elem_bt) {
5091 case T_INT:
5092 break;
5093 case T_SHORT:
5094 evpmovdw(dst, dst, vec_enc);
5095 break;
5096 case T_BYTE:
5097 evpmovdb(dst, dst, vec_enc);
5098 break;
5099 default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt));
5100 }
5101 }
5102
5103 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5104 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5105 Register rscratch, int vec_enc) {
5106 evcvttps2qq(dst, src, vec_enc);
5107 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5108 }
5109
5110 // Handling for downcasting from double to integer or sub-word types on AVX2.
5111 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5112 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5113 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5114 int to_elem_sz = type2aelembytes(to_elem_bt);
5115 assert(to_elem_sz < 8, "");
5116 vcvttpd2dq(dst, src, vec_enc);
5117 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5118 float_sign_flip, vec_enc);
5119 if (to_elem_sz < 4) {
5120 // xtmp4 holds all zero lanes.
5121 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5122 }
5123 }
5124
5125 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5126 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5127 KRegister ktmp2, AddressLiteral sign_flip,
5128 Register rscratch, int vec_enc) {
5129 if (VM_Version::supports_avx512dq()) {
5130 evcvttpd2qq(dst, src, vec_enc);
5131 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5132 switch(to_elem_bt) {
5133 case T_LONG:
5134 break;
5135 case T_INT:
5136 evpmovsqd(dst, dst, vec_enc);
5137 break;
5138 case T_SHORT:
5139 evpmovsqd(dst, dst, vec_enc);
5140 evpmovdw(dst, dst, vec_enc);
5141 break;
5142 case T_BYTE:
5143 evpmovsqd(dst, dst, vec_enc);
5144 evpmovdb(dst, dst, vec_enc);
5145 break;
5146 default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt));
5147 }
5148 } else {
5149 assert(type2aelembytes(to_elem_bt) <= 4, "");
5150 vcvttpd2dq(dst, src, vec_enc);
5151 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5152 switch(to_elem_bt) {
5153 case T_INT:
5154 break;
5155 case T_SHORT:
5156 evpmovdw(dst, dst, vec_enc);
5157 break;
5158 case T_BYTE:
5159 evpmovdb(dst, dst, vec_enc);
5160 break;
5161 default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt));
5162 }
5163 }
5164 }
5165
5166 void C2_MacroAssembler::vector_castF2X_avx10(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5167 switch(to_elem_bt) {
5168 case T_LONG:
5169 evcvttps2qqs(dst, src, vec_enc);
5170 break;
5171 case T_INT:
5172 evcvttps2dqs(dst, src, vec_enc);
5173 break;
5174 case T_SHORT:
5175 evcvttps2dqs(dst, src, vec_enc);
5176 evpmovdw(dst, dst, vec_enc);
5177 break;
5178 case T_BYTE:
5179 evcvttps2dqs(dst, src, vec_enc);
5180 evpmovdb(dst, dst, vec_enc);
5181 break;
5182 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt));
5183 }
5184 }
5185
5186 void C2_MacroAssembler::vector_castF2X_avx10(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5187 switch(to_elem_bt) {
5188 case T_LONG:
5189 evcvttps2qqs(dst, src, vec_enc);
5190 break;
5191 case T_INT:
5192 evcvttps2dqs(dst, src, vec_enc);
5193 break;
5194 case T_SHORT:
5195 evcvttps2dqs(dst, src, vec_enc);
5196 evpmovdw(dst, dst, vec_enc);
5197 break;
5198 case T_BYTE:
5199 evcvttps2dqs(dst, src, vec_enc);
5200 evpmovdb(dst, dst, vec_enc);
5201 break;
5202 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt));
5203 }
5204 }
5205
5206 void C2_MacroAssembler::vector_castD2X_avx10(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5207 switch(to_elem_bt) {
5208 case T_LONG:
5209 evcvttpd2qqs(dst, src, vec_enc);
5210 break;
5211 case T_INT:
5212 evcvttpd2dqs(dst, src, vec_enc);
5213 break;
5214 case T_SHORT:
5215 evcvttpd2dqs(dst, src, vec_enc);
5216 evpmovdw(dst, dst, vec_enc);
5217 break;
5218 case T_BYTE:
5219 evcvttpd2dqs(dst, src, vec_enc);
5220 evpmovdb(dst, dst, vec_enc);
5221 break;
5222 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt));
5223 }
5224 }
5225
5226 void C2_MacroAssembler::vector_castD2X_avx10(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5227 switch(to_elem_bt) {
5228 case T_LONG:
5229 evcvttpd2qqs(dst, src, vec_enc);
5230 break;
5231 case T_INT:
5232 evcvttpd2dqs(dst, src, vec_enc);
5233 break;
5234 case T_SHORT:
5235 evcvttpd2dqs(dst, src, vec_enc);
5236 evpmovdw(dst, dst, vec_enc);
5237 break;
5238 case T_BYTE:
5239 evcvttpd2dqs(dst, src, vec_enc);
5240 evpmovdb(dst, dst, vec_enc);
5241 break;
5242 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt));
5243 }
5244 }
5245
5246 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5247 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5248 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5249 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5250 // and re-instantiate original MXCSR.RC mode after that.
5251 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5252
5253 mov64(tmp, julong_cast(0.5L));
5254 evpbroadcastq(xtmp1, tmp, vec_enc);
5255 vaddpd(xtmp1, src , xtmp1, vec_enc);
5256 evcvtpd2qq(dst, xtmp1, vec_enc);
5257 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5258 double_sign_flip, vec_enc);;
5259
5260 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5261 }
5262
5263 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5264 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5265 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5266 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5267 // and re-instantiate original MXCSR.RC mode after that.
5268 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5269
5270 movl(tmp, jint_cast(0.5));
5271 movq(xtmp1, tmp);
5272 vbroadcastss(xtmp1, xtmp1, vec_enc);
5273 vaddps(xtmp1, src , xtmp1, vec_enc);
5274 vcvtps2dq(dst, xtmp1, vec_enc);
5275 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5276 float_sign_flip, vec_enc);
5277
5278 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5279 }
5280
5281 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5282 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5283 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5284 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5285 // and re-instantiate original MXCSR.RC mode after that.
5286 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5287
5288 movl(tmp, jint_cast(0.5));
5289 movq(xtmp1, tmp);
5290 vbroadcastss(xtmp1, xtmp1, vec_enc);
5291 vaddps(xtmp1, src , xtmp1, vec_enc);
5292 vcvtps2dq(dst, xtmp1, vec_enc);
5293 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5294
5295 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5296 }
5297
5298 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5299 BasicType from_elem_bt, BasicType to_elem_bt) {
5300 switch (from_elem_bt) {
5301 case T_BYTE:
5302 switch (to_elem_bt) {
5303 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5304 case T_INT: vpmovzxbd(dst, src, vlen_enc); break;
5305 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break;
5306 default: ShouldNotReachHere();
5307 }
5308 break;
5309 case T_SHORT:
5310 switch (to_elem_bt) {
5311 case T_INT: vpmovzxwd(dst, src, vlen_enc); break;
5312 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5313 default: ShouldNotReachHere();
5314 }
5315 break;
5316 case T_INT:
5317 assert(to_elem_bt == T_LONG, "");
5318 vpmovzxdq(dst, src, vlen_enc);
5319 break;
5320 default:
5321 ShouldNotReachHere();
5322 }
5323 }
5324
5325 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5326 BasicType from_elem_bt, BasicType to_elem_bt) {
5327 switch (from_elem_bt) {
5328 case T_BYTE:
5329 switch (to_elem_bt) {
5330 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5331 case T_INT: vpmovsxbd(dst, src, vlen_enc); break;
5332 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break;
5333 default: ShouldNotReachHere();
5334 }
5335 break;
5336 case T_SHORT:
5337 switch (to_elem_bt) {
5338 case T_INT: vpmovsxwd(dst, src, vlen_enc); break;
5339 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5340 default: ShouldNotReachHere();
5341 }
5342 break;
5343 case T_INT:
5344 assert(to_elem_bt == T_LONG, "");
5345 vpmovsxdq(dst, src, vlen_enc);
5346 break;
5347 default:
5348 ShouldNotReachHere();
5349 }
5350 }
5351
5352 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5353 BasicType dst_bt, BasicType src_bt, int vlen) {
5354 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5355 assert(vlen_enc != AVX_512bit, "");
5356
5357 int dst_bt_size = type2aelembytes(dst_bt);
5358 int src_bt_size = type2aelembytes(src_bt);
5359 if (dst_bt_size > src_bt_size) {
5360 switch (dst_bt_size / src_bt_size) {
5361 case 2: vpmovsxbw(dst, src, vlen_enc); break;
5362 case 4: vpmovsxbd(dst, src, vlen_enc); break;
5363 case 8: vpmovsxbq(dst, src, vlen_enc); break;
5364 default: ShouldNotReachHere();
5365 }
5366 } else {
5367 assert(dst_bt_size < src_bt_size, "");
5368 switch (src_bt_size / dst_bt_size) {
5369 case 2: {
5370 if (vlen_enc == AVX_128bit) {
5371 vpacksswb(dst, src, src, vlen_enc);
5372 } else {
5373 vpacksswb(dst, src, src, vlen_enc);
5374 vpermq(dst, dst, 0x08, vlen_enc);
5375 }
5376 break;
5377 }
5378 case 4: {
5379 if (vlen_enc == AVX_128bit) {
5380 vpackssdw(dst, src, src, vlen_enc);
5381 vpacksswb(dst, dst, dst, vlen_enc);
5382 } else {
5383 vpackssdw(dst, src, src, vlen_enc);
5384 vpermq(dst, dst, 0x08, vlen_enc);
5385 vpacksswb(dst, dst, dst, AVX_128bit);
5386 }
5387 break;
5388 }
5389 case 8: {
5390 if (vlen_enc == AVX_128bit) {
5391 vpshufd(dst, src, 0x08, vlen_enc);
5392 vpackssdw(dst, dst, dst, vlen_enc);
5393 vpacksswb(dst, dst, dst, vlen_enc);
5394 } else {
5395 vpshufd(dst, src, 0x08, vlen_enc);
5396 vpermq(dst, dst, 0x08, vlen_enc);
5397 vpackssdw(dst, dst, dst, AVX_128bit);
5398 vpacksswb(dst, dst, dst, AVX_128bit);
5399 }
5400 break;
5401 }
5402 default: ShouldNotReachHere();
5403 }
5404 }
5405 }
5406
5407 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5408 bool merge, BasicType bt, int vlen_enc) {
5409 if (bt == T_INT) {
5410 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5411 } else {
5412 assert(bt == T_LONG, "");
5413 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5414 }
5415 }
5416
5417 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5418 bool merge, BasicType bt, int vlen_enc) {
5419 if (bt == T_INT) {
5420 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5421 } else {
5422 assert(bt == T_LONG, "");
5423 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5424 }
5425 }
5426
5427 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5428 Register rtmp2, XMMRegister xtmp, int mask_len,
5429 int vec_enc) {
5430 int index = 0;
5431 int vindex = 0;
5432 mov64(rtmp1, 0x0101010101010101L);
5433 pdepq(rtmp1, src, rtmp1);
5434 if (mask_len > 8) {
5435 movq(rtmp2, src);
5436 vpxor(xtmp, xtmp, xtmp, vec_enc);
5437 movq(xtmp, rtmp1);
5438 }
5439 movq(dst, rtmp1);
5440
5441 mask_len -= 8;
5442 while (mask_len > 0) {
5443 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5444 index++;
5445 if ((index % 2) == 0) {
5446 pxor(xtmp, xtmp);
5447 }
5448 mov64(rtmp1, 0x0101010101010101L);
5449 shrq(rtmp2, 8);
5450 pdepq(rtmp1, rtmp2, rtmp1);
5451 pinsrq(xtmp, rtmp1, index % 2);
5452 vindex = index / 2;
5453 if (vindex) {
5454 // Write entire 16 byte vector when both 64 bit
5455 // lanes are update to save redundant instructions.
5456 if (index % 2) {
5457 vinsertf128(dst, dst, xtmp, vindex);
5458 }
5459 } else {
5460 vmovdqu(dst, xtmp);
5461 }
5462 mask_len -= 8;
5463 }
5464 }
5465
5466 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5467 switch(opc) {
5468 case Op_VectorMaskTrueCount:
5469 popcntq(dst, tmp);
5470 break;
5471 case Op_VectorMaskLastTrue:
5472 if (VM_Version::supports_lzcnt()) {
5473 lzcntq(tmp, tmp);
5474 movl(dst, 63);
5475 subl(dst, tmp);
5476 } else {
5477 movl(dst, -1);
5478 bsrq(tmp, tmp);
5479 cmov32(Assembler::notZero, dst, tmp);
5480 }
5481 break;
5482 case Op_VectorMaskFirstTrue:
5483 if (VM_Version::supports_bmi1()) {
5484 if (masklen < 32) {
5485 orl(tmp, 1 << masklen);
5486 tzcntl(dst, tmp);
5487 } else if (masklen == 32) {
5488 tzcntl(dst, tmp);
5489 } else {
5490 assert(masklen == 64, "");
5491 tzcntq(dst, tmp);
5492 }
5493 } else {
5494 if (masklen < 32) {
5495 orl(tmp, 1 << masklen);
5496 bsfl(dst, tmp);
5497 } else {
5498 assert(masklen == 32 || masklen == 64, "");
5499 movl(dst, masklen);
5500 if (masklen == 32) {
5501 bsfl(tmp, tmp);
5502 } else {
5503 bsfq(tmp, tmp);
5504 }
5505 cmov32(Assembler::notZero, dst, tmp);
5506 }
5507 }
5508 break;
5509 case Op_VectorMaskToLong:
5510 assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5511 break;
5512 default: assert(false, "Unhandled mask operation");
5513 }
5514 }
5515
5516 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5517 int masklen, int masksize, int vec_enc) {
5518 assert(VM_Version::supports_popcnt(), "");
5519
5520 if(VM_Version::supports_avx512bw()) {
5521 kmovql(tmp, mask);
5522 } else {
5523 assert(masklen <= 16, "");
5524 kmovwl(tmp, mask);
5525 }
5526
5527 // Mask generated out of partial vector comparisons/replicate/mask manipulation
5528 // operations needs to be clipped.
5529 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5530 andq(tmp, (1 << masklen) - 1);
5531 }
5532
5533 vector_mask_operation_helper(opc, dst, tmp, masklen);
5534 }
5535
5536 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5537 Register tmp, int masklen, BasicType bt, int vec_enc) {
5538 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5539 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5540 assert(VM_Version::supports_popcnt(), "");
5541
5542 bool need_clip = false;
5543 switch(bt) {
5544 case T_BOOLEAN:
5545 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5546 vpxor(xtmp, xtmp, xtmp, vec_enc);
5547 vpsubb(xtmp, xtmp, mask, vec_enc);
5548 vpmovmskb(tmp, xtmp, vec_enc);
5549 need_clip = masklen < 16;
5550 break;
5551 case T_BYTE:
5552 vpmovmskb(tmp, mask, vec_enc);
5553 need_clip = masklen < 16;
5554 break;
5555 case T_SHORT:
5556 vpacksswb(xtmp, mask, mask, vec_enc);
5557 if (masklen >= 16) {
5558 vpermpd(xtmp, xtmp, 8, vec_enc);
5559 }
5560 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5561 need_clip = masklen < 16;
5562 break;
5563 case T_INT:
5564 case T_FLOAT:
5565 vmovmskps(tmp, mask, vec_enc);
5566 need_clip = masklen < 4;
5567 break;
5568 case T_LONG:
5569 case T_DOUBLE:
5570 vmovmskpd(tmp, mask, vec_enc);
5571 need_clip = masklen < 2;
5572 break;
5573 default: assert(false, "Unhandled type, %s", type2name(bt));
5574 }
5575
5576 // Mask generated out of partial vector comparisons/replicate/mask manipulation
5577 // operations needs to be clipped.
5578 if (need_clip && opc != Op_VectorMaskFirstTrue) {
5579 // need_clip implies masklen < 32
5580 andq(tmp, (1 << masklen) - 1);
5581 }
5582
5583 vector_mask_operation_helper(opc, dst, tmp, masklen);
5584 }
5585
5586 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5587 Register rtmp2, int mask_len) {
5588 kmov(rtmp1, src);
5589 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5590 mov64(rtmp2, -1L);
5591 pextq(rtmp2, rtmp2, rtmp1);
5592 kmov(dst, rtmp2);
5593 }
5594
5595 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5596 XMMRegister mask, Register rtmp, Register rscratch,
5597 XMMRegister permv, XMMRegister xtmp, BasicType bt,
5598 int vec_enc) {
5599 assert(type2aelembytes(bt) >= 4, "");
5600 assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5601 address compress_perm_table = nullptr;
5602 address expand_perm_table = nullptr;
5603 if (type2aelembytes(bt) == 8) {
5604 compress_perm_table = StubRoutines::x86::compress_perm_table64();
5605 expand_perm_table = StubRoutines::x86::expand_perm_table64();
5606 vmovmskpd(rtmp, mask, vec_enc);
5607 } else {
5608 compress_perm_table = StubRoutines::x86::compress_perm_table32();
5609 expand_perm_table = StubRoutines::x86::expand_perm_table32();
5610 vmovmskps(rtmp, mask, vec_enc);
5611 }
5612 shlq(rtmp, 5); // for 32 byte permute row.
5613 if (opcode == Op_CompressV) {
5614 lea(rscratch, ExternalAddress(compress_perm_table));
5615 } else {
5616 lea(rscratch, ExternalAddress(expand_perm_table));
5617 }
5618 addptr(rtmp, rscratch);
5619 vmovdqu(permv, Address(rtmp));
5620 vpermps(dst, permv, src, Assembler::AVX_256bit);
5621 vpxor(xtmp, xtmp, xtmp, vec_enc);
5622 // Blend the result with zero vector using permute mask, each column entry
5623 // in a permute table row contains either a valid permute index or a -1 (default)
5624 // value, this can potentially be used as a blending mask after
5625 // compressing/expanding the source vector lanes.
5626 vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5627 }
5628
5629 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5630 bool merge, BasicType bt, int vec_enc) {
5631 if (opcode == Op_CompressV) {
5632 switch(bt) {
5633 case T_BYTE:
5634 evpcompressb(dst, mask, src, merge, vec_enc);
5635 break;
5636 case T_CHAR:
5637 case T_SHORT:
5638 evpcompressw(dst, mask, src, merge, vec_enc);
5639 break;
5640 case T_INT:
5641 evpcompressd(dst, mask, src, merge, vec_enc);
5642 break;
5643 case T_FLOAT:
5644 evcompressps(dst, mask, src, merge, vec_enc);
5645 break;
5646 case T_LONG:
5647 evpcompressq(dst, mask, src, merge, vec_enc);
5648 break;
5649 case T_DOUBLE:
5650 evcompresspd(dst, mask, src, merge, vec_enc);
5651 break;
5652 default:
5653 fatal("Unsupported type %s", type2name(bt));
5654 break;
5655 }
5656 } else {
5657 assert(opcode == Op_ExpandV, "");
5658 switch(bt) {
5659 case T_BYTE:
5660 evpexpandb(dst, mask, src, merge, vec_enc);
5661 break;
5662 case T_CHAR:
5663 case T_SHORT:
5664 evpexpandw(dst, mask, src, merge, vec_enc);
5665 break;
5666 case T_INT:
5667 evpexpandd(dst, mask, src, merge, vec_enc);
5668 break;
5669 case T_FLOAT:
5670 evexpandps(dst, mask, src, merge, vec_enc);
5671 break;
5672 case T_LONG:
5673 evpexpandq(dst, mask, src, merge, vec_enc);
5674 break;
5675 case T_DOUBLE:
5676 evexpandpd(dst, mask, src, merge, vec_enc);
5677 break;
5678 default:
5679 fatal("Unsupported type %s", type2name(bt));
5680 break;
5681 }
5682 }
5683 }
5684
5685 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5686 KRegister ktmp1, int vec_enc) {
5687 if (opcode == Op_SignumVD) {
5688 vsubpd(dst, zero, one, vec_enc);
5689 // if src < 0 ? -1 : 1
5690 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5691 evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5692 // if src == NaN, -0.0 or 0.0 return src.
5693 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5694 evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5695 } else {
5696 assert(opcode == Op_SignumVF, "");
5697 vsubps(dst, zero, one, vec_enc);
5698 // if src < 0 ? -1 : 1
5699 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5700 evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5701 // if src == NaN, -0.0 or 0.0 return src.
5702 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5703 evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5704 }
5705 }
5706
5707 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5708 XMMRegister xtmp1, int vec_enc) {
5709 if (opcode == Op_SignumVD) {
5710 vsubpd(dst, zero, one, vec_enc);
5711 // if src < 0 ? -1 : 1
5712 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5713 // if src == NaN, -0.0 or 0.0 return src.
5714 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5715 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5716 } else {
5717 assert(opcode == Op_SignumVF, "");
5718 vsubps(dst, zero, one, vec_enc);
5719 // if src < 0 ? -1 : 1
5720 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5721 // if src == NaN, -0.0 or 0.0 return src.
5722 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5723 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5724 }
5725 }
5726
5727 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5728 if (VM_Version::supports_avx512bw()) {
5729 if (mask_len > 32) {
5730 kmovql(dst, src);
5731 } else {
5732 kmovdl(dst, src);
5733 if (mask_len != 32) {
5734 kshiftrdl(dst, dst, 32 - mask_len);
5735 }
5736 }
5737 } else {
5738 assert(mask_len <= 16, "");
5739 kmovwl(dst, src);
5740 if (mask_len != 16) {
5741 kshiftrwl(dst, dst, 16 - mask_len);
5742 }
5743 }
5744 }
5745
5746 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5747 int lane_size = type2aelembytes(bt);
5748 if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5749 (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5750 movptr(rtmp, imm32);
5751 switch(lane_size) {
5752 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5753 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5754 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5755 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5756 fatal("Unsupported lane size %d", lane_size);
5757 break;
5758 }
5759 } else {
5760 movptr(rtmp, imm32);
5761 movq(dst, rtmp);
5762 switch(lane_size) {
5763 case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5764 case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5765 case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5766 case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5767 fatal("Unsupported lane size %d", lane_size);
5768 break;
5769 }
5770 }
5771 }
5772
5773 //
5774 // Following is lookup table based popcount computation algorithm:-
5775 // Index Bit set count
5776 // [ 0000 -> 0,
5777 // 0001 -> 1,
5778 // 0010 -> 1,
5779 // 0011 -> 2,
5780 // 0100 -> 1,
5781 // 0101 -> 2,
5782 // 0110 -> 2,
5783 // 0111 -> 3,
5784 // 1000 -> 1,
5785 // 1001 -> 2,
5786 // 1010 -> 3,
5787 // 1011 -> 3,
5788 // 1100 -> 2,
5789 // 1101 -> 3,
5790 // 1111 -> 4 ]
5791 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5792 // shuffle indices for lookup table access.
5793 // b. Right shift each byte of vector lane by 4 positions.
5794 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5795 // shuffle indices for lookup table access.
5796 // d. Add the bitset count of upper and lower 4 bits of each byte.
5797 // e. Unpack double words to quad words and compute sum of absolute difference of bitset
5798 // count of all the bytes of a quadword.
5799 // f. Perform step e. for upper 128bit vector lane.
5800 // g. Pack the bitset count of quadwords back to double word.
5801 // h. Unpacking and packing operations are not needed for 64bit vector lane.
5802
5803 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5804 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5805 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5806 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5807 vpsrlw(dst, src, 4, vec_enc);
5808 vpand(dst, dst, xtmp1, vec_enc);
5809 vpand(xtmp1, src, xtmp1, vec_enc);
5810 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5811 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5812 vpshufb(dst, xtmp2, dst, vec_enc);
5813 vpaddb(dst, dst, xtmp1, vec_enc);
5814 }
5815
5816 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5817 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5818 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5819 // Following code is as per steps e,f,g and h of above algorithm.
5820 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5821 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5822 vpsadbw(dst, dst, xtmp2, vec_enc);
5823 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5824 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5825 vpackuswb(dst, xtmp1, dst, vec_enc);
5826 }
5827
5828 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5829 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5830 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5831 // Add the popcount of upper and lower bytes of word.
5832 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5833 vpsrlw(dst, xtmp1, 8, vec_enc);
5834 vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5835 vpaddw(dst, dst, xtmp1, vec_enc);
5836 }
5837
5838 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5839 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5840 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5841 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5842 vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5843 }
5844
5845 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5846 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5847 switch(bt) {
5848 case T_LONG:
5849 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5850 break;
5851 case T_INT:
5852 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5853 break;
5854 case T_CHAR:
5855 case T_SHORT:
5856 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5857 break;
5858 case T_BYTE:
5859 case T_BOOLEAN:
5860 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5861 break;
5862 default:
5863 fatal("Unsupported type %s", type2name(bt));
5864 break;
5865 }
5866 }
5867
5868 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5869 KRegister mask, bool merge, int vec_enc) {
5870 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5871 switch(bt) {
5872 case T_LONG:
5873 assert(VM_Version::supports_avx512_vpopcntdq(), "");
5874 evpopcntq(dst, mask, src, merge, vec_enc);
5875 break;
5876 case T_INT:
5877 assert(VM_Version::supports_avx512_vpopcntdq(), "");
5878 evpopcntd(dst, mask, src, merge, vec_enc);
5879 break;
5880 case T_CHAR:
5881 case T_SHORT:
5882 assert(VM_Version::supports_avx512_bitalg(), "");
5883 evpopcntw(dst, mask, src, merge, vec_enc);
5884 break;
5885 case T_BYTE:
5886 case T_BOOLEAN:
5887 assert(VM_Version::supports_avx512_bitalg(), "");
5888 evpopcntb(dst, mask, src, merge, vec_enc);
5889 break;
5890 default:
5891 fatal("Unsupported type %s", type2name(bt));
5892 break;
5893 }
5894 }
5895
5896 // Bit reversal algorithm first reverses the bits of each byte followed by
5897 // a byte level reversal for multi-byte primitive types (short/int/long).
5898 // Algorithm performs a lookup table access to get reverse bit sequence
5899 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5900 // is obtained by swapping the reverse bit sequences of upper and lower
5901 // nibble of a byte.
5902 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5903 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5904 if (VM_Version::supports_avx512vlbw()) {
5905
5906 // Get the reverse bit sequence of lower nibble of each byte.
5907 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5908 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5909 evpandq(dst, xtmp2, src, vec_enc);
5910 vpshufb(dst, xtmp1, dst, vec_enc);
5911 vpsllq(dst, dst, 4, vec_enc);
5912
5913 // Get the reverse bit sequence of upper nibble of each byte.
5914 vpandn(xtmp2, xtmp2, src, vec_enc);
5915 vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5916 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5917
5918 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5919 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5920 evporq(xtmp2, dst, xtmp2, vec_enc);
5921 vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5922
5923 } else if(vec_enc == Assembler::AVX_512bit) {
5924 // Shift based bit reversal.
5925 assert(bt == T_LONG || bt == T_INT, "");
5926
5927 // Swap lower and upper nibble of each byte.
5928 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5929
5930 // Swap two least and most significant bits of each nibble.
5931 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5932
5933 // Swap adjacent pair of bits.
5934 evmovdqul(xtmp1, k0, dst, true, vec_enc);
5935 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
5936
5937 evmovdqul(xtmp1, k0, dst, true, vec_enc);
5938 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
5939 } else {
5940 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
5941 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5942
5943 // Get the reverse bit sequence of lower nibble of each byte.
5944 vpand(dst, xtmp2, src, vec_enc);
5945 vpshufb(dst, xtmp1, dst, vec_enc);
5946 vpsllq(dst, dst, 4, vec_enc);
5947
5948 // Get the reverse bit sequence of upper nibble of each byte.
5949 vpandn(xtmp2, xtmp2, src, vec_enc);
5950 vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5951 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5952
5953 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5954 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5955 vpor(xtmp2, dst, xtmp2, vec_enc);
5956 vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5957 }
5958 }
5959
5960 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
5961 XMMRegister xtmp, Register rscratch) {
5962 assert(VM_Version::supports_gfni(), "");
5963 assert(rscratch != noreg || always_reachable(mask), "missing");
5964
5965 // Galois field instruction based bit reversal based on following algorithm.
5966 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
5967 vpbroadcastq(xtmp, mask, vec_enc, rscratch);
5968 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
5969 vector_reverse_byte(bt, dst, xtmp, vec_enc);
5970 }
5971
5972 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
5973 XMMRegister xtmp1, Register rtmp, int vec_enc) {
5974 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
5975 evpandq(dst, xtmp1, src, vec_enc);
5976 vpsllq(dst, dst, nbits, vec_enc);
5977 vpandn(xtmp1, xtmp1, src, vec_enc);
5978 vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
5979 evporq(dst, dst, xtmp1, vec_enc);
5980 }
5981
5982 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5983 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5984 // Shift based bit reversal.
5985 assert(VM_Version::supports_evex(), "");
5986 switch(bt) {
5987 case T_LONG:
5988 // Swap upper and lower double word of each quad word.
5989 evprorq(xtmp1, k0, src, 32, true, vec_enc);
5990 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
5991 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5992 break;
5993 case T_INT:
5994 // Swap upper and lower word of each double word.
5995 evprord(xtmp1, k0, src, 16, true, vec_enc);
5996 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
5997 break;
5998 case T_CHAR:
5999 case T_SHORT:
6000 // Swap upper and lower byte of each word.
6001 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6002 break;
6003 case T_BYTE:
6004 evmovdquq(dst, k0, src, true, vec_enc);
6005 break;
6006 default:
6007 fatal("Unsupported type %s", type2name(bt));
6008 break;
6009 }
6010 }
6011
6012 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6013 if (bt == T_BYTE) {
6014 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6015 evmovdquq(dst, k0, src, true, vec_enc);
6016 } else {
6017 vmovdqu(dst, src);
6018 }
6019 return;
6020 }
6021 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6022 // pre-computed shuffle indices.
6023 switch(bt) {
6024 case T_LONG:
6025 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6026 break;
6027 case T_INT:
6028 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6029 break;
6030 case T_CHAR:
6031 case T_SHORT:
6032 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6033 break;
6034 default:
6035 fatal("Unsupported type %s", type2name(bt));
6036 break;
6037 }
6038 vpshufb(dst, src, dst, vec_enc);
6039 }
6040
6041 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6042 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6043 KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6044 assert(is_integral_type(bt), "");
6045 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6046 assert(VM_Version::supports_avx512cd(), "");
6047 switch(bt) {
6048 case T_LONG:
6049 evplzcntq(dst, ktmp, src, merge, vec_enc);
6050 break;
6051 case T_INT:
6052 evplzcntd(dst, ktmp, src, merge, vec_enc);
6053 break;
6054 case T_SHORT:
6055 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6056 vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6057 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6058 vpunpckhwd(dst, xtmp1, src, vec_enc);
6059 evplzcntd(dst, ktmp, dst, merge, vec_enc);
6060 vpackusdw(dst, xtmp2, dst, vec_enc);
6061 break;
6062 case T_BYTE:
6063 // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6064 // accessing the lookup table.
6065 // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6066 // accessing the lookup table.
6067 // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6068 assert(VM_Version::supports_avx512bw(), "");
6069 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6070 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6071 vpand(xtmp2, dst, src, vec_enc);
6072 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6073 vpsrlw(xtmp3, src, 4, vec_enc);
6074 vpand(xtmp3, dst, xtmp3, vec_enc);
6075 vpshufb(dst, xtmp1, xtmp3, vec_enc);
6076 vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6077 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6078 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6079 break;
6080 default:
6081 fatal("Unsupported type %s", type2name(bt));
6082 break;
6083 }
6084 }
6085
6086 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6087 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6088 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6089 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6090 // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6091 // accessing the lookup table.
6092 vpand(dst, xtmp2, src, vec_enc);
6093 vpshufb(dst, xtmp1, dst, vec_enc);
6094 // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6095 // accessing the lookup table.
6096 vpsrlw(xtmp3, src, 4, vec_enc);
6097 vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6098 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6099 // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6100 vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6101 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6102 vpaddb(dst, dst, xtmp2, vec_enc);
6103 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6104 }
6105
6106 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6107 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6108 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6109 // Add zero counts of lower byte and upper byte of a word if
6110 // upper byte holds a zero value.
6111 vpsrlw(xtmp3, src, 8, vec_enc);
6112 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6113 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6114 vpsllw(xtmp2, dst, 8, vec_enc);
6115 vpaddw(xtmp2, xtmp2, dst, vec_enc);
6116 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6117 vpsrlw(dst, dst, 8, vec_enc);
6118 }
6119
6120 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6121 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6122 // Since IEEE 754 floating point format represents mantissa in 1.0 format
6123 // hence biased exponent can be used to compute leading zero count as per
6124 // following formula:-
6125 // LZCNT = 31 - (biased_exp - 127)
6126 // Special handling has been introduced for Zero, Max_Int and -ve source values.
6127
6128 // Broadcast 0xFF
6129 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6130 vpsrld(xtmp1, xtmp1, 24, vec_enc);
6131
6132 // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6133 // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6134 // contributes to the leading number of zeros.
6135 vpsrld(xtmp2, src, 1, vec_enc);
6136 vpandn(xtmp3, xtmp2, src, vec_enc);
6137
6138 // Extract biased exponent.
6139 vcvtdq2ps(dst, xtmp3, vec_enc);
6140 vpsrld(dst, dst, 23, vec_enc);
6141 vpand(dst, dst, xtmp1, vec_enc);
6142
6143 // Broadcast 127.
6144 vpsrld(xtmp1, xtmp1, 1, vec_enc);
6145 // Exponent = biased_exp - 127
6146 vpsubd(dst, dst, xtmp1, vec_enc);
6147
6148 // Exponent_plus_one = Exponent + 1
6149 vpsrld(xtmp3, xtmp1, 6, vec_enc);
6150 vpaddd(dst, dst, xtmp3, vec_enc);
6151
6152 // Replace -ve exponent with zero, exponent is -ve when src
6153 // lane contains a zero value.
6154 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6155 vblendvps(dst, dst, xtmp2, dst, vec_enc);
6156
6157 // Rematerialize broadcast 32.
6158 vpslld(xtmp1, xtmp3, 5, vec_enc);
6159 // Exponent is 32 if corresponding source lane contains max_int value.
6160 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
6161 // LZCNT = 32 - exponent_plus_one
6162 vpsubd(dst, xtmp1, dst, vec_enc);
6163
6164 // Replace LZCNT with a value 1 if corresponding source lane
6165 // contains max_int value.
6166 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
6167
6168 // Replace biased_exp with 0 if source lane value is less than zero.
6169 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
6170 vblendvps(dst, dst, xtmp2, src, vec_enc);
6171 }
6172
6173 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6174 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6175 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6176 // Add zero counts of lower word and upper word of a double word if
6177 // upper word holds a zero value.
6178 vpsrld(xtmp3, src, 16, vec_enc);
6179 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6180 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
6181 vpslld(xtmp2, dst, 16, vec_enc);
6182 vpaddd(xtmp2, xtmp2, dst, vec_enc);
6183 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6184 vpsrld(dst, dst, 16, vec_enc);
6185 // Add zero counts of lower doubleword and upper doubleword of a
6186 // quadword if upper doubleword holds a zero value.
6187 vpsrlq(xtmp3, src, 32, vec_enc);
6188 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
6189 vpsllq(xtmp2, dst, 32, vec_enc);
6190 vpaddq(xtmp2, xtmp2, dst, vec_enc);
6191 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6192 vpsrlq(dst, dst, 32, vec_enc);
6193 }
6194
6195 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6196 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6197 Register rtmp, int vec_enc) {
6198 assert(is_integral_type(bt), "unexpected type");
6199 assert(vec_enc < Assembler::AVX_512bit, "");
6200 switch(bt) {
6201 case T_LONG:
6202 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6203 break;
6204 case T_INT:
6205 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6206 break;
6207 case T_SHORT:
6208 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6209 break;
6210 case T_BYTE:
6211 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6212 break;
6213 default:
6214 fatal("Unsupported type %s", type2name(bt));
6215 break;
6216 }
6217 }
6218
6219 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6220 switch(bt) {
6221 case T_BYTE:
6222 vpsubb(dst, src1, src2, vec_enc);
6223 break;
6224 case T_SHORT:
6225 vpsubw(dst, src1, src2, vec_enc);
6226 break;
6227 case T_INT:
6228 vpsubd(dst, src1, src2, vec_enc);
6229 break;
6230 case T_LONG:
6231 vpsubq(dst, src1, src2, vec_enc);
6232 break;
6233 default:
6234 fatal("Unsupported type %s", type2name(bt));
6235 break;
6236 }
6237 }
6238
6239 // Trailing zero count computation is based on leading zero count operation as per
6240 // following equation. All AVX3 targets support AVX512CD feature which offers
6241 // direct vector instruction to compute leading zero count.
6242 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6243 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6244 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6245 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6246 assert(is_integral_type(bt), "");
6247 // xtmp = -1
6248 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6249 // xtmp = xtmp + src
6250 vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6251 // xtmp = xtmp & ~src
6252 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6253 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6254 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6255 vpsub(bt, dst, xtmp4, dst, vec_enc);
6256 }
6257
6258 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6259 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6260 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6261 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6262 assert(is_integral_type(bt), "");
6263 // xtmp = 0
6264 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6265 // xtmp = 0 - src
6266 vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6267 // xtmp = xtmp | src
6268 vpor(xtmp3, xtmp3, src, vec_enc);
6269 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6270 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6271 vpsub(bt, dst, xtmp1, dst, vec_enc);
6272 }
6273
6274 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6275 Label done;
6276 Label neg_divisor_fastpath;
6277 cmpl(divisor, 0);
6278 jccb(Assembler::less, neg_divisor_fastpath);
6279 xorl(rdx, rdx);
6280 divl(divisor);
6281 jmpb(done);
6282 bind(neg_divisor_fastpath);
6283 // Fastpath for divisor < 0:
6284 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6285 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6286 movl(rdx, rax);
6287 subl(rdx, divisor);
6288 if (VM_Version::supports_bmi1()) {
6289 andnl(rax, rdx, rax);
6290 } else {
6291 notl(rdx);
6292 andl(rax, rdx);
6293 }
6294 shrl(rax, 31);
6295 bind(done);
6296 }
6297
6298 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6299 Label done;
6300 Label neg_divisor_fastpath;
6301 cmpl(divisor, 0);
6302 jccb(Assembler::less, neg_divisor_fastpath);
6303 xorl(rdx, rdx);
6304 divl(divisor);
6305 jmpb(done);
6306 bind(neg_divisor_fastpath);
6307 // Fastpath when divisor < 0:
6308 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6309 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6310 movl(rdx, rax);
6311 subl(rax, divisor);
6312 if (VM_Version::supports_bmi1()) {
6313 andnl(rax, rax, rdx);
6314 } else {
6315 notl(rax);
6316 andl(rax, rdx);
6317 }
6318 sarl(rax, 31);
6319 andl(rax, divisor);
6320 subl(rdx, rax);
6321 bind(done);
6322 }
6323
6324 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6325 Label done;
6326 Label neg_divisor_fastpath;
6327
6328 cmpl(divisor, 0);
6329 jccb(Assembler::less, neg_divisor_fastpath);
6330 xorl(rdx, rdx);
6331 divl(divisor);
6332 jmpb(done);
6333 bind(neg_divisor_fastpath);
6334 // Fastpath for divisor < 0:
6335 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6336 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6337 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6338 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6339 movl(rdx, rax);
6340 subl(rax, divisor);
6341 if (VM_Version::supports_bmi1()) {
6342 andnl(rax, rax, rdx);
6343 } else {
6344 notl(rax);
6345 andl(rax, rdx);
6346 }
6347 movl(tmp, rax);
6348 shrl(rax, 31); // quotient
6349 sarl(tmp, 31);
6350 andl(tmp, divisor);
6351 subl(rdx, tmp); // remainder
6352 bind(done);
6353 }
6354
6355 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6356 XMMRegister xtmp2, Register rtmp) {
6357 if(VM_Version::supports_gfni()) {
6358 // Galois field instruction based bit reversal based on following algorithm.
6359 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6360 mov64(rtmp, 0x8040201008040201L);
6361 movq(xtmp1, src);
6362 movq(xtmp2, rtmp);
6363 gf2p8affineqb(xtmp1, xtmp2, 0);
6364 movq(dst, xtmp1);
6365 } else {
6366 // Swap even and odd numbered bits.
6367 movl(rtmp, src);
6368 andl(rtmp, 0x55555555);
6369 shll(rtmp, 1);
6370 movl(dst, src);
6371 andl(dst, 0xAAAAAAAA);
6372 shrl(dst, 1);
6373 orl(dst, rtmp);
6374
6375 // Swap LSB and MSB 2 bits of each nibble.
6376 movl(rtmp, dst);
6377 andl(rtmp, 0x33333333);
6378 shll(rtmp, 2);
6379 andl(dst, 0xCCCCCCCC);
6380 shrl(dst, 2);
6381 orl(dst, rtmp);
6382
6383 // Swap LSB and MSB 4 bits of each byte.
6384 movl(rtmp, dst);
6385 andl(rtmp, 0x0F0F0F0F);
6386 shll(rtmp, 4);
6387 andl(dst, 0xF0F0F0F0);
6388 shrl(dst, 4);
6389 orl(dst, rtmp);
6390 }
6391 bswapl(dst);
6392 }
6393
6394 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6395 XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6396 if(VM_Version::supports_gfni()) {
6397 // Galois field instruction based bit reversal based on following algorithm.
6398 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6399 mov64(rtmp1, 0x8040201008040201L);
6400 movq(xtmp1, src);
6401 movq(xtmp2, rtmp1);
6402 gf2p8affineqb(xtmp1, xtmp2, 0);
6403 movq(dst, xtmp1);
6404 } else {
6405 // Swap even and odd numbered bits.
6406 movq(rtmp1, src);
6407 mov64(rtmp2, 0x5555555555555555L);
6408 andq(rtmp1, rtmp2);
6409 shlq(rtmp1, 1);
6410 movq(dst, src);
6411 notq(rtmp2);
6412 andq(dst, rtmp2);
6413 shrq(dst, 1);
6414 orq(dst, rtmp1);
6415
6416 // Swap LSB and MSB 2 bits of each nibble.
6417 movq(rtmp1, dst);
6418 mov64(rtmp2, 0x3333333333333333L);
6419 andq(rtmp1, rtmp2);
6420 shlq(rtmp1, 2);
6421 notq(rtmp2);
6422 andq(dst, rtmp2);
6423 shrq(dst, 2);
6424 orq(dst, rtmp1);
6425
6426 // Swap LSB and MSB 4 bits of each byte.
6427 movq(rtmp1, dst);
6428 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6429 andq(rtmp1, rtmp2);
6430 shlq(rtmp1, 4);
6431 notq(rtmp2);
6432 andq(dst, rtmp2);
6433 shrq(dst, 4);
6434 orq(dst, rtmp1);
6435 }
6436 bswapq(dst);
6437 }
6438
6439 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6440 Label done;
6441 Label neg_divisor_fastpath;
6442 cmpq(divisor, 0);
6443 jccb(Assembler::less, neg_divisor_fastpath);
6444 xorl(rdx, rdx);
6445 divq(divisor);
6446 jmpb(done);
6447 bind(neg_divisor_fastpath);
6448 // Fastpath for divisor < 0:
6449 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6450 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6451 movq(rdx, rax);
6452 subq(rdx, divisor);
6453 if (VM_Version::supports_bmi1()) {
6454 andnq(rax, rdx, rax);
6455 } else {
6456 notq(rdx);
6457 andq(rax, rdx);
6458 }
6459 shrq(rax, 63);
6460 bind(done);
6461 }
6462
6463 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6464 Label done;
6465 Label neg_divisor_fastpath;
6466 cmpq(divisor, 0);
6467 jccb(Assembler::less, neg_divisor_fastpath);
6468 xorq(rdx, rdx);
6469 divq(divisor);
6470 jmp(done);
6471 bind(neg_divisor_fastpath);
6472 // Fastpath when divisor < 0:
6473 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6474 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6475 movq(rdx, rax);
6476 subq(rax, divisor);
6477 if (VM_Version::supports_bmi1()) {
6478 andnq(rax, rax, rdx);
6479 } else {
6480 notq(rax);
6481 andq(rax, rdx);
6482 }
6483 sarq(rax, 63);
6484 andq(rax, divisor);
6485 subq(rdx, rax);
6486 bind(done);
6487 }
6488
6489 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6490 Label done;
6491 Label neg_divisor_fastpath;
6492 cmpq(divisor, 0);
6493 jccb(Assembler::less, neg_divisor_fastpath);
6494 xorq(rdx, rdx);
6495 divq(divisor);
6496 jmp(done);
6497 bind(neg_divisor_fastpath);
6498 // Fastpath for divisor < 0:
6499 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6500 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6501 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6502 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6503 movq(rdx, rax);
6504 subq(rax, divisor);
6505 if (VM_Version::supports_bmi1()) {
6506 andnq(rax, rax, rdx);
6507 } else {
6508 notq(rax);
6509 andq(rax, rdx);
6510 }
6511 movq(tmp, rax);
6512 shrq(rax, 63); // quotient
6513 sarq(tmp, 63);
6514 andq(tmp, divisor);
6515 subq(rdx, tmp); // remainder
6516 bind(done);
6517 }
6518
6519 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6520 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6521 int vlen_enc) {
6522 assert(VM_Version::supports_avx512bw(), "");
6523 // Byte shuffles are inlane operations and indices are determined using
6524 // lower 4 bit of each shuffle lane, thus all shuffle indices are
6525 // normalized to index range 0-15. This makes sure that all the multiples
6526 // of an index value are placed at same relative position in 128 bit
6527 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6528 // will be 16th element in their respective 128 bit lanes.
6529 movl(rtmp, 16);
6530 evpbroadcastb(xtmp1, rtmp, vlen_enc);
6531
6532 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6533 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6534 // original shuffle indices and move the shuffled lanes corresponding to true
6535 // mask to destination vector.
6536 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6537 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6538 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6539
6540 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6541 // and broadcasting second 128 bit lane.
6542 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
6543 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6544 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6545 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6546 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6547
6548 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6549 // and broadcasting third 128 bit lane.
6550 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc);
6551 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6552 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6553 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6554 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6555
6556 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6557 // and broadcasting third 128 bit lane.
6558 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
6559 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6560 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6561 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6562 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6563 }
6564
6565 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6566 XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6567 if (vlen_enc == AVX_128bit) {
6568 vpermilps(dst, src, shuffle, vlen_enc);
6569 } else if (bt == T_INT) {
6570 vpermd(dst, shuffle, src, vlen_enc);
6571 } else {
6572 assert(bt == T_FLOAT, "");
6573 vpermps(dst, shuffle, src, vlen_enc);
6574 }
6575 }
6576
6577 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6578 switch(opcode) {
6579 case Op_AddHF: vaddsh(dst, src1, src2); break;
6580 case Op_SubHF: vsubsh(dst, src1, src2); break;
6581 case Op_MulHF: vmulsh(dst, src1, src2); break;
6582 case Op_DivHF: vdivsh(dst, src1, src2); break;
6583 default: assert(false, "%s", NodeClassNames[opcode]); break;
6584 }
6585 }
6586
6587 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6588 switch(elem_bt) {
6589 case T_BYTE:
6590 if (ideal_opc == Op_SaturatingAddV) {
6591 vpaddsb(dst, src1, src2, vlen_enc);
6592 } else {
6593 assert(ideal_opc == Op_SaturatingSubV, "");
6594 vpsubsb(dst, src1, src2, vlen_enc);
6595 }
6596 break;
6597 case T_SHORT:
6598 if (ideal_opc == Op_SaturatingAddV) {
6599 vpaddsw(dst, src1, src2, vlen_enc);
6600 } else {
6601 assert(ideal_opc == Op_SaturatingSubV, "");
6602 vpsubsw(dst, src1, src2, vlen_enc);
6603 }
6604 break;
6605 default:
6606 fatal("Unsupported type %s", type2name(elem_bt));
6607 break;
6608 }
6609 }
6610
6611 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6612 switch(elem_bt) {
6613 case T_BYTE:
6614 if (ideal_opc == Op_SaturatingAddV) {
6615 vpaddusb(dst, src1, src2, vlen_enc);
6616 } else {
6617 assert(ideal_opc == Op_SaturatingSubV, "");
6618 vpsubusb(dst, src1, src2, vlen_enc);
6619 }
6620 break;
6621 case T_SHORT:
6622 if (ideal_opc == Op_SaturatingAddV) {
6623 vpaddusw(dst, src1, src2, vlen_enc);
6624 } else {
6625 assert(ideal_opc == Op_SaturatingSubV, "");
6626 vpsubusw(dst, src1, src2, vlen_enc);
6627 }
6628 break;
6629 default:
6630 fatal("Unsupported type %s", type2name(elem_bt));
6631 break;
6632 }
6633 }
6634
6635 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6636 XMMRegister src2, KRegister ktmp, int vlen_enc) {
6637 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6638 // overflow_mask = Inp1 <u Inp2
6639 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc);
6640 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6641 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6642 }
6643
6644 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6645 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6646 // Emulate unsigned comparison using signed comparison
6647 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6648 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6649 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6650 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6651
6652 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6653
6654 // Res = INP1 - INP2 (non-commutative and non-associative)
6655 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6656 // Res = Mask ? Zero : Res
6657 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6658 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6659 }
6660
6661 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6662 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6663 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6664 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6665 // Res = Signed Add INP1, INP2
6666 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6667 // T1 = SRC1 | SRC2
6668 vpor(xtmp1, src1, src2, vlen_enc);
6669 // Max_Unsigned = -1
6670 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6671 // Unsigned compare: Mask = Res <u T1
6672 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6673 // res = Mask ? Max_Unsigned : Res
6674 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc);
6675 }
6676
6677 //
6678 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6679 // unsigned addition operation.
6680 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6681 //
6682 // We empirically determined its semantic equivalence to following reduced expression
6683 // overflow_mask = (a + b) <u (a | b)
6684 //
6685 // and also verified it though Alive2 solver.
6686 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6687 //
6688
6689 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6690 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6691 // Res = Signed Add INP1, INP2
6692 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6693 // Compute T1 = INP1 | INP2
6694 vpor(xtmp3, src1, src2, vlen_enc);
6695 // T1 = Minimum signed value.
6696 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6697 // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6698 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6699 // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6700 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6701 // Compute overflow detection mask = Res<1> <s T1
6702 if (elem_bt == T_INT) {
6703 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6704 } else {
6705 assert(elem_bt == T_LONG, "");
6706 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6707 }
6708 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6709 }
6710
6711 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6712 int vlen_enc, bool xtmp2_hold_M1) {
6713 if (VM_Version::supports_avx512dq()) {
6714 evpmovq2m(ktmp, src, vlen_enc);
6715 } else {
6716 assert(VM_Version::supports_evex(), "");
6717 if (!xtmp2_hold_M1) {
6718 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6719 }
6720 evpsraq(xtmp1, src, 63, vlen_enc);
6721 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6722 }
6723 }
6724
6725 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6726 int vlen_enc, bool xtmp2_hold_M1) {
6727 if (VM_Version::supports_avx512dq()) {
6728 evpmovd2m(ktmp, src, vlen_enc);
6729 } else {
6730 assert(VM_Version::supports_evex(), "");
6731 if (!xtmp2_hold_M1) {
6732 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6733 }
6734 vpsrad(xtmp1, src, 31, vlen_enc);
6735 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6736 }
6737 }
6738
6739
6740 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6741 if (elem_bt == T_LONG) {
6742 if (VM_Version::supports_evex()) {
6743 evpsraq(dst, src, 63, vlen_enc);
6744 } else {
6745 vpsrad(dst, src, 31, vlen_enc);
6746 vpshufd(dst, dst, 0xF5, vlen_enc);
6747 }
6748 } else {
6749 assert(elem_bt == T_INT, "");
6750 vpsrad(dst, src, 31, vlen_enc);
6751 }
6752 }
6753
6754 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6755 if (compute_allones) {
6756 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6757 vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6758 } else {
6759 vpcmpeqq(allones, allones, allones, vlen_enc);
6760 }
6761 }
6762 if (elem_bt == T_LONG) {
6763 vpsrlq(dst, allones, 1, vlen_enc);
6764 } else {
6765 assert(elem_bt == T_INT, "");
6766 vpsrld(dst, allones, 1, vlen_enc);
6767 }
6768 }
6769
6770 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6771 if (compute_allones) {
6772 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6773 vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6774 } else {
6775 vpcmpeqq(allones, allones, allones, vlen_enc);
6776 }
6777 }
6778 if (elem_bt == T_LONG) {
6779 vpsllq(dst, allones, 63, vlen_enc);
6780 } else {
6781 assert(elem_bt == T_INT, "");
6782 vpslld(dst, allones, 31, vlen_enc);
6783 }
6784 }
6785
6786 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2,
6787 Assembler::ComparisonPredicate cond, int vlen_enc) {
6788 switch(elem_bt) {
6789 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6790 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6791 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6792 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6793 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6794 }
6795 }
6796
6797 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6798 switch(elem_bt) {
6799 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break;
6800 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break;
6801 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6802 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break;
6803 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6804 }
6805 }
6806
6807 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6808 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6809 if (elem_bt == T_LONG) {
6810 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6811 } else {
6812 assert(elem_bt == T_INT, "");
6813 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6814 }
6815 }
6816
6817 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6818 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6819 KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6820 assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6821 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6822 // Overflow detection based on Hacker's delight section 2-13.
6823 if (ideal_opc == Op_SaturatingAddV) {
6824 // res = src1 + src2
6825 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6826 // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6827 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6828 vpxor(xtmp1, dst, src1, vlen_enc);
6829 vpxor(xtmp2, dst, src2, vlen_enc);
6830 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6831 } else {
6832 assert(ideal_opc == Op_SaturatingSubV, "");
6833 // res = src1 - src2
6834 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6835 // Overflow occurs when both inputs have opposite polarity and
6836 // result polarity does not comply with first input polarity.
6837 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6838 vpxor(xtmp1, src1, src2, vlen_enc);
6839 vpxor(xtmp2, dst, src1, vlen_enc);
6840 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6841 }
6842
6843 // Compute overflow detection mask.
6844 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6845 // Note: xtmp1 hold -1 in all its lanes after above call.
6846
6847 // Compute mask based on first input polarity.
6848 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6849
6850 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6851 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6852
6853 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6854 // set bits in first input polarity mask holds a min value.
6855 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6856 // Blend destination lanes with saturated values using overflow detection mask.
6857 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6858 }
6859
6860
6861 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6862 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6863 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6864 assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6865 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6866 // Overflow detection based on Hacker's delight section 2-13.
6867 if (ideal_opc == Op_SaturatingAddV) {
6868 // res = src1 + src2
6869 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6870 // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6871 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6872 vpxor(xtmp1, dst, src1, vlen_enc);
6873 vpxor(xtmp2, dst, src2, vlen_enc);
6874 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6875 } else {
6876 assert(ideal_opc == Op_SaturatingSubV, "");
6877 // res = src1 - src2
6878 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6879 // Overflow occurs when both inputs have opposite polarity and
6880 // result polarity does not comply with first input polarity.
6881 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6882 vpxor(xtmp1, src1, src2, vlen_enc);
6883 vpxor(xtmp2, dst, src1, vlen_enc);
6884 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6885 }
6886
6887 // Sign-extend to compute overflow detection mask.
6888 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6889
6890 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6891 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6892 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6893
6894 // Compose saturating min/max vector using first input polarity mask.
6895 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6896 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6897
6898 // Blend result with saturating vector using overflow detection mask.
6899 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6900 }
6901
6902 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6903 switch(elem_bt) {
6904 case T_BYTE:
6905 if (ideal_opc == Op_SaturatingAddV) {
6906 vpaddsb(dst, src1, src2, vlen_enc);
6907 } else {
6908 assert(ideal_opc == Op_SaturatingSubV, "");
6909 vpsubsb(dst, src1, src2, vlen_enc);
6910 }
6911 break;
6912 case T_SHORT:
6913 if (ideal_opc == Op_SaturatingAddV) {
6914 vpaddsw(dst, src1, src2, vlen_enc);
6915 } else {
6916 assert(ideal_opc == Op_SaturatingSubV, "");
6917 vpsubsw(dst, src1, src2, vlen_enc);
6918 }
6919 break;
6920 default:
6921 fatal("Unsupported type %s", type2name(elem_bt));
6922 break;
6923 }
6924 }
6925
6926 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6927 switch(elem_bt) {
6928 case T_BYTE:
6929 if (ideal_opc == Op_SaturatingAddV) {
6930 vpaddusb(dst, src1, src2, vlen_enc);
6931 } else {
6932 assert(ideal_opc == Op_SaturatingSubV, "");
6933 vpsubusb(dst, src1, src2, vlen_enc);
6934 }
6935 break;
6936 case T_SHORT:
6937 if (ideal_opc == Op_SaturatingAddV) {
6938 vpaddusw(dst, src1, src2, vlen_enc);
6939 } else {
6940 assert(ideal_opc == Op_SaturatingSubV, "");
6941 vpsubusw(dst, src1, src2, vlen_enc);
6942 }
6943 break;
6944 default:
6945 fatal("Unsupported type %s", type2name(elem_bt));
6946 break;
6947 }
6948 }
6949
6950 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6951 XMMRegister src2, int vlen_enc) {
6952 switch(elem_bt) {
6953 case T_BYTE:
6954 evpermi2b(dst, src1, src2, vlen_enc);
6955 break;
6956 case T_SHORT:
6957 evpermi2w(dst, src1, src2, vlen_enc);
6958 break;
6959 case T_INT:
6960 evpermi2d(dst, src1, src2, vlen_enc);
6961 break;
6962 case T_LONG:
6963 evpermi2q(dst, src1, src2, vlen_enc);
6964 break;
6965 case T_FLOAT:
6966 evpermi2ps(dst, src1, src2, vlen_enc);
6967 break;
6968 case T_DOUBLE:
6969 evpermi2pd(dst, src1, src2, vlen_enc);
6970 break;
6971 default:
6972 fatal("Unsupported type %s", type2name(elem_bt));
6973 break;
6974 }
6975 }
6976
6977 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
6978 if (is_unsigned) {
6979 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6980 } else {
6981 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6982 }
6983 }
6984
6985 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
6986 if (is_unsigned) {
6987 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6988 } else {
6989 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
6990 }
6991 }
6992
6993 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6994 switch(opcode) {
6995 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
6996 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
6997 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
6998 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
6999 default: assert(false, "%s", NodeClassNames[opcode]); break;
7000 }
7001 }
7002
7003 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7004 switch(opcode) {
7005 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7006 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7007 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7008 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7009 default: assert(false, "%s", NodeClassNames[opcode]); break;
7010 }
7011 }
7012
7013 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7014 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7015 vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7016 }
7017
7018 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7019 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7020 if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7021 // Move sign bits of src2 to mask register.
7022 evpmovw2m(ktmp, src2, vlen_enc);
7023 // xtmp1 = src2 < 0 ? src2 : src1
7024 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7025 // xtmp2 = src2 < 0 ? ? src1 : src2
7026 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7027 // Idea behind above swapping is to make seconds source operand a +ve value.
7028 // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7029 // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7030 // the second source operand, either a NaN or a valid floating-point value, is returned
7031 // dst = max(xtmp1, xtmp2)
7032 evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7033 // isNaN = is_unordered_quiet(xtmp1)
7034 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7035 // Final result is same as first source if its a NaN value,
7036 // in case second operand holds a NaN value then as per above semantics
7037 // result is same as second operand.
7038 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7039 } else {
7040 assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7041 // Move sign bits of src1 to mask register.
7042 evpmovw2m(ktmp, src1, vlen_enc);
7043 // xtmp1 = src1 < 0 ? src2 : src1
7044 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7045 // xtmp2 = src1 < 0 ? src1 : src2
7046 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7047 // Idea behind above swapping is to make seconds source operand a -ve value.
7048 // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7049 // the second source operand is returned.
7050 // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7051 // or a valid floating-point value, is written to the result.
7052 // dst = min(xtmp1, xtmp2)
7053 evminph(dst, xtmp1, xtmp2, vlen_enc);
7054 // isNaN = is_unordered_quiet(xtmp1)
7055 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7056 // Final result is same as first source if its a NaN value,
7057 // in case second operand holds a NaN value then as per above semantics
7058 // result is same as second operand.
7059 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7060 }
7061 }