1 /*
2 * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "asm/assembler.hpp"
26 #include "asm/assembler.inline.hpp"
27 #include "gc/shared/barrierSet.hpp"
28 #include "gc/shared/barrierSetAssembler.hpp"
29 #include "oops/methodData.hpp"
30 #include "opto/c2_MacroAssembler.hpp"
31 #include "opto/intrinsicnode.hpp"
32 #include "opto/output.hpp"
33 #include "opto/opcodes.hpp"
34 #include "opto/subnode.hpp"
35 #include "runtime/globals.hpp"
36 #include "runtime/objectMonitor.hpp"
37 #include "runtime/objectMonitorTable.hpp"
38 #include "runtime/stubRoutines.hpp"
39 #include "runtime/synchronizer.hpp"
40 #include "utilities/checkedCast.hpp"
41 #include "utilities/globalDefinitions.hpp"
42 #include "utilities/powerOfTwo.hpp"
43 #include "utilities/sizes.hpp"
44
45 #ifdef PRODUCT
46 #define BLOCK_COMMENT(str) /* nothing */
47 #define STOP(error) stop(error)
48 #else
49 #define BLOCK_COMMENT(str) block_comment(str)
50 #define STOP(error) block_comment(error); stop(error)
51 #endif
52
53 // C2 compiled method's prolog code.
54 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
55 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
56
57 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
58 // Remove word for return addr
59 framesize -= wordSize;
60 stack_bang_size -= wordSize;
61
62 // Calls to C2R adapters often do not accept exceptional returns.
63 // We require that their callers must bang for them. But be careful, because
64 // some VM calls (such as call site linkage) can use several kilobytes of
65 // stack. But the stack safety zone should account for that.
66 // See bugs 4446381, 4468289, 4497237.
67 if (stack_bang_size > 0) {
68 generate_stack_overflow_check(stack_bang_size);
69
70 // We always push rbp, so that on return to interpreter rbp, will be
71 // restored correctly and we can correct the stack.
72 push(rbp);
73 // Save caller's stack pointer into RBP if the frame pointer is preserved.
74 if (PreserveFramePointer) {
75 mov(rbp, rsp);
76 }
77 // Remove word for ebp
78 framesize -= wordSize;
79
80 // Create frame
81 if (framesize) {
82 subptr(rsp, framesize);
83 }
84 } else {
85 subptr(rsp, framesize);
86
87 // Save RBP register now.
88 framesize -= wordSize;
89 movptr(Address(rsp, framesize), rbp);
90 // Save caller's stack pointer into RBP if the frame pointer is preserved.
91 if (PreserveFramePointer) {
92 movptr(rbp, rsp);
93 if (framesize > 0) {
94 addptr(rbp, framesize);
95 }
96 }
97 }
98
99 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
100 framesize -= wordSize;
101 movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
102 }
103
104 #ifdef ASSERT
105 if (VerifyStackAtCalls) {
106 Label L;
107 push(rax);
108 mov(rax, rsp);
109 andptr(rax, StackAlignmentInBytes-1);
110 cmpptr(rax, StackAlignmentInBytes-wordSize);
111 pop(rax);
112 jcc(Assembler::equal, L);
113 STOP("Stack is not properly aligned!");
114 bind(L);
115 }
116 #endif
117
118 if (!is_stub) {
119 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
120 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
121 Label dummy_slow_path;
122 Label dummy_continuation;
123 Label* slow_path = &dummy_slow_path;
124 Label* continuation = &dummy_continuation;
125 if (!Compile::current()->output()->in_scratch_emit_size()) {
126 // Use real labels from actual stub when not emitting code for the purpose of measuring its size
127 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
128 Compile::current()->output()->add_stub(stub);
129 slow_path = &stub->entry();
130 continuation = &stub->continuation();
131 }
132 bs->nmethod_entry_barrier(this, slow_path, continuation);
133 }
134 }
135
136 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
137 switch (vlen_in_bytes) {
138 case 4: // fall-through
139 case 8: // fall-through
140 case 16: return Assembler::AVX_128bit;
141 case 32: return Assembler::AVX_256bit;
142 case 64: return Assembler::AVX_512bit;
143
144 default: {
145 ShouldNotReachHere();
146 return Assembler::AVX_NoVec;
147 }
148 }
149 }
150
151 // fast_lock and fast_unlock used by C2
152
153 // Because the transitions from emitted code to the runtime
154 // monitorenter/exit helper stubs are so slow it's critical that
155 // we inline both the stack-locking fast path and the inflated fast path.
156 //
157 // See also: cmpFastLock and cmpFastUnlock.
158 //
159 // What follows is a specialized inline transliteration of the code
160 // in enter() and exit(). If we're concerned about I$ bloat another
161 // option would be to emit TrySlowEnter and TrySlowExit methods
162 // at startup-time. These methods would accept arguments as
163 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
164 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply
165 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
166 // In practice, however, the # of lock sites is bounded and is usually small.
167 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
168 // if the processor uses simple bimodal branch predictors keyed by EIP
169 // Since the helper routines would be called from multiple synchronization
170 // sites.
171 //
172 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
173 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
174 // to those specialized methods. That'd give us a mostly platform-independent
175 // implementation that the JITs could optimize and inline at their pleasure.
176 // Done correctly, the only time we'd need to cross to native could would be
177 // to park() or unpark() threads. We'd also need a few more unsafe operators
178 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
179 // (b) explicit barriers or fence operations.
180 //
181 // TODO:
182 //
183 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
184 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
185 // Given TLAB allocation, Self is usually manifested in a register, so passing it into
186 // the lock operators would typically be faster than reifying Self.
187 //
188 // * Ideally I'd define the primitives as:
189 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
190 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
191 // Unfortunately ADLC bugs prevent us from expressing the ideal form.
192 // Instead, we're stuck with a rather awkward and brittle register assignments below.
193 // Furthermore the register assignments are overconstrained, possibly resulting in
194 // sub-optimal code near the synchronization site.
195 //
196 // * Eliminate the sp-proximity tests and just use "== Self" tests instead.
197 // Alternately, use a better sp-proximity test.
198 //
199 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
200 // Either one is sufficient to uniquely identify a thread.
201 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
202 //
203 // * Intrinsify notify() and notifyAll() for the common cases where the
204 // object is locked by the calling thread but the waitlist is empty.
205 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
206 //
207 // * use jccb and jmpb instead of jcc and jmp to improve code density.
208 // But beware of excessive branch density on AMD Opterons.
209 //
210 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success
211 // or failure of the fast path. If the fast path fails then we pass
212 // control to the slow path, typically in C. In fast_lock and
213 // fast_unlock we often branch to DONE_LABEL, just to find that C2
214 // will emit a conditional branch immediately after the node.
215 // So we have branches to branches and lots of ICC.ZF games.
216 // Instead, it might be better to have C2 pass a "FailureLabel"
217 // into fast_lock and fast_unlock. In the case of success, control
218 // will drop through the node. ICC.ZF is undefined at exit.
219 // In the case of failure, the node will branch directly to the
220 // FailureLabel
221
222 // obj: object to lock
223 // box: on-stack box address -- KILLED
224 // rax: tmp -- KILLED
225 // t : tmp -- KILLED
226 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register rax_reg,
227 Register t, Register thread) {
228 assert(rax_reg == rax, "Used for CAS");
229 assert_different_registers(obj, box, rax_reg, t, thread);
230
231 // Handle inflated monitor.
232 Label inflated;
233 // Finish fast lock successfully. ZF value is irrelevant.
234 Label locked;
235 // Finish fast lock unsuccessfully. MUST jump with ZF == 0
236 Label slow_path;
237
238 if (UseObjectMonitorTable) {
239 // Clear cache in case fast locking succeeds or we need to take the slow-path.
240 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
241 }
242
243 if (DiagnoseSyncOnValueBasedClasses != 0) {
244 load_klass(rax_reg, obj, t);
245 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
246 jcc(Assembler::notZero, slow_path);
247 }
248
249 const Register mark = t;
250
251 { // Fast Lock
252
253 Label push;
254
255 const Register top = UseObjectMonitorTable ? rax_reg : box;
256
257 // Load the mark.
258 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
259
260 // Prefetch top.
261 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
262
263 // Check for monitor (0b10).
264 testptr(mark, markWord::monitor_value);
265 jcc(Assembler::notZero, inflated);
266
267 // Check if lock-stack is full.
268 cmpl(top, LockStack::end_offset() - 1);
269 jcc(Assembler::greater, slow_path);
270
271 // Check if recursive.
272 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
273 jccb(Assembler::equal, push);
274
275 // Try to lock. Transition lock bits 0b01 => 0b00
276 movptr(rax_reg, mark);
277 orptr(rax_reg, markWord::unlocked_value);
278 andptr(mark, ~(int32_t)markWord::unlocked_value);
279 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
280 jcc(Assembler::notEqual, slow_path);
281
282 if (UseObjectMonitorTable) {
283 // Need to reload top, clobbered by CAS.
284 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
285 }
286 bind(push);
287 // After successful lock, push object on lock-stack.
288 movptr(Address(thread, top), obj);
289 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
290 jmp(locked);
291 }
292
293 { // Handle inflated monitor.
294 bind(inflated);
295
296 const Register monitor = t;
297
298 if (!UseObjectMonitorTable) {
299 assert(mark == monitor, "should be the same here");
300 } else {
301 const Register hash = t;
302 Label monitor_found;
303
304 // Look for the monitor in the om_cache.
305
306 ByteSize cache_offset = JavaThread::om_cache_oops_offset();
307 ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
308 const int num_unrolled = OMCache::CAPACITY;
309 for (int i = 0; i < num_unrolled; i++) {
310 movptr(monitor, Address(thread, cache_offset + monitor_offset));
311 cmpptr(obj, Address(thread, cache_offset));
312 jccb(Assembler::equal, monitor_found);
313 cache_offset = cache_offset + OMCache::oop_to_oop_difference();
314 }
315
316 // Look for the monitor in the table.
317
318 // Get the hash code.
319 movptr(hash, Address(obj, oopDesc::mark_offset_in_bytes()));
320 shrq(hash, markWord::hash_shift);
321 andq(hash, markWord::hash_mask);
322
323 // Get the table and calculate the bucket's address.
324 lea(rax_reg, ExternalAddress(ObjectMonitorTable::current_table_address()));
325 movptr(rax_reg, Address(rax_reg));
326 andq(hash, Address(rax_reg, ObjectMonitorTable::table_capacity_mask_offset()));
327 movptr(rax_reg, Address(rax_reg, ObjectMonitorTable::table_buckets_offset()));
328
329 // Read the monitor from the bucket.
330 movptr(monitor, Address(rax_reg, hash, Address::times_ptr));
331
332 // Check if the monitor in the bucket is special (empty, tombstone or removed)
333 cmpptr(monitor, ObjectMonitorTable::SpecialPointerValues::below_is_special);
334 jcc(Assembler::below, slow_path);
335
336 // Check if object matches.
337 movptr(rax_reg, Address(monitor, ObjectMonitor::object_offset()));
338 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
339 bs_asm->try_resolve_weak_handle_in_c2(this, rax_reg, slow_path);
340 cmpptr(rax_reg, obj);
341 jcc(Assembler::notEqual, slow_path);
342
343 bind(monitor_found);
344 }
345 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
346 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
347 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
348
349 Label monitor_locked;
350 // Lock the monitor.
351
352 if (UseObjectMonitorTable) {
353 // Cache the monitor for unlock before trashing box. On failure to acquire
354 // the lock, the slow path will reset the entry accordingly (see CacheSetter).
355 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
356 }
357
358 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
359 xorptr(rax_reg, rax_reg);
360 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
361 lock(); cmpxchgptr(box, owner_address);
362 jccb(Assembler::equal, monitor_locked);
363
364 // Check if recursive.
365 cmpptr(box, rax_reg);
366 jccb(Assembler::notEqual, slow_path);
367
368 // Recursive.
369 increment(recursions_address);
370
371 bind(monitor_locked);
372 }
373
374 bind(locked);
375 // Set ZF = 1
376 xorl(rax_reg, rax_reg);
377
378 #ifdef ASSERT
379 // Check that locked label is reached with ZF set.
380 Label zf_correct;
381 Label zf_bad_zero;
382 jcc(Assembler::zero, zf_correct);
383 jmp(zf_bad_zero);
384 #endif
385
386 bind(slow_path);
387 #ifdef ASSERT
388 // Check that slow_path label is reached with ZF not set.
389 jcc(Assembler::notZero, zf_correct);
390 stop("Fast Lock ZF != 0");
391 bind(zf_bad_zero);
392 stop("Fast Lock ZF != 1");
393 bind(zf_correct);
394 #endif
395 // C2 uses the value of ZF to determine the continuation.
396 }
397
398 // obj: object to lock
399 // rax: tmp -- KILLED
400 // t : tmp - cannot be obj nor rax -- KILLED
401 //
402 // Some commentary on balanced locking:
403 //
404 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
405 // Methods that don't have provably balanced locking are forced to run in the
406 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
407 // The interpreter provides two properties:
408 // I1: At return-time the interpreter automatically and quietly unlocks any
409 // objects acquired in the current activation (frame). Recall that the
410 // interpreter maintains an on-stack list of locks currently held by
411 // a frame.
412 // I2: If a method attempts to unlock an object that is not held by the
413 // frame the interpreter throws IMSX.
414 //
415 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
416 // B() doesn't have provably balanced locking so it runs in the interpreter.
417 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
418 // is still locked by A().
419 //
420 // The only other source of unbalanced locking would be JNI. The "Java Native Interface
421 // Specification" states that an object locked by JNI's MonitorEnter should not be
422 // unlocked by "normal" java-level locking and vice-versa. The specification doesn't
423 // specify what will occur if a program engages in such mixed-mode locking, however.
424 // Arguably given that the spec legislates the JNI case as undefined our implementation
425 // could reasonably *avoid* checking owner in fast_unlock().
426 // In the interest of performance we elide m->Owner==Self check in unlock.
427 // A perfectly viable alternative is to elide the owner check except when
428 // Xcheck:jni is enabled.
429
430 void C2_MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register t, Register thread) {
431 assert(reg_rax == rax, "Used for CAS");
432 assert_different_registers(obj, reg_rax, t);
433
434 // Handle inflated monitor.
435 Label inflated, inflated_check_lock_stack;
436 // Finish fast unlock successfully. MUST jump with ZF == 1
437 Label unlocked, slow_path;
438
439 const Register mark = t;
440 const Register monitor = t;
441 const Register top = UseObjectMonitorTable ? t : reg_rax;
442 const Register box = reg_rax;
443
444 Label dummy;
445 C2FastUnlockStub* stub = nullptr;
446
447 if (!Compile::current()->output()->in_scratch_emit_size()) {
448 stub = new (Compile::current()->comp_arena()) C2FastUnlockStub(obj, mark, reg_rax, thread);
449 Compile::current()->output()->add_stub(stub);
450 }
451
452 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
453
454 { // Fast Unlock
455
456 // Load top.
457 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
458
459 if (!UseObjectMonitorTable) {
460 // Prefetch mark.
461 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
462 }
463
464 // Check if obj is top of lock-stack.
465 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
466 // Top of lock stack was not obj. Must be monitor.
467 jcc(Assembler::notEqual, inflated_check_lock_stack);
468
469 // Pop lock-stack.
470 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
471 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
472
473 // Check if recursive.
474 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
475 jcc(Assembler::equal, unlocked);
476
477 // We elide the monitor check, let the CAS fail instead.
478
479 if (UseObjectMonitorTable) {
480 // Load mark.
481 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
482 }
483
484 // Try to unlock. Transition lock bits 0b00 => 0b01
485 movptr(reg_rax, mark);
486 andptr(reg_rax, ~(int32_t)markWord::lock_mask);
487 orptr(mark, markWord::unlocked_value);
488 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
489 jcc(Assembler::notEqual, push_and_slow_path);
490 jmp(unlocked);
491 }
492
493
494 { // Handle inflated monitor.
495 bind(inflated_check_lock_stack);
496 #ifdef ASSERT
497 Label check_done;
498 subl(top, oopSize);
499 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
500 jcc(Assembler::below, check_done);
501 cmpptr(obj, Address(thread, top));
502 jcc(Assembler::notEqual, inflated_check_lock_stack);
503 stop("Fast Unlock lock on stack");
504 bind(check_done);
505 if (UseObjectMonitorTable) {
506 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
507 }
508 testptr(mark, markWord::monitor_value);
509 jcc(Assembler::notZero, inflated);
510 stop("Fast Unlock not monitor");
511 #endif
512
513 bind(inflated);
514
515 if (!UseObjectMonitorTable) {
516 assert(mark == monitor, "should be the same here");
517 } else {
518 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack.
519 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
520 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
521 cmpptr(monitor, alignof(ObjectMonitor*));
522 jcc(Assembler::below, slow_path);
523 }
524 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
525 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
526 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
527 const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
528 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
529
530 Label recursive;
531
532 // Check if recursive.
533 cmpptr(recursions_address, 0);
534 jcc(Assembler::notZero, recursive);
535
536 // Set owner to null.
537 // Release to satisfy the JMM
538 movptr(owner_address, NULL_WORD);
539 // We need a full fence after clearing owner to avoid stranding.
540 // StoreLoad achieves this.
541 membar(StoreLoad);
542
543 // Check if the entry_list is empty.
544 cmpptr(entry_list_address, NULL_WORD);
545 jcc(Assembler::zero, unlocked); // If so we are done.
546
547 // Check if there is a successor.
548 cmpptr(succ_address, NULL_WORD);
549 jcc(Assembler::notZero, unlocked); // If so we are done.
550
551 // Save the monitor pointer in the current thread, so we can try to
552 // reacquire the lock in SharedRuntime::monitor_exit_helper().
553 if (!UseObjectMonitorTable) {
554 andptr(monitor, ~(int32_t)markWord::monitor_value);
555 }
556 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
557
558 orl(t, 1); // Fast Unlock ZF = 0
559 jmpb(slow_path);
560
561 // Recursive unlock.
562 bind(recursive);
563 decrement(recursions_address);
564 }
565
566 bind(unlocked);
567 xorl(t, t); // Fast Unlock ZF = 1
568
569 #ifdef ASSERT
570 // Check that unlocked label is reached with ZF set.
571 Label zf_correct;
572 Label zf_bad_zero;
573 jcc(Assembler::zero, zf_correct);
574 jmp(zf_bad_zero);
575 #endif
576
577 bind(slow_path);
578 if (stub != nullptr) {
579 bind(stub->slow_path_continuation());
580 }
581 #ifdef ASSERT
582 // Check that stub->continuation() label is reached with ZF not set.
583 jcc(Assembler::notZero, zf_correct);
584 stop("Fast Unlock ZF != 0");
585 bind(zf_bad_zero);
586 stop("Fast Unlock ZF != 1");
587 bind(zf_correct);
588 #endif
589 // C2 uses the value of ZF to determine the continuation.
590 }
591
592 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
593 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
594 }
595
596 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
597 const int framesize = Compile::current()->output()->frame_size_in_bytes();
598 masm->movptr(dst, rsp);
599 if (framesize > 2 * wordSize) {
600 masm->addptr(dst, framesize - 2 * wordSize);
601 }
602 }
603
604 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
605 if (PreserveFramePointer) {
606 // frame pointer is valid
607 #ifdef ASSERT
608 // Verify frame pointer value in rbp.
609 reconstruct_frame_pointer_helper(this, rtmp);
610 Label L_success;
611 cmpq(rbp, rtmp);
612 jccb(Assembler::equal, L_success);
613 STOP("frame pointer mismatch");
614 bind(L_success);
615 #endif // ASSERT
616 } else {
617 reconstruct_frame_pointer_helper(this, rbp);
618 }
619 }
620
621 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
622 jint lo = t->_lo;
623 jint hi = t->_hi;
624 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
625 if (t == TypeInt::INT) {
626 return;
627 }
628
629 BLOCK_COMMENT("CastII {");
630 Label fail;
631 Label succeed;
632
633 if (lo != min_jint) {
634 cmpl(val, lo);
635 jccb(Assembler::less, fail);
636 }
637 if (hi != max_jint) {
638 cmpl(val, hi);
639 jccb(Assembler::greater, fail);
640 }
641 jmpb(succeed);
642
643 bind(fail);
644 movl(c_rarg0, idx);
645 movl(c_rarg1, val);
646 movl(c_rarg2, lo);
647 movl(c_rarg3, hi);
648 reconstruct_frame_pointer(rscratch1);
649 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
650 hlt();
651 bind(succeed);
652 BLOCK_COMMENT("} // CastII");
653 }
654
655 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
656 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
657 }
658
659 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
660 jlong lo = t->_lo;
661 jlong hi = t->_hi;
662 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
663 if (t == TypeLong::LONG) {
664 return;
665 }
666
667 BLOCK_COMMENT("CastLL {");
668 Label fail;
669 Label succeed;
670
671 auto cmp_val = [&](jlong bound) {
672 if (is_simm32(bound)) {
673 cmpq(val, checked_cast<int>(bound));
674 } else {
675 mov64(tmp, bound);
676 cmpq(val, tmp);
677 }
678 };
679
680 if (lo != min_jlong) {
681 cmp_val(lo);
682 jccb(Assembler::less, fail);
683 }
684 if (hi != max_jlong) {
685 cmp_val(hi);
686 jccb(Assembler::greater, fail);
687 }
688 jmpb(succeed);
689
690 bind(fail);
691 movl(c_rarg0, idx);
692 movq(c_rarg1, val);
693 mov64(c_rarg2, lo);
694 mov64(c_rarg3, hi);
695 reconstruct_frame_pointer(rscratch1);
696 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
697 hlt();
698 bind(succeed);
699 BLOCK_COMMENT("} // CastLL");
700 }
701
702 //-------------------------------------------------------------------------------------------
703 // Generic instructions support for use in .ad files C2 code generation
704
705 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
706 if (dst != src) {
707 movdqu(dst, src);
708 }
709 if (opcode == Op_AbsVD) {
710 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
711 } else {
712 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
713 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
714 }
715 }
716
717 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
718 if (opcode == Op_AbsVD) {
719 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
720 } else {
721 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
722 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
723 }
724 }
725
726 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
727 if (dst != src) {
728 movdqu(dst, src);
729 }
730 if (opcode == Op_AbsVF) {
731 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
732 } else {
733 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
734 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
735 }
736 }
737
738 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
739 if (opcode == Op_AbsVF) {
740 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
741 } else {
742 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
743 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
744 }
745 }
746
747 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
748 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
749 assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
750
751 if (opcode == Op_MinV) {
752 if (elem_bt == T_BYTE) {
753 pminsb(dst, src);
754 } else if (elem_bt == T_SHORT) {
755 pminsw(dst, src);
756 } else if (elem_bt == T_INT) {
757 pminsd(dst, src);
758 } else {
759 assert(elem_bt == T_LONG, "required");
760 assert(tmp == xmm0, "required");
761 assert_different_registers(dst, src, tmp);
762 movdqu(xmm0, dst);
763 pcmpgtq(xmm0, src);
764 blendvpd(dst, src); // xmm0 as mask
765 }
766 } else { // opcode == Op_MaxV
767 if (elem_bt == T_BYTE) {
768 pmaxsb(dst, src);
769 } else if (elem_bt == T_SHORT) {
770 pmaxsw(dst, src);
771 } else if (elem_bt == T_INT) {
772 pmaxsd(dst, src);
773 } else {
774 assert(elem_bt == T_LONG, "required");
775 assert(tmp == xmm0, "required");
776 assert_different_registers(dst, src, tmp);
777 movdqu(xmm0, src);
778 pcmpgtq(xmm0, dst);
779 blendvpd(dst, src); // xmm0 as mask
780 }
781 }
782 }
783
784 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
785 XMMRegister src1, Address src2, int vlen_enc) {
786 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
787 if (opcode == Op_UMinV) {
788 switch(elem_bt) {
789 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break;
790 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
791 case T_INT: vpminud(dst, src1, src2, vlen_enc); break;
792 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
793 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
794 }
795 } else {
796 assert(opcode == Op_UMaxV, "required");
797 switch(elem_bt) {
798 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break;
799 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
800 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break;
801 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
802 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
803 }
804 }
805 }
806
807 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
808 // For optimality, leverage a full vector width of 512 bits
809 // for operations over smaller vector sizes on AVX512 targets.
810 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
811 if (opcode == Op_UMaxV) {
812 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
813 } else {
814 assert(opcode == Op_UMinV, "required");
815 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
816 }
817 } else {
818 // T1 = -1
819 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
820 // T1 = -1 << 63
821 vpsllq(xtmp1, xtmp1, 63, vlen_enc);
822 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
823 vpaddq(xtmp2, xtmp1, src2, vlen_enc);
824 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
825 vpaddq(xtmp1, xtmp1, src1, vlen_enc);
826 // Mask = T2 > T1
827 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
828 if (opcode == Op_UMaxV) {
829 // Res = Mask ? Src2 : Src1
830 vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
831 } else {
832 // Res = Mask ? Src1 : Src2
833 vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
834 }
835 }
836 }
837
838 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
839 XMMRegister src1, XMMRegister src2, int vlen_enc) {
840 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
841 if (opcode == Op_UMinV) {
842 switch(elem_bt) {
843 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break;
844 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
845 case T_INT: vpminud(dst, src1, src2, vlen_enc); break;
846 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
847 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
848 }
849 } else {
850 assert(opcode == Op_UMaxV, "required");
851 switch(elem_bt) {
852 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break;
853 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
854 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break;
855 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
856 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
857 }
858 }
859 }
860
861 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
862 XMMRegister dst, XMMRegister src1, XMMRegister src2,
863 int vlen_enc) {
864 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
865
866 if (opcode == Op_MinV) {
867 if (elem_bt == T_BYTE) {
868 vpminsb(dst, src1, src2, vlen_enc);
869 } else if (elem_bt == T_SHORT) {
870 vpminsw(dst, src1, src2, vlen_enc);
871 } else if (elem_bt == T_INT) {
872 vpminsd(dst, src1, src2, vlen_enc);
873 } else {
874 assert(elem_bt == T_LONG, "required");
875 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
876 vpminsq(dst, src1, src2, vlen_enc);
877 } else {
878 assert_different_registers(dst, src1, src2);
879 vpcmpgtq(dst, src1, src2, vlen_enc);
880 vblendvpd(dst, src1, src2, dst, vlen_enc);
881 }
882 }
883 } else { // opcode == Op_MaxV
884 if (elem_bt == T_BYTE) {
885 vpmaxsb(dst, src1, src2, vlen_enc);
886 } else if (elem_bt == T_SHORT) {
887 vpmaxsw(dst, src1, src2, vlen_enc);
888 } else if (elem_bt == T_INT) {
889 vpmaxsd(dst, src1, src2, vlen_enc);
890 } else {
891 assert(elem_bt == T_LONG, "required");
892 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
893 vpmaxsq(dst, src1, src2, vlen_enc);
894 } else {
895 assert_different_registers(dst, src1, src2);
896 vpcmpgtq(dst, src1, src2, vlen_enc);
897 vblendvpd(dst, src2, src1, dst, vlen_enc);
898 }
899 }
900 }
901 }
902
903 // Float/Double min max
904
905 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
906 XMMRegister dst, XMMRegister a, XMMRegister b,
907 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
908 int vlen_enc) {
909 assert(UseAVX > 0, "required");
910 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
911 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
912 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
913 assert_different_registers(a, tmp, atmp, btmp);
914 assert_different_registers(b, tmp, atmp, btmp);
915
916 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
917 bool is_double_word = is_double_word_type(elem_bt);
918
919 /* Note on 'non-obvious' assembly sequence:
920 *
921 * While there are vminps/vmaxps instructions, there are two important differences between hardware
922 * and Java on how they handle floats:
923 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
924 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
925 *
926 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
927 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
928 * (only useful when signs differ, noop otherwise)
929 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
930
931 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
932 * btmp = (b < +0.0) ? a : b
933 * atmp = (b < +0.0) ? b : a
934 * Tmp = Max_Float(atmp , btmp)
935 * Res = (atmp == NaN) ? atmp : Tmp
936 */
937
938 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
939 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
940 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
941 XMMRegister mask;
942
943 if (!is_double_word && is_min) {
944 mask = a;
945 vblend = &MacroAssembler::vblendvps;
946 vmaxmin = &MacroAssembler::vminps;
947 vcmp = &MacroAssembler::vcmpps;
948 } else if (!is_double_word && !is_min) {
949 mask = b;
950 vblend = &MacroAssembler::vblendvps;
951 vmaxmin = &MacroAssembler::vmaxps;
952 vcmp = &MacroAssembler::vcmpps;
953 } else if (is_double_word && is_min) {
954 mask = a;
955 vblend = &MacroAssembler::vblendvpd;
956 vmaxmin = &MacroAssembler::vminpd;
957 vcmp = &MacroAssembler::vcmppd;
958 } else {
959 assert(is_double_word && !is_min, "sanity");
960 mask = b;
961 vblend = &MacroAssembler::vblendvpd;
962 vmaxmin = &MacroAssembler::vmaxpd;
963 vcmp = &MacroAssembler::vcmppd;
964 }
965
966 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
967 XMMRegister maxmin, scratch;
968 if (dst == btmp) {
969 maxmin = btmp;
970 scratch = tmp;
971 } else {
972 maxmin = tmp;
973 scratch = btmp;
974 }
975
976 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
977 if (precompute_mask && !is_double_word) {
978 vpsrad(tmp, mask, 32, vlen_enc);
979 mask = tmp;
980 } else if (precompute_mask && is_double_word) {
981 vpxor(tmp, tmp, tmp, vlen_enc);
982 vpcmpgtq(tmp, tmp, mask, vlen_enc);
983 mask = tmp;
984 }
985
986 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
987 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
988 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
989 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
990 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
991 }
992
993 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
994 XMMRegister dst, XMMRegister a, XMMRegister b,
995 KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
996 int vlen_enc) {
997 assert(UseAVX > 2, "required");
998 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
999 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1000 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1001 assert_different_registers(dst, a, atmp, btmp);
1002 assert_different_registers(dst, b, atmp, btmp);
1003
1004 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1005 bool is_double_word = is_double_word_type(elem_bt);
1006 bool merge = true;
1007
1008 if (!is_double_word && is_min) {
1009 evpmovd2m(ktmp, a, vlen_enc);
1010 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1011 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1012 vminps(dst, atmp, btmp, vlen_enc);
1013 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1014 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1015 } else if (!is_double_word && !is_min) {
1016 evpmovd2m(ktmp, b, vlen_enc);
1017 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1018 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1019 vmaxps(dst, atmp, btmp, vlen_enc);
1020 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1021 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1022 } else if (is_double_word && is_min) {
1023 evpmovq2m(ktmp, a, vlen_enc);
1024 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1025 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1026 vminpd(dst, atmp, btmp, vlen_enc);
1027 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1028 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1029 } else {
1030 assert(is_double_word && !is_min, "sanity");
1031 evpmovq2m(ktmp, b, vlen_enc);
1032 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1033 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1034 vmaxpd(dst, atmp, btmp, vlen_enc);
1035 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1036 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1037 }
1038 }
1039
1040 void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1041 XMMRegister src1, XMMRegister src2, int vlen_enc) {
1042 assert(opc == Op_MinV || opc == Op_MinReductionV ||
1043 opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1044
1045 int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1046 : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1047 if (elem_bt == T_FLOAT) {
1048 evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1049 } else {
1050 assert(elem_bt == T_DOUBLE, "");
1051 evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1052 }
1053 }
1054
1055 // Float/Double signum
1056 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1057 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1058
1059 Label DONE_LABEL;
1060
1061 // Handle special cases +0.0/-0.0 and NaN, if argument is +0.0/-0.0 or NaN, return argument
1062 // If AVX10.2 (or newer) floating point comparison instructions used, SF=1 for equal and unordered cases
1063 // If other floating point comparison instructions used, ZF=1 for equal and unordered cases
1064 if (opcode == Op_SignumF) {
1065 if (VM_Version::supports_avx10_2()) {
1066 vucomxss(dst, zero);
1067 jcc(Assembler::negative, DONE_LABEL);
1068 } else {
1069 ucomiss(dst, zero);
1070 jcc(Assembler::equal, DONE_LABEL);
1071 }
1072 movflt(dst, one);
1073 jcc(Assembler::above, DONE_LABEL);
1074 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1075 } else if (opcode == Op_SignumD) {
1076 if (VM_Version::supports_avx10_2()) {
1077 vucomxsd(dst, zero);
1078 jcc(Assembler::negative, DONE_LABEL);
1079 } else {
1080 ucomisd(dst, zero);
1081 jcc(Assembler::equal, DONE_LABEL);
1082 }
1083 movdbl(dst, one);
1084 jcc(Assembler::above, DONE_LABEL);
1085 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1086 }
1087
1088 bind(DONE_LABEL);
1089 }
1090
1091 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1092 if (sign) {
1093 pmovsxbw(dst, src);
1094 } else {
1095 pmovzxbw(dst, src);
1096 }
1097 }
1098
1099 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1100 if (sign) {
1101 vpmovsxbw(dst, src, vector_len);
1102 } else {
1103 vpmovzxbw(dst, src, vector_len);
1104 }
1105 }
1106
1107 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1108 if (sign) {
1109 vpmovsxbd(dst, src, vector_len);
1110 } else {
1111 vpmovzxbd(dst, src, vector_len);
1112 }
1113 }
1114
1115 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1116 if (sign) {
1117 vpmovsxwd(dst, src, vector_len);
1118 } else {
1119 vpmovzxwd(dst, src, vector_len);
1120 }
1121 }
1122
1123 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1124 int shift, int vector_len) {
1125 if (opcode == Op_RotateLeftV) {
1126 if (etype == T_INT) {
1127 evprold(dst, src, shift, vector_len);
1128 } else {
1129 assert(etype == T_LONG, "expected type T_LONG");
1130 evprolq(dst, src, shift, vector_len);
1131 }
1132 } else {
1133 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1134 if (etype == T_INT) {
1135 evprord(dst, src, shift, vector_len);
1136 } else {
1137 assert(etype == T_LONG, "expected type T_LONG");
1138 evprorq(dst, src, shift, vector_len);
1139 }
1140 }
1141 }
1142
1143 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1144 XMMRegister shift, int vector_len) {
1145 if (opcode == Op_RotateLeftV) {
1146 if (etype == T_INT) {
1147 evprolvd(dst, src, shift, vector_len);
1148 } else {
1149 assert(etype == T_LONG, "expected type T_LONG");
1150 evprolvq(dst, src, shift, vector_len);
1151 }
1152 } else {
1153 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1154 if (etype == T_INT) {
1155 evprorvd(dst, src, shift, vector_len);
1156 } else {
1157 assert(etype == T_LONG, "expected type T_LONG");
1158 evprorvq(dst, src, shift, vector_len);
1159 }
1160 }
1161 }
1162
1163 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1164 if (opcode == Op_RShiftVI) {
1165 psrad(dst, shift);
1166 } else if (opcode == Op_LShiftVI) {
1167 pslld(dst, shift);
1168 } else {
1169 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1170 psrld(dst, shift);
1171 }
1172 }
1173
1174 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1175 switch (opcode) {
1176 case Op_RShiftVI: psrad(dst, shift); break;
1177 case Op_LShiftVI: pslld(dst, shift); break;
1178 case Op_URShiftVI: psrld(dst, shift); break;
1179
1180 default: assert(false, "%s", NodeClassNames[opcode]);
1181 }
1182 }
1183
1184 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1185 if (opcode == Op_RShiftVI) {
1186 vpsrad(dst, nds, shift, vector_len);
1187 } else if (opcode == Op_LShiftVI) {
1188 vpslld(dst, nds, shift, vector_len);
1189 } else {
1190 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1191 vpsrld(dst, nds, shift, vector_len);
1192 }
1193 }
1194
1195 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1196 switch (opcode) {
1197 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break;
1198 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break;
1199 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1200
1201 default: assert(false, "%s", NodeClassNames[opcode]);
1202 }
1203 }
1204
1205 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1206 switch (opcode) {
1207 case Op_RShiftVB: // fall-through
1208 case Op_RShiftVS: psraw(dst, shift); break;
1209
1210 case Op_LShiftVB: // fall-through
1211 case Op_LShiftVS: psllw(dst, shift); break;
1212
1213 case Op_URShiftVS: // fall-through
1214 case Op_URShiftVB: psrlw(dst, shift); break;
1215
1216 default: assert(false, "%s", NodeClassNames[opcode]);
1217 }
1218 }
1219
1220 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1221 switch (opcode) {
1222 case Op_RShiftVB: // fall-through
1223 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break;
1224
1225 case Op_LShiftVB: // fall-through
1226 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break;
1227
1228 case Op_URShiftVS: // fall-through
1229 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1230
1231 default: assert(false, "%s", NodeClassNames[opcode]);
1232 }
1233 }
1234
1235 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1236 switch (opcode) {
1237 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1238 case Op_LShiftVL: psllq(dst, shift); break;
1239 case Op_URShiftVL: psrlq(dst, shift); break;
1240
1241 default: assert(false, "%s", NodeClassNames[opcode]);
1242 }
1243 }
1244
1245 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1246 if (opcode == Op_RShiftVL) {
1247 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems
1248 } else if (opcode == Op_LShiftVL) {
1249 psllq(dst, shift);
1250 } else {
1251 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1252 psrlq(dst, shift);
1253 }
1254 }
1255
1256 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1257 switch (opcode) {
1258 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1259 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break;
1260 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1261
1262 default: assert(false, "%s", NodeClassNames[opcode]);
1263 }
1264 }
1265
1266 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1267 if (opcode == Op_RShiftVL) {
1268 evpsraq(dst, nds, shift, vector_len);
1269 } else if (opcode == Op_LShiftVL) {
1270 vpsllq(dst, nds, shift, vector_len);
1271 } else {
1272 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1273 vpsrlq(dst, nds, shift, vector_len);
1274 }
1275 }
1276
1277 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1278 switch (opcode) {
1279 case Op_RShiftVB: // fall-through
1280 case Op_RShiftVS: // fall-through
1281 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break;
1282
1283 case Op_LShiftVB: // fall-through
1284 case Op_LShiftVS: // fall-through
1285 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break;
1286
1287 case Op_URShiftVB: // fall-through
1288 case Op_URShiftVS: // fall-through
1289 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1290
1291 default: assert(false, "%s", NodeClassNames[opcode]);
1292 }
1293 }
1294
1295 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1296 switch (opcode) {
1297 case Op_RShiftVB: // fall-through
1298 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break;
1299
1300 case Op_LShiftVB: // fall-through
1301 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break;
1302
1303 case Op_URShiftVB: // fall-through
1304 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1305
1306 default: assert(false, "%s", NodeClassNames[opcode]);
1307 }
1308 }
1309
1310 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1311 assert(UseAVX >= 2, "required");
1312 switch (opcode) {
1313 case Op_RShiftVL: {
1314 if (UseAVX > 2) {
1315 assert(tmp == xnoreg, "not used");
1316 if (!VM_Version::supports_avx512vl()) {
1317 vlen_enc = Assembler::AVX_512bit;
1318 }
1319 evpsravq(dst, src, shift, vlen_enc);
1320 } else {
1321 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1322 vpsrlvq(dst, src, shift, vlen_enc);
1323 vpsrlvq(tmp, tmp, shift, vlen_enc);
1324 vpxor(dst, dst, tmp, vlen_enc);
1325 vpsubq(dst, dst, tmp, vlen_enc);
1326 }
1327 break;
1328 }
1329 case Op_LShiftVL: {
1330 assert(tmp == xnoreg, "not used");
1331 vpsllvq(dst, src, shift, vlen_enc);
1332 break;
1333 }
1334 case Op_URShiftVL: {
1335 assert(tmp == xnoreg, "not used");
1336 vpsrlvq(dst, src, shift, vlen_enc);
1337 break;
1338 }
1339 default: assert(false, "%s", NodeClassNames[opcode]);
1340 }
1341 }
1342
1343 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1344 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1345 assert(opcode == Op_LShiftVB ||
1346 opcode == Op_RShiftVB ||
1347 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1348 bool sign = (opcode != Op_URShiftVB);
1349 assert(vector_len == 0, "required");
1350 vextendbd(sign, dst, src, 1);
1351 vpmovzxbd(vtmp, shift, 1);
1352 varshiftd(opcode, dst, dst, vtmp, 1);
1353 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1354 vextracti128_high(vtmp, dst);
1355 vpackusdw(dst, dst, vtmp, 0);
1356 }
1357
1358 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1359 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1360 assert(opcode == Op_LShiftVB ||
1361 opcode == Op_RShiftVB ||
1362 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1363 bool sign = (opcode != Op_URShiftVB);
1364 int ext_vector_len = vector_len + 1;
1365 vextendbw(sign, dst, src, ext_vector_len);
1366 vpmovzxbw(vtmp, shift, ext_vector_len);
1367 varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1368 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1369 if (vector_len == 0) {
1370 vextracti128_high(vtmp, dst);
1371 vpackuswb(dst, dst, vtmp, vector_len);
1372 } else {
1373 vextracti64x4_high(vtmp, dst);
1374 vpackuswb(dst, dst, vtmp, vector_len);
1375 vpermq(dst, dst, 0xD8, vector_len);
1376 }
1377 }
1378
1379 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1380 switch(typ) {
1381 case T_BYTE:
1382 pinsrb(dst, val, idx);
1383 break;
1384 case T_SHORT:
1385 pinsrw(dst, val, idx);
1386 break;
1387 case T_INT:
1388 pinsrd(dst, val, idx);
1389 break;
1390 case T_LONG:
1391 pinsrq(dst, val, idx);
1392 break;
1393 default:
1394 assert(false,"Should not reach here.");
1395 break;
1396 }
1397 }
1398
1399 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1400 switch(typ) {
1401 case T_BYTE:
1402 vpinsrb(dst, src, val, idx);
1403 break;
1404 case T_SHORT:
1405 vpinsrw(dst, src, val, idx);
1406 break;
1407 case T_INT:
1408 vpinsrd(dst, src, val, idx);
1409 break;
1410 case T_LONG:
1411 vpinsrq(dst, src, val, idx);
1412 break;
1413 default:
1414 assert(false,"Should not reach here.");
1415 break;
1416 }
1417 }
1418
1419 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst,
1420 Register base, Register idx_base,
1421 Register mask, Register mask_idx,
1422 Register rtmp, int vlen_enc) {
1423 vpxor(dst, dst, dst, vlen_enc);
1424 if (elem_bt == T_SHORT) {
1425 for (int i = 0; i < 4; i++) {
1426 // dst[i] = mask[i] ? src[idx_base[i]] : 0
1427 Label skip_load;
1428 btq(mask, mask_idx);
1429 jccb(Assembler::carryClear, skip_load);
1430 movl(rtmp, Address(idx_base, i * 4));
1431 pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1432 bind(skip_load);
1433 incq(mask_idx);
1434 }
1435 } else {
1436 assert(elem_bt == T_BYTE, "");
1437 for (int i = 0; i < 8; i++) {
1438 // dst[i] = mask[i] ? src[idx_base[i]] : 0
1439 Label skip_load;
1440 btq(mask, mask_idx);
1441 jccb(Assembler::carryClear, skip_load);
1442 movl(rtmp, Address(idx_base, i * 4));
1443 pinsrb(dst, Address(base, rtmp), i);
1444 bind(skip_load);
1445 incq(mask_idx);
1446 }
1447 }
1448 }
1449
1450 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
1451 Register base, Register idx_base,
1452 Register rtmp, int vlen_enc) {
1453 vpxor(dst, dst, dst, vlen_enc);
1454 if (elem_bt == T_SHORT) {
1455 for (int i = 0; i < 4; i++) {
1456 // dst[i] = src[idx_base[i]]
1457 movl(rtmp, Address(idx_base, i * 4));
1458 pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1459 }
1460 } else {
1461 assert(elem_bt == T_BYTE, "");
1462 for (int i = 0; i < 8; i++) {
1463 // dst[i] = src[idx_base[i]]
1464 movl(rtmp, Address(idx_base, i * 4));
1465 pinsrb(dst, Address(base, rtmp), i);
1466 }
1467 }
1468 }
1469
1470 /*
1471 * Gather using hybrid algorithm, first partially unroll scalar loop
1472 * to accumulate values from gather indices into a quad-word(64bit) slice.
1473 * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1474 * permutation to place the slice into appropriate vector lane
1475 * locations in destination vector. Following pseudo code describes the
1476 * algorithm in detail:
1477 *
1478 * DST_VEC = ZERO_VEC
1479 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1480 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1481 * FOREACH_ITER:
1482 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1483 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1484 * DST_VEC = DST_VEC OR TEMP_PERM_VEC
1485 * PERM_INDEX = PERM_INDEX - TWO_VEC
1486 *
1487 * With each iteration, doubleword permute indices (0,1) corresponding
1488 * to gathered quadword gets right shifted by two lane positions.
1489 *
1490 */
1491 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1492 Register base, Register idx_base,
1493 Register mask, XMMRegister xtmp1,
1494 XMMRegister xtmp2, XMMRegister temp_dst,
1495 Register rtmp, Register mask_idx,
1496 Register length, int vector_len, int vlen_enc) {
1497 Label GATHER8_LOOP;
1498 assert(is_subword_type(elem_ty), "");
1499 movl(length, vector_len);
1500 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1501 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1502 vallones(xtmp2, vlen_enc);
1503 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1504 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1505 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1506
1507 bind(GATHER8_LOOP);
1508 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1509 if (mask == noreg) {
1510 vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc);
1511 } else {
1512 vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc);
1513 }
1514 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1515 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1516 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1517 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1518 // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1519 vpor(dst, dst, temp_dst, vlen_enc);
1520 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1));
1521 subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1522 jcc(Assembler::notEqual, GATHER8_LOOP);
1523 }
1524
1525 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1526 switch(typ) {
1527 case T_INT:
1528 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1529 break;
1530 case T_FLOAT:
1531 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1532 break;
1533 case T_LONG:
1534 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1535 break;
1536 case T_DOUBLE:
1537 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1538 break;
1539 default:
1540 assert(false,"Should not reach here.");
1541 break;
1542 }
1543 }
1544
1545 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1546 switch(typ) {
1547 case T_INT:
1548 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1549 break;
1550 case T_FLOAT:
1551 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1552 break;
1553 case T_LONG:
1554 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1555 break;
1556 case T_DOUBLE:
1557 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1558 break;
1559 default:
1560 assert(false,"Should not reach here.");
1561 break;
1562 }
1563 }
1564
1565 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1566 switch(typ) {
1567 case T_INT:
1568 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1569 break;
1570 case T_FLOAT:
1571 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1572 break;
1573 case T_LONG:
1574 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1575 break;
1576 case T_DOUBLE:
1577 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1578 break;
1579 default:
1580 assert(false,"Should not reach here.");
1581 break;
1582 }
1583 }
1584
1585 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1586 if (vlen_in_bytes <= 16) {
1587 pxor (dst, dst);
1588 psubb(dst, src);
1589 switch (elem_bt) {
1590 case T_BYTE: /* nothing to do */ break;
1591 case T_SHORT: pmovsxbw(dst, dst); break;
1592 case T_INT: pmovsxbd(dst, dst); break;
1593 case T_FLOAT: pmovsxbd(dst, dst); break;
1594 case T_LONG: pmovsxbq(dst, dst); break;
1595 case T_DOUBLE: pmovsxbq(dst, dst); break;
1596
1597 default: assert(false, "%s", type2name(elem_bt));
1598 }
1599 } else {
1600 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1601 int vlen_enc = vector_length_encoding(vlen_in_bytes);
1602
1603 vpxor (dst, dst, dst, vlen_enc);
1604 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1605
1606 switch (elem_bt) {
1607 case T_BYTE: /* nothing to do */ break;
1608 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break;
1609 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break;
1610 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break;
1611 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break;
1612 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1613
1614 default: assert(false, "%s", type2name(elem_bt));
1615 }
1616 }
1617 }
1618
1619 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1620 if (novlbwdq) {
1621 vpmovsxbd(xtmp, src, vlen_enc);
1622 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1623 Assembler::eq, true, vlen_enc, noreg);
1624 } else {
1625 vpxor(xtmp, xtmp, xtmp, vlen_enc);
1626 vpsubb(xtmp, xtmp, src, vlen_enc);
1627 evpmovb2m(dst, xtmp, vlen_enc);
1628 }
1629 }
1630
1631 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1632 if (is_integral_type(bt)) {
1633 switch (vlen_in_bytes) {
1634 case 4: movdl(dst, src); break;
1635 case 8: movq(dst, src); break;
1636 case 16: movdqu(dst, src); break;
1637 case 32: vmovdqu(dst, src); break;
1638 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1639 default: ShouldNotReachHere();
1640 }
1641 } else {
1642 switch (vlen_in_bytes) {
1643 case 4: movflt(dst, src); break;
1644 case 8: movdbl(dst, src); break;
1645 case 16: movups(dst, src); break;
1646 case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1647 case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1648 default: ShouldNotReachHere();
1649 }
1650 }
1651 }
1652
1653 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1654 assert(rscratch != noreg || always_reachable(src), "missing");
1655
1656 if (reachable(src)) {
1657 load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1658 } else {
1659 lea(rscratch, src);
1660 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1661 }
1662 }
1663
1664 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1665 int vlen_enc = vector_length_encoding(vlen);
1666 if (VM_Version::supports_avx()) {
1667 if (bt == T_LONG) {
1668 if (VM_Version::supports_avx2()) {
1669 vpbroadcastq(dst, src, vlen_enc);
1670 } else {
1671 vmovddup(dst, src, vlen_enc);
1672 }
1673 } else if (bt == T_DOUBLE) {
1674 if (vlen_enc != Assembler::AVX_128bit) {
1675 vbroadcastsd(dst, src, vlen_enc, noreg);
1676 } else {
1677 vmovddup(dst, src, vlen_enc);
1678 }
1679 } else {
1680 if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1681 vpbroadcastd(dst, src, vlen_enc);
1682 } else {
1683 vbroadcastss(dst, src, vlen_enc);
1684 }
1685 }
1686 } else if (VM_Version::supports_sse3()) {
1687 movddup(dst, src);
1688 } else {
1689 load_vector(bt, dst, src, vlen);
1690 }
1691 }
1692
1693 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1694 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1695 int offset = exact_log2(type2aelembytes(bt)) << 6;
1696 if (is_floating_point_type(bt)) {
1697 offset += 128;
1698 }
1699 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1700 load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1701 }
1702
1703 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1704
1705 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1706 int vector_len = Assembler::AVX_128bit;
1707
1708 switch (opcode) {
1709 case Op_AndReductionV: pand(dst, src); break;
1710 case Op_OrReductionV: por (dst, src); break;
1711 case Op_XorReductionV: pxor(dst, src); break;
1712 case Op_MinReductionV:
1713 switch (typ) {
1714 case T_BYTE: pminsb(dst, src); break;
1715 case T_SHORT: pminsw(dst, src); break;
1716 case T_INT: pminsd(dst, src); break;
1717 case T_LONG: assert(UseAVX > 2, "required");
1718 vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1719 default: assert(false, "wrong type");
1720 }
1721 break;
1722 case Op_MaxReductionV:
1723 switch (typ) {
1724 case T_BYTE: pmaxsb(dst, src); break;
1725 case T_SHORT: pmaxsw(dst, src); break;
1726 case T_INT: pmaxsd(dst, src); break;
1727 case T_LONG: assert(UseAVX > 2, "required");
1728 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1729 default: assert(false, "wrong type");
1730 }
1731 break;
1732 case Op_UMinReductionV:
1733 switch (typ) {
1734 case T_BYTE: vpminub(dst, dst, src, Assembler::AVX_128bit); break;
1735 case T_SHORT: vpminuw(dst, dst, src, Assembler::AVX_128bit); break;
1736 case T_INT: vpminud(dst, dst, src, Assembler::AVX_128bit); break;
1737 case T_LONG: evpminuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1738 default: assert(false, "wrong type");
1739 }
1740 break;
1741 case Op_UMaxReductionV:
1742 switch (typ) {
1743 case T_BYTE: vpmaxub(dst, dst, src, Assembler::AVX_128bit); break;
1744 case T_SHORT: vpmaxuw(dst, dst, src, Assembler::AVX_128bit); break;
1745 case T_INT: vpmaxud(dst, dst, src, Assembler::AVX_128bit); break;
1746 case T_LONG: evpmaxuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1747 default: assert(false, "wrong type");
1748 }
1749 break;
1750 case Op_AddReductionVF: addss(dst, src); break;
1751 case Op_AddReductionVD: addsd(dst, src); break;
1752 case Op_AddReductionVI:
1753 switch (typ) {
1754 case T_BYTE: paddb(dst, src); break;
1755 case T_SHORT: paddw(dst, src); break;
1756 case T_INT: paddd(dst, src); break;
1757 default: assert(false, "wrong type");
1758 }
1759 break;
1760 case Op_AddReductionVL: paddq(dst, src); break;
1761 case Op_MulReductionVF: mulss(dst, src); break;
1762 case Op_MulReductionVD: mulsd(dst, src); break;
1763 case Op_MulReductionVI:
1764 switch (typ) {
1765 case T_SHORT: pmullw(dst, src); break;
1766 case T_INT: pmulld(dst, src); break;
1767 default: assert(false, "wrong type");
1768 }
1769 break;
1770 case Op_MulReductionVL: assert(UseAVX > 2, "required");
1771 evpmullq(dst, dst, src, vector_len); break;
1772 default: assert(false, "wrong opcode");
1773 }
1774 }
1775
1776 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1777 switch (opcode) {
1778 case Op_AddReductionVF: addps(dst, src); break;
1779 case Op_AddReductionVD: addpd(dst, src); break;
1780 case Op_MulReductionVF: mulps(dst, src); break;
1781 case Op_MulReductionVD: mulpd(dst, src); break;
1782 default: assert(false, "%s", NodeClassNames[opcode]);
1783 }
1784 }
1785
1786 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1787 int vector_len = Assembler::AVX_256bit;
1788
1789 switch (opcode) {
1790 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break;
1791 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break;
1792 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break;
1793 case Op_MinReductionV:
1794 switch (typ) {
1795 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break;
1796 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break;
1797 case T_INT: vpminsd(dst, src1, src2, vector_len); break;
1798 case T_LONG: assert(UseAVX > 2, "required");
1799 vpminsq(dst, src1, src2, vector_len); break;
1800 default: assert(false, "wrong type");
1801 }
1802 break;
1803 case Op_MaxReductionV:
1804 switch (typ) {
1805 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break;
1806 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break;
1807 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break;
1808 case T_LONG: assert(UseAVX > 2, "required");
1809 vpmaxsq(dst, src1, src2, vector_len); break;
1810 default: assert(false, "wrong type");
1811 }
1812 break;
1813 case Op_UMinReductionV:
1814 switch (typ) {
1815 case T_BYTE: vpminub(dst, src1, src2, vector_len); break;
1816 case T_SHORT: vpminuw(dst, src1, src2, vector_len); break;
1817 case T_INT: vpminud(dst, src1, src2, vector_len); break;
1818 case T_LONG: evpminuq(dst, k0, src1, src2, true, vector_len); break;
1819 default: assert(false, "wrong type");
1820 }
1821 break;
1822 case Op_UMaxReductionV:
1823 switch (typ) {
1824 case T_BYTE: vpmaxub(dst, src1, src2, vector_len); break;
1825 case T_SHORT: vpmaxuw(dst, src1, src2, vector_len); break;
1826 case T_INT: vpmaxud(dst, src1, src2, vector_len); break;
1827 case T_LONG: evpmaxuq(dst, k0, src1, src2, true, vector_len); break;
1828 default: assert(false, "wrong type");
1829 }
1830 break;
1831 case Op_AddReductionVI:
1832 switch (typ) {
1833 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break;
1834 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break;
1835 case T_INT: vpaddd(dst, src1, src2, vector_len); break;
1836 default: assert(false, "wrong type");
1837 }
1838 break;
1839 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1840 case Op_MulReductionVI:
1841 switch (typ) {
1842 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break;
1843 case T_INT: vpmulld(dst, src1, src2, vector_len); break;
1844 default: assert(false, "wrong type");
1845 }
1846 break;
1847 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1848 default: assert(false, "wrong opcode");
1849 }
1850 }
1851
1852 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1853 int vector_len = Assembler::AVX_256bit;
1854
1855 switch (opcode) {
1856 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1857 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1858 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1859 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1860 default: assert(false, "%s", NodeClassNames[opcode]);
1861 }
1862 }
1863
1864 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1865 XMMRegister dst, XMMRegister src,
1866 XMMRegister vtmp1, XMMRegister vtmp2) {
1867 switch (opcode) {
1868 case Op_AddReductionVF:
1869 case Op_MulReductionVF:
1870 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1871 break;
1872
1873 case Op_AddReductionVD:
1874 case Op_MulReductionVD:
1875 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1876 break;
1877
1878 default: assert(false, "wrong opcode");
1879 }
1880 }
1881
1882 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1883 XMMRegister dst, XMMRegister src,
1884 XMMRegister vtmp1, XMMRegister vtmp2) {
1885 switch (opcode) {
1886 case Op_AddReductionVF:
1887 case Op_MulReductionVF:
1888 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1889 break;
1890
1891 case Op_AddReductionVD:
1892 case Op_MulReductionVD:
1893 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1894 break;
1895
1896 default: assert(false, "%s", NodeClassNames[opcode]);
1897 }
1898 }
1899
1900 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1901 Register dst, Register src1, XMMRegister src2,
1902 XMMRegister vtmp1, XMMRegister vtmp2) {
1903 switch (vlen) {
1904 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1905 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1906 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1907 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1908
1909 default: assert(false, "wrong vector length");
1910 }
1911 }
1912
1913 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1914 Register dst, Register src1, XMMRegister src2,
1915 XMMRegister vtmp1, XMMRegister vtmp2) {
1916 switch (vlen) {
1917 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1918 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1919 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1920 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1921
1922 default: assert(false, "wrong vector length");
1923 }
1924 }
1925
1926 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1927 Register dst, Register src1, XMMRegister src2,
1928 XMMRegister vtmp1, XMMRegister vtmp2) {
1929 switch (vlen) {
1930 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1931 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1932 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1933 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1934
1935 default: assert(false, "wrong vector length");
1936 }
1937 }
1938
1939 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1940 Register dst, Register src1, XMMRegister src2,
1941 XMMRegister vtmp1, XMMRegister vtmp2) {
1942 switch (vlen) {
1943 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1944 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1945 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1946 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1947
1948 default: assert(false, "wrong vector length");
1949 }
1950 }
1951
1952 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1953 Register dst, Register src1, XMMRegister src2,
1954 XMMRegister vtmp1, XMMRegister vtmp2) {
1955 switch (vlen) {
1956 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1957 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1958 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1959
1960 default: assert(false, "wrong vector length");
1961 }
1962 }
1963
1964 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1965 switch (vlen) {
1966 case 2:
1967 assert(vtmp2 == xnoreg, "");
1968 reduce2F(opcode, dst, src, vtmp1);
1969 break;
1970 case 4:
1971 assert(vtmp2 == xnoreg, "");
1972 reduce4F(opcode, dst, src, vtmp1);
1973 break;
1974 case 8:
1975 reduce8F(opcode, dst, src, vtmp1, vtmp2);
1976 break;
1977 case 16:
1978 reduce16F(opcode, dst, src, vtmp1, vtmp2);
1979 break;
1980 default: assert(false, "wrong vector length");
1981 }
1982 }
1983
1984 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1985 switch (vlen) {
1986 case 2:
1987 assert(vtmp2 == xnoreg, "");
1988 reduce2D(opcode, dst, src, vtmp1);
1989 break;
1990 case 4:
1991 reduce4D(opcode, dst, src, vtmp1, vtmp2);
1992 break;
1993 case 8:
1994 reduce8D(opcode, dst, src, vtmp1, vtmp2);
1995 break;
1996 default: assert(false, "wrong vector length");
1997 }
1998 }
1999
2000 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2001 switch (vlen) {
2002 case 2:
2003 assert(vtmp1 == xnoreg, "");
2004 assert(vtmp2 == xnoreg, "");
2005 unorderedReduce2F(opcode, dst, src);
2006 break;
2007 case 4:
2008 assert(vtmp2 == xnoreg, "");
2009 unorderedReduce4F(opcode, dst, src, vtmp1);
2010 break;
2011 case 8:
2012 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2013 break;
2014 case 16:
2015 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2016 break;
2017 default: assert(false, "wrong vector length");
2018 }
2019 }
2020
2021 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2022 switch (vlen) {
2023 case 2:
2024 assert(vtmp1 == xnoreg, "");
2025 assert(vtmp2 == xnoreg, "");
2026 unorderedReduce2D(opcode, dst, src);
2027 break;
2028 case 4:
2029 assert(vtmp2 == xnoreg, "");
2030 unorderedReduce4D(opcode, dst, src, vtmp1);
2031 break;
2032 case 8:
2033 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2034 break;
2035 default: assert(false, "wrong vector length");
2036 }
2037 }
2038
2039 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2040 if (opcode == Op_AddReductionVI) {
2041 if (vtmp1 != src2) {
2042 movdqu(vtmp1, src2);
2043 }
2044 phaddd(vtmp1, vtmp1);
2045 } else {
2046 pshufd(vtmp1, src2, 0x1);
2047 reduce_operation_128(T_INT, opcode, vtmp1, src2);
2048 }
2049 movdl(vtmp2, src1);
2050 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2051 movdl(dst, vtmp1);
2052 }
2053
2054 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2055 if (opcode == Op_AddReductionVI) {
2056 if (vtmp1 != src2) {
2057 movdqu(vtmp1, src2);
2058 }
2059 phaddd(vtmp1, src2);
2060 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2061 } else {
2062 pshufd(vtmp2, src2, 0xE);
2063 reduce_operation_128(T_INT, opcode, vtmp2, src2);
2064 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2065 }
2066 }
2067
2068 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2069 if (opcode == Op_AddReductionVI) {
2070 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2071 vextracti128_high(vtmp2, vtmp1);
2072 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2073 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2074 } else {
2075 vextracti128_high(vtmp1, src2);
2076 reduce_operation_128(T_INT, opcode, vtmp1, src2);
2077 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2078 }
2079 }
2080
2081 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2082 vextracti64x4_high(vtmp2, src2);
2083 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2084 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2085 }
2086
2087 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2088 pshufd(vtmp2, src2, 0x1);
2089 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2090 movdqu(vtmp1, vtmp2);
2091 psrldq(vtmp1, 2);
2092 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2093 movdqu(vtmp2, vtmp1);
2094 psrldq(vtmp2, 1);
2095 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2096 movdl(vtmp2, src1);
2097 if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2098 pmovzxbd(vtmp1, vtmp1);
2099 } else {
2100 pmovsxbd(vtmp1, vtmp1);
2101 }
2102 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2103 pextrb(dst, vtmp1, 0x0);
2104 movsbl(dst, dst);
2105 }
2106
2107 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2108 pshufd(vtmp1, src2, 0xE);
2109 reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2110 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2111 }
2112
2113 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2114 vextracti128_high(vtmp2, src2);
2115 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2116 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2117 }
2118
2119 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2120 vextracti64x4_high(vtmp1, src2);
2121 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2122 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2123 }
2124
2125 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2126 pmovsxbw(vtmp2, src2);
2127 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2128 }
2129
2130 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2131 if (UseAVX > 1) {
2132 int vector_len = Assembler::AVX_256bit;
2133 vpmovsxbw(vtmp1, src2, vector_len);
2134 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2135 } else {
2136 pmovsxbw(vtmp2, src2);
2137 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2138 pshufd(vtmp2, src2, 0x1);
2139 pmovsxbw(vtmp2, src2);
2140 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2141 }
2142 }
2143
2144 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2145 if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2146 int vector_len = Assembler::AVX_512bit;
2147 vpmovsxbw(vtmp1, src2, vector_len);
2148 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2149 } else {
2150 assert(UseAVX >= 2,"Should not reach here.");
2151 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2152 vextracti128_high(vtmp2, src2);
2153 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2154 }
2155 }
2156
2157 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2158 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2159 vextracti64x4_high(vtmp2, src2);
2160 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2161 }
2162
2163 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2164 if (opcode == Op_AddReductionVI) {
2165 if (vtmp1 != src2) {
2166 movdqu(vtmp1, src2);
2167 }
2168 phaddw(vtmp1, vtmp1);
2169 phaddw(vtmp1, vtmp1);
2170 } else {
2171 pshufd(vtmp2, src2, 0x1);
2172 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2173 movdqu(vtmp1, vtmp2);
2174 psrldq(vtmp1, 2);
2175 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2176 }
2177 movdl(vtmp2, src1);
2178 if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2179 pmovzxwd(vtmp1, vtmp1);
2180 } else {
2181 pmovsxwd(vtmp1, vtmp1);
2182 }
2183 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2184 pextrw(dst, vtmp1, 0x0);
2185 movswl(dst, dst);
2186 }
2187
2188 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2189 if (opcode == Op_AddReductionVI) {
2190 if (vtmp1 != src2) {
2191 movdqu(vtmp1, src2);
2192 }
2193 phaddw(vtmp1, src2);
2194 } else {
2195 pshufd(vtmp1, src2, 0xE);
2196 reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2197 }
2198 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2199 }
2200
2201 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2202 if (opcode == Op_AddReductionVI) {
2203 int vector_len = Assembler::AVX_256bit;
2204 vphaddw(vtmp2, src2, src2, vector_len);
2205 vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2206 } else {
2207 vextracti128_high(vtmp2, src2);
2208 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2209 }
2210 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2211 }
2212
2213 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2214 int vector_len = Assembler::AVX_256bit;
2215 vextracti64x4_high(vtmp1, src2);
2216 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2217 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2218 }
2219
2220 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2221 pshufd(vtmp2, src2, 0xE);
2222 reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2223 movdq(vtmp1, src1);
2224 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2225 movdq(dst, vtmp1);
2226 }
2227
2228 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2229 vextracti128_high(vtmp1, src2);
2230 reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2231 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2232 }
2233
2234 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2235 vextracti64x4_high(vtmp2, src2);
2236 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2237 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2238 }
2239
2240 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2241 mov64(temp, -1L);
2242 bzhiq(temp, temp, len);
2243 kmovql(dst, temp);
2244 }
2245
2246 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2247 reduce_operation_128(T_FLOAT, opcode, dst, src);
2248 pshufd(vtmp, src, 0x1);
2249 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2250 }
2251
2252 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2253 reduce2F(opcode, dst, src, vtmp);
2254 pshufd(vtmp, src, 0x2);
2255 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2256 pshufd(vtmp, src, 0x3);
2257 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2258 }
2259
2260 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2261 reduce4F(opcode, dst, src, vtmp2);
2262 vextractf128_high(vtmp2, src);
2263 reduce4F(opcode, dst, vtmp2, vtmp1);
2264 }
2265
2266 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2267 reduce8F(opcode, dst, src, vtmp1, vtmp2);
2268 vextracti64x4_high(vtmp1, src);
2269 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2270 }
2271
2272 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2273 pshufd(dst, src, 0x1);
2274 reduce_operation_128(T_FLOAT, opcode, dst, src);
2275 }
2276
2277 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2278 pshufd(vtmp, src, 0xE);
2279 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2280 unorderedReduce2F(opcode, dst, vtmp);
2281 }
2282
2283 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2284 vextractf128_high(vtmp1, src);
2285 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2286 unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2287 }
2288
2289 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2290 vextractf64x4_high(vtmp2, src);
2291 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2292 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2293 }
2294
2295 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2296 reduce_operation_128(T_DOUBLE, opcode, dst, src);
2297 pshufd(vtmp, src, 0xE);
2298 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2299 }
2300
2301 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2302 reduce2D(opcode, dst, src, vtmp2);
2303 vextractf128_high(vtmp2, src);
2304 reduce2D(opcode, dst, vtmp2, vtmp1);
2305 }
2306
2307 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2308 reduce4D(opcode, dst, src, vtmp1, vtmp2);
2309 vextracti64x4_high(vtmp1, src);
2310 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2311 }
2312
2313 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2314 pshufd(dst, src, 0xE);
2315 reduce_operation_128(T_DOUBLE, opcode, dst, src);
2316 }
2317
2318 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2319 vextractf128_high(vtmp, src);
2320 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2321 unorderedReduce2D(opcode, dst, vtmp);
2322 }
2323
2324 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2325 vextractf64x4_high(vtmp2, src);
2326 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2327 unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2328 }
2329
2330 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2331 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2332 }
2333
2334 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2335 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2336 }
2337
2338 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2339 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2340 }
2341
2342 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2343 int vec_enc) {
2344 switch(elem_bt) {
2345 case T_INT:
2346 case T_FLOAT:
2347 vmaskmovps(dst, src, mask, vec_enc);
2348 break;
2349 case T_LONG:
2350 case T_DOUBLE:
2351 vmaskmovpd(dst, src, mask, vec_enc);
2352 break;
2353 default:
2354 fatal("Unsupported type %s", type2name(elem_bt));
2355 break;
2356 }
2357 }
2358
2359 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2360 int vec_enc) {
2361 switch(elem_bt) {
2362 case T_INT:
2363 case T_FLOAT:
2364 vmaskmovps(dst, src, mask, vec_enc);
2365 break;
2366 case T_LONG:
2367 case T_DOUBLE:
2368 vmaskmovpd(dst, src, mask, vec_enc);
2369 break;
2370 default:
2371 fatal("Unsupported type %s", type2name(elem_bt));
2372 break;
2373 }
2374 }
2375
2376 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2377 XMMRegister dst, XMMRegister src,
2378 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2379 XMMRegister xmm_0, XMMRegister xmm_1) {
2380 const int permconst[] = {1, 14};
2381 XMMRegister wsrc = src;
2382 XMMRegister wdst = xmm_0;
2383 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2384
2385 int vlen_enc = Assembler::AVX_128bit;
2386 if (vlen == 16) {
2387 vlen_enc = Assembler::AVX_256bit;
2388 }
2389
2390 for (int i = log2(vlen) - 1; i >=0; i--) {
2391 if (i == 0 && !is_dst_valid) {
2392 wdst = dst;
2393 }
2394 if (i == 3) {
2395 vextracti64x4_high(wtmp, wsrc);
2396 } else if (i == 2) {
2397 vextracti128_high(wtmp, wsrc);
2398 } else { // i = [0,1]
2399 vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2400 }
2401
2402 if (VM_Version::supports_avx10_2()) {
2403 vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2404 } else {
2405 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2406 }
2407 wsrc = wdst;
2408 vlen_enc = Assembler::AVX_128bit;
2409 }
2410 if (is_dst_valid) {
2411 if (VM_Version::supports_avx10_2()) {
2412 vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2413 } else {
2414 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2415 }
2416 }
2417 }
2418
2419 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2420 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2421 XMMRegister xmm_0, XMMRegister xmm_1) {
2422 XMMRegister wsrc = src;
2423 XMMRegister wdst = xmm_0;
2424 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2425 int vlen_enc = Assembler::AVX_128bit;
2426 if (vlen == 8) {
2427 vlen_enc = Assembler::AVX_256bit;
2428 }
2429 for (int i = log2(vlen) - 1; i >=0; i--) {
2430 if (i == 0 && !is_dst_valid) {
2431 wdst = dst;
2432 }
2433 if (i == 1) {
2434 vextracti128_high(wtmp, wsrc);
2435 } else if (i == 2) {
2436 vextracti64x4_high(wtmp, wsrc);
2437 } else {
2438 assert(i == 0, "%d", i);
2439 vpermilpd(wtmp, wsrc, 1, vlen_enc);
2440 }
2441
2442 if (VM_Version::supports_avx10_2()) {
2443 vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2444 } else {
2445 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2446 }
2447
2448 wsrc = wdst;
2449 vlen_enc = Assembler::AVX_128bit;
2450 }
2451
2452 if (is_dst_valid) {
2453 if (VM_Version::supports_avx10_2()) {
2454 vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2455 } else {
2456 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2457 }
2458 }
2459 }
2460
2461 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2462 switch (bt) {
2463 case T_BYTE: pextrb(dst, src, idx); break;
2464 case T_SHORT: pextrw(dst, src, idx); break;
2465 case T_INT: pextrd(dst, src, idx); break;
2466 case T_LONG: pextrq(dst, src, idx); break;
2467
2468 default:
2469 assert(false,"Should not reach here.");
2470 break;
2471 }
2472 }
2473
2474 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2475 int esize = type2aelembytes(typ);
2476 int elem_per_lane = 16/esize;
2477 int lane = elemindex / elem_per_lane;
2478 int eindex = elemindex % elem_per_lane;
2479
2480 if (lane >= 2) {
2481 assert(UseAVX > 2, "required");
2482 vextractf32x4(dst, src, lane & 3);
2483 return dst;
2484 } else if (lane > 0) {
2485 assert(UseAVX > 0, "required");
2486 vextractf128(dst, src, lane);
2487 return dst;
2488 } else {
2489 return src;
2490 }
2491 }
2492
2493 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2494 if (typ == T_BYTE) {
2495 movsbl(dst, dst);
2496 } else if (typ == T_SHORT) {
2497 movswl(dst, dst);
2498 }
2499 }
2500
2501 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2502 int esize = type2aelembytes(typ);
2503 int elem_per_lane = 16/esize;
2504 int eindex = elemindex % elem_per_lane;
2505 assert(is_integral_type(typ),"required");
2506
2507 if (eindex == 0) {
2508 if (typ == T_LONG) {
2509 movq(dst, src);
2510 } else {
2511 movdl(dst, src);
2512 movsxl(typ, dst);
2513 }
2514 } else {
2515 extract(typ, dst, src, eindex);
2516 movsxl(typ, dst);
2517 }
2518 }
2519
2520 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2521 int esize = type2aelembytes(typ);
2522 int elem_per_lane = 16/esize;
2523 int eindex = elemindex % elem_per_lane;
2524 assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2525
2526 if (eindex == 0) {
2527 movq(dst, src);
2528 } else {
2529 if (typ == T_FLOAT) {
2530 if (UseAVX == 0) {
2531 movdqu(dst, src);
2532 shufps(dst, dst, eindex);
2533 } else {
2534 vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2535 }
2536 } else {
2537 if (UseAVX == 0) {
2538 movdqu(dst, src);
2539 psrldq(dst, eindex*esize);
2540 } else {
2541 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2542 }
2543 movq(dst, dst);
2544 }
2545 }
2546 // Zero upper bits
2547 if (typ == T_FLOAT) {
2548 if (UseAVX == 0) {
2549 assert(vtmp != xnoreg, "required.");
2550 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2551 pand(dst, vtmp);
2552 } else {
2553 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2554 }
2555 }
2556 }
2557
2558 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2559 switch(typ) {
2560 case T_BYTE:
2561 case T_BOOLEAN:
2562 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2563 break;
2564 case T_SHORT:
2565 case T_CHAR:
2566 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2567 break;
2568 case T_INT:
2569 case T_FLOAT:
2570 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2571 break;
2572 case T_LONG:
2573 case T_DOUBLE:
2574 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2575 break;
2576 default:
2577 assert(false,"Should not reach here.");
2578 break;
2579 }
2580 }
2581
2582 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2583 assert(rscratch != noreg || always_reachable(src2), "missing");
2584
2585 switch(typ) {
2586 case T_BOOLEAN:
2587 case T_BYTE:
2588 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2589 break;
2590 case T_CHAR:
2591 case T_SHORT:
2592 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2593 break;
2594 case T_INT:
2595 case T_FLOAT:
2596 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2597 break;
2598 case T_LONG:
2599 case T_DOUBLE:
2600 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2601 break;
2602 default:
2603 assert(false,"Should not reach here.");
2604 break;
2605 }
2606 }
2607
2608 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2609 switch(typ) {
2610 case T_BYTE:
2611 evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2612 break;
2613 case T_SHORT:
2614 evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2615 break;
2616 case T_INT:
2617 case T_FLOAT:
2618 evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2619 break;
2620 case T_LONG:
2621 case T_DOUBLE:
2622 evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2623 break;
2624 default:
2625 assert(false,"Should not reach here.");
2626 break;
2627 }
2628 }
2629
2630 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2631 assert(vlen_in_bytes <= 32, "");
2632 int esize = type2aelembytes(bt);
2633 if (vlen_in_bytes == 32) {
2634 assert(vtmp == xnoreg, "required.");
2635 if (esize >= 4) {
2636 vtestps(src1, src2, AVX_256bit);
2637 } else {
2638 vptest(src1, src2, AVX_256bit);
2639 }
2640 return;
2641 }
2642 if (vlen_in_bytes < 16) {
2643 // Duplicate the lower part to fill the whole register,
2644 // Don't need to do so for src2
2645 assert(vtmp != xnoreg, "required");
2646 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2647 pshufd(vtmp, src1, shuffle_imm);
2648 } else {
2649 assert(vtmp == xnoreg, "required");
2650 vtmp = src1;
2651 }
2652 if (esize >= 4 && VM_Version::supports_avx()) {
2653 vtestps(vtmp, src2, AVX_128bit);
2654 } else {
2655 ptest(vtmp, src2);
2656 }
2657 }
2658
2659 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2660 #ifdef ASSERT
2661 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2662 bool is_bw_supported = VM_Version::supports_avx512bw();
2663 if (is_bw && !is_bw_supported) {
2664 assert(vlen_enc != Assembler::AVX_512bit, "required");
2665 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2666 "XMM register should be 0-15");
2667 }
2668 #endif // ASSERT
2669 switch (elem_bt) {
2670 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2671 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2672 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2673 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2674 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2675 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2676 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2677 }
2678 }
2679
2680 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2681 assert(UseAVX >= 2, "required");
2682 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2683 bool is_vl = vlen_enc != Assembler::AVX_512bit;
2684 if ((UseAVX > 2) &&
2685 (!is_bw || VM_Version::supports_avx512bw()) &&
2686 (!is_vl || VM_Version::supports_avx512vl())) {
2687 switch (elem_bt) {
2688 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2689 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2690 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2691 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2692 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2693 }
2694 } else {
2695 assert(vlen_enc != Assembler::AVX_512bit, "required");
2696 assert((dst->encoding() < 16),"XMM register should be 0-15");
2697 switch (elem_bt) {
2698 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2699 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2700 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2701 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2702 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2703 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2704 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2705 }
2706 }
2707 }
2708
2709 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2710 switch (to_elem_bt) {
2711 case T_SHORT:
2712 vpmovsxbw(dst, src, vlen_enc);
2713 break;
2714 case T_INT:
2715 vpmovsxbd(dst, src, vlen_enc);
2716 break;
2717 case T_FLOAT:
2718 vpmovsxbd(dst, src, vlen_enc);
2719 vcvtdq2ps(dst, dst, vlen_enc);
2720 break;
2721 case T_LONG:
2722 vpmovsxbq(dst, src, vlen_enc);
2723 break;
2724 case T_DOUBLE: {
2725 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2726 vpmovsxbd(dst, src, mid_vlen_enc);
2727 vcvtdq2pd(dst, dst, vlen_enc);
2728 break;
2729 }
2730 default:
2731 fatal("Unsupported type %s", type2name(to_elem_bt));
2732 break;
2733 }
2734 }
2735
2736 //-------------------------------------------------------------------------------------------
2737
2738 // IndexOf for constant substrings with size >= 8 chars
2739 // which don't need to be loaded through stack.
2740 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2741 Register cnt1, Register cnt2,
2742 int int_cnt2, Register result,
2743 XMMRegister vec, Register tmp,
2744 int ae) {
2745 ShortBranchVerifier sbv(this);
2746 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2747 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2748
2749 // This method uses the pcmpestri instruction with bound registers
2750 // inputs:
2751 // xmm - substring
2752 // rax - substring length (elements count)
2753 // mem - scanned string
2754 // rdx - string length (elements count)
2755 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2756 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2757 // outputs:
2758 // rcx - matched index in string
2759 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2760 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2761 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2762 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2763 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2764
2765 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2766 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2767 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2768
2769 // Note, inline_string_indexOf() generates checks:
2770 // if (substr.count > string.count) return -1;
2771 // if (substr.count == 0) return 0;
2772 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2773
2774 // Load substring.
2775 if (ae == StrIntrinsicNode::UL) {
2776 pmovzxbw(vec, Address(str2, 0));
2777 } else {
2778 movdqu(vec, Address(str2, 0));
2779 }
2780 movl(cnt2, int_cnt2);
2781 movptr(result, str1); // string addr
2782
2783 if (int_cnt2 > stride) {
2784 jmpb(SCAN_TO_SUBSTR);
2785
2786 // Reload substr for rescan, this code
2787 // is executed only for large substrings (> 8 chars)
2788 bind(RELOAD_SUBSTR);
2789 if (ae == StrIntrinsicNode::UL) {
2790 pmovzxbw(vec, Address(str2, 0));
2791 } else {
2792 movdqu(vec, Address(str2, 0));
2793 }
2794 negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2795
2796 bind(RELOAD_STR);
2797 // We came here after the beginning of the substring was
2798 // matched but the rest of it was not so we need to search
2799 // again. Start from the next element after the previous match.
2800
2801 // cnt2 is number of substring reminding elements and
2802 // cnt1 is number of string reminding elements when cmp failed.
2803 // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2804 subl(cnt1, cnt2);
2805 addl(cnt1, int_cnt2);
2806 movl(cnt2, int_cnt2); // Now restore cnt2
2807
2808 decrementl(cnt1); // Shift to next element
2809 cmpl(cnt1, cnt2);
2810 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2811
2812 addptr(result, (1<<scale1));
2813
2814 } // (int_cnt2 > 8)
2815
2816 // Scan string for start of substr in 16-byte vectors
2817 bind(SCAN_TO_SUBSTR);
2818 pcmpestri(vec, Address(result, 0), mode);
2819 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
2820 subl(cnt1, stride);
2821 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2822 cmpl(cnt1, cnt2);
2823 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2824 addptr(result, 16);
2825 jmpb(SCAN_TO_SUBSTR);
2826
2827 // Found a potential substr
2828 bind(FOUND_CANDIDATE);
2829 // Matched whole vector if first element matched (tmp(rcx) == 0).
2830 if (int_cnt2 == stride) {
2831 jccb(Assembler::overflow, RET_FOUND); // OF == 1
2832 } else { // int_cnt2 > 8
2833 jccb(Assembler::overflow, FOUND_SUBSTR);
2834 }
2835 // After pcmpestri tmp(rcx) contains matched element index
2836 // Compute start addr of substr
2837 lea(result, Address(result, tmp, scale1));
2838
2839 // Make sure string is still long enough
2840 subl(cnt1, tmp);
2841 cmpl(cnt1, cnt2);
2842 if (int_cnt2 == stride) {
2843 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2844 } else { // int_cnt2 > 8
2845 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2846 }
2847 // Left less then substring.
2848
2849 bind(RET_NOT_FOUND);
2850 movl(result, -1);
2851 jmp(EXIT);
2852
2853 if (int_cnt2 > stride) {
2854 // This code is optimized for the case when whole substring
2855 // is matched if its head is matched.
2856 bind(MATCH_SUBSTR_HEAD);
2857 pcmpestri(vec, Address(result, 0), mode);
2858 // Reload only string if does not match
2859 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2860
2861 Label CONT_SCAN_SUBSTR;
2862 // Compare the rest of substring (> 8 chars).
2863 bind(FOUND_SUBSTR);
2864 // First 8 chars are already matched.
2865 negptr(cnt2);
2866 addptr(cnt2, stride);
2867
2868 bind(SCAN_SUBSTR);
2869 subl(cnt1, stride);
2870 cmpl(cnt2, -stride); // Do not read beyond substring
2871 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2872 // Back-up strings to avoid reading beyond substring:
2873 // cnt1 = cnt1 - cnt2 + 8
2874 addl(cnt1, cnt2); // cnt2 is negative
2875 addl(cnt1, stride);
2876 movl(cnt2, stride); negptr(cnt2);
2877 bind(CONT_SCAN_SUBSTR);
2878 if (int_cnt2 < (int)G) {
2879 int tail_off1 = int_cnt2<<scale1;
2880 int tail_off2 = int_cnt2<<scale2;
2881 if (ae == StrIntrinsicNode::UL) {
2882 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2883 } else {
2884 movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2885 }
2886 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2887 } else {
2888 // calculate index in register to avoid integer overflow (int_cnt2*2)
2889 movl(tmp, int_cnt2);
2890 addptr(tmp, cnt2);
2891 if (ae == StrIntrinsicNode::UL) {
2892 pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2893 } else {
2894 movdqu(vec, Address(str2, tmp, scale2, 0));
2895 }
2896 pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2897 }
2898 // Need to reload strings pointers if not matched whole vector
2899 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2900 addptr(cnt2, stride);
2901 jcc(Assembler::negative, SCAN_SUBSTR);
2902 // Fall through if found full substring
2903
2904 } // (int_cnt2 > 8)
2905
2906 bind(RET_FOUND);
2907 // Found result if we matched full small substring.
2908 // Compute substr offset
2909 subptr(result, str1);
2910 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2911 shrl(result, 1); // index
2912 }
2913 bind(EXIT);
2914
2915 } // string_indexofC8
2916
2917 // Small strings are loaded through stack if they cross page boundary.
2918 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2919 Register cnt1, Register cnt2,
2920 int int_cnt2, Register result,
2921 XMMRegister vec, Register tmp,
2922 int ae) {
2923 ShortBranchVerifier sbv(this);
2924 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2925 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2926
2927 //
2928 // int_cnt2 is length of small (< 8 chars) constant substring
2929 // or (-1) for non constant substring in which case its length
2930 // is in cnt2 register.
2931 //
2932 // Note, inline_string_indexOf() generates checks:
2933 // if (substr.count > string.count) return -1;
2934 // if (substr.count == 0) return 0;
2935 //
2936 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2937 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2938 // This method uses the pcmpestri instruction with bound registers
2939 // inputs:
2940 // xmm - substring
2941 // rax - substring length (elements count)
2942 // mem - scanned string
2943 // rdx - string length (elements count)
2944 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2945 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2946 // outputs:
2947 // rcx - matched index in string
2948 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2949 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2950 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2951 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2952
2953 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2954 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2955 FOUND_CANDIDATE;
2956
2957 { //========================================================
2958 // We don't know where these strings are located
2959 // and we can't read beyond them. Load them through stack.
2960 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2961
2962 movptr(tmp, rsp); // save old SP
2963
2964 if (int_cnt2 > 0) { // small (< 8 chars) constant substring
2965 if (int_cnt2 == (1>>scale2)) { // One byte
2966 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2967 load_unsigned_byte(result, Address(str2, 0));
2968 movdl(vec, result); // move 32 bits
2969 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes
2970 // Not enough header space in 32-bit VM: 12+3 = 15.
2971 movl(result, Address(str2, -1));
2972 shrl(result, 8);
2973 movdl(vec, result); // move 32 bits
2974 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char
2975 load_unsigned_short(result, Address(str2, 0));
2976 movdl(vec, result); // move 32 bits
2977 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2978 movdl(vec, Address(str2, 0)); // move 32 bits
2979 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2980 movq(vec, Address(str2, 0)); // move 64 bits
2981 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2982 // Array header size is 12 bytes in 32-bit VM
2983 // + 6 bytes for 3 chars == 18 bytes,
2984 // enough space to load vec and shift.
2985 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2986 if (ae == StrIntrinsicNode::UL) {
2987 int tail_off = int_cnt2-8;
2988 pmovzxbw(vec, Address(str2, tail_off));
2989 psrldq(vec, -2*tail_off);
2990 }
2991 else {
2992 int tail_off = int_cnt2*(1<<scale2);
2993 movdqu(vec, Address(str2, tail_off-16));
2994 psrldq(vec, 16-tail_off);
2995 }
2996 }
2997 } else { // not constant substring
2998 cmpl(cnt2, stride);
2999 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3000
3001 // We can read beyond string if srt+16 does not cross page boundary
3002 // since heaps are aligned and mapped by pages.
3003 assert(os::vm_page_size() < (int)G, "default page should be small");
3004 movl(result, str2); // We need only low 32 bits
3005 andl(result, ((int)os::vm_page_size()-1));
3006 cmpl(result, ((int)os::vm_page_size()-16));
3007 jccb(Assembler::belowEqual, CHECK_STR);
3008
3009 // Move small strings to stack to allow load 16 bytes into vec.
3010 subptr(rsp, 16);
3011 int stk_offset = wordSize-(1<<scale2);
3012 push(cnt2);
3013
3014 bind(COPY_SUBSTR);
3015 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3016 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3017 movb(Address(rsp, cnt2, scale2, stk_offset), result);
3018 } else if (ae == StrIntrinsicNode::UU) {
3019 load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3020 movw(Address(rsp, cnt2, scale2, stk_offset), result);
3021 }
3022 decrement(cnt2);
3023 jccb(Assembler::notZero, COPY_SUBSTR);
3024
3025 pop(cnt2);
3026 movptr(str2, rsp); // New substring address
3027 } // non constant
3028
3029 bind(CHECK_STR);
3030 cmpl(cnt1, stride);
3031 jccb(Assembler::aboveEqual, BIG_STRINGS);
3032
3033 // Check cross page boundary.
3034 movl(result, str1); // We need only low 32 bits
3035 andl(result, ((int)os::vm_page_size()-1));
3036 cmpl(result, ((int)os::vm_page_size()-16));
3037 jccb(Assembler::belowEqual, BIG_STRINGS);
3038
3039 subptr(rsp, 16);
3040 int stk_offset = -(1<<scale1);
3041 if (int_cnt2 < 0) { // not constant
3042 push(cnt2);
3043 stk_offset += wordSize;
3044 }
3045 movl(cnt2, cnt1);
3046
3047 bind(COPY_STR);
3048 if (ae == StrIntrinsicNode::LL) {
3049 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3050 movb(Address(rsp, cnt2, scale1, stk_offset), result);
3051 } else {
3052 load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3053 movw(Address(rsp, cnt2, scale1, stk_offset), result);
3054 }
3055 decrement(cnt2);
3056 jccb(Assembler::notZero, COPY_STR);
3057
3058 if (int_cnt2 < 0) { // not constant
3059 pop(cnt2);
3060 }
3061 movptr(str1, rsp); // New string address
3062
3063 bind(BIG_STRINGS);
3064 // Load substring.
3065 if (int_cnt2 < 0) { // -1
3066 if (ae == StrIntrinsicNode::UL) {
3067 pmovzxbw(vec, Address(str2, 0));
3068 } else {
3069 movdqu(vec, Address(str2, 0));
3070 }
3071 push(cnt2); // substr count
3072 push(str2); // substr addr
3073 push(str1); // string addr
3074 } else {
3075 // Small (< 8 chars) constant substrings are loaded already.
3076 movl(cnt2, int_cnt2);
3077 }
3078 push(tmp); // original SP
3079
3080 } // Finished loading
3081
3082 //========================================================
3083 // Start search
3084 //
3085
3086 movptr(result, str1); // string addr
3087
3088 if (int_cnt2 < 0) { // Only for non constant substring
3089 jmpb(SCAN_TO_SUBSTR);
3090
3091 // SP saved at sp+0
3092 // String saved at sp+1*wordSize
3093 // Substr saved at sp+2*wordSize
3094 // Substr count saved at sp+3*wordSize
3095
3096 // Reload substr for rescan, this code
3097 // is executed only for large substrings (> 8 chars)
3098 bind(RELOAD_SUBSTR);
3099 movptr(str2, Address(rsp, 2*wordSize));
3100 movl(cnt2, Address(rsp, 3*wordSize));
3101 if (ae == StrIntrinsicNode::UL) {
3102 pmovzxbw(vec, Address(str2, 0));
3103 } else {
3104 movdqu(vec, Address(str2, 0));
3105 }
3106 // We came here after the beginning of the substring was
3107 // matched but the rest of it was not so we need to search
3108 // again. Start from the next element after the previous match.
3109 subptr(str1, result); // Restore counter
3110 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3111 shrl(str1, 1);
3112 }
3113 addl(cnt1, str1);
3114 decrementl(cnt1); // Shift to next element
3115 cmpl(cnt1, cnt2);
3116 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
3117
3118 addptr(result, (1<<scale1));
3119 } // non constant
3120
3121 // Scan string for start of substr in 16-byte vectors
3122 bind(SCAN_TO_SUBSTR);
3123 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3124 pcmpestri(vec, Address(result, 0), mode);
3125 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
3126 subl(cnt1, stride);
3127 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3128 cmpl(cnt1, cnt2);
3129 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
3130 addptr(result, 16);
3131
3132 bind(ADJUST_STR);
3133 cmpl(cnt1, stride); // Do not read beyond string
3134 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3135 // Back-up string to avoid reading beyond string.
3136 lea(result, Address(result, cnt1, scale1, -16));
3137 movl(cnt1, stride);
3138 jmpb(SCAN_TO_SUBSTR);
3139
3140 // Found a potential substr
3141 bind(FOUND_CANDIDATE);
3142 // After pcmpestri tmp(rcx) contains matched element index
3143
3144 // Make sure string is still long enough
3145 subl(cnt1, tmp);
3146 cmpl(cnt1, cnt2);
3147 jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3148 // Left less then substring.
3149
3150 bind(RET_NOT_FOUND);
3151 movl(result, -1);
3152 jmp(CLEANUP);
3153
3154 bind(FOUND_SUBSTR);
3155 // Compute start addr of substr
3156 lea(result, Address(result, tmp, scale1));
3157 if (int_cnt2 > 0) { // Constant substring
3158 // Repeat search for small substring (< 8 chars)
3159 // from new point without reloading substring.
3160 // Have to check that we don't read beyond string.
3161 cmpl(tmp, stride-int_cnt2);
3162 jccb(Assembler::greater, ADJUST_STR);
3163 // Fall through if matched whole substring.
3164 } else { // non constant
3165 assert(int_cnt2 == -1, "should be != 0");
3166
3167 addl(tmp, cnt2);
3168 // Found result if we matched whole substring.
3169 cmpl(tmp, stride);
3170 jcc(Assembler::lessEqual, RET_FOUND);
3171
3172 // Repeat search for small substring (<= 8 chars)
3173 // from new point 'str1' without reloading substring.
3174 cmpl(cnt2, stride);
3175 // Have to check that we don't read beyond string.
3176 jccb(Assembler::lessEqual, ADJUST_STR);
3177
3178 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3179 // Compare the rest of substring (> 8 chars).
3180 movptr(str1, result);
3181
3182 cmpl(tmp, cnt2);
3183 // First 8 chars are already matched.
3184 jccb(Assembler::equal, CHECK_NEXT);
3185
3186 bind(SCAN_SUBSTR);
3187 pcmpestri(vec, Address(str1, 0), mode);
3188 // Need to reload strings pointers if not matched whole vector
3189 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3190
3191 bind(CHECK_NEXT);
3192 subl(cnt2, stride);
3193 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3194 addptr(str1, 16);
3195 if (ae == StrIntrinsicNode::UL) {
3196 addptr(str2, 8);
3197 } else {
3198 addptr(str2, 16);
3199 }
3200 subl(cnt1, stride);
3201 cmpl(cnt2, stride); // Do not read beyond substring
3202 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3203 // Back-up strings to avoid reading beyond substring.
3204
3205 if (ae == StrIntrinsicNode::UL) {
3206 lea(str2, Address(str2, cnt2, scale2, -8));
3207 lea(str1, Address(str1, cnt2, scale1, -16));
3208 } else {
3209 lea(str2, Address(str2, cnt2, scale2, -16));
3210 lea(str1, Address(str1, cnt2, scale1, -16));
3211 }
3212 subl(cnt1, cnt2);
3213 movl(cnt2, stride);
3214 addl(cnt1, stride);
3215 bind(CONT_SCAN_SUBSTR);
3216 if (ae == StrIntrinsicNode::UL) {
3217 pmovzxbw(vec, Address(str2, 0));
3218 } else {
3219 movdqu(vec, Address(str2, 0));
3220 }
3221 jmp(SCAN_SUBSTR);
3222
3223 bind(RET_FOUND_LONG);
3224 movptr(str1, Address(rsp, wordSize));
3225 } // non constant
3226
3227 bind(RET_FOUND);
3228 // Compute substr offset
3229 subptr(result, str1);
3230 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3231 shrl(result, 1); // index
3232 }
3233 bind(CLEANUP);
3234 pop(rsp); // restore SP
3235
3236 } // string_indexof
3237
3238 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3239 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3240 ShortBranchVerifier sbv(this);
3241 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3242
3243 int stride = 8;
3244
3245 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3246 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3247 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3248 FOUND_SEQ_CHAR, DONE_LABEL;
3249
3250 movptr(result, str1);
3251 if (UseAVX >= 2) {
3252 cmpl(cnt1, stride);
3253 jcc(Assembler::less, SCAN_TO_CHAR);
3254 cmpl(cnt1, 2*stride);
3255 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3256 movdl(vec1, ch);
3257 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3258 vpxor(vec2, vec2);
3259 movl(tmp, cnt1);
3260 andl(tmp, 0xFFFFFFF0); //vector count (in chars)
3261 andl(cnt1,0x0000000F); //tail count (in chars)
3262
3263 bind(SCAN_TO_16_CHAR_LOOP);
3264 vmovdqu(vec3, Address(result, 0));
3265 vpcmpeqw(vec3, vec3, vec1, 1);
3266 vptest(vec2, vec3);
3267 jcc(Assembler::carryClear, FOUND_CHAR);
3268 addptr(result, 32);
3269 subl(tmp, 2*stride);
3270 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3271 jmp(SCAN_TO_8_CHAR);
3272 bind(SCAN_TO_8_CHAR_INIT);
3273 movdl(vec1, ch);
3274 pshuflw(vec1, vec1, 0x00);
3275 pshufd(vec1, vec1, 0);
3276 pxor(vec2, vec2);
3277 }
3278 bind(SCAN_TO_8_CHAR);
3279 cmpl(cnt1, stride);
3280 jcc(Assembler::less, SCAN_TO_CHAR);
3281 if (UseAVX < 2) {
3282 movdl(vec1, ch);
3283 pshuflw(vec1, vec1, 0x00);
3284 pshufd(vec1, vec1, 0);
3285 pxor(vec2, vec2);
3286 }
3287 movl(tmp, cnt1);
3288 andl(tmp, 0xFFFFFFF8); //vector count (in chars)
3289 andl(cnt1,0x00000007); //tail count (in chars)
3290
3291 bind(SCAN_TO_8_CHAR_LOOP);
3292 movdqu(vec3, Address(result, 0));
3293 pcmpeqw(vec3, vec1);
3294 ptest(vec2, vec3);
3295 jcc(Assembler::carryClear, FOUND_CHAR);
3296 addptr(result, 16);
3297 subl(tmp, stride);
3298 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3299 bind(SCAN_TO_CHAR);
3300 testl(cnt1, cnt1);
3301 jcc(Assembler::zero, RET_NOT_FOUND);
3302 bind(SCAN_TO_CHAR_LOOP);
3303 load_unsigned_short(tmp, Address(result, 0));
3304 cmpl(ch, tmp);
3305 jccb(Assembler::equal, FOUND_SEQ_CHAR);
3306 addptr(result, 2);
3307 subl(cnt1, 1);
3308 jccb(Assembler::zero, RET_NOT_FOUND);
3309 jmp(SCAN_TO_CHAR_LOOP);
3310
3311 bind(RET_NOT_FOUND);
3312 movl(result, -1);
3313 jmpb(DONE_LABEL);
3314
3315 bind(FOUND_CHAR);
3316 if (UseAVX >= 2) {
3317 vpmovmskb(tmp, vec3);
3318 } else {
3319 pmovmskb(tmp, vec3);
3320 }
3321 bsfl(ch, tmp);
3322 addptr(result, ch);
3323
3324 bind(FOUND_SEQ_CHAR);
3325 subptr(result, str1);
3326 shrl(result, 1);
3327
3328 bind(DONE_LABEL);
3329 } // string_indexof_char
3330
3331 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3332 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3333 ShortBranchVerifier sbv(this);
3334 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3335
3336 int stride = 16;
3337
3338 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3339 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3340 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3341 FOUND_SEQ_CHAR, DONE_LABEL;
3342
3343 movptr(result, str1);
3344 if (UseAVX >= 2) {
3345 cmpl(cnt1, stride);
3346 jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3347 cmpl(cnt1, stride*2);
3348 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3349 movdl(vec1, ch);
3350 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3351 vpxor(vec2, vec2);
3352 movl(tmp, cnt1);
3353 andl(tmp, 0xFFFFFFE0); //vector count (in chars)
3354 andl(cnt1,0x0000001F); //tail count (in chars)
3355
3356 bind(SCAN_TO_32_CHAR_LOOP);
3357 vmovdqu(vec3, Address(result, 0));
3358 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3359 vptest(vec2, vec3);
3360 jcc(Assembler::carryClear, FOUND_CHAR);
3361 addptr(result, 32);
3362 subl(tmp, stride*2);
3363 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3364 jmp(SCAN_TO_16_CHAR);
3365
3366 bind(SCAN_TO_16_CHAR_INIT);
3367 movdl(vec1, ch);
3368 pxor(vec2, vec2);
3369 pshufb(vec1, vec2);
3370 }
3371
3372 bind(SCAN_TO_16_CHAR);
3373 cmpl(cnt1, stride);
3374 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3375 if (UseAVX < 2) {
3376 movdl(vec1, ch);
3377 pxor(vec2, vec2);
3378 pshufb(vec1, vec2);
3379 }
3380 movl(tmp, cnt1);
3381 andl(tmp, 0xFFFFFFF0); //vector count (in bytes)
3382 andl(cnt1,0x0000000F); //tail count (in bytes)
3383
3384 bind(SCAN_TO_16_CHAR_LOOP);
3385 movdqu(vec3, Address(result, 0));
3386 pcmpeqb(vec3, vec1);
3387 ptest(vec2, vec3);
3388 jcc(Assembler::carryClear, FOUND_CHAR);
3389 addptr(result, 16);
3390 subl(tmp, stride);
3391 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3392
3393 bind(SCAN_TO_CHAR_INIT);
3394 testl(cnt1, cnt1);
3395 jcc(Assembler::zero, RET_NOT_FOUND);
3396 bind(SCAN_TO_CHAR_LOOP);
3397 load_unsigned_byte(tmp, Address(result, 0));
3398 cmpl(ch, tmp);
3399 jccb(Assembler::equal, FOUND_SEQ_CHAR);
3400 addptr(result, 1);
3401 subl(cnt1, 1);
3402 jccb(Assembler::zero, RET_NOT_FOUND);
3403 jmp(SCAN_TO_CHAR_LOOP);
3404
3405 bind(RET_NOT_FOUND);
3406 movl(result, -1);
3407 jmpb(DONE_LABEL);
3408
3409 bind(FOUND_CHAR);
3410 if (UseAVX >= 2) {
3411 vpmovmskb(tmp, vec3);
3412 } else {
3413 pmovmskb(tmp, vec3);
3414 }
3415 bsfl(ch, tmp);
3416 addptr(result, ch);
3417
3418 bind(FOUND_SEQ_CHAR);
3419 subptr(result, str1);
3420
3421 bind(DONE_LABEL);
3422 } // stringL_indexof_char
3423
3424 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3425 switch (eltype) {
3426 case T_BOOLEAN: return sizeof(jboolean);
3427 case T_BYTE: return sizeof(jbyte);
3428 case T_SHORT: return sizeof(jshort);
3429 case T_CHAR: return sizeof(jchar);
3430 case T_INT: return sizeof(jint);
3431 default:
3432 ShouldNotReachHere();
3433 return -1;
3434 }
3435 }
3436
3437 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3438 switch (eltype) {
3439 // T_BOOLEAN used as surrogate for unsigned byte
3440 case T_BOOLEAN: movzbl(dst, src); break;
3441 case T_BYTE: movsbl(dst, src); break;
3442 case T_SHORT: movswl(dst, src); break;
3443 case T_CHAR: movzwl(dst, src); break;
3444 case T_INT: movl(dst, src); break;
3445 default:
3446 ShouldNotReachHere();
3447 }
3448 }
3449
3450 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3451 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3452 }
3453
3454 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3455 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3456 }
3457
3458 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3459 const int vlen = Assembler::AVX_256bit;
3460 switch (eltype) {
3461 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break;
3462 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break;
3463 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3464 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3465 case T_INT:
3466 // do nothing
3467 break;
3468 default:
3469 ShouldNotReachHere();
3470 }
3471 }
3472
3473 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3474 Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3475 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3476 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3477 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3478 BasicType eltype) {
3479 ShortBranchVerifier sbv(this);
3480 assert(UseAVX >= 2, "AVX2 intrinsics are required");
3481 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3482 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3483
3484 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3485 SHORT_UNROLLED_LOOP_EXIT,
3486 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3487 UNROLLED_VECTOR_LOOP_BEGIN,
3488 END;
3489 switch (eltype) {
3490 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3491 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break;
3492 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break;
3493 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break;
3494 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break;
3495 default: BLOCK_COMMENT("arrays_hashcode {"); break;
3496 }
3497
3498 // For "renaming" for readibility of the code
3499 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3500 vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3501 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3502
3503 const int elsize = arrays_hashcode_elsize(eltype);
3504
3505 /*
3506 if (cnt1 >= 2) {
3507 if (cnt1 >= 32) {
3508 UNROLLED VECTOR LOOP
3509 }
3510 UNROLLED SCALAR LOOP
3511 }
3512 SINGLE SCALAR
3513 */
3514
3515 cmpl(cnt1, 32);
3516 jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3517
3518 // cnt1 >= 32 && generate_vectorized_loop
3519 xorl(index, index);
3520
3521 // vresult = IntVector.zero(I256);
3522 for (int idx = 0; idx < 4; idx++) {
3523 vpxor(vresult[idx], vresult[idx]);
3524 }
3525 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3526 Register bound = tmp2;
3527 Register next = tmp3;
3528 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3529 movl(next, Address(tmp2, 0));
3530 movdl(vnext, next);
3531 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3532
3533 // index = 0;
3534 // bound = cnt1 & ~(32 - 1);
3535 movl(bound, cnt1);
3536 andl(bound, ~(32 - 1));
3537 // for (; index < bound; index += 32) {
3538 bind(UNROLLED_VECTOR_LOOP_BEGIN);
3539 // result *= next;
3540 imull(result, next);
3541 // loop fission to upfront the cost of fetching from memory, OOO execution
3542 // can then hopefully do a better job of prefetching
3543 for (int idx = 0; idx < 4; idx++) {
3544 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3545 }
3546 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3547 for (int idx = 0; idx < 4; idx++) {
3548 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3549 arrays_hashcode_elvcast(vtmp[idx], eltype);
3550 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3551 }
3552 // index += 32;
3553 addl(index, 32);
3554 // index < bound;
3555 cmpl(index, bound);
3556 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3557 // }
3558
3559 lea(ary1, Address(ary1, bound, Address::times(elsize)));
3560 subl(cnt1, bound);
3561 // release bound
3562
3563 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3564 for (int idx = 0; idx < 4; idx++) {
3565 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3566 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3567 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3568 }
3569 // result += vresult.reduceLanes(ADD);
3570 for (int idx = 0; idx < 4; idx++) {
3571 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3572 }
3573
3574 // } else if (cnt1 < 32) {
3575
3576 bind(SHORT_UNROLLED_BEGIN);
3577 // int i = 1;
3578 movl(index, 1);
3579 cmpl(index, cnt1);
3580 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3581
3582 // for (; i < cnt1 ; i += 2) {
3583 bind(SHORT_UNROLLED_LOOP_BEGIN);
3584 movl(tmp3, 961);
3585 imull(result, tmp3);
3586 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3587 movl(tmp3, tmp2);
3588 shll(tmp3, 5);
3589 subl(tmp3, tmp2);
3590 addl(result, tmp3);
3591 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3592 addl(result, tmp3);
3593 addl(index, 2);
3594 cmpl(index, cnt1);
3595 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3596
3597 // }
3598 // if (i >= cnt1) {
3599 bind(SHORT_UNROLLED_LOOP_EXIT);
3600 jccb(Assembler::greater, END);
3601 movl(tmp2, result);
3602 shll(result, 5);
3603 subl(result, tmp2);
3604 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3605 addl(result, tmp3);
3606 // }
3607 bind(END);
3608
3609 BLOCK_COMMENT("} // arrays_hashcode");
3610
3611 } // arrays_hashcode
3612
3613 // helper function for string_compare
3614 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3615 Address::ScaleFactor scale, Address::ScaleFactor scale1,
3616 Address::ScaleFactor scale2, Register index, int ae) {
3617 if (ae == StrIntrinsicNode::LL) {
3618 load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3619 load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3620 } else if (ae == StrIntrinsicNode::UU) {
3621 load_unsigned_short(elem1, Address(str1, index, scale, 0));
3622 load_unsigned_short(elem2, Address(str2, index, scale, 0));
3623 } else {
3624 load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3625 load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3626 }
3627 }
3628
3629 // Compare strings, used for char[] and byte[].
3630 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3631 Register cnt1, Register cnt2, Register result,
3632 XMMRegister vec1, int ae, KRegister mask) {
3633 ShortBranchVerifier sbv(this);
3634 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3635 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only AVX3
3636 int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3637 int stride2x2 = 0x40;
3638 Address::ScaleFactor scale = Address::no_scale;
3639 Address::ScaleFactor scale1 = Address::no_scale;
3640 Address::ScaleFactor scale2 = Address::no_scale;
3641
3642 if (ae != StrIntrinsicNode::LL) {
3643 stride2x2 = 0x20;
3644 }
3645
3646 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3647 shrl(cnt2, 1);
3648 }
3649 // Compute the minimum of the string lengths and the
3650 // difference of the string lengths (stack).
3651 // Do the conditional move stuff
3652 movl(result, cnt1);
3653 subl(cnt1, cnt2);
3654 push(cnt1);
3655 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2)
3656
3657 // Is the minimum length zero?
3658 testl(cnt2, cnt2);
3659 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3660 if (ae == StrIntrinsicNode::LL) {
3661 // Load first bytes
3662 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0]
3663 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0]
3664 } else if (ae == StrIntrinsicNode::UU) {
3665 // Load first characters
3666 load_unsigned_short(result, Address(str1, 0));
3667 load_unsigned_short(cnt1, Address(str2, 0));
3668 } else {
3669 load_unsigned_byte(result, Address(str1, 0));
3670 load_unsigned_short(cnt1, Address(str2, 0));
3671 }
3672 subl(result, cnt1);
3673 jcc(Assembler::notZero, POP_LABEL);
3674
3675 if (ae == StrIntrinsicNode::UU) {
3676 // Divide length by 2 to get number of chars
3677 shrl(cnt2, 1);
3678 }
3679 cmpl(cnt2, 1);
3680 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3681
3682 // Check if the strings start at the same location and setup scale and stride
3683 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3684 cmpptr(str1, str2);
3685 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3686 if (ae == StrIntrinsicNode::LL) {
3687 scale = Address::times_1;
3688 stride = 16;
3689 } else {
3690 scale = Address::times_2;
3691 stride = 8;
3692 }
3693 } else {
3694 scale1 = Address::times_1;
3695 scale2 = Address::times_2;
3696 // scale not used
3697 stride = 8;
3698 }
3699
3700 if (UseAVX >= 2 && UseSSE42Intrinsics) {
3701 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3702 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3703 Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3704 Label COMPARE_TAIL_LONG;
3705 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only AVX3
3706
3707 int pcmpmask = 0x19;
3708 if (ae == StrIntrinsicNode::LL) {
3709 pcmpmask &= ~0x01;
3710 }
3711
3712 // Setup to compare 16-chars (32-bytes) vectors,
3713 // start from first character again because it has aligned address.
3714 if (ae == StrIntrinsicNode::LL) {
3715 stride2 = 32;
3716 } else {
3717 stride2 = 16;
3718 }
3719 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3720 adr_stride = stride << scale;
3721 } else {
3722 adr_stride1 = 8; //stride << scale1;
3723 adr_stride2 = 16; //stride << scale2;
3724 }
3725
3726 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3727 // rax and rdx are used by pcmpestri as elements counters
3728 movl(result, cnt2);
3729 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count
3730 jcc(Assembler::zero, COMPARE_TAIL_LONG);
3731
3732 // fast path : compare first 2 8-char vectors.
3733 bind(COMPARE_16_CHARS);
3734 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3735 movdqu(vec1, Address(str1, 0));
3736 } else {
3737 pmovzxbw(vec1, Address(str1, 0));
3738 }
3739 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3740 jccb(Assembler::below, COMPARE_INDEX_CHAR);
3741
3742 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3743 movdqu(vec1, Address(str1, adr_stride));
3744 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3745 } else {
3746 pmovzxbw(vec1, Address(str1, adr_stride1));
3747 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3748 }
3749 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3750 addl(cnt1, stride);
3751
3752 // Compare the characters at index in cnt1
3753 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3754 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3755 subl(result, cnt2);
3756 jmp(POP_LABEL);
3757
3758 // Setup the registers to start vector comparison loop
3759 bind(COMPARE_WIDE_VECTORS);
3760 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3761 lea(str1, Address(str1, result, scale));
3762 lea(str2, Address(str2, result, scale));
3763 } else {
3764 lea(str1, Address(str1, result, scale1));
3765 lea(str2, Address(str2, result, scale2));
3766 }
3767 subl(result, stride2);
3768 subl(cnt2, stride2);
3769 jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3770 negptr(result);
3771
3772 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3773 bind(COMPARE_WIDE_VECTORS_LOOP);
3774
3775 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3776 cmpl(cnt2, stride2x2);
3777 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3778 testl(cnt2, stride2x2-1); // cnt2 holds the vector count
3779 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40
3780
3781 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3782 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3783 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3784 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3785 } else {
3786 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3787 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3788 }
3789 kortestql(mask, mask);
3790 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare
3791 addptr(result, stride2x2); // update since we already compared at this addr
3792 subl(cnt2, stride2x2); // and sub the size too
3793 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3794
3795 vpxor(vec1, vec1);
3796 jmpb(COMPARE_WIDE_TAIL);
3797 }//if (VM_Version::supports_avx512vlbw())
3798
3799 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3800 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3801 vmovdqu(vec1, Address(str1, result, scale));
3802 vpxor(vec1, Address(str2, result, scale));
3803 } else {
3804 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3805 vpxor(vec1, Address(str2, result, scale2));
3806 }
3807 vptest(vec1, vec1);
3808 jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3809 addptr(result, stride2);
3810 subl(cnt2, stride2);
3811 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3812 // clean upper bits of YMM registers
3813 vpxor(vec1, vec1);
3814
3815 // compare wide vectors tail
3816 bind(COMPARE_WIDE_TAIL);
3817 testptr(result, result);
3818 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3819
3820 movl(result, stride2);
3821 movl(cnt2, result);
3822 negptr(result);
3823 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3824
3825 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3826 bind(VECTOR_NOT_EQUAL);
3827 // clean upper bits of YMM registers
3828 vpxor(vec1, vec1);
3829 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3830 lea(str1, Address(str1, result, scale));
3831 lea(str2, Address(str2, result, scale));
3832 } else {
3833 lea(str1, Address(str1, result, scale1));
3834 lea(str2, Address(str2, result, scale2));
3835 }
3836 jmp(COMPARE_16_CHARS);
3837
3838 // Compare tail chars, length between 1 to 15 chars
3839 bind(COMPARE_TAIL_LONG);
3840 movl(cnt2, result);
3841 cmpl(cnt2, stride);
3842 jcc(Assembler::less, COMPARE_SMALL_STR);
3843
3844 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3845 movdqu(vec1, Address(str1, 0));
3846 } else {
3847 pmovzxbw(vec1, Address(str1, 0));
3848 }
3849 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3850 jcc(Assembler::below, COMPARE_INDEX_CHAR);
3851 subptr(cnt2, stride);
3852 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3853 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3854 lea(str1, Address(str1, result, scale));
3855 lea(str2, Address(str2, result, scale));
3856 } else {
3857 lea(str1, Address(str1, result, scale1));
3858 lea(str2, Address(str2, result, scale2));
3859 }
3860 negptr(cnt2);
3861 jmpb(WHILE_HEAD_LABEL);
3862
3863 bind(COMPARE_SMALL_STR);
3864 } else if (UseSSE42Intrinsics) {
3865 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3866 int pcmpmask = 0x19;
3867 // Setup to compare 8-char (16-byte) vectors,
3868 // start from first character again because it has aligned address.
3869 movl(result, cnt2);
3870 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count
3871 if (ae == StrIntrinsicNode::LL) {
3872 pcmpmask &= ~0x01;
3873 }
3874 jcc(Assembler::zero, COMPARE_TAIL);
3875 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3876 lea(str1, Address(str1, result, scale));
3877 lea(str2, Address(str2, result, scale));
3878 } else {
3879 lea(str1, Address(str1, result, scale1));
3880 lea(str2, Address(str2, result, scale2));
3881 }
3882 negptr(result);
3883
3884 // pcmpestri
3885 // inputs:
3886 // vec1- substring
3887 // rax - negative string length (elements count)
3888 // mem - scanned string
3889 // rdx - string length (elements count)
3890 // pcmpmask - cmp mode: 11000 (string compare with negated result)
3891 // + 00 (unsigned bytes) or + 01 (unsigned shorts)
3892 // outputs:
3893 // rcx - first mismatched element index
3894 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3895
3896 bind(COMPARE_WIDE_VECTORS);
3897 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3898 movdqu(vec1, Address(str1, result, scale));
3899 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3900 } else {
3901 pmovzxbw(vec1, Address(str1, result, scale1));
3902 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3903 }
3904 // After pcmpestri cnt1(rcx) contains mismatched element index
3905
3906 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1
3907 addptr(result, stride);
3908 subptr(cnt2, stride);
3909 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3910
3911 // compare wide vectors tail
3912 testptr(result, result);
3913 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3914
3915 movl(cnt2, stride);
3916 movl(result, stride);
3917 negptr(result);
3918 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3919 movdqu(vec1, Address(str1, result, scale));
3920 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3921 } else {
3922 pmovzxbw(vec1, Address(str1, result, scale1));
3923 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3924 }
3925 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3926
3927 // Mismatched characters in the vectors
3928 bind(VECTOR_NOT_EQUAL);
3929 addptr(cnt1, result);
3930 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3931 subl(result, cnt2);
3932 jmpb(POP_LABEL);
3933
3934 bind(COMPARE_TAIL); // limit is zero
3935 movl(cnt2, result);
3936 // Fallthru to tail compare
3937 }
3938 // Shift str2 and str1 to the end of the arrays, negate min
3939 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3940 lea(str1, Address(str1, cnt2, scale));
3941 lea(str2, Address(str2, cnt2, scale));
3942 } else {
3943 lea(str1, Address(str1, cnt2, scale1));
3944 lea(str2, Address(str2, cnt2, scale2));
3945 }
3946 decrementl(cnt2); // first character was compared already
3947 negptr(cnt2);
3948
3949 // Compare the rest of the elements
3950 bind(WHILE_HEAD_LABEL);
3951 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3952 subl(result, cnt1);
3953 jccb(Assembler::notZero, POP_LABEL);
3954 increment(cnt2);
3955 jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3956
3957 // Strings are equal up to min length. Return the length difference.
3958 bind(LENGTH_DIFF_LABEL);
3959 pop(result);
3960 if (ae == StrIntrinsicNode::UU) {
3961 // Divide diff by 2 to get number of chars
3962 sarl(result, 1);
3963 }
3964 jmpb(DONE_LABEL);
3965
3966 if (VM_Version::supports_avx512vlbw()) {
3967
3968 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3969
3970 kmovql(cnt1, mask);
3971 notq(cnt1);
3972 bsfq(cnt2, cnt1);
3973 if (ae != StrIntrinsicNode::LL) {
3974 // Divide diff by 2 to get number of chars
3975 sarl(cnt2, 1);
3976 }
3977 addq(result, cnt2);
3978 if (ae == StrIntrinsicNode::LL) {
3979 load_unsigned_byte(cnt1, Address(str2, result));
3980 load_unsigned_byte(result, Address(str1, result));
3981 } else if (ae == StrIntrinsicNode::UU) {
3982 load_unsigned_short(cnt1, Address(str2, result, scale));
3983 load_unsigned_short(result, Address(str1, result, scale));
3984 } else {
3985 load_unsigned_short(cnt1, Address(str2, result, scale2));
3986 load_unsigned_byte(result, Address(str1, result, scale1));
3987 }
3988 subl(result, cnt1);
3989 jmpb(POP_LABEL);
3990 }//if (VM_Version::supports_avx512vlbw())
3991
3992 // Discard the stored length difference
3993 bind(POP_LABEL);
3994 pop(cnt1);
3995
3996 // That's it
3997 bind(DONE_LABEL);
3998 if(ae == StrIntrinsicNode::UL) {
3999 negl(result);
4000 }
4001
4002 }
4003
4004 // Search for Non-ASCII character (Negative byte value) in a byte array,
4005 // return the index of the first such character, otherwise the length
4006 // of the array segment searched.
4007 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4008 // @IntrinsicCandidate
4009 // public static int countPositives(byte[] ba, int off, int len) {
4010 // for (int i = off; i < off + len; i++) {
4011 // if (ba[i] < 0) {
4012 // return i - off;
4013 // }
4014 // }
4015 // return len;
4016 // }
4017 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4018 Register result, Register tmp1,
4019 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4020 // rsi: byte array
4021 // rcx: len
4022 // rax: result
4023 ShortBranchVerifier sbv(this);
4024 assert_different_registers(ary1, len, result, tmp1);
4025 assert_different_registers(vec1, vec2);
4026 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4027
4028 movl(result, len); // copy
4029 // len == 0
4030 testl(len, len);
4031 jcc(Assembler::zero, DONE);
4032
4033 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4034 VM_Version::supports_avx512vlbw() &&
4035 VM_Version::supports_bmi2()) {
4036
4037 Label test_64_loop, test_tail, BREAK_LOOP;
4038 movl(tmp1, len);
4039 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4040
4041 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4042 andl(len, 0xffffffc0); // vector count (in chars)
4043 jccb(Assembler::zero, test_tail);
4044
4045 lea(ary1, Address(ary1, len, Address::times_1));
4046 negptr(len);
4047
4048 bind(test_64_loop);
4049 // Check whether our 64 elements of size byte contain negatives
4050 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4051 kortestql(mask1, mask1);
4052 jcc(Assembler::notZero, BREAK_LOOP);
4053
4054 addptr(len, 64);
4055 jccb(Assembler::notZero, test_64_loop);
4056
4057 bind(test_tail);
4058 // bail out when there is nothing to be done
4059 testl(tmp1, -1);
4060 jcc(Assembler::zero, DONE);
4061
4062
4063 // check the tail for absense of negatives
4064 // ~(~0 << len) applied up to two times (for 32-bit scenario)
4065 {
4066 Register tmp3_aliased = len;
4067 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4068 shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4069 notq(tmp3_aliased);
4070 kmovql(mask2, tmp3_aliased);
4071 }
4072
4073 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4074 ktestq(mask1, mask2);
4075 jcc(Assembler::zero, DONE);
4076
4077 // do a full check for negative registers in the tail
4078 movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4079 // ary1 already pointing to the right place
4080 jmpb(TAIL_START);
4081
4082 bind(BREAK_LOOP);
4083 // At least one byte in the last 64 byte block was negative.
4084 // Set up to look at the last 64 bytes as if they were a tail
4085 lea(ary1, Address(ary1, len, Address::times_1));
4086 addptr(result, len);
4087 // Ignore the very last byte: if all others are positive,
4088 // it must be negative, so we can skip right to the 2+1 byte
4089 // end comparison at this point
4090 orl(result, 63);
4091 movl(len, 63);
4092 // Fallthru to tail compare
4093 } else {
4094
4095 if (UseAVX >= 2) {
4096 // With AVX2, use 32-byte vector compare
4097 Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4098
4099 // Compare 32-byte vectors
4100 testl(len, 0xffffffe0); // vector count (in bytes)
4101 jccb(Assembler::zero, TAIL_START);
4102
4103 andl(len, 0xffffffe0);
4104 lea(ary1, Address(ary1, len, Address::times_1));
4105 negptr(len);
4106
4107 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
4108 movdl(vec2, tmp1);
4109 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4110
4111 bind(COMPARE_WIDE_VECTORS);
4112 vmovdqu(vec1, Address(ary1, len, Address::times_1));
4113 vptest(vec1, vec2);
4114 jccb(Assembler::notZero, BREAK_LOOP);
4115 addptr(len, 32);
4116 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4117
4118 testl(result, 0x0000001f); // any bytes remaining?
4119 jcc(Assembler::zero, DONE);
4120
4121 // Quick test using the already prepared vector mask
4122 movl(len, result);
4123 andl(len, 0x0000001f);
4124 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4125 vptest(vec1, vec2);
4126 jcc(Assembler::zero, DONE);
4127 // There are zeros, jump to the tail to determine exactly where
4128 jmpb(TAIL_START);
4129
4130 bind(BREAK_LOOP);
4131 // At least one byte in the last 32-byte vector is negative.
4132 // Set up to look at the last 32 bytes as if they were a tail
4133 lea(ary1, Address(ary1, len, Address::times_1));
4134 addptr(result, len);
4135 // Ignore the very last byte: if all others are positive,
4136 // it must be negative, so we can skip right to the 2+1 byte
4137 // end comparison at this point
4138 orl(result, 31);
4139 movl(len, 31);
4140 // Fallthru to tail compare
4141 } else if (UseSSE42Intrinsics) {
4142 // With SSE4.2, use double quad vector compare
4143 Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4144
4145 // Compare 16-byte vectors
4146 testl(len, 0xfffffff0); // vector count (in bytes)
4147 jcc(Assembler::zero, TAIL_START);
4148
4149 andl(len, 0xfffffff0);
4150 lea(ary1, Address(ary1, len, Address::times_1));
4151 negptr(len);
4152
4153 movl(tmp1, 0x80808080);
4154 movdl(vec2, tmp1);
4155 pshufd(vec2, vec2, 0);
4156
4157 bind(COMPARE_WIDE_VECTORS);
4158 movdqu(vec1, Address(ary1, len, Address::times_1));
4159 ptest(vec1, vec2);
4160 jccb(Assembler::notZero, BREAK_LOOP);
4161 addptr(len, 16);
4162 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4163
4164 testl(result, 0x0000000f); // len is zero, any bytes remaining?
4165 jcc(Assembler::zero, DONE);
4166
4167 // Quick test using the already prepared vector mask
4168 movl(len, result);
4169 andl(len, 0x0000000f); // tail count (in bytes)
4170 movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4171 ptest(vec1, vec2);
4172 jcc(Assembler::zero, DONE);
4173 jmpb(TAIL_START);
4174
4175 bind(BREAK_LOOP);
4176 // At least one byte in the last 16-byte vector is negative.
4177 // Set up and look at the last 16 bytes as if they were a tail
4178 lea(ary1, Address(ary1, len, Address::times_1));
4179 addptr(result, len);
4180 // Ignore the very last byte: if all others are positive,
4181 // it must be negative, so we can skip right to the 2+1 byte
4182 // end comparison at this point
4183 orl(result, 15);
4184 movl(len, 15);
4185 // Fallthru to tail compare
4186 }
4187 }
4188
4189 bind(TAIL_START);
4190 // Compare 4-byte vectors
4191 andl(len, 0xfffffffc); // vector count (in bytes)
4192 jccb(Assembler::zero, COMPARE_CHAR);
4193
4194 lea(ary1, Address(ary1, len, Address::times_1));
4195 negptr(len);
4196
4197 bind(COMPARE_VECTORS);
4198 movl(tmp1, Address(ary1, len, Address::times_1));
4199 andl(tmp1, 0x80808080);
4200 jccb(Assembler::notZero, TAIL_ADJUST);
4201 addptr(len, 4);
4202 jccb(Assembler::notZero, COMPARE_VECTORS);
4203
4204 // Compare trailing char (final 2-3 bytes), if any
4205 bind(COMPARE_CHAR);
4206
4207 testl(result, 0x2); // tail char
4208 jccb(Assembler::zero, COMPARE_BYTE);
4209 load_unsigned_short(tmp1, Address(ary1, 0));
4210 andl(tmp1, 0x00008080);
4211 jccb(Assembler::notZero, CHAR_ADJUST);
4212 lea(ary1, Address(ary1, 2));
4213
4214 bind(COMPARE_BYTE);
4215 testl(result, 0x1); // tail byte
4216 jccb(Assembler::zero, DONE);
4217 load_unsigned_byte(tmp1, Address(ary1, 0));
4218 testl(tmp1, 0x00000080);
4219 jccb(Assembler::zero, DONE);
4220 subptr(result, 1);
4221 jmpb(DONE);
4222
4223 bind(TAIL_ADJUST);
4224 // there are negative bits in the last 4 byte block.
4225 // Adjust result and check the next three bytes
4226 addptr(result, len);
4227 orl(result, 3);
4228 lea(ary1, Address(ary1, len, Address::times_1));
4229 jmpb(COMPARE_CHAR);
4230
4231 bind(CHAR_ADJUST);
4232 // We are looking at a char + optional byte tail, and found that one
4233 // of the bytes in the char is negative. Adjust the result, check the
4234 // first byte and readjust if needed.
4235 andl(result, 0xfffffffc);
4236 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4237 jccb(Assembler::notZero, DONE);
4238 addptr(result, 1);
4239
4240 // That's it
4241 bind(DONE);
4242 if (UseAVX >= 2) {
4243 // clean upper bits of YMM registers
4244 vpxor(vec1, vec1);
4245 vpxor(vec2, vec2);
4246 }
4247 }
4248
4249 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4250 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4251 Register limit, Register result, Register chr,
4252 XMMRegister vec1, XMMRegister vec2, bool is_char,
4253 KRegister mask, bool expand_ary2) {
4254 // for expand_ary2, limit is the (smaller) size of the second array.
4255 ShortBranchVerifier sbv(this);
4256 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4257
4258 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4259 "Expansion only implemented for AVX2");
4260
4261 int length_offset = arrayOopDesc::length_offset_in_bytes();
4262 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4263
4264 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4265 int scaleIncr = expand_ary2 ? 8 : 16;
4266
4267 if (is_array_equ) {
4268 // Check the input args
4269 cmpoop(ary1, ary2);
4270 jcc(Assembler::equal, TRUE_LABEL);
4271
4272 // Need additional checks for arrays_equals.
4273 testptr(ary1, ary1);
4274 jcc(Assembler::zero, FALSE_LABEL);
4275 testptr(ary2, ary2);
4276 jcc(Assembler::zero, FALSE_LABEL);
4277
4278 // Check the lengths
4279 movl(limit, Address(ary1, length_offset));
4280 cmpl(limit, Address(ary2, length_offset));
4281 jcc(Assembler::notEqual, FALSE_LABEL);
4282 }
4283
4284 // count == 0
4285 testl(limit, limit);
4286 jcc(Assembler::zero, TRUE_LABEL);
4287
4288 if (is_array_equ) {
4289 // Load array address
4290 lea(ary1, Address(ary1, base_offset));
4291 lea(ary2, Address(ary2, base_offset));
4292 }
4293
4294 if (is_array_equ && is_char) {
4295 // arrays_equals when used for char[].
4296 shll(limit, 1); // byte count != 0
4297 }
4298 movl(result, limit); // copy
4299
4300 if (UseAVX >= 2) {
4301 // With AVX2, use 32-byte vector compare
4302 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4303
4304 // Compare 32-byte vectors
4305 if (expand_ary2) {
4306 andl(result, 0x0000000f); // tail count (in bytes)
4307 andl(limit, 0xfffffff0); // vector count (in bytes)
4308 jcc(Assembler::zero, COMPARE_TAIL);
4309 } else {
4310 andl(result, 0x0000001f); // tail count (in bytes)
4311 andl(limit, 0xffffffe0); // vector count (in bytes)
4312 jcc(Assembler::zero, COMPARE_TAIL_16);
4313 }
4314
4315 lea(ary1, Address(ary1, limit, scaleFactor));
4316 lea(ary2, Address(ary2, limit, Address::times_1));
4317 negptr(limit);
4318
4319 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4320 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4321
4322 cmpl(limit, -64);
4323 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4324
4325 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4326
4327 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4328 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4329 kortestql(mask, mask);
4330 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
4331 addptr(limit, 64); // update since we already compared at this addr
4332 cmpl(limit, -64);
4333 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4334
4335 // At this point we may still need to compare -limit+result bytes.
4336 // We could execute the next two instruction and just continue via non-wide path:
4337 // cmpl(limit, 0);
4338 // jcc(Assembler::equal, COMPARE_TAIL); // true
4339 // But since we stopped at the points ary{1,2}+limit which are
4340 // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4341 // (|limit| <= 32 and result < 32),
4342 // we may just compare the last 64 bytes.
4343 //
4344 addptr(result, -64); // it is safe, bc we just came from this area
4345 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4346 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4347 kortestql(mask, mask);
4348 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
4349
4350 jmp(TRUE_LABEL);
4351
4352 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4353
4354 }//if (VM_Version::supports_avx512vlbw())
4355
4356 bind(COMPARE_WIDE_VECTORS);
4357 vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4358 if (expand_ary2) {
4359 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4360 } else {
4361 vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4362 }
4363 vpxor(vec1, vec2);
4364
4365 vptest(vec1, vec1);
4366 jcc(Assembler::notZero, FALSE_LABEL);
4367 addptr(limit, scaleIncr * 2);
4368 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4369
4370 testl(result, result);
4371 jcc(Assembler::zero, TRUE_LABEL);
4372
4373 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4374 if (expand_ary2) {
4375 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4376 } else {
4377 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4378 }
4379 vpxor(vec1, vec2);
4380
4381 vptest(vec1, vec1);
4382 jcc(Assembler::notZero, FALSE_LABEL);
4383 jmp(TRUE_LABEL);
4384
4385 bind(COMPARE_TAIL_16); // limit is zero
4386 movl(limit, result);
4387
4388 // Compare 16-byte chunks
4389 andl(result, 0x0000000f); // tail count (in bytes)
4390 andl(limit, 0xfffffff0); // vector count (in bytes)
4391 jcc(Assembler::zero, COMPARE_TAIL);
4392
4393 lea(ary1, Address(ary1, limit, scaleFactor));
4394 lea(ary2, Address(ary2, limit, Address::times_1));
4395 negptr(limit);
4396
4397 bind(COMPARE_WIDE_VECTORS_16);
4398 movdqu(vec1, Address(ary1, limit, scaleFactor));
4399 if (expand_ary2) {
4400 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4401 } else {
4402 movdqu(vec2, Address(ary2, limit, Address::times_1));
4403 }
4404 pxor(vec1, vec2);
4405
4406 ptest(vec1, vec1);
4407 jcc(Assembler::notZero, FALSE_LABEL);
4408 addptr(limit, scaleIncr);
4409 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4410
4411 bind(COMPARE_TAIL); // limit is zero
4412 movl(limit, result);
4413 // Fallthru to tail compare
4414 } else if (UseSSE42Intrinsics) {
4415 // With SSE4.2, use double quad vector compare
4416 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4417
4418 // Compare 16-byte vectors
4419 andl(result, 0x0000000f); // tail count (in bytes)
4420 andl(limit, 0xfffffff0); // vector count (in bytes)
4421 jcc(Assembler::zero, COMPARE_TAIL);
4422
4423 lea(ary1, Address(ary1, limit, Address::times_1));
4424 lea(ary2, Address(ary2, limit, Address::times_1));
4425 negptr(limit);
4426
4427 bind(COMPARE_WIDE_VECTORS);
4428 movdqu(vec1, Address(ary1, limit, Address::times_1));
4429 movdqu(vec2, Address(ary2, limit, Address::times_1));
4430 pxor(vec1, vec2);
4431
4432 ptest(vec1, vec1);
4433 jcc(Assembler::notZero, FALSE_LABEL);
4434 addptr(limit, 16);
4435 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4436
4437 testl(result, result);
4438 jcc(Assembler::zero, TRUE_LABEL);
4439
4440 movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4441 movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4442 pxor(vec1, vec2);
4443
4444 ptest(vec1, vec1);
4445 jccb(Assembler::notZero, FALSE_LABEL);
4446 jmpb(TRUE_LABEL);
4447
4448 bind(COMPARE_TAIL); // limit is zero
4449 movl(limit, result);
4450 // Fallthru to tail compare
4451 }
4452
4453 // Compare 4-byte vectors
4454 if (expand_ary2) {
4455 testl(result, result);
4456 jccb(Assembler::zero, TRUE_LABEL);
4457 } else {
4458 andl(limit, 0xfffffffc); // vector count (in bytes)
4459 jccb(Assembler::zero, COMPARE_CHAR);
4460 }
4461
4462 lea(ary1, Address(ary1, limit, scaleFactor));
4463 lea(ary2, Address(ary2, limit, Address::times_1));
4464 negptr(limit);
4465
4466 bind(COMPARE_VECTORS);
4467 if (expand_ary2) {
4468 // There are no "vector" operations for bytes to shorts
4469 movzbl(chr, Address(ary2, limit, Address::times_1));
4470 cmpw(Address(ary1, limit, Address::times_2), chr);
4471 jccb(Assembler::notEqual, FALSE_LABEL);
4472 addptr(limit, 1);
4473 jcc(Assembler::notZero, COMPARE_VECTORS);
4474 jmp(TRUE_LABEL);
4475 } else {
4476 movl(chr, Address(ary1, limit, Address::times_1));
4477 cmpl(chr, Address(ary2, limit, Address::times_1));
4478 jccb(Assembler::notEqual, FALSE_LABEL);
4479 addptr(limit, 4);
4480 jcc(Assembler::notZero, COMPARE_VECTORS);
4481 }
4482
4483 // Compare trailing char (final 2 bytes), if any
4484 bind(COMPARE_CHAR);
4485 testl(result, 0x2); // tail char
4486 jccb(Assembler::zero, COMPARE_BYTE);
4487 load_unsigned_short(chr, Address(ary1, 0));
4488 load_unsigned_short(limit, Address(ary2, 0));
4489 cmpl(chr, limit);
4490 jccb(Assembler::notEqual, FALSE_LABEL);
4491
4492 if (is_array_equ && is_char) {
4493 bind(COMPARE_BYTE);
4494 } else {
4495 lea(ary1, Address(ary1, 2));
4496 lea(ary2, Address(ary2, 2));
4497
4498 bind(COMPARE_BYTE);
4499 testl(result, 0x1); // tail byte
4500 jccb(Assembler::zero, TRUE_LABEL);
4501 load_unsigned_byte(chr, Address(ary1, 0));
4502 load_unsigned_byte(limit, Address(ary2, 0));
4503 cmpl(chr, limit);
4504 jccb(Assembler::notEqual, FALSE_LABEL);
4505 }
4506 bind(TRUE_LABEL);
4507 movl(result, 1); // return true
4508 jmpb(DONE);
4509
4510 bind(FALSE_LABEL);
4511 xorl(result, result); // return false
4512
4513 // That's it
4514 bind(DONE);
4515 if (UseAVX >= 2) {
4516 // clean upper bits of YMM registers
4517 vpxor(vec1, vec1);
4518 vpxor(vec2, vec2);
4519 }
4520 }
4521
4522 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4523 #define __ masm.
4524 Register dst = stub.data<0>();
4525 XMMRegister src = stub.data<1>();
4526 address target = stub.data<2>();
4527 __ bind(stub.entry());
4528 __ subptr(rsp, 8);
4529 __ movdbl(Address(rsp), src);
4530 __ call(RuntimeAddress(target));
4531 // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4532 __ pop(dst);
4533 __ jmp(stub.continuation());
4534 #undef __
4535 }
4536
4537 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4538 assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4539 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4540
4541 address slowpath_target;
4542 if (dst_bt == T_INT) {
4543 if (src_bt == T_FLOAT) {
4544 cvttss2sil(dst, src);
4545 cmpl(dst, 0x80000000);
4546 slowpath_target = StubRoutines::x86::f2i_fixup();
4547 } else {
4548 cvttsd2sil(dst, src);
4549 cmpl(dst, 0x80000000);
4550 slowpath_target = StubRoutines::x86::d2i_fixup();
4551 }
4552 } else {
4553 if (src_bt == T_FLOAT) {
4554 cvttss2siq(dst, src);
4555 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4556 slowpath_target = StubRoutines::x86::f2l_fixup();
4557 } else {
4558 cvttsd2siq(dst, src);
4559 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4560 slowpath_target = StubRoutines::x86::d2l_fixup();
4561 }
4562 }
4563
4564 // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4565 int max_size = 23 + (UseAPX ? 1 : 0);
4566 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4567 jcc(Assembler::equal, stub->entry());
4568 bind(stub->continuation());
4569 }
4570
4571 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4572 XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4573 switch(ideal_opc) {
4574 case Op_LShiftVS:
4575 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4576 case Op_LShiftVI:
4577 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4578 case Op_LShiftVL:
4579 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4580 case Op_RShiftVS:
4581 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4582 case Op_RShiftVI:
4583 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4584 case Op_RShiftVL:
4585 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4586 case Op_URShiftVS:
4587 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4588 case Op_URShiftVI:
4589 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4590 case Op_URShiftVL:
4591 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4592 case Op_RotateRightV:
4593 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4594 case Op_RotateLeftV:
4595 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4596 default:
4597 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4598 break;
4599 }
4600 }
4601
4602 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4603 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4604 if (is_unsigned) {
4605 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4606 } else {
4607 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4608 }
4609 }
4610
4611 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4612 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4613 switch (elem_bt) {
4614 case T_BYTE:
4615 if (ideal_opc == Op_SaturatingAddV) {
4616 evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4617 } else {
4618 assert(ideal_opc == Op_SaturatingSubV, "");
4619 evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4620 }
4621 break;
4622 case T_SHORT:
4623 if (ideal_opc == Op_SaturatingAddV) {
4624 evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4625 } else {
4626 assert(ideal_opc == Op_SaturatingSubV, "");
4627 evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4628 }
4629 break;
4630 default:
4631 fatal("Unsupported type %s", type2name(elem_bt));
4632 break;
4633 }
4634 }
4635
4636 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4637 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4638 switch (elem_bt) {
4639 case T_BYTE:
4640 if (ideal_opc == Op_SaturatingAddV) {
4641 evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4642 } else {
4643 assert(ideal_opc == Op_SaturatingSubV, "");
4644 evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4645 }
4646 break;
4647 case T_SHORT:
4648 if (ideal_opc == Op_SaturatingAddV) {
4649 evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4650 } else {
4651 assert(ideal_opc == Op_SaturatingSubV, "");
4652 evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4653 }
4654 break;
4655 default:
4656 fatal("Unsupported type %s", type2name(elem_bt));
4657 break;
4658 }
4659 }
4660
4661 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4662 Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4663 if (is_unsigned) {
4664 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4665 } else {
4666 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4667 }
4668 }
4669
4670 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4671 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4672 switch (elem_bt) {
4673 case T_BYTE:
4674 if (ideal_opc == Op_SaturatingAddV) {
4675 evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4676 } else {
4677 assert(ideal_opc == Op_SaturatingSubV, "");
4678 evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4679 }
4680 break;
4681 case T_SHORT:
4682 if (ideal_opc == Op_SaturatingAddV) {
4683 evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4684 } else {
4685 assert(ideal_opc == Op_SaturatingSubV, "");
4686 evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4687 }
4688 break;
4689 default:
4690 fatal("Unsupported type %s", type2name(elem_bt));
4691 break;
4692 }
4693 }
4694
4695 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4696 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4697 switch (elem_bt) {
4698 case T_BYTE:
4699 if (ideal_opc == Op_SaturatingAddV) {
4700 evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4701 } else {
4702 assert(ideal_opc == Op_SaturatingSubV, "");
4703 evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4704 }
4705 break;
4706 case T_SHORT:
4707 if (ideal_opc == Op_SaturatingAddV) {
4708 evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4709 } else {
4710 assert(ideal_opc == Op_SaturatingSubV, "");
4711 evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4712 }
4713 break;
4714 default:
4715 fatal("Unsupported type %s", type2name(elem_bt));
4716 break;
4717 }
4718 }
4719
4720 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4721 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4722 bool is_varshift) {
4723 switch (ideal_opc) {
4724 case Op_AddVB:
4725 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4726 case Op_AddVS:
4727 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4728 case Op_AddVI:
4729 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4730 case Op_AddVL:
4731 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4732 case Op_AddVF:
4733 evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4734 case Op_AddVD:
4735 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4736 case Op_SubVB:
4737 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4738 case Op_SubVS:
4739 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4740 case Op_SubVI:
4741 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4742 case Op_SubVL:
4743 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4744 case Op_SubVF:
4745 evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4746 case Op_SubVD:
4747 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4748 case Op_MulVS:
4749 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4750 case Op_MulVI:
4751 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4752 case Op_MulVL:
4753 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4754 case Op_MulVF:
4755 evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4756 case Op_MulVD:
4757 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4758 case Op_DivVF:
4759 evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4760 case Op_DivVD:
4761 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4762 case Op_SqrtVF:
4763 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4764 case Op_SqrtVD:
4765 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4766 case Op_AbsVB:
4767 evpabsb(dst, mask, src2, merge, vlen_enc); break;
4768 case Op_AbsVS:
4769 evpabsw(dst, mask, src2, merge, vlen_enc); break;
4770 case Op_AbsVI:
4771 evpabsd(dst, mask, src2, merge, vlen_enc); break;
4772 case Op_AbsVL:
4773 evpabsq(dst, mask, src2, merge, vlen_enc); break;
4774 case Op_FmaVF:
4775 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4776 case Op_FmaVD:
4777 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4778 case Op_VectorRearrange:
4779 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4780 case Op_LShiftVS:
4781 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4782 case Op_LShiftVI:
4783 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4784 case Op_LShiftVL:
4785 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4786 case Op_RShiftVS:
4787 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4788 case Op_RShiftVI:
4789 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4790 case Op_RShiftVL:
4791 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4792 case Op_URShiftVS:
4793 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4794 case Op_URShiftVI:
4795 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4796 case Op_URShiftVL:
4797 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4798 case Op_RotateLeftV:
4799 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4800 case Op_RotateRightV:
4801 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4802 case Op_MaxV:
4803 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4804 case Op_MinV:
4805 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4806 case Op_UMinV:
4807 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4808 case Op_UMaxV:
4809 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4810 case Op_XorV:
4811 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4812 case Op_OrV:
4813 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4814 case Op_AndV:
4815 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4816 default:
4817 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4818 break;
4819 }
4820 }
4821
4822 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4823 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4824 switch (ideal_opc) {
4825 case Op_AddVB:
4826 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4827 case Op_AddVS:
4828 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4829 case Op_AddVI:
4830 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4831 case Op_AddVL:
4832 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4833 case Op_AddVF:
4834 evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4835 case Op_AddVD:
4836 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4837 case Op_SubVB:
4838 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4839 case Op_SubVS:
4840 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4841 case Op_SubVI:
4842 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4843 case Op_SubVL:
4844 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4845 case Op_SubVF:
4846 evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4847 case Op_SubVD:
4848 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4849 case Op_MulVS:
4850 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4851 case Op_MulVI:
4852 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4853 case Op_MulVL:
4854 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4855 case Op_MulVF:
4856 evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4857 case Op_MulVD:
4858 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4859 case Op_DivVF:
4860 evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4861 case Op_DivVD:
4862 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4863 case Op_FmaVF:
4864 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4865 case Op_FmaVD:
4866 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4867 case Op_MaxV:
4868 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4869 case Op_MinV:
4870 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4871 case Op_UMaxV:
4872 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4873 case Op_UMinV:
4874 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4875 case Op_XorV:
4876 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4877 case Op_OrV:
4878 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4879 case Op_AndV:
4880 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4881 default:
4882 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4883 break;
4884 }
4885 }
4886
4887 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4888 KRegister src1, KRegister src2) {
4889 BasicType etype = T_ILLEGAL;
4890 switch(mask_len) {
4891 case 2:
4892 case 4:
4893 case 8: etype = T_BYTE; break;
4894 case 16: etype = T_SHORT; break;
4895 case 32: etype = T_INT; break;
4896 case 64: etype = T_LONG; break;
4897 default: fatal("Unsupported type"); break;
4898 }
4899 assert(etype != T_ILLEGAL, "");
4900 switch(ideal_opc) {
4901 case Op_AndVMask:
4902 kand(etype, dst, src1, src2); break;
4903 case Op_OrVMask:
4904 kor(etype, dst, src1, src2); break;
4905 case Op_XorVMask:
4906 kxor(etype, dst, src1, src2); break;
4907 default:
4908 fatal("Unsupported masked operation"); break;
4909 }
4910 }
4911
4912 /*
4913 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4914 * If src is NaN, the result is 0.
4915 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4916 * the result is equal to the value of Integer.MIN_VALUE.
4917 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4918 * the result is equal to the value of Integer.MAX_VALUE.
4919 */
4920 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4921 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4922 Register rscratch, AddressLiteral float_sign_flip,
4923 int vec_enc) {
4924 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4925 Label done;
4926 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4927 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4928 vptest(xtmp2, xtmp2, vec_enc);
4929 jccb(Assembler::equal, done);
4930
4931 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4932 vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4933
4934 vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4935 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4936 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4937
4938 // Recompute the mask for remaining special value.
4939 vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4940 // Extract SRC values corresponding to TRUE mask lanes.
4941 vpand(xtmp4, xtmp2, src, vec_enc);
4942 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4943 // values are set.
4944 vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4945
4946 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4947 bind(done);
4948 }
4949
4950 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4951 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4952 Register rscratch, AddressLiteral float_sign_flip,
4953 int vec_enc) {
4954 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4955 Label done;
4956 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4957 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4958 kortestwl(ktmp1, ktmp1);
4959 jccb(Assembler::equal, done);
4960
4961 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4962 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4963 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4964
4965 kxorwl(ktmp1, ktmp1, ktmp2);
4966 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4967 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4968 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4969 bind(done);
4970 }
4971
4972 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4973 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4974 Register rscratch, AddressLiteral double_sign_flip,
4975 int vec_enc) {
4976 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4977
4978 Label done;
4979 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4980 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4981 kortestwl(ktmp1, ktmp1);
4982 jccb(Assembler::equal, done);
4983
4984 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4985 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4986 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
4987
4988 kxorwl(ktmp1, ktmp1, ktmp2);
4989 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4990 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4991 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
4992 bind(done);
4993 }
4994
4995 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4996 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4997 Register rscratch, AddressLiteral float_sign_flip,
4998 int vec_enc) {
4999 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5000 Label done;
5001 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5002 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5003 kortestwl(ktmp1, ktmp1);
5004 jccb(Assembler::equal, done);
5005
5006 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5007 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5008 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5009
5010 kxorwl(ktmp1, ktmp1, ktmp2);
5011 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5012 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5013 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5014 bind(done);
5015 }
5016
5017 /*
5018 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5019 * If src is NaN, the result is 0.
5020 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5021 * the result is equal to the value of Long.MIN_VALUE.
5022 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5023 * the result is equal to the value of Long.MAX_VALUE.
5024 */
5025 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5026 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5027 Register rscratch, AddressLiteral double_sign_flip,
5028 int vec_enc) {
5029 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5030
5031 Label done;
5032 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5033 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5034 kortestwl(ktmp1, ktmp1);
5035 jccb(Assembler::equal, done);
5036
5037 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5038 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5039 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5040
5041 kxorwl(ktmp1, ktmp1, ktmp2);
5042 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5043 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5044 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5045 bind(done);
5046 }
5047
5048 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5049 XMMRegister xtmp, int index, int vec_enc) {
5050 assert(vec_enc < Assembler::AVX_512bit, "");
5051 if (vec_enc == Assembler::AVX_256bit) {
5052 vextractf128_high(xtmp, src);
5053 vshufps(dst, src, xtmp, index, vec_enc);
5054 } else {
5055 vshufps(dst, src, zero, index, vec_enc);
5056 }
5057 }
5058
5059 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5060 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5061 AddressLiteral float_sign_flip, int src_vec_enc) {
5062 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5063
5064 Label done;
5065 // Compare the destination lanes with float_sign_flip
5066 // value to get mask for all special values.
5067 movdqu(xtmp1, float_sign_flip, rscratch);
5068 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5069 ptest(xtmp2, xtmp2);
5070 jccb(Assembler::equal, done);
5071
5072 // Flip float_sign_flip to get max integer value.
5073 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5074 pxor(xtmp1, xtmp4);
5075
5076 // Set detination lanes corresponding to unordered source lanes as zero.
5077 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5078 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5079
5080 // Shuffle mask vector and pack lower doubles word from each quadword lane.
5081 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5082 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5083
5084 // Recompute the mask for remaining special value.
5085 pxor(xtmp2, xtmp3);
5086 // Extract mask corresponding to non-negative source lanes.
5087 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5088
5089 // Shuffle mask vector and pack lower doubles word from each quadword lane.
5090 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5091 pand(xtmp3, xtmp2);
5092
5093 // Replace destination lanes holding special value(0x80000000) with max int
5094 // if corresponding source lane holds a +ve value.
5095 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5096 bind(done);
5097 }
5098
5099
5100 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5101 XMMRegister xtmp, Register rscratch, int vec_enc) {
5102 switch(to_elem_bt) {
5103 case T_SHORT:
5104 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5105 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5106 vpackusdw(dst, dst, zero, vec_enc);
5107 if (vec_enc == Assembler::AVX_256bit) {
5108 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5109 }
5110 break;
5111 case T_BYTE:
5112 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5113 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5114 vpackusdw(dst, dst, zero, vec_enc);
5115 if (vec_enc == Assembler::AVX_256bit) {
5116 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5117 }
5118 vpackuswb(dst, dst, zero, vec_enc);
5119 break;
5120 default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt));
5121 }
5122 }
5123
5124 /*
5125 * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):-
5126 * a) Perform vector D2L/F2I cast.
5127 * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5128 * It signifies that source value could be any of the special floating point
5129 * values(NaN,-Inf,Inf,Max,-Min).
5130 * c) Set destination to zero if source is NaN value.
5131 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5132 */
5133
5134 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5135 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5136 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5137 int to_elem_sz = type2aelembytes(to_elem_bt);
5138 assert(to_elem_sz <= 4, "");
5139 vcvttps2dq(dst, src, vec_enc);
5140 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5141 if (to_elem_sz < 4) {
5142 vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5143 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5144 }
5145 }
5146
5147 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5148 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5149 Register rscratch, int vec_enc) {
5150 int to_elem_sz = type2aelembytes(to_elem_bt);
5151 assert(to_elem_sz <= 4, "");
5152 vcvttps2dq(dst, src, vec_enc);
5153 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5154 switch(to_elem_bt) {
5155 case T_INT:
5156 break;
5157 case T_SHORT:
5158 evpmovdw(dst, dst, vec_enc);
5159 break;
5160 case T_BYTE:
5161 evpmovdb(dst, dst, vec_enc);
5162 break;
5163 default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt));
5164 }
5165 }
5166
5167 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5168 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5169 Register rscratch, int vec_enc) {
5170 evcvttps2qq(dst, src, vec_enc);
5171 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5172 }
5173
5174 // Handling for downcasting from double to integer or sub-word types on AVX2.
5175 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5176 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5177 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5178 int to_elem_sz = type2aelembytes(to_elem_bt);
5179 assert(to_elem_sz < 8, "");
5180 vcvttpd2dq(dst, src, vec_enc);
5181 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5182 float_sign_flip, vec_enc);
5183 if (to_elem_sz < 4) {
5184 // xtmp4 holds all zero lanes.
5185 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5186 }
5187 }
5188
5189 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5190 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5191 KRegister ktmp2, AddressLiteral sign_flip,
5192 Register rscratch, int vec_enc) {
5193 if (VM_Version::supports_avx512dq()) {
5194 evcvttpd2qq(dst, src, vec_enc);
5195 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5196 switch(to_elem_bt) {
5197 case T_LONG:
5198 break;
5199 case T_INT:
5200 evpmovsqd(dst, dst, vec_enc);
5201 break;
5202 case T_SHORT:
5203 evpmovsqd(dst, dst, vec_enc);
5204 evpmovdw(dst, dst, vec_enc);
5205 break;
5206 case T_BYTE:
5207 evpmovsqd(dst, dst, vec_enc);
5208 evpmovdb(dst, dst, vec_enc);
5209 break;
5210 default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt));
5211 }
5212 } else {
5213 assert(type2aelembytes(to_elem_bt) <= 4, "");
5214 vcvttpd2dq(dst, src, vec_enc);
5215 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5216 switch(to_elem_bt) {
5217 case T_INT:
5218 break;
5219 case T_SHORT:
5220 evpmovdw(dst, dst, vec_enc);
5221 break;
5222 case T_BYTE:
5223 evpmovdb(dst, dst, vec_enc);
5224 break;
5225 default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt));
5226 }
5227 }
5228 }
5229
5230 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5231 switch(to_elem_bt) {
5232 case T_LONG:
5233 evcvttps2qqs(dst, src, vec_enc);
5234 break;
5235 case T_INT:
5236 evcvttps2dqs(dst, src, vec_enc);
5237 break;
5238 case T_SHORT:
5239 evcvttps2dqs(dst, src, vec_enc);
5240 evpmovdw(dst, dst, vec_enc);
5241 break;
5242 case T_BYTE:
5243 evcvttps2dqs(dst, src, vec_enc);
5244 evpmovdb(dst, dst, vec_enc);
5245 break;
5246 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt));
5247 }
5248 }
5249
5250 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5251 switch(to_elem_bt) {
5252 case T_LONG:
5253 evcvttps2qqs(dst, src, vec_enc);
5254 break;
5255 case T_INT:
5256 evcvttps2dqs(dst, src, vec_enc);
5257 break;
5258 case T_SHORT:
5259 evcvttps2dqs(dst, src, vec_enc);
5260 evpmovdw(dst, dst, vec_enc);
5261 break;
5262 case T_BYTE:
5263 evcvttps2dqs(dst, src, vec_enc);
5264 evpmovdb(dst, dst, vec_enc);
5265 break;
5266 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt));
5267 }
5268 }
5269
5270 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5271 switch(to_elem_bt) {
5272 case T_LONG:
5273 evcvttpd2qqs(dst, src, vec_enc);
5274 break;
5275 case T_INT:
5276 evcvttpd2dqs(dst, src, vec_enc);
5277 break;
5278 case T_SHORT:
5279 evcvttpd2dqs(dst, src, vec_enc);
5280 evpmovdw(dst, dst, vec_enc);
5281 break;
5282 case T_BYTE:
5283 evcvttpd2dqs(dst, src, vec_enc);
5284 evpmovdb(dst, dst, vec_enc);
5285 break;
5286 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt));
5287 }
5288 }
5289
5290 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5291 switch(to_elem_bt) {
5292 case T_LONG:
5293 evcvttpd2qqs(dst, src, vec_enc);
5294 break;
5295 case T_INT:
5296 evcvttpd2dqs(dst, src, vec_enc);
5297 break;
5298 case T_SHORT:
5299 evcvttpd2dqs(dst, src, vec_enc);
5300 evpmovdw(dst, dst, vec_enc);
5301 break;
5302 case T_BYTE:
5303 evcvttpd2dqs(dst, src, vec_enc);
5304 evpmovdb(dst, dst, vec_enc);
5305 break;
5306 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt));
5307 }
5308 }
5309
5310 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5311 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5312 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5313 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5314 // and re-instantiate original MXCSR.RC mode after that.
5315 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5316
5317 mov64(tmp, julong_cast(0.5L));
5318 evpbroadcastq(xtmp1, tmp, vec_enc);
5319 vaddpd(xtmp1, src , xtmp1, vec_enc);
5320 evcvtpd2qq(dst, xtmp1, vec_enc);
5321 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5322 double_sign_flip, vec_enc);;
5323
5324 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5325 }
5326
5327 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5328 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5329 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5330 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5331 // and re-instantiate original MXCSR.RC mode after that.
5332 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5333
5334 movl(tmp, jint_cast(0.5));
5335 movq(xtmp1, tmp);
5336 vbroadcastss(xtmp1, xtmp1, vec_enc);
5337 vaddps(xtmp1, src , xtmp1, vec_enc);
5338 vcvtps2dq(dst, xtmp1, vec_enc);
5339 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5340 float_sign_flip, vec_enc);
5341
5342 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5343 }
5344
5345 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5346 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5347 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5348 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5349 // and re-instantiate original MXCSR.RC mode after that.
5350 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5351
5352 movl(tmp, jint_cast(0.5));
5353 movq(xtmp1, tmp);
5354 vbroadcastss(xtmp1, xtmp1, vec_enc);
5355 vaddps(xtmp1, src , xtmp1, vec_enc);
5356 vcvtps2dq(dst, xtmp1, vec_enc);
5357 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5358
5359 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5360 }
5361
5362 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5363 BasicType from_elem_bt, BasicType to_elem_bt) {
5364 switch (from_elem_bt) {
5365 case T_BYTE:
5366 switch (to_elem_bt) {
5367 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5368 case T_INT: vpmovzxbd(dst, src, vlen_enc); break;
5369 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break;
5370 default: ShouldNotReachHere();
5371 }
5372 break;
5373 case T_SHORT:
5374 switch (to_elem_bt) {
5375 case T_INT: vpmovzxwd(dst, src, vlen_enc); break;
5376 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5377 default: ShouldNotReachHere();
5378 }
5379 break;
5380 case T_INT:
5381 assert(to_elem_bt == T_LONG, "");
5382 vpmovzxdq(dst, src, vlen_enc);
5383 break;
5384 default:
5385 ShouldNotReachHere();
5386 }
5387 }
5388
5389 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5390 BasicType from_elem_bt, BasicType to_elem_bt) {
5391 switch (from_elem_bt) {
5392 case T_BYTE:
5393 switch (to_elem_bt) {
5394 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5395 case T_INT: vpmovsxbd(dst, src, vlen_enc); break;
5396 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break;
5397 default: ShouldNotReachHere();
5398 }
5399 break;
5400 case T_SHORT:
5401 switch (to_elem_bt) {
5402 case T_INT: vpmovsxwd(dst, src, vlen_enc); break;
5403 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5404 default: ShouldNotReachHere();
5405 }
5406 break;
5407 case T_INT:
5408 assert(to_elem_bt == T_LONG, "");
5409 vpmovsxdq(dst, src, vlen_enc);
5410 break;
5411 default:
5412 ShouldNotReachHere();
5413 }
5414 }
5415
5416 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5417 BasicType dst_bt, BasicType src_bt, int vlen) {
5418 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5419 assert(vlen_enc != AVX_512bit, "");
5420
5421 int dst_bt_size = type2aelembytes(dst_bt);
5422 int src_bt_size = type2aelembytes(src_bt);
5423 if (dst_bt_size > src_bt_size) {
5424 switch (dst_bt_size / src_bt_size) {
5425 case 2: vpmovsxbw(dst, src, vlen_enc); break;
5426 case 4: vpmovsxbd(dst, src, vlen_enc); break;
5427 case 8: vpmovsxbq(dst, src, vlen_enc); break;
5428 default: ShouldNotReachHere();
5429 }
5430 } else {
5431 assert(dst_bt_size < src_bt_size, "");
5432 switch (src_bt_size / dst_bt_size) {
5433 case 2: {
5434 if (vlen_enc == AVX_128bit) {
5435 vpacksswb(dst, src, src, vlen_enc);
5436 } else {
5437 vpacksswb(dst, src, src, vlen_enc);
5438 vpermq(dst, dst, 0x08, vlen_enc);
5439 }
5440 break;
5441 }
5442 case 4: {
5443 if (vlen_enc == AVX_128bit) {
5444 vpackssdw(dst, src, src, vlen_enc);
5445 vpacksswb(dst, dst, dst, vlen_enc);
5446 } else {
5447 vpackssdw(dst, src, src, vlen_enc);
5448 vpermq(dst, dst, 0x08, vlen_enc);
5449 vpacksswb(dst, dst, dst, AVX_128bit);
5450 }
5451 break;
5452 }
5453 case 8: {
5454 if (vlen_enc == AVX_128bit) {
5455 vpshufd(dst, src, 0x08, vlen_enc);
5456 vpackssdw(dst, dst, dst, vlen_enc);
5457 vpacksswb(dst, dst, dst, vlen_enc);
5458 } else {
5459 vpshufd(dst, src, 0x08, vlen_enc);
5460 vpermq(dst, dst, 0x08, vlen_enc);
5461 vpackssdw(dst, dst, dst, AVX_128bit);
5462 vpacksswb(dst, dst, dst, AVX_128bit);
5463 }
5464 break;
5465 }
5466 default: ShouldNotReachHere();
5467 }
5468 }
5469 }
5470
5471 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5472 bool merge, BasicType bt, int vlen_enc) {
5473 if (bt == T_INT) {
5474 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5475 } else {
5476 assert(bt == T_LONG, "");
5477 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5478 }
5479 }
5480
5481 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5482 bool merge, BasicType bt, int vlen_enc) {
5483 if (bt == T_INT) {
5484 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5485 } else {
5486 assert(bt == T_LONG, "");
5487 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5488 }
5489 }
5490
5491 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5492 Register rtmp2, XMMRegister xtmp, int mask_len,
5493 int vec_enc) {
5494 int index = 0;
5495 int vindex = 0;
5496 mov64(rtmp1, 0x0101010101010101L);
5497 pdepq(rtmp1, src, rtmp1);
5498 if (mask_len > 8) {
5499 movq(rtmp2, src);
5500 vpxor(xtmp, xtmp, xtmp, vec_enc);
5501 movq(xtmp, rtmp1);
5502 }
5503 movq(dst, rtmp1);
5504
5505 mask_len -= 8;
5506 while (mask_len > 0) {
5507 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5508 index++;
5509 if ((index % 2) == 0) {
5510 pxor(xtmp, xtmp);
5511 }
5512 mov64(rtmp1, 0x0101010101010101L);
5513 shrq(rtmp2, 8);
5514 pdepq(rtmp1, rtmp2, rtmp1);
5515 pinsrq(xtmp, rtmp1, index % 2);
5516 vindex = index / 2;
5517 if (vindex) {
5518 // Write entire 16 byte vector when both 64 bit
5519 // lanes are update to save redundant instructions.
5520 if (index % 2) {
5521 vinsertf128(dst, dst, xtmp, vindex);
5522 }
5523 } else {
5524 vmovdqu(dst, xtmp);
5525 }
5526 mask_len -= 8;
5527 }
5528 }
5529
5530 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5531 switch(opc) {
5532 case Op_VectorMaskTrueCount:
5533 popcntq(dst, tmp);
5534 break;
5535 case Op_VectorMaskLastTrue:
5536 if (VM_Version::supports_lzcnt()) {
5537 lzcntq(tmp, tmp);
5538 movl(dst, 63);
5539 subl(dst, tmp);
5540 } else {
5541 movl(dst, -1);
5542 bsrq(tmp, tmp);
5543 cmov32(Assembler::notZero, dst, tmp);
5544 }
5545 break;
5546 case Op_VectorMaskFirstTrue:
5547 if (VM_Version::supports_bmi1()) {
5548 if (masklen < 32) {
5549 orl(tmp, 1 << masklen);
5550 tzcntl(dst, tmp);
5551 } else if (masklen == 32) {
5552 tzcntl(dst, tmp);
5553 } else {
5554 assert(masklen == 64, "");
5555 tzcntq(dst, tmp);
5556 }
5557 } else {
5558 if (masklen < 32) {
5559 orl(tmp, 1 << masklen);
5560 bsfl(dst, tmp);
5561 } else {
5562 assert(masklen == 32 || masklen == 64, "");
5563 movl(dst, masklen);
5564 if (masklen == 32) {
5565 bsfl(tmp, tmp);
5566 } else {
5567 bsfq(tmp, tmp);
5568 }
5569 cmov32(Assembler::notZero, dst, tmp);
5570 }
5571 }
5572 break;
5573 case Op_VectorMaskToLong:
5574 assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5575 break;
5576 default: assert(false, "Unhandled mask operation");
5577 }
5578 }
5579
5580 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5581 int masklen, int masksize, int vec_enc) {
5582 assert(VM_Version::supports_popcnt(), "");
5583
5584 if(VM_Version::supports_avx512bw()) {
5585 kmovql(tmp, mask);
5586 } else {
5587 assert(masklen <= 16, "");
5588 kmovwl(tmp, mask);
5589 }
5590
5591 // Mask generated out of partial vector comparisons/replicate/mask manipulation
5592 // operations needs to be clipped.
5593 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5594 andq(tmp, (1 << masklen) - 1);
5595 }
5596
5597 vector_mask_operation_helper(opc, dst, tmp, masklen);
5598 }
5599
5600 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5601 Register tmp, int masklen, BasicType bt, int vec_enc) {
5602 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5603 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5604 assert(VM_Version::supports_popcnt(), "");
5605
5606 bool need_clip = false;
5607 switch(bt) {
5608 case T_BOOLEAN:
5609 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5610 vpxor(xtmp, xtmp, xtmp, vec_enc);
5611 vpsubb(xtmp, xtmp, mask, vec_enc);
5612 vpmovmskb(tmp, xtmp, vec_enc);
5613 need_clip = masklen < 16;
5614 break;
5615 case T_BYTE:
5616 vpmovmskb(tmp, mask, vec_enc);
5617 need_clip = masklen < 16;
5618 break;
5619 case T_SHORT:
5620 vpacksswb(xtmp, mask, mask, vec_enc);
5621 if (masklen >= 16) {
5622 vpermpd(xtmp, xtmp, 8, vec_enc);
5623 }
5624 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5625 need_clip = masklen < 16;
5626 break;
5627 case T_INT:
5628 case T_FLOAT:
5629 vmovmskps(tmp, mask, vec_enc);
5630 need_clip = masklen < 4;
5631 break;
5632 case T_LONG:
5633 case T_DOUBLE:
5634 vmovmskpd(tmp, mask, vec_enc);
5635 need_clip = masklen < 2;
5636 break;
5637 default: assert(false, "Unhandled type, %s", type2name(bt));
5638 }
5639
5640 // Mask generated out of partial vector comparisons/replicate/mask manipulation
5641 // operations needs to be clipped.
5642 if (need_clip && opc != Op_VectorMaskFirstTrue) {
5643 // need_clip implies masklen < 32
5644 andq(tmp, (1 << masklen) - 1);
5645 }
5646
5647 vector_mask_operation_helper(opc, dst, tmp, masklen);
5648 }
5649
5650 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5651 Register rtmp2, int mask_len) {
5652 kmov(rtmp1, src);
5653 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5654 mov64(rtmp2, -1L);
5655 pextq(rtmp2, rtmp2, rtmp1);
5656 kmov(dst, rtmp2);
5657 }
5658
5659 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5660 XMMRegister mask, Register rtmp, Register rscratch,
5661 XMMRegister permv, XMMRegister xtmp, BasicType bt,
5662 int vec_enc) {
5663 assert(type2aelembytes(bt) >= 4, "");
5664 assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5665 address compress_perm_table = nullptr;
5666 address expand_perm_table = nullptr;
5667 if (type2aelembytes(bt) == 8) {
5668 compress_perm_table = StubRoutines::x86::compress_perm_table64();
5669 expand_perm_table = StubRoutines::x86::expand_perm_table64();
5670 vmovmskpd(rtmp, mask, vec_enc);
5671 } else {
5672 compress_perm_table = StubRoutines::x86::compress_perm_table32();
5673 expand_perm_table = StubRoutines::x86::expand_perm_table32();
5674 vmovmskps(rtmp, mask, vec_enc);
5675 }
5676 shlq(rtmp, 5); // for 32 byte permute row.
5677 if (opcode == Op_CompressV) {
5678 lea(rscratch, ExternalAddress(compress_perm_table));
5679 } else {
5680 lea(rscratch, ExternalAddress(expand_perm_table));
5681 }
5682 addptr(rtmp, rscratch);
5683 vmovdqu(permv, Address(rtmp));
5684 vpermps(dst, permv, src, Assembler::AVX_256bit);
5685 vpxor(xtmp, xtmp, xtmp, vec_enc);
5686 // Blend the result with zero vector using permute mask, each column entry
5687 // in a permute table row contains either a valid permute index or a -1 (default)
5688 // value, this can potentially be used as a blending mask after
5689 // compressing/expanding the source vector lanes.
5690 vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5691 }
5692
5693 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5694 bool merge, BasicType bt, int vec_enc) {
5695 if (opcode == Op_CompressV) {
5696 switch(bt) {
5697 case T_BYTE:
5698 evpcompressb(dst, mask, src, merge, vec_enc);
5699 break;
5700 case T_CHAR:
5701 case T_SHORT:
5702 evpcompressw(dst, mask, src, merge, vec_enc);
5703 break;
5704 case T_INT:
5705 evpcompressd(dst, mask, src, merge, vec_enc);
5706 break;
5707 case T_FLOAT:
5708 evcompressps(dst, mask, src, merge, vec_enc);
5709 break;
5710 case T_LONG:
5711 evpcompressq(dst, mask, src, merge, vec_enc);
5712 break;
5713 case T_DOUBLE:
5714 evcompresspd(dst, mask, src, merge, vec_enc);
5715 break;
5716 default:
5717 fatal("Unsupported type %s", type2name(bt));
5718 break;
5719 }
5720 } else {
5721 assert(opcode == Op_ExpandV, "");
5722 switch(bt) {
5723 case T_BYTE:
5724 evpexpandb(dst, mask, src, merge, vec_enc);
5725 break;
5726 case T_CHAR:
5727 case T_SHORT:
5728 evpexpandw(dst, mask, src, merge, vec_enc);
5729 break;
5730 case T_INT:
5731 evpexpandd(dst, mask, src, merge, vec_enc);
5732 break;
5733 case T_FLOAT:
5734 evexpandps(dst, mask, src, merge, vec_enc);
5735 break;
5736 case T_LONG:
5737 evpexpandq(dst, mask, src, merge, vec_enc);
5738 break;
5739 case T_DOUBLE:
5740 evexpandpd(dst, mask, src, merge, vec_enc);
5741 break;
5742 default:
5743 fatal("Unsupported type %s", type2name(bt));
5744 break;
5745 }
5746 }
5747 }
5748
5749 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5750 KRegister ktmp1, int vec_enc) {
5751 if (opcode == Op_SignumVD) {
5752 vsubpd(dst, zero, one, vec_enc);
5753 // if src < 0 ? -1 : 1
5754 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5755 evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5756 // if src == NaN, -0.0 or 0.0 return src.
5757 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5758 evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5759 } else {
5760 assert(opcode == Op_SignumVF, "");
5761 vsubps(dst, zero, one, vec_enc);
5762 // if src < 0 ? -1 : 1
5763 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5764 evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5765 // if src == NaN, -0.0 or 0.0 return src.
5766 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5767 evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5768 }
5769 }
5770
5771 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5772 XMMRegister xtmp1, int vec_enc) {
5773 if (opcode == Op_SignumVD) {
5774 vsubpd(dst, zero, one, vec_enc);
5775 // if src < 0 ? -1 : 1
5776 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5777 // if src == NaN, -0.0 or 0.0 return src.
5778 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5779 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5780 } else {
5781 assert(opcode == Op_SignumVF, "");
5782 vsubps(dst, zero, one, vec_enc);
5783 // if src < 0 ? -1 : 1
5784 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5785 // if src == NaN, -0.0 or 0.0 return src.
5786 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5787 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5788 }
5789 }
5790
5791 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5792 if (VM_Version::supports_avx512bw()) {
5793 if (mask_len > 32) {
5794 kmovql(dst, src);
5795 } else {
5796 kmovdl(dst, src);
5797 if (mask_len != 32) {
5798 kshiftrdl(dst, dst, 32 - mask_len);
5799 }
5800 }
5801 } else {
5802 assert(mask_len <= 16, "");
5803 kmovwl(dst, src);
5804 if (mask_len != 16) {
5805 kshiftrwl(dst, dst, 16 - mask_len);
5806 }
5807 }
5808 }
5809
5810 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5811 int lane_size = type2aelembytes(bt);
5812 if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5813 (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5814 movptr(rtmp, imm32);
5815 switch(lane_size) {
5816 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5817 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5818 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5819 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5820 fatal("Unsupported lane size %d", lane_size);
5821 break;
5822 }
5823 } else {
5824 movptr(rtmp, imm32);
5825 movq(dst, rtmp);
5826 switch(lane_size) {
5827 case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5828 case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5829 case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5830 case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5831 fatal("Unsupported lane size %d", lane_size);
5832 break;
5833 }
5834 }
5835 }
5836
5837 //
5838 // Following is lookup table based popcount computation algorithm:-
5839 // Index Bit set count
5840 // [ 0000 -> 0,
5841 // 0001 -> 1,
5842 // 0010 -> 1,
5843 // 0011 -> 2,
5844 // 0100 -> 1,
5845 // 0101 -> 2,
5846 // 0110 -> 2,
5847 // 0111 -> 3,
5848 // 1000 -> 1,
5849 // 1001 -> 2,
5850 // 1010 -> 3,
5851 // 1011 -> 3,
5852 // 1100 -> 2,
5853 // 1101 -> 3,
5854 // 1111 -> 4 ]
5855 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5856 // shuffle indices for lookup table access.
5857 // b. Right shift each byte of vector lane by 4 positions.
5858 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5859 // shuffle indices for lookup table access.
5860 // d. Add the bitset count of upper and lower 4 bits of each byte.
5861 // e. Unpack double words to quad words and compute sum of absolute difference of bitset
5862 // count of all the bytes of a quadword.
5863 // f. Perform step e. for upper 128bit vector lane.
5864 // g. Pack the bitset count of quadwords back to double word.
5865 // h. Unpacking and packing operations are not needed for 64bit vector lane.
5866
5867 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5868 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5869 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5870 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5871 vpsrlw(dst, src, 4, vec_enc);
5872 vpand(dst, dst, xtmp1, vec_enc);
5873 vpand(xtmp1, src, xtmp1, vec_enc);
5874 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5875 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5876 vpshufb(dst, xtmp2, dst, vec_enc);
5877 vpaddb(dst, dst, xtmp1, vec_enc);
5878 }
5879
5880 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5881 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5882 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5883 // Following code is as per steps e,f,g and h of above algorithm.
5884 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5885 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5886 vpsadbw(dst, dst, xtmp2, vec_enc);
5887 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5888 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5889 vpackuswb(dst, xtmp1, dst, vec_enc);
5890 }
5891
5892 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5893 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5894 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5895 // Add the popcount of upper and lower bytes of word.
5896 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5897 vpsrlw(dst, xtmp1, 8, vec_enc);
5898 vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5899 vpaddw(dst, dst, xtmp1, vec_enc);
5900 }
5901
5902 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5903 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5904 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5905 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5906 vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5907 }
5908
5909 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5910 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5911 switch(bt) {
5912 case T_LONG:
5913 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5914 break;
5915 case T_INT:
5916 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5917 break;
5918 case T_CHAR:
5919 case T_SHORT:
5920 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5921 break;
5922 case T_BYTE:
5923 case T_BOOLEAN:
5924 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5925 break;
5926 default:
5927 fatal("Unsupported type %s", type2name(bt));
5928 break;
5929 }
5930 }
5931
5932 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5933 KRegister mask, bool merge, int vec_enc) {
5934 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5935 switch(bt) {
5936 case T_LONG:
5937 assert(VM_Version::supports_avx512_vpopcntdq(), "");
5938 evpopcntq(dst, mask, src, merge, vec_enc);
5939 break;
5940 case T_INT:
5941 assert(VM_Version::supports_avx512_vpopcntdq(), "");
5942 evpopcntd(dst, mask, src, merge, vec_enc);
5943 break;
5944 case T_CHAR:
5945 case T_SHORT:
5946 assert(VM_Version::supports_avx512_bitalg(), "");
5947 evpopcntw(dst, mask, src, merge, vec_enc);
5948 break;
5949 case T_BYTE:
5950 case T_BOOLEAN:
5951 assert(VM_Version::supports_avx512_bitalg(), "");
5952 evpopcntb(dst, mask, src, merge, vec_enc);
5953 break;
5954 default:
5955 fatal("Unsupported type %s", type2name(bt));
5956 break;
5957 }
5958 }
5959
5960 // Bit reversal algorithm first reverses the bits of each byte followed by
5961 // a byte level reversal for multi-byte primitive types (short/int/long).
5962 // Algorithm performs a lookup table access to get reverse bit sequence
5963 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5964 // is obtained by swapping the reverse bit sequences of upper and lower
5965 // nibble of a byte.
5966 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5967 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5968 if (VM_Version::supports_avx512vlbw()) {
5969
5970 // Get the reverse bit sequence of lower nibble of each byte.
5971 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5972 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5973 evpandq(dst, xtmp2, src, vec_enc);
5974 vpshufb(dst, xtmp1, dst, vec_enc);
5975 vpsllq(dst, dst, 4, vec_enc);
5976
5977 // Get the reverse bit sequence of upper nibble of each byte.
5978 vpandn(xtmp2, xtmp2, src, vec_enc);
5979 vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5980 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5981
5982 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
5983 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
5984 evporq(xtmp2, dst, xtmp2, vec_enc);
5985 vector_reverse_byte(bt, dst, xtmp2, vec_enc);
5986
5987 } else if(vec_enc == Assembler::AVX_512bit) {
5988 // Shift based bit reversal.
5989 assert(bt == T_LONG || bt == T_INT, "");
5990
5991 // Swap lower and upper nibble of each byte.
5992 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
5993
5994 // Swap two least and most significant bits of each nibble.
5995 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
5996
5997 // Swap adjacent pair of bits.
5998 evmovdqul(xtmp1, k0, dst, true, vec_enc);
5999 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
6000
6001 evmovdqul(xtmp1, k0, dst, true, vec_enc);
6002 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
6003 } else {
6004 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
6005 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6006
6007 // Get the reverse bit sequence of lower nibble of each byte.
6008 vpand(dst, xtmp2, src, vec_enc);
6009 vpshufb(dst, xtmp1, dst, vec_enc);
6010 vpsllq(dst, dst, 4, vec_enc);
6011
6012 // Get the reverse bit sequence of upper nibble of each byte.
6013 vpandn(xtmp2, xtmp2, src, vec_enc);
6014 vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6015 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6016
6017 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6018 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6019 vpor(xtmp2, dst, xtmp2, vec_enc);
6020 vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6021 }
6022 }
6023
6024 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6025 XMMRegister xtmp, Register rscratch) {
6026 assert(VM_Version::supports_gfni(), "");
6027 assert(rscratch != noreg || always_reachable(mask), "missing");
6028
6029 // Galois field instruction based bit reversal based on following algorithm.
6030 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6031 vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6032 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6033 vector_reverse_byte(bt, dst, xtmp, vec_enc);
6034 }
6035
6036 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6037 XMMRegister xtmp1, Register rtmp, int vec_enc) {
6038 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6039 evpandq(dst, xtmp1, src, vec_enc);
6040 vpsllq(dst, dst, nbits, vec_enc);
6041 vpandn(xtmp1, xtmp1, src, vec_enc);
6042 vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6043 evporq(dst, dst, xtmp1, vec_enc);
6044 }
6045
6046 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6047 XMMRegister xtmp2, Register rtmp, int vec_enc) {
6048 // Shift based bit reversal.
6049 assert(VM_Version::supports_evex(), "");
6050 switch(bt) {
6051 case T_LONG:
6052 // Swap upper and lower double word of each quad word.
6053 evprorq(xtmp1, k0, src, 32, true, vec_enc);
6054 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6055 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6056 break;
6057 case T_INT:
6058 // Swap upper and lower word of each double word.
6059 evprord(xtmp1, k0, src, 16, true, vec_enc);
6060 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6061 break;
6062 case T_CHAR:
6063 case T_SHORT:
6064 // Swap upper and lower byte of each word.
6065 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6066 break;
6067 case T_BYTE:
6068 evmovdquq(dst, k0, src, true, vec_enc);
6069 break;
6070 default:
6071 fatal("Unsupported type %s", type2name(bt));
6072 break;
6073 }
6074 }
6075
6076 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6077 if (bt == T_BYTE) {
6078 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6079 evmovdquq(dst, k0, src, true, vec_enc);
6080 } else {
6081 vmovdqu(dst, src);
6082 }
6083 return;
6084 }
6085 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6086 // pre-computed shuffle indices.
6087 switch(bt) {
6088 case T_LONG:
6089 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6090 break;
6091 case T_INT:
6092 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6093 break;
6094 case T_CHAR:
6095 case T_SHORT:
6096 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6097 break;
6098 default:
6099 fatal("Unsupported type %s", type2name(bt));
6100 break;
6101 }
6102 vpshufb(dst, src, dst, vec_enc);
6103 }
6104
6105 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6106 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6107 KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6108 assert(is_integral_type(bt), "");
6109 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6110 assert(VM_Version::supports_avx512cd(), "");
6111 switch(bt) {
6112 case T_LONG:
6113 evplzcntq(dst, ktmp, src, merge, vec_enc);
6114 break;
6115 case T_INT:
6116 evplzcntd(dst, ktmp, src, merge, vec_enc);
6117 break;
6118 case T_SHORT:
6119 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6120 vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6121 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6122 vpunpckhwd(dst, xtmp1, src, vec_enc);
6123 evplzcntd(dst, ktmp, dst, merge, vec_enc);
6124 vpackusdw(dst, xtmp2, dst, vec_enc);
6125 break;
6126 case T_BYTE:
6127 // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6128 // accessing the lookup table.
6129 // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6130 // accessing the lookup table.
6131 // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6132 assert(VM_Version::supports_avx512bw(), "");
6133 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6134 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6135 vpand(xtmp2, dst, src, vec_enc);
6136 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6137 vpsrlw(xtmp3, src, 4, vec_enc);
6138 vpand(xtmp3, dst, xtmp3, vec_enc);
6139 vpshufb(dst, xtmp1, xtmp3, vec_enc);
6140 vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6141 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6142 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6143 break;
6144 default:
6145 fatal("Unsupported type %s", type2name(bt));
6146 break;
6147 }
6148 }
6149
6150 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6151 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6152 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6153 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6154 // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6155 // accessing the lookup table.
6156 vpand(dst, xtmp2, src, vec_enc);
6157 vpshufb(dst, xtmp1, dst, vec_enc);
6158 // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6159 // accessing the lookup table.
6160 vpsrlw(xtmp3, src, 4, vec_enc);
6161 vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6162 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6163 // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6164 vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6165 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6166 vpaddb(dst, dst, xtmp2, vec_enc);
6167 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6168 }
6169
6170 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6171 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6172 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6173 // Add zero counts of lower byte and upper byte of a word if
6174 // upper byte holds a zero value.
6175 vpsrlw(xtmp3, src, 8, vec_enc);
6176 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6177 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6178 vpsllw(xtmp2, dst, 8, vec_enc);
6179 vpaddw(xtmp2, xtmp2, dst, vec_enc);
6180 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6181 vpsrlw(dst, dst, 8, vec_enc);
6182 }
6183
6184 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6185 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6186 // By converting the integer to a float, we can obtain the number of leading zeros based on the exponent of the float.
6187 // As the float exponent contains a bias of 127 for nonzero values, the bias must be removed before interpreting the
6188 // exponent as the leading zero count.
6189
6190 // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6191 // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6192 // contributes to the leading number of zeros.
6193 vpsrld(dst, src, 1, vec_enc);
6194 vpandn(dst, dst, src, vec_enc);
6195
6196 vcvtdq2ps(dst, dst, vec_enc);
6197
6198 // By comparing the register to itself, all the bits in the destination are set.
6199 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6200
6201 // Move the biased exponent to the low end of the lane and mask with 0xFF to discard the sign bit.
6202 vpsrld(xtmp2, xtmp1, 24, vec_enc);
6203 vpsrld(dst, dst, 23, vec_enc);
6204 vpand(dst, xtmp2, dst, vec_enc);
6205
6206 // Subtract 127 from the exponent, which removes the bias from the exponent.
6207 vpsrld(xtmp2, xtmp1, 25, vec_enc);
6208 vpsubd(dst, dst, xtmp2, vec_enc);
6209
6210 vpsrld(xtmp2, xtmp1, 27, vec_enc);
6211
6212 // If the original value is 0 the exponent would not have bias, so the subtraction creates a negative number. If this
6213 // is found in any of the lanes, replace the lane with -1 from xtmp1.
6214 vblendvps(dst, dst, xtmp1, dst, vec_enc, true, xtmp3);
6215
6216 // If the original value is negative, replace the lane with 31.
6217 vblendvps(dst, dst, xtmp2, src, vec_enc, true, xtmp3);
6218
6219 // Subtract the exponent from 31, giving the final result. For 0, the result is 32 as the exponent was replaced with -1,
6220 // and for negative numbers the result is 0 as the exponent was replaced with 31.
6221 vpsubd(dst, xtmp2, dst, vec_enc);
6222 }
6223
6224 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6225 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6226 // Find the leading zeros of the top and bottom halves of the long individually.
6227 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6228
6229 // Move the top half result to the bottom half of xtmp1, setting the top half to 0.
6230 vpsrlq(xtmp1, dst, 32, vec_enc);
6231 // By moving the top half result to the right by 6 bits, if the top half was empty (i.e. 32 is returned) the result bit will
6232 // be in the most significant position of the bottom half.
6233 vpsrlq(xtmp2, dst, 6, vec_enc);
6234
6235 // In the bottom half, add the top half and bottom half results.
6236 vpaddq(dst, xtmp1, dst, vec_enc);
6237
6238 // For the bottom half, choose between the values using the most significant bit of xtmp2.
6239 // If the MSB is set, then bottom+top in dst is the resulting value. If the top half is less than 32 xtmp1 is chosen,
6240 // which contains only the top half result.
6241 // In the top half the MSB is always zero, so the value in xtmp1 is always chosen. This value is always 0, which clears
6242 // the lane as required.
6243 vblendvps(dst, xtmp1, dst, xtmp2, vec_enc, true, xtmp3);
6244 }
6245
6246 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6247 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6248 Register rtmp, int vec_enc) {
6249 assert(is_integral_type(bt), "unexpected type");
6250 assert(vec_enc < Assembler::AVX_512bit, "");
6251 switch(bt) {
6252 case T_LONG:
6253 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6254 break;
6255 case T_INT:
6256 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6257 break;
6258 case T_SHORT:
6259 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6260 break;
6261 case T_BYTE:
6262 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6263 break;
6264 default:
6265 fatal("Unsupported type %s", type2name(bt));
6266 break;
6267 }
6268 }
6269
6270 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6271 switch(bt) {
6272 case T_BYTE:
6273 vpsubb(dst, src1, src2, vec_enc);
6274 break;
6275 case T_SHORT:
6276 vpsubw(dst, src1, src2, vec_enc);
6277 break;
6278 case T_INT:
6279 vpsubd(dst, src1, src2, vec_enc);
6280 break;
6281 case T_LONG:
6282 vpsubq(dst, src1, src2, vec_enc);
6283 break;
6284 default:
6285 fatal("Unsupported type %s", type2name(bt));
6286 break;
6287 }
6288 }
6289
6290 // Trailing zero count computation is based on leading zero count operation as per
6291 // following equation. All AVX3 targets support AVX512CD feature which offers
6292 // direct vector instruction to compute leading zero count.
6293 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6294 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6295 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6296 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6297 assert(is_integral_type(bt), "");
6298 // xtmp = -1
6299 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6300 // xtmp = xtmp + src
6301 vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6302 // xtmp = xtmp & ~src
6303 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6304 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6305 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6306 vpsub(bt, dst, xtmp4, dst, vec_enc);
6307 }
6308
6309 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6310 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6311 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6312 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6313 assert(is_integral_type(bt), "");
6314 // xtmp = 0
6315 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6316 // xtmp = 0 - src
6317 vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6318 // xtmp = xtmp | src
6319 vpor(xtmp3, xtmp3, src, vec_enc);
6320 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6321 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6322 vpsub(bt, dst, xtmp1, dst, vec_enc);
6323 }
6324
6325 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6326 Label done;
6327 Label neg_divisor_fastpath;
6328 cmpl(divisor, 0);
6329 jccb(Assembler::less, neg_divisor_fastpath);
6330 xorl(rdx, rdx);
6331 divl(divisor);
6332 jmpb(done);
6333 bind(neg_divisor_fastpath);
6334 // Fastpath for divisor < 0:
6335 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6336 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6337 movl(rdx, rax);
6338 subl(rdx, divisor);
6339 if (VM_Version::supports_bmi1()) {
6340 andnl(rax, rdx, rax);
6341 } else {
6342 notl(rdx);
6343 andl(rax, rdx);
6344 }
6345 shrl(rax, 31);
6346 bind(done);
6347 }
6348
6349 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6350 Label done;
6351 Label neg_divisor_fastpath;
6352 cmpl(divisor, 0);
6353 jccb(Assembler::less, neg_divisor_fastpath);
6354 xorl(rdx, rdx);
6355 divl(divisor);
6356 jmpb(done);
6357 bind(neg_divisor_fastpath);
6358 // Fastpath when divisor < 0:
6359 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6360 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6361 movl(rdx, rax);
6362 subl(rax, divisor);
6363 if (VM_Version::supports_bmi1()) {
6364 andnl(rax, rax, rdx);
6365 } else {
6366 notl(rax);
6367 andl(rax, rdx);
6368 }
6369 sarl(rax, 31);
6370 andl(rax, divisor);
6371 subl(rdx, rax);
6372 bind(done);
6373 }
6374
6375 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6376 Label done;
6377 Label neg_divisor_fastpath;
6378
6379 cmpl(divisor, 0);
6380 jccb(Assembler::less, neg_divisor_fastpath);
6381 xorl(rdx, rdx);
6382 divl(divisor);
6383 jmpb(done);
6384 bind(neg_divisor_fastpath);
6385 // Fastpath for divisor < 0:
6386 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6387 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6388 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6389 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6390 movl(rdx, rax);
6391 subl(rax, divisor);
6392 if (VM_Version::supports_bmi1()) {
6393 andnl(rax, rax, rdx);
6394 } else {
6395 notl(rax);
6396 andl(rax, rdx);
6397 }
6398 movl(tmp, rax);
6399 shrl(rax, 31); // quotient
6400 sarl(tmp, 31);
6401 andl(tmp, divisor);
6402 subl(rdx, tmp); // remainder
6403 bind(done);
6404 }
6405
6406 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6407 XMMRegister xtmp2, Register rtmp) {
6408 if(VM_Version::supports_gfni()) {
6409 // Galois field instruction based bit reversal based on following algorithm.
6410 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6411 mov64(rtmp, 0x8040201008040201L);
6412 movq(xtmp1, src);
6413 movq(xtmp2, rtmp);
6414 gf2p8affineqb(xtmp1, xtmp2, 0);
6415 movq(dst, xtmp1);
6416 } else {
6417 // Swap even and odd numbered bits.
6418 movl(rtmp, src);
6419 andl(rtmp, 0x55555555);
6420 shll(rtmp, 1);
6421 movl(dst, src);
6422 andl(dst, 0xAAAAAAAA);
6423 shrl(dst, 1);
6424 orl(dst, rtmp);
6425
6426 // Swap LSB and MSB 2 bits of each nibble.
6427 movl(rtmp, dst);
6428 andl(rtmp, 0x33333333);
6429 shll(rtmp, 2);
6430 andl(dst, 0xCCCCCCCC);
6431 shrl(dst, 2);
6432 orl(dst, rtmp);
6433
6434 // Swap LSB and MSB 4 bits of each byte.
6435 movl(rtmp, dst);
6436 andl(rtmp, 0x0F0F0F0F);
6437 shll(rtmp, 4);
6438 andl(dst, 0xF0F0F0F0);
6439 shrl(dst, 4);
6440 orl(dst, rtmp);
6441 }
6442 bswapl(dst);
6443 }
6444
6445 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6446 XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6447 if(VM_Version::supports_gfni()) {
6448 // Galois field instruction based bit reversal based on following algorithm.
6449 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6450 mov64(rtmp1, 0x8040201008040201L);
6451 movq(xtmp1, src);
6452 movq(xtmp2, rtmp1);
6453 gf2p8affineqb(xtmp1, xtmp2, 0);
6454 movq(dst, xtmp1);
6455 } else {
6456 // Swap even and odd numbered bits.
6457 movq(rtmp1, src);
6458 mov64(rtmp2, 0x5555555555555555L);
6459 andq(rtmp1, rtmp2);
6460 shlq(rtmp1, 1);
6461 movq(dst, src);
6462 notq(rtmp2);
6463 andq(dst, rtmp2);
6464 shrq(dst, 1);
6465 orq(dst, rtmp1);
6466
6467 // Swap LSB and MSB 2 bits of each nibble.
6468 movq(rtmp1, dst);
6469 mov64(rtmp2, 0x3333333333333333L);
6470 andq(rtmp1, rtmp2);
6471 shlq(rtmp1, 2);
6472 notq(rtmp2);
6473 andq(dst, rtmp2);
6474 shrq(dst, 2);
6475 orq(dst, rtmp1);
6476
6477 // Swap LSB and MSB 4 bits of each byte.
6478 movq(rtmp1, dst);
6479 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6480 andq(rtmp1, rtmp2);
6481 shlq(rtmp1, 4);
6482 notq(rtmp2);
6483 andq(dst, rtmp2);
6484 shrq(dst, 4);
6485 orq(dst, rtmp1);
6486 }
6487 bswapq(dst);
6488 }
6489
6490 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6491 Label done;
6492 Label neg_divisor_fastpath;
6493 cmpq(divisor, 0);
6494 jccb(Assembler::less, neg_divisor_fastpath);
6495 xorl(rdx, rdx);
6496 divq(divisor);
6497 jmpb(done);
6498 bind(neg_divisor_fastpath);
6499 // Fastpath for divisor < 0:
6500 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6501 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6502 movq(rdx, rax);
6503 subq(rdx, divisor);
6504 if (VM_Version::supports_bmi1()) {
6505 andnq(rax, rdx, rax);
6506 } else {
6507 notq(rdx);
6508 andq(rax, rdx);
6509 }
6510 shrq(rax, 63);
6511 bind(done);
6512 }
6513
6514 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6515 Label done;
6516 Label neg_divisor_fastpath;
6517 cmpq(divisor, 0);
6518 jccb(Assembler::less, neg_divisor_fastpath);
6519 xorq(rdx, rdx);
6520 divq(divisor);
6521 jmp(done);
6522 bind(neg_divisor_fastpath);
6523 // Fastpath when divisor < 0:
6524 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6525 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6526 movq(rdx, rax);
6527 subq(rax, divisor);
6528 if (VM_Version::supports_bmi1()) {
6529 andnq(rax, rax, rdx);
6530 } else {
6531 notq(rax);
6532 andq(rax, rdx);
6533 }
6534 sarq(rax, 63);
6535 andq(rax, divisor);
6536 subq(rdx, rax);
6537 bind(done);
6538 }
6539
6540 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6541 Label done;
6542 Label neg_divisor_fastpath;
6543 cmpq(divisor, 0);
6544 jccb(Assembler::less, neg_divisor_fastpath);
6545 xorq(rdx, rdx);
6546 divq(divisor);
6547 jmp(done);
6548 bind(neg_divisor_fastpath);
6549 // Fastpath for divisor < 0:
6550 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6551 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6552 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6553 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6554 movq(rdx, rax);
6555 subq(rax, divisor);
6556 if (VM_Version::supports_bmi1()) {
6557 andnq(rax, rax, rdx);
6558 } else {
6559 notq(rax);
6560 andq(rax, rdx);
6561 }
6562 movq(tmp, rax);
6563 shrq(rax, 63); // quotient
6564 sarq(tmp, 63);
6565 andq(tmp, divisor);
6566 subq(rdx, tmp); // remainder
6567 bind(done);
6568 }
6569
6570 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6571 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6572 int vlen_enc) {
6573 assert(VM_Version::supports_avx512bw(), "");
6574 // Byte shuffles are inlane operations and indices are determined using
6575 // lower 4 bit of each shuffle lane, thus all shuffle indices are
6576 // normalized to index range 0-15. This makes sure that all the multiples
6577 // of an index value are placed at same relative position in 128 bit
6578 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6579 // will be 16th element in their respective 128 bit lanes.
6580 movl(rtmp, 16);
6581 evpbroadcastb(xtmp1, rtmp, vlen_enc);
6582
6583 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6584 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6585 // original shuffle indices and move the shuffled lanes corresponding to true
6586 // mask to destination vector.
6587 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6588 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6589 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6590
6591 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6592 // and broadcasting second 128 bit lane.
6593 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
6594 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6595 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6596 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6597 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6598
6599 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6600 // and broadcasting third 128 bit lane.
6601 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc);
6602 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6603 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6604 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6605 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6606
6607 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6608 // and broadcasting third 128 bit lane.
6609 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
6610 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6611 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6612 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6613 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6614 }
6615
6616 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6617 XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6618 if (vlen_enc == AVX_128bit) {
6619 vpermilps(dst, src, shuffle, vlen_enc);
6620 } else if (bt == T_INT) {
6621 vpermd(dst, shuffle, src, vlen_enc);
6622 } else {
6623 assert(bt == T_FLOAT, "");
6624 vpermps(dst, shuffle, src, vlen_enc);
6625 }
6626 }
6627
6628 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6629 switch(opcode) {
6630 case Op_AddHF: vaddsh(dst, src1, src2); break;
6631 case Op_SubHF: vsubsh(dst, src1, src2); break;
6632 case Op_MulHF: vmulsh(dst, src1, src2); break;
6633 case Op_DivHF: vdivsh(dst, src1, src2); break;
6634 default: assert(false, "%s", NodeClassNames[opcode]); break;
6635 }
6636 }
6637
6638 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6639 switch(elem_bt) {
6640 case T_BYTE:
6641 if (ideal_opc == Op_SaturatingAddV) {
6642 vpaddsb(dst, src1, src2, vlen_enc);
6643 } else {
6644 assert(ideal_opc == Op_SaturatingSubV, "");
6645 vpsubsb(dst, src1, src2, vlen_enc);
6646 }
6647 break;
6648 case T_SHORT:
6649 if (ideal_opc == Op_SaturatingAddV) {
6650 vpaddsw(dst, src1, src2, vlen_enc);
6651 } else {
6652 assert(ideal_opc == Op_SaturatingSubV, "");
6653 vpsubsw(dst, src1, src2, vlen_enc);
6654 }
6655 break;
6656 default:
6657 fatal("Unsupported type %s", type2name(elem_bt));
6658 break;
6659 }
6660 }
6661
6662 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6663 switch(elem_bt) {
6664 case T_BYTE:
6665 if (ideal_opc == Op_SaturatingAddV) {
6666 vpaddusb(dst, src1, src2, vlen_enc);
6667 } else {
6668 assert(ideal_opc == Op_SaturatingSubV, "");
6669 vpsubusb(dst, src1, src2, vlen_enc);
6670 }
6671 break;
6672 case T_SHORT:
6673 if (ideal_opc == Op_SaturatingAddV) {
6674 vpaddusw(dst, src1, src2, vlen_enc);
6675 } else {
6676 assert(ideal_opc == Op_SaturatingSubV, "");
6677 vpsubusw(dst, src1, src2, vlen_enc);
6678 }
6679 break;
6680 default:
6681 fatal("Unsupported type %s", type2name(elem_bt));
6682 break;
6683 }
6684 }
6685
6686 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6687 XMMRegister src2, KRegister ktmp, int vlen_enc) {
6688 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6689 // overflow_mask = Inp1 <u Inp2
6690 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc);
6691 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6692 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6693 }
6694
6695 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6696 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6697 // Emulate unsigned comparison using signed comparison
6698 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6699 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6700 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6701 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6702
6703 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6704
6705 // Res = INP1 - INP2 (non-commutative and non-associative)
6706 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6707 // Res = Mask ? Zero : Res
6708 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6709 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6710 }
6711
6712 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6713 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6714 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6715 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6716 // Res = Signed Add INP1, INP2
6717 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6718 // T1 = SRC1 | SRC2
6719 vpor(xtmp1, src1, src2, vlen_enc);
6720 // Max_Unsigned = -1
6721 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6722 // Unsigned compare: Mask = Res <u T1
6723 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6724 // res = Mask ? Max_Unsigned : Res
6725 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc);
6726 }
6727
6728 //
6729 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6730 // unsigned addition operation.
6731 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6732 //
6733 // We empirically determined its semantic equivalence to following reduced expression
6734 // overflow_mask = (a + b) <u (a | b)
6735 //
6736 // and also verified it though Alive2 solver.
6737 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6738 //
6739
6740 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6741 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6742 // Res = Signed Add INP1, INP2
6743 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6744 // Compute T1 = INP1 | INP2
6745 vpor(xtmp3, src1, src2, vlen_enc);
6746 // T1 = Minimum signed value.
6747 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6748 // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6749 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6750 // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6751 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6752 // Compute overflow detection mask = Res<1> <s T1
6753 if (elem_bt == T_INT) {
6754 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6755 } else {
6756 assert(elem_bt == T_LONG, "");
6757 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6758 }
6759 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6760 }
6761
6762 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6763 int vlen_enc, bool xtmp2_hold_M1) {
6764 if (VM_Version::supports_avx512dq()) {
6765 evpmovq2m(ktmp, src, vlen_enc);
6766 } else {
6767 assert(VM_Version::supports_evex(), "");
6768 if (!xtmp2_hold_M1) {
6769 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6770 }
6771 evpsraq(xtmp1, src, 63, vlen_enc);
6772 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6773 }
6774 }
6775
6776 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6777 int vlen_enc, bool xtmp2_hold_M1) {
6778 if (VM_Version::supports_avx512dq()) {
6779 evpmovd2m(ktmp, src, vlen_enc);
6780 } else {
6781 assert(VM_Version::supports_evex(), "");
6782 if (!xtmp2_hold_M1) {
6783 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6784 }
6785 vpsrad(xtmp1, src, 31, vlen_enc);
6786 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6787 }
6788 }
6789
6790
6791 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6792 if (elem_bt == T_LONG) {
6793 if (VM_Version::supports_evex()) {
6794 evpsraq(dst, src, 63, vlen_enc);
6795 } else {
6796 vpsrad(dst, src, 31, vlen_enc);
6797 vpshufd(dst, dst, 0xF5, vlen_enc);
6798 }
6799 } else {
6800 assert(elem_bt == T_INT, "");
6801 vpsrad(dst, src, 31, vlen_enc);
6802 }
6803 }
6804
6805 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6806 if (compute_allones) {
6807 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6808 vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6809 } else {
6810 vpcmpeqq(allones, allones, allones, vlen_enc);
6811 }
6812 }
6813 if (elem_bt == T_LONG) {
6814 vpsrlq(dst, allones, 1, vlen_enc);
6815 } else {
6816 assert(elem_bt == T_INT, "");
6817 vpsrld(dst, allones, 1, vlen_enc);
6818 }
6819 }
6820
6821 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6822 if (compute_allones) {
6823 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6824 vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6825 } else {
6826 vpcmpeqq(allones, allones, allones, vlen_enc);
6827 }
6828 }
6829 if (elem_bt == T_LONG) {
6830 vpsllq(dst, allones, 63, vlen_enc);
6831 } else {
6832 assert(elem_bt == T_INT, "");
6833 vpslld(dst, allones, 31, vlen_enc);
6834 }
6835 }
6836
6837 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2,
6838 Assembler::ComparisonPredicate cond, int vlen_enc) {
6839 switch(elem_bt) {
6840 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6841 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6842 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6843 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6844 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6845 }
6846 }
6847
6848 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6849 switch(elem_bt) {
6850 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break;
6851 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break;
6852 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6853 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break;
6854 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6855 }
6856 }
6857
6858 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6859 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6860 if (elem_bt == T_LONG) {
6861 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6862 } else {
6863 assert(elem_bt == T_INT, "");
6864 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6865 }
6866 }
6867
6868 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6869 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6870 KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6871 assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6872 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6873 // Overflow detection based on Hacker's delight section 2-13.
6874 if (ideal_opc == Op_SaturatingAddV) {
6875 // res = src1 + src2
6876 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6877 // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6878 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6879 vpxor(xtmp1, dst, src1, vlen_enc);
6880 vpxor(xtmp2, dst, src2, vlen_enc);
6881 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6882 } else {
6883 assert(ideal_opc == Op_SaturatingSubV, "");
6884 // res = src1 - src2
6885 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6886 // Overflow occurs when both inputs have opposite polarity and
6887 // result polarity does not comply with first input polarity.
6888 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6889 vpxor(xtmp1, src1, src2, vlen_enc);
6890 vpxor(xtmp2, dst, src1, vlen_enc);
6891 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6892 }
6893
6894 // Compute overflow detection mask.
6895 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6896 // Note: xtmp1 hold -1 in all its lanes after above call.
6897
6898 // Compute mask based on first input polarity.
6899 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6900
6901 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6902 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6903
6904 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6905 // set bits in first input polarity mask holds a min value.
6906 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6907 // Blend destination lanes with saturated values using overflow detection mask.
6908 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6909 }
6910
6911
6912 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6913 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6914 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6915 assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6916 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6917 // Overflow detection based on Hacker's delight section 2-13.
6918 if (ideal_opc == Op_SaturatingAddV) {
6919 // res = src1 + src2
6920 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6921 // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6922 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6923 vpxor(xtmp1, dst, src1, vlen_enc);
6924 vpxor(xtmp2, dst, src2, vlen_enc);
6925 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6926 } else {
6927 assert(ideal_opc == Op_SaturatingSubV, "");
6928 // res = src1 - src2
6929 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6930 // Overflow occurs when both inputs have opposite polarity and
6931 // result polarity does not comply with first input polarity.
6932 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6933 vpxor(xtmp1, src1, src2, vlen_enc);
6934 vpxor(xtmp2, dst, src1, vlen_enc);
6935 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6936 }
6937
6938 // Sign-extend to compute overflow detection mask.
6939 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6940
6941 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6942 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6943 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6944
6945 // Compose saturating min/max vector using first input polarity mask.
6946 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6947 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6948
6949 // Blend result with saturating vector using overflow detection mask.
6950 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6951 }
6952
6953 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6954 switch(elem_bt) {
6955 case T_BYTE:
6956 if (ideal_opc == Op_SaturatingAddV) {
6957 vpaddsb(dst, src1, src2, vlen_enc);
6958 } else {
6959 assert(ideal_opc == Op_SaturatingSubV, "");
6960 vpsubsb(dst, src1, src2, vlen_enc);
6961 }
6962 break;
6963 case T_SHORT:
6964 if (ideal_opc == Op_SaturatingAddV) {
6965 vpaddsw(dst, src1, src2, vlen_enc);
6966 } else {
6967 assert(ideal_opc == Op_SaturatingSubV, "");
6968 vpsubsw(dst, src1, src2, vlen_enc);
6969 }
6970 break;
6971 default:
6972 fatal("Unsupported type %s", type2name(elem_bt));
6973 break;
6974 }
6975 }
6976
6977 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6978 switch(elem_bt) {
6979 case T_BYTE:
6980 if (ideal_opc == Op_SaturatingAddV) {
6981 vpaddusb(dst, src1, src2, vlen_enc);
6982 } else {
6983 assert(ideal_opc == Op_SaturatingSubV, "");
6984 vpsubusb(dst, src1, src2, vlen_enc);
6985 }
6986 break;
6987 case T_SHORT:
6988 if (ideal_opc == Op_SaturatingAddV) {
6989 vpaddusw(dst, src1, src2, vlen_enc);
6990 } else {
6991 assert(ideal_opc == Op_SaturatingSubV, "");
6992 vpsubusw(dst, src1, src2, vlen_enc);
6993 }
6994 break;
6995 default:
6996 fatal("Unsupported type %s", type2name(elem_bt));
6997 break;
6998 }
6999 }
7000
7001 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
7002 XMMRegister src2, int vlen_enc) {
7003 switch(elem_bt) {
7004 case T_BYTE:
7005 evpermi2b(dst, src1, src2, vlen_enc);
7006 break;
7007 case T_SHORT:
7008 evpermi2w(dst, src1, src2, vlen_enc);
7009 break;
7010 case T_INT:
7011 evpermi2d(dst, src1, src2, vlen_enc);
7012 break;
7013 case T_LONG:
7014 evpermi2q(dst, src1, src2, vlen_enc);
7015 break;
7016 case T_FLOAT:
7017 evpermi2ps(dst, src1, src2, vlen_enc);
7018 break;
7019 case T_DOUBLE:
7020 evpermi2pd(dst, src1, src2, vlen_enc);
7021 break;
7022 default:
7023 fatal("Unsupported type %s", type2name(elem_bt));
7024 break;
7025 }
7026 }
7027
7028 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7029 if (is_unsigned) {
7030 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7031 } else {
7032 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7033 }
7034 }
7035
7036 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7037 if (is_unsigned) {
7038 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7039 } else {
7040 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7041 }
7042 }
7043
7044 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
7045 switch(opcode) {
7046 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7047 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7048 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7049 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7050 default: assert(false, "%s", NodeClassNames[opcode]); break;
7051 }
7052 }
7053
7054 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7055 switch(opcode) {
7056 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7057 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7058 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7059 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7060 default: assert(false, "%s", NodeClassNames[opcode]); break;
7061 }
7062 }
7063
7064 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7065 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7066 vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7067 }
7068
7069 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7070 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7071 if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7072 // Move sign bits of src2 to mask register.
7073 evpmovw2m(ktmp, src2, vlen_enc);
7074 // xtmp1 = src2 < 0 ? src2 : src1
7075 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7076 // xtmp2 = src2 < 0 ? ? src1 : src2
7077 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7078 // Idea behind above swapping is to make seconds source operand a +ve value.
7079 // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7080 // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7081 // the second source operand, either a NaN or a valid floating-point value, is returned
7082 // dst = max(xtmp1, xtmp2)
7083 evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7084 // isNaN = is_unordered_quiet(xtmp1)
7085 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7086 // Final result is same as first source if its a NaN value,
7087 // in case second operand holds a NaN value then as per above semantics
7088 // result is same as second operand.
7089 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7090 } else {
7091 assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7092 // Move sign bits of src1 to mask register.
7093 evpmovw2m(ktmp, src1, vlen_enc);
7094 // xtmp1 = src1 < 0 ? src2 : src1
7095 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7096 // xtmp2 = src1 < 0 ? src1 : src2
7097 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7098 // Idea behind above swapping is to make seconds source operand a -ve value.
7099 // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7100 // the second source operand is returned.
7101 // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7102 // or a valid floating-point value, is written to the result.
7103 // dst = min(xtmp1, xtmp2)
7104 evminph(dst, xtmp1, xtmp2, vlen_enc);
7105 // isNaN = is_unordered_quiet(xtmp1)
7106 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7107 // Final result is same as first source if its a NaN value,
7108 // in case second operand holds a NaN value then as per above semantics
7109 // result is same as second operand.
7110 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7111 }
7112 }