1 /*
2 * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "asm/assembler.hpp"
26 #include "asm/assembler.inline.hpp"
27 #include "gc/shared/barrierSet.hpp"
28 #include "gc/shared/barrierSetAssembler.hpp"
29 #include "oops/methodData.hpp"
30 #include "opto/c2_MacroAssembler.hpp"
31 #include "opto/intrinsicnode.hpp"
32 #include "opto/output.hpp"
33 #include "opto/opcodes.hpp"
34 #include "opto/subnode.hpp"
35 #include "runtime/globals.hpp"
36 #include "runtime/objectMonitor.hpp"
37 #include "runtime/objectMonitorTable.hpp"
38 #include "runtime/stubRoutines.hpp"
39 #include "runtime/synchronizer.hpp"
40 #include "utilities/checkedCast.hpp"
41 #include "utilities/globalDefinitions.hpp"
42 #include "utilities/powerOfTwo.hpp"
43 #include "utilities/sizes.hpp"
44
45 #ifdef PRODUCT
46 #define BLOCK_COMMENT(str) /* nothing */
47 #define STOP(error) stop(error)
48 #else
49 #define BLOCK_COMMENT(str) block_comment(str)
50 #define STOP(error) block_comment(error); stop(error)
51 #endif
52
53 // C2 compiled method's prolog code.
54 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) {
55 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
56
57 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
58 // Remove word for return addr
59 framesize -= wordSize;
60 stack_bang_size -= wordSize;
61
62 // Calls to C2R adapters often do not accept exceptional returns.
63 // We require that their callers must bang for them. But be careful, because
64 // some VM calls (such as call site linkage) can use several kilobytes of
65 // stack. But the stack safety zone should account for that.
66 // See bugs 4446381, 4468289, 4497237.
67 if (stack_bang_size > 0) {
68 generate_stack_overflow_check(stack_bang_size);
69
70 // We always push rbp, so that on return to interpreter rbp, will be
71 // restored correctly and we can correct the stack.
72 push(rbp);
73 // Save caller's stack pointer into RBP if the frame pointer is preserved.
74 if (PreserveFramePointer) {
75 mov(rbp, rsp);
76 }
77 // Remove word for ebp
78 framesize -= wordSize;
79
80 // Create frame
81 if (framesize) {
82 subptr(rsp, framesize);
83 }
84 } else {
85 subptr(rsp, framesize);
86
87 // Save RBP register now.
88 framesize -= wordSize;
89 movptr(Address(rsp, framesize), rbp);
90 // Save caller's stack pointer into RBP if the frame pointer is preserved.
91 if (PreserveFramePointer) {
92 movptr(rbp, rsp);
93 if (framesize > 0) {
94 addptr(rbp, framesize);
95 }
96 }
97 }
98
99 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
100 framesize -= wordSize;
101 movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
102 }
103
104 #ifdef ASSERT
105 if (VerifyStackAtCalls) {
106 Label L;
107 push(rax);
108 mov(rax, rsp);
109 andptr(rax, StackAlignmentInBytes-1);
110 cmpptr(rax, StackAlignmentInBytes-wordSize);
111 pop(rax);
112 jcc(Assembler::equal, L);
113 STOP("Stack is not properly aligned!");
114 bind(L);
115 }
116 #endif
117
118 if (!is_stub) {
119 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
120 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
121 Label dummy_slow_path;
122 Label dummy_continuation;
123 Label* slow_path = &dummy_slow_path;
124 Label* continuation = &dummy_continuation;
125 if (!Compile::current()->output()->in_scratch_emit_size()) {
126 // Use real labels from actual stub when not emitting code for the purpose of measuring its size
127 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
128 Compile::current()->output()->add_stub(stub);
129 slow_path = &stub->entry();
130 continuation = &stub->continuation();
131 }
132 bs->nmethod_entry_barrier(this, slow_path, continuation);
133 }
134 }
135
136 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
137 switch (vlen_in_bytes) {
138 case 4: // fall-through
139 case 8: // fall-through
140 case 16: return Assembler::AVX_128bit;
141 case 32: return Assembler::AVX_256bit;
142 case 64: return Assembler::AVX_512bit;
143
144 default: {
145 ShouldNotReachHere();
146 return Assembler::AVX_NoVec;
147 }
148 }
149 }
150
151 // fast_lock and fast_unlock used by C2
152
153 // Because the transitions from emitted code to the runtime
154 // monitorenter/exit helper stubs are so slow it's critical that
155 // we inline both the lock-stack fast path and the inflated fast path.
156 //
157 // See also: cmpFastLock and cmpFastUnlock.
158 //
159 // What follows is a specialized inline transliteration of the code
160 // in enter() and exit(). If we're concerned about I$ bloat another
161 // option would be to emit TrySlowEnter and TrySlowExit methods
162 // at startup-time. These methods would accept arguments as
163 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
164 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply
165 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
166 // In practice, however, the # of lock sites is bounded and is usually small.
167 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
168 // if the processor uses simple bimodal branch predictors keyed by EIP
169 // Since the helper routines would be called from multiple synchronization
170 // sites.
171 //
172 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
173 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
174 // to those specialized methods. That'd give us a mostly platform-independent
175 // implementation that the JITs could optimize and inline at their pleasure.
176 // Done correctly, the only time we'd need to cross to native could would be
177 // to park() or unpark() threads. We'd also need a few more unsafe operators
178 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
179 // (b) explicit barriers or fence operations.
180 //
181 // TODO:
182 //
183 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
184 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
185 // Given TLAB allocation, Self is usually manifested in a register, so passing it into
186 // the lock operators would typically be faster than reifying Self.
187 //
188 // * Ideally I'd define the primitives as:
189 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
190 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
191 // Unfortunately ADLC bugs prevent us from expressing the ideal form.
192 // Instead, we're stuck with a rather awkward and brittle register assignments below.
193 // Furthermore the register assignments are overconstrained, possibly resulting in
194 // sub-optimal code near the synchronization site.
195 //
196 // * Eliminate the sp-proximity tests and just use "== Self" tests instead.
197 // Alternately, use a better sp-proximity test.
198 //
199 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
200 // Either one is sufficient to uniquely identify a thread.
201 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
202 //
203 // * Intrinsify notify() and notifyAll() for the common cases where the
204 // object is locked by the calling thread but the waitlist is empty.
205 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
206 //
207 // * use jccb and jmpb instead of jcc and jmp to improve code density.
208 // But beware of excessive branch density on AMD Opterons.
209 //
210 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success
211 // or failure of the fast path. If the fast path fails then we pass
212 // control to the slow path, typically in C. In fast_lock and
213 // fast_unlock we often branch to DONE_LABEL, just to find that C2
214 // will emit a conditional branch immediately after the node.
215 // So we have branches to branches and lots of ICC.ZF games.
216 // Instead, it might be better to have C2 pass a "FailureLabel"
217 // into fast_lock and fast_unlock. In the case of success, control
218 // will drop through the node. ICC.ZF is undefined at exit.
219 // In the case of failure, the node will branch directly to the
220 // FailureLabel
221
222 // obj: object to lock
223 // box: on-stack box address -- KILLED
224 // rax: tmp -- KILLED
225 // t : tmp -- KILLED
226 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register rax_reg,
227 Register t, Register thread) {
228 assert(rax_reg == rax, "Used for CAS");
229 assert_different_registers(obj, box, rax_reg, t, thread);
230
231 // Handle inflated monitor.
232 Label inflated;
233 // Finish fast lock successfully. ZF value is irrelevant.
234 Label locked;
235 // Finish fast lock unsuccessfully. MUST jump with ZF == 0
236 Label slow_path;
237
238 if (UseObjectMonitorTable) {
239 // Clear cache in case fast locking succeeds or we need to take the slow-path.
240 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
241 }
242
243 if (DiagnoseSyncOnValueBasedClasses != 0) {
244 load_klass(rax_reg, obj, t);
245 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
246 jcc(Assembler::notZero, slow_path);
247 }
248
249 const Register mark = t;
250
251 { // Fast Lock
252
253 Label push;
254
255 const Register top = UseObjectMonitorTable ? rax_reg : box;
256
257 // Load the mark.
258 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
259
260 // Prefetch top.
261 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
262
263 // Check for monitor (0b10).
264 testptr(mark, markWord::monitor_value);
265 jcc(Assembler::notZero, inflated);
266
267 // Check if lock-stack is full.
268 cmpl(top, LockStack::end_offset() - 1);
269 jcc(Assembler::greater, slow_path);
270
271 // Check if recursive.
272 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
273 jccb(Assembler::equal, push);
274
275 // Try to lock. Transition lock bits 0b01 => 0b00
276 movptr(rax_reg, mark);
277 orptr(rax_reg, markWord::unlocked_value);
278 andptr(mark, ~(int32_t)markWord::unlocked_value);
279 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
280 jcc(Assembler::notEqual, slow_path);
281
282 if (UseObjectMonitorTable) {
283 // Need to reload top, clobbered by CAS.
284 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
285 }
286 bind(push);
287 // After successful lock, push object on lock-stack.
288 movptr(Address(thread, top), obj);
289 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
290 jmp(locked);
291 }
292
293 { // Handle inflated monitor.
294 bind(inflated);
295
296 const Register monitor = t;
297
298 if (!UseObjectMonitorTable) {
299 assert(mark == monitor, "should be the same here");
300 } else {
301 const Register hash = t;
302 Label monitor_found;
303
304 // Look for the monitor in the om_cache.
305
306 ByteSize cache_offset = JavaThread::om_cache_oops_offset();
307 ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
308 const int num_unrolled = OMCache::CAPACITY;
309 for (int i = 0; i < num_unrolled; i++) {
310 movptr(monitor, Address(thread, cache_offset + monitor_offset));
311 cmpptr(obj, Address(thread, cache_offset));
312 jccb(Assembler::equal, monitor_found);
313 cache_offset = cache_offset + OMCache::oop_to_oop_difference();
314 }
315
316 // Look for the monitor in the table.
317
318 // Get the hash code.
319 movptr(hash, Address(obj, oopDesc::mark_offset_in_bytes()));
320 shrq(hash, markWord::hash_shift);
321 andq(hash, markWord::hash_mask);
322
323 // Get the table and calculate the bucket's address.
324 lea(rax_reg, ExternalAddress(ObjectMonitorTable::current_table_address()));
325 movptr(rax_reg, Address(rax_reg));
326 andq(hash, Address(rax_reg, ObjectMonitorTable::table_capacity_mask_offset()));
327 movptr(rax_reg, Address(rax_reg, ObjectMonitorTable::table_buckets_offset()));
328
329 // Read the monitor from the bucket.
330 movptr(monitor, Address(rax_reg, hash, Address::times_ptr));
331
332 // Check if the monitor in the bucket is special (empty, tombstone or removed)
333 cmpptr(monitor, ObjectMonitorTable::SpecialPointerValues::below_is_special);
334 jcc(Assembler::below, slow_path);
335
336 // Check if object matches.
337 movptr(rax_reg, Address(monitor, ObjectMonitor::object_offset()));
338 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
339 bs_asm->try_resolve_weak_handle_in_c2(this, rax_reg, slow_path);
340 cmpptr(rax_reg, obj);
341 jcc(Assembler::notEqual, slow_path);
342
343 bind(monitor_found);
344 }
345 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
346 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
347 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
348
349 Label monitor_locked;
350 // Lock the monitor.
351
352 if (UseObjectMonitorTable) {
353 // Cache the monitor for unlock before trashing box. On failure to acquire
354 // the lock, the slow path will reset the entry accordingly (see CacheSetter).
355 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
356 }
357
358 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
359 xorptr(rax_reg, rax_reg);
360 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
361 lock(); cmpxchgptr(box, owner_address);
362 jccb(Assembler::equal, monitor_locked);
363
364 // Check if recursive.
365 cmpptr(box, rax_reg);
366 jccb(Assembler::notEqual, slow_path);
367
368 // Recursive.
369 increment(recursions_address);
370
371 bind(monitor_locked);
372 }
373
374 bind(locked);
375 // Set ZF = 1
376 xorl(rax_reg, rax_reg);
377
378 #ifdef ASSERT
379 // Check that locked label is reached with ZF set.
380 Label zf_correct;
381 Label zf_bad_zero;
382 jcc(Assembler::zero, zf_correct);
383 jmp(zf_bad_zero);
384 #endif
385
386 bind(slow_path);
387 #ifdef ASSERT
388 // Check that slow_path label is reached with ZF not set.
389 jcc(Assembler::notZero, zf_correct);
390 stop("Fast Lock ZF != 0");
391 bind(zf_bad_zero);
392 stop("Fast Lock ZF != 1");
393 bind(zf_correct);
394 #endif
395 // C2 uses the value of ZF to determine the continuation.
396 }
397
398 // obj: object to lock
399 // rax: tmp -- KILLED
400 // t : tmp - cannot be obj nor rax -- KILLED
401 //
402 // Some commentary on balanced locking:
403 //
404 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
405 // Methods that don't have provably balanced locking are forced to run in the
406 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
407 // The interpreter provides two properties:
408 // I1: At return-time the interpreter automatically and quietly unlocks any
409 // objects acquired in the current activation (frame). Recall that the
410 // interpreter maintains an on-stack list of locks currently held by
411 // a frame.
412 // I2: If a method attempts to unlock an object that is not held by the
413 // frame the interpreter throws IMSX.
414 //
415 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
416 // B() doesn't have provably balanced locking so it runs in the interpreter.
417 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
418 // is still locked by A().
419 //
420 // The only other source of unbalanced locking would be JNI. The "Java Native Interface
421 // Specification" states that an object locked by JNI's MonitorEnter should not be
422 // unlocked by "normal" java-level locking and vice-versa. The specification doesn't
423 // specify what will occur if a program engages in such mixed-mode locking, however.
424 // Arguably given that the spec legislates the JNI case as undefined our implementation
425 // could reasonably *avoid* checking owner in fast_unlock().
426 // In the interest of performance we elide m->Owner==Self check in unlock.
427 // A perfectly viable alternative is to elide the owner check except when
428 // Xcheck:jni is enabled.
429
430 void C2_MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register t, Register thread) {
431 assert(reg_rax == rax, "Used for CAS");
432 assert_different_registers(obj, reg_rax, t);
433
434 // Handle inflated monitor.
435 Label inflated, inflated_check_lock_stack;
436 // Finish fast unlock successfully. MUST jump with ZF == 1
437 Label unlocked, slow_path;
438
439 const Register mark = t;
440 const Register monitor = t;
441 const Register top = UseObjectMonitorTable ? t : reg_rax;
442 const Register box = reg_rax;
443
444 Label dummy;
445 C2FastUnlockStub* stub = nullptr;
446
447 if (!Compile::current()->output()->in_scratch_emit_size()) {
448 stub = new (Compile::current()->comp_arena()) C2FastUnlockStub(obj, mark, reg_rax, thread);
449 Compile::current()->output()->add_stub(stub);
450 }
451
452 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
453
454 { // Fast Unlock
455
456 // Load top.
457 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
458
459 if (!UseObjectMonitorTable) {
460 // Prefetch mark.
461 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
462 }
463
464 // Check if obj is top of lock-stack.
465 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
466 // Top of lock stack was not obj. Must be monitor.
467 jcc(Assembler::notEqual, inflated_check_lock_stack);
468
469 // Pop lock-stack.
470 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
471 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
472
473 // Check if recursive.
474 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
475 jcc(Assembler::equal, unlocked);
476
477 // We elide the monitor check, let the CAS fail instead.
478
479 if (UseObjectMonitorTable) {
480 // Load mark.
481 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
482 }
483
484 // Try to unlock. Transition lock bits 0b00 => 0b01
485 movptr(reg_rax, mark);
486 andptr(reg_rax, ~(int32_t)markWord::lock_mask);
487 orptr(mark, markWord::unlocked_value);
488 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
489 jcc(Assembler::notEqual, push_and_slow_path);
490 jmp(unlocked);
491 }
492
493
494 { // Handle inflated monitor.
495 bind(inflated_check_lock_stack);
496 #ifdef ASSERT
497 Label check_done;
498 subl(top, oopSize);
499 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
500 jcc(Assembler::below, check_done);
501 cmpptr(obj, Address(thread, top));
502 jcc(Assembler::notEqual, inflated_check_lock_stack);
503 stop("Fast Unlock lock on stack");
504 bind(check_done);
505 if (UseObjectMonitorTable) {
506 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
507 }
508 testptr(mark, markWord::monitor_value);
509 jcc(Assembler::notZero, inflated);
510 stop("Fast Unlock not monitor");
511 #endif
512
513 bind(inflated);
514
515 if (!UseObjectMonitorTable) {
516 assert(mark == monitor, "should be the same here");
517 } else {
518 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack.
519 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
520 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
521 cmpptr(monitor, alignof(ObjectMonitor*));
522 jcc(Assembler::below, slow_path);
523 }
524 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
525 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
526 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
527 const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
528 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
529
530 Label recursive;
531
532 // Check if recursive.
533 cmpptr(recursions_address, 0);
534 jcc(Assembler::notZero, recursive);
535
536 // Set owner to null.
537 // Release to satisfy the JMM
538 movptr(owner_address, NULL_WORD);
539 // We need a full fence after clearing owner to avoid stranding.
540 // StoreLoad achieves this.
541 membar(StoreLoad);
542
543 // Check if the entry_list is empty.
544 cmpptr(entry_list_address, NULL_WORD);
545 jcc(Assembler::zero, unlocked); // If so we are done.
546
547 // Check if there is a successor.
548 cmpptr(succ_address, NULL_WORD);
549 jcc(Assembler::notZero, unlocked); // If so we are done.
550
551 // Save the monitor pointer in the current thread, so we can try to
552 // reacquire the lock in SharedRuntime::monitor_exit_helper().
553 if (!UseObjectMonitorTable) {
554 andptr(monitor, ~(int32_t)markWord::monitor_value);
555 }
556 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
557
558 orl(t, 1); // Fast Unlock ZF = 0
559 jmpb(slow_path);
560
561 // Recursive unlock.
562 bind(recursive);
563 decrement(recursions_address);
564 }
565
566 bind(unlocked);
567 xorl(t, t); // Fast Unlock ZF = 1
568
569 #ifdef ASSERT
570 // Check that unlocked label is reached with ZF set.
571 Label zf_correct;
572 Label zf_bad_zero;
573 jcc(Assembler::zero, zf_correct);
574 jmp(zf_bad_zero);
575 #endif
576
577 bind(slow_path);
578 if (stub != nullptr) {
579 bind(stub->slow_path_continuation());
580 }
581 #ifdef ASSERT
582 // Check that stub->continuation() label is reached with ZF not set.
583 jcc(Assembler::notZero, zf_correct);
584 stop("Fast Unlock ZF != 0");
585 bind(zf_bad_zero);
586 stop("Fast Unlock ZF != 1");
587 bind(zf_correct);
588 #endif
589 // C2 uses the value of ZF to determine the continuation.
590 }
591
592 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
593 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
594 }
595
596 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
597 const int framesize = Compile::current()->output()->frame_size_in_bytes();
598 masm->movptr(dst, rsp);
599 if (framesize > 2 * wordSize) {
600 masm->addptr(dst, framesize - 2 * wordSize);
601 }
602 }
603
604 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
605 if (PreserveFramePointer) {
606 // frame pointer is valid
607 #ifdef ASSERT
608 // Verify frame pointer value in rbp.
609 reconstruct_frame_pointer_helper(this, rtmp);
610 Label L_success;
611 cmpq(rbp, rtmp);
612 jccb(Assembler::equal, L_success);
613 STOP("frame pointer mismatch");
614 bind(L_success);
615 #endif // ASSERT
616 } else {
617 reconstruct_frame_pointer_helper(this, rbp);
618 }
619 }
620
621 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
622 jint lo = t->_lo;
623 jint hi = t->_hi;
624 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
625 if (t == TypeInt::INT) {
626 return;
627 }
628
629 BLOCK_COMMENT("CastII {");
630 Label fail;
631 Label succeed;
632
633 if (lo != min_jint) {
634 cmpl(val, lo);
635 jccb(Assembler::less, fail);
636 }
637 if (hi != max_jint) {
638 cmpl(val, hi);
639 jccb(Assembler::greater, fail);
640 }
641 jmpb(succeed);
642
643 bind(fail);
644 movl(c_rarg0, idx);
645 movl(c_rarg1, val);
646 movl(c_rarg2, lo);
647 movl(c_rarg3, hi);
648 reconstruct_frame_pointer(rscratch1);
649 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
650 hlt();
651 bind(succeed);
652 BLOCK_COMMENT("} // CastII");
653 }
654
655 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
656 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
657 }
658
659 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
660 jlong lo = t->_lo;
661 jlong hi = t->_hi;
662 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
663 if (t == TypeLong::LONG) {
664 return;
665 }
666
667 BLOCK_COMMENT("CastLL {");
668 Label fail;
669 Label succeed;
670
671 auto cmp_val = [&](jlong bound) {
672 if (is_simm32(bound)) {
673 cmpq(val, checked_cast<int>(bound));
674 } else {
675 mov64(tmp, bound);
676 cmpq(val, tmp);
677 }
678 };
679
680 if (lo != min_jlong) {
681 cmp_val(lo);
682 jccb(Assembler::less, fail);
683 }
684 if (hi != max_jlong) {
685 cmp_val(hi);
686 jccb(Assembler::greater, fail);
687 }
688 jmpb(succeed);
689
690 bind(fail);
691 movl(c_rarg0, idx);
692 movq(c_rarg1, val);
693 mov64(c_rarg2, lo);
694 mov64(c_rarg3, hi);
695 reconstruct_frame_pointer(rscratch1);
696 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
697 hlt();
698 bind(succeed);
699 BLOCK_COMMENT("} // CastLL");
700 }
701
702 //-------------------------------------------------------------------------------------------
703 // Generic instructions support for use in .ad files C2 code generation
704
705 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
706 if (dst != src) {
707 movdqu(dst, src);
708 }
709 if (opcode == Op_AbsVD) {
710 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
711 } else {
712 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
713 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
714 }
715 }
716
717 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
718 if (opcode == Op_AbsVD) {
719 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
720 } else {
721 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
722 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
723 }
724 }
725
726 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
727 if (dst != src) {
728 movdqu(dst, src);
729 }
730 if (opcode == Op_AbsVF) {
731 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
732 } else {
733 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
734 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
735 }
736 }
737
738 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
739 if (opcode == Op_AbsVF) {
740 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
741 } else {
742 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
743 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
744 }
745 }
746
747 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
748 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
749 assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
750
751 if (opcode == Op_MinV) {
752 if (elem_bt == T_BYTE) {
753 pminsb(dst, src);
754 } else if (elem_bt == T_SHORT) {
755 pminsw(dst, src);
756 } else if (elem_bt == T_INT) {
757 pminsd(dst, src);
758 } else {
759 assert(elem_bt == T_LONG, "required");
760 assert(tmp == xmm0, "required");
761 assert_different_registers(dst, src, tmp);
762 movdqu(xmm0, dst);
763 pcmpgtq(xmm0, src);
764 blendvpd(dst, src); // xmm0 as mask
765 }
766 } else { // opcode == Op_MaxV
767 if (elem_bt == T_BYTE) {
768 pmaxsb(dst, src);
769 } else if (elem_bt == T_SHORT) {
770 pmaxsw(dst, src);
771 } else if (elem_bt == T_INT) {
772 pmaxsd(dst, src);
773 } else {
774 assert(elem_bt == T_LONG, "required");
775 assert(tmp == xmm0, "required");
776 assert_different_registers(dst, src, tmp);
777 movdqu(xmm0, src);
778 pcmpgtq(xmm0, dst);
779 blendvpd(dst, src); // xmm0 as mask
780 }
781 }
782 }
783
784 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
785 XMMRegister src1, Address src2, int vlen_enc) {
786 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
787 if (opcode == Op_UMinV) {
788 switch(elem_bt) {
789 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break;
790 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
791 case T_INT: vpminud(dst, src1, src2, vlen_enc); break;
792 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
793 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
794 }
795 } else {
796 assert(opcode == Op_UMaxV, "required");
797 switch(elem_bt) {
798 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break;
799 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
800 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break;
801 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
802 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
803 }
804 }
805 }
806
807 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
808 // For optimality, leverage a full vector width of 512 bits
809 // for operations over smaller vector sizes on AVX512 targets.
810 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
811 if (opcode == Op_UMaxV) {
812 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
813 } else {
814 assert(opcode == Op_UMinV, "required");
815 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
816 }
817 } else {
818 // T1 = -1
819 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
820 // T1 = -1 << 63
821 vpsllq(xtmp1, xtmp1, 63, vlen_enc);
822 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
823 vpaddq(xtmp2, xtmp1, src2, vlen_enc);
824 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
825 vpaddq(xtmp1, xtmp1, src1, vlen_enc);
826 // Mask = T2 > T1
827 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
828 if (opcode == Op_UMaxV) {
829 // Res = Mask ? Src2 : Src1
830 vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
831 } else {
832 // Res = Mask ? Src1 : Src2
833 vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
834 }
835 }
836 }
837
838 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
839 XMMRegister src1, XMMRegister src2, int vlen_enc) {
840 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
841 if (opcode == Op_UMinV) {
842 switch(elem_bt) {
843 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break;
844 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
845 case T_INT: vpminud(dst, src1, src2, vlen_enc); break;
846 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
847 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
848 }
849 } else {
850 assert(opcode == Op_UMaxV, "required");
851 switch(elem_bt) {
852 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break;
853 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
854 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break;
855 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
856 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
857 }
858 }
859 }
860
861 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
862 XMMRegister dst, XMMRegister src1, XMMRegister src2,
863 int vlen_enc) {
864 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
865
866 if (opcode == Op_MinV) {
867 if (elem_bt == T_BYTE) {
868 vpminsb(dst, src1, src2, vlen_enc);
869 } else if (elem_bt == T_SHORT) {
870 vpminsw(dst, src1, src2, vlen_enc);
871 } else if (elem_bt == T_INT) {
872 vpminsd(dst, src1, src2, vlen_enc);
873 } else {
874 assert(elem_bt == T_LONG, "required");
875 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
876 vpminsq(dst, src1, src2, vlen_enc);
877 } else {
878 assert_different_registers(dst, src1, src2);
879 vpcmpgtq(dst, src1, src2, vlen_enc);
880 vblendvpd(dst, src1, src2, dst, vlen_enc);
881 }
882 }
883 } else { // opcode == Op_MaxV
884 if (elem_bt == T_BYTE) {
885 vpmaxsb(dst, src1, src2, vlen_enc);
886 } else if (elem_bt == T_SHORT) {
887 vpmaxsw(dst, src1, src2, vlen_enc);
888 } else if (elem_bt == T_INT) {
889 vpmaxsd(dst, src1, src2, vlen_enc);
890 } else {
891 assert(elem_bt == T_LONG, "required");
892 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
893 vpmaxsq(dst, src1, src2, vlen_enc);
894 } else {
895 assert_different_registers(dst, src1, src2);
896 vpcmpgtq(dst, src1, src2, vlen_enc);
897 vblendvpd(dst, src2, src1, dst, vlen_enc);
898 }
899 }
900 }
901 }
902
903 // Float/Double min max
904
905 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
906 XMMRegister dst, XMMRegister a, XMMRegister b,
907 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
908 int vlen_enc) {
909 assert(UseAVX > 0, "required");
910 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
911 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
912 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
913 assert_different_registers(a, tmp, atmp, btmp);
914 assert_different_registers(b, tmp, atmp, btmp);
915
916 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
917 bool is_double_word = is_double_word_type(elem_bt);
918
919 /* Note on 'non-obvious' assembly sequence:
920 *
921 * While there are vminps/vmaxps instructions, there are two important differences between hardware
922 * and Java on how they handle floats:
923 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
924 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
925 *
926 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
927 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
928 * (only useful when signs differ, noop otherwise)
929 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
930
931 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
932 * btmp = (b < +0.0) ? a : b
933 * atmp = (b < +0.0) ? b : a
934 * Tmp = Max_Float(atmp , btmp)
935 * Res = (atmp == NaN) ? atmp : Tmp
936 */
937
938 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
939 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
940 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
941 XMMRegister mask;
942
943 if (!is_double_word && is_min) {
944 mask = a;
945 vblend = &MacroAssembler::vblendvps;
946 vmaxmin = &MacroAssembler::vminps;
947 vcmp = &MacroAssembler::vcmpps;
948 } else if (!is_double_word && !is_min) {
949 mask = b;
950 vblend = &MacroAssembler::vblendvps;
951 vmaxmin = &MacroAssembler::vmaxps;
952 vcmp = &MacroAssembler::vcmpps;
953 } else if (is_double_word && is_min) {
954 mask = a;
955 vblend = &MacroAssembler::vblendvpd;
956 vmaxmin = &MacroAssembler::vminpd;
957 vcmp = &MacroAssembler::vcmppd;
958 } else {
959 assert(is_double_word && !is_min, "sanity");
960 mask = b;
961 vblend = &MacroAssembler::vblendvpd;
962 vmaxmin = &MacroAssembler::vmaxpd;
963 vcmp = &MacroAssembler::vcmppd;
964 }
965
966 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
967 XMMRegister maxmin, scratch;
968 if (dst == btmp) {
969 maxmin = btmp;
970 scratch = tmp;
971 } else {
972 maxmin = tmp;
973 scratch = btmp;
974 }
975
976 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
977 if (precompute_mask && !is_double_word) {
978 vpsrad(tmp, mask, 32, vlen_enc);
979 mask = tmp;
980 } else if (precompute_mask && is_double_word) {
981 vpxor(tmp, tmp, tmp, vlen_enc);
982 vpcmpgtq(tmp, tmp, mask, vlen_enc);
983 mask = tmp;
984 }
985
986 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
987 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
988 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
989 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
990 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
991 }
992
993 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
994 XMMRegister dst, XMMRegister a, XMMRegister b,
995 KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
996 int vlen_enc) {
997 assert(UseAVX > 2, "required");
998 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
999 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1000 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1001 assert_different_registers(dst, a, atmp, btmp);
1002 assert_different_registers(dst, b, atmp, btmp);
1003
1004 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1005 bool is_double_word = is_double_word_type(elem_bt);
1006 bool merge = true;
1007
1008 if (!is_double_word && is_min) {
1009 evpmovd2m(ktmp, a, vlen_enc);
1010 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1011 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1012 vminps(dst, atmp, btmp, vlen_enc);
1013 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1014 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1015 } else if (!is_double_word && !is_min) {
1016 evpmovd2m(ktmp, b, vlen_enc);
1017 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1018 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1019 vmaxps(dst, atmp, btmp, vlen_enc);
1020 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1021 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1022 } else if (is_double_word && is_min) {
1023 evpmovq2m(ktmp, a, vlen_enc);
1024 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1025 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1026 vminpd(dst, atmp, btmp, vlen_enc);
1027 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1028 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1029 } else {
1030 assert(is_double_word && !is_min, "sanity");
1031 evpmovq2m(ktmp, b, vlen_enc);
1032 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1033 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1034 vmaxpd(dst, atmp, btmp, vlen_enc);
1035 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1036 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1037 }
1038 }
1039
1040 void C2_MacroAssembler::vminmax_fp_avx10_2(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1041 XMMRegister src1, XMMRegister src2, int vlen_enc) {
1042 assert(opc == Op_MinV || opc == Op_MinReductionV ||
1043 opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1044
1045 int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1046 : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1047 if (elem_bt == T_FLOAT) {
1048 evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1049 } else {
1050 assert(elem_bt == T_DOUBLE, "");
1051 evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1052 }
1053 }
1054
1055 void C2_MacroAssembler::sminmax_fp_avx10_2(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1056 XMMRegister src1, XMMRegister src2) {
1057 assert(opc == Op_MinF || opc == Op_MaxF ||
1058 opc == Op_MinD || opc == Op_MaxD, "sanity");
1059
1060 int imm8 = (opc == Op_MinF || opc == Op_MinD) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1061 : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1062 if (elem_bt == T_FLOAT) {
1063 evminmaxss(dst, mask, src1, src2, true, imm8);
1064 } else {
1065 assert(elem_bt == T_DOUBLE, "");
1066 evminmaxsd(dst, mask, src1, src2, true, imm8);
1067 }
1068 }
1069
1070 // Float/Double signum
1071 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1072 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1073
1074 Label DONE_LABEL;
1075
1076 // Handle special cases +0.0/-0.0 and NaN, if argument is +0.0/-0.0 or NaN, return argument
1077 // If AVX10.2 (or newer) floating point comparison instructions used, SF=1 for equal and unordered cases
1078 // If other floating point comparison instructions used, ZF=1 for equal and unordered cases
1079 if (opcode == Op_SignumF) {
1080 if (VM_Version::supports_avx10_2()) {
1081 evucomxss(dst, zero);
1082 jcc(Assembler::negative, DONE_LABEL);
1083 } else {
1084 ucomiss(dst, zero);
1085 jcc(Assembler::equal, DONE_LABEL);
1086 }
1087 movflt(dst, one);
1088 jcc(Assembler::above, DONE_LABEL);
1089 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1090 } else if (opcode == Op_SignumD) {
1091 if (VM_Version::supports_avx10_2()) {
1092 evucomxsd(dst, zero);
1093 jcc(Assembler::negative, DONE_LABEL);
1094 } else {
1095 ucomisd(dst, zero);
1096 jcc(Assembler::equal, DONE_LABEL);
1097 }
1098 movdbl(dst, one);
1099 jcc(Assembler::above, DONE_LABEL);
1100 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1101 }
1102
1103 bind(DONE_LABEL);
1104 }
1105
1106 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1107 if (sign) {
1108 pmovsxbw(dst, src);
1109 } else {
1110 pmovzxbw(dst, src);
1111 }
1112 }
1113
1114 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1115 if (sign) {
1116 vpmovsxbw(dst, src, vector_len);
1117 } else {
1118 vpmovzxbw(dst, src, vector_len);
1119 }
1120 }
1121
1122 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1123 if (sign) {
1124 vpmovsxbd(dst, src, vector_len);
1125 } else {
1126 vpmovzxbd(dst, src, vector_len);
1127 }
1128 }
1129
1130 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1131 if (sign) {
1132 vpmovsxwd(dst, src, vector_len);
1133 } else {
1134 vpmovzxwd(dst, src, vector_len);
1135 }
1136 }
1137
1138 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1139 int shift, int vector_len) {
1140 if (opcode == Op_RotateLeftV) {
1141 if (etype == T_INT) {
1142 evprold(dst, src, shift, vector_len);
1143 } else {
1144 assert(etype == T_LONG, "expected type T_LONG");
1145 evprolq(dst, src, shift, vector_len);
1146 }
1147 } else {
1148 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1149 if (etype == T_INT) {
1150 evprord(dst, src, shift, vector_len);
1151 } else {
1152 assert(etype == T_LONG, "expected type T_LONG");
1153 evprorq(dst, src, shift, vector_len);
1154 }
1155 }
1156 }
1157
1158 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1159 XMMRegister shift, int vector_len) {
1160 if (opcode == Op_RotateLeftV) {
1161 if (etype == T_INT) {
1162 evprolvd(dst, src, shift, vector_len);
1163 } else {
1164 assert(etype == T_LONG, "expected type T_LONG");
1165 evprolvq(dst, src, shift, vector_len);
1166 }
1167 } else {
1168 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1169 if (etype == T_INT) {
1170 evprorvd(dst, src, shift, vector_len);
1171 } else {
1172 assert(etype == T_LONG, "expected type T_LONG");
1173 evprorvq(dst, src, shift, vector_len);
1174 }
1175 }
1176 }
1177
1178 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1179 if (opcode == Op_RShiftVI) {
1180 psrad(dst, shift);
1181 } else if (opcode == Op_LShiftVI) {
1182 pslld(dst, shift);
1183 } else {
1184 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1185 psrld(dst, shift);
1186 }
1187 }
1188
1189 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1190 switch (opcode) {
1191 case Op_RShiftVI: psrad(dst, shift); break;
1192 case Op_LShiftVI: pslld(dst, shift); break;
1193 case Op_URShiftVI: psrld(dst, shift); break;
1194
1195 default: assert(false, "%s", NodeClassNames[opcode]);
1196 }
1197 }
1198
1199 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1200 if (opcode == Op_RShiftVI) {
1201 vpsrad(dst, nds, shift, vector_len);
1202 } else if (opcode == Op_LShiftVI) {
1203 vpslld(dst, nds, shift, vector_len);
1204 } else {
1205 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1206 vpsrld(dst, nds, shift, vector_len);
1207 }
1208 }
1209
1210 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1211 switch (opcode) {
1212 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break;
1213 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break;
1214 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1215
1216 default: assert(false, "%s", NodeClassNames[opcode]);
1217 }
1218 }
1219
1220 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1221 switch (opcode) {
1222 case Op_RShiftVB: // fall-through
1223 case Op_RShiftVS: psraw(dst, shift); break;
1224
1225 case Op_LShiftVB: // fall-through
1226 case Op_LShiftVS: psllw(dst, shift); break;
1227
1228 case Op_URShiftVS: // fall-through
1229 case Op_URShiftVB: psrlw(dst, shift); break;
1230
1231 default: assert(false, "%s", NodeClassNames[opcode]);
1232 }
1233 }
1234
1235 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1236 switch (opcode) {
1237 case Op_RShiftVB: // fall-through
1238 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break;
1239
1240 case Op_LShiftVB: // fall-through
1241 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break;
1242
1243 case Op_URShiftVS: // fall-through
1244 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1245
1246 default: assert(false, "%s", NodeClassNames[opcode]);
1247 }
1248 }
1249
1250 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1251 switch (opcode) {
1252 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1253 case Op_LShiftVL: psllq(dst, shift); break;
1254 case Op_URShiftVL: psrlq(dst, shift); break;
1255
1256 default: assert(false, "%s", NodeClassNames[opcode]);
1257 }
1258 }
1259
1260 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1261 if (opcode == Op_RShiftVL) {
1262 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems
1263 } else if (opcode == Op_LShiftVL) {
1264 psllq(dst, shift);
1265 } else {
1266 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1267 psrlq(dst, shift);
1268 }
1269 }
1270
1271 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1272 switch (opcode) {
1273 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1274 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break;
1275 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1276
1277 default: assert(false, "%s", NodeClassNames[opcode]);
1278 }
1279 }
1280
1281 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1282 if (opcode == Op_RShiftVL) {
1283 evpsraq(dst, nds, shift, vector_len);
1284 } else if (opcode == Op_LShiftVL) {
1285 vpsllq(dst, nds, shift, vector_len);
1286 } else {
1287 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1288 vpsrlq(dst, nds, shift, vector_len);
1289 }
1290 }
1291
1292 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1293 switch (opcode) {
1294 case Op_RShiftVB: // fall-through
1295 case Op_RShiftVS: // fall-through
1296 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break;
1297
1298 case Op_LShiftVB: // fall-through
1299 case Op_LShiftVS: // fall-through
1300 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break;
1301
1302 case Op_URShiftVB: // fall-through
1303 case Op_URShiftVS: // fall-through
1304 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1305
1306 default: assert(false, "%s", NodeClassNames[opcode]);
1307 }
1308 }
1309
1310 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1311 switch (opcode) {
1312 case Op_RShiftVB: // fall-through
1313 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break;
1314
1315 case Op_LShiftVB: // fall-through
1316 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break;
1317
1318 case Op_URShiftVB: // fall-through
1319 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1320
1321 default: assert(false, "%s", NodeClassNames[opcode]);
1322 }
1323 }
1324
1325 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1326 assert(UseAVX >= 2, "required");
1327 switch (opcode) {
1328 case Op_RShiftVL: {
1329 if (UseAVX > 2) {
1330 assert(tmp == xnoreg, "not used");
1331 if (!VM_Version::supports_avx512vl()) {
1332 vlen_enc = Assembler::AVX_512bit;
1333 }
1334 evpsravq(dst, src, shift, vlen_enc);
1335 } else {
1336 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1337 vpsrlvq(dst, src, shift, vlen_enc);
1338 vpsrlvq(tmp, tmp, shift, vlen_enc);
1339 vpxor(dst, dst, tmp, vlen_enc);
1340 vpsubq(dst, dst, tmp, vlen_enc);
1341 }
1342 break;
1343 }
1344 case Op_LShiftVL: {
1345 assert(tmp == xnoreg, "not used");
1346 vpsllvq(dst, src, shift, vlen_enc);
1347 break;
1348 }
1349 case Op_URShiftVL: {
1350 assert(tmp == xnoreg, "not used");
1351 vpsrlvq(dst, src, shift, vlen_enc);
1352 break;
1353 }
1354 default: assert(false, "%s", NodeClassNames[opcode]);
1355 }
1356 }
1357
1358 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1359 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1360 assert(opcode == Op_LShiftVB ||
1361 opcode == Op_RShiftVB ||
1362 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1363 bool sign = (opcode != Op_URShiftVB);
1364 assert(vector_len == 0, "required");
1365 vextendbd(sign, dst, src, 1);
1366 vpmovzxbd(vtmp, shift, 1);
1367 varshiftd(opcode, dst, dst, vtmp, 1);
1368 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1369 vextracti128_high(vtmp, dst);
1370 vpackusdw(dst, dst, vtmp, 0);
1371 }
1372
1373 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1374 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1375 assert(opcode == Op_LShiftVB ||
1376 opcode == Op_RShiftVB ||
1377 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1378 bool sign = (opcode != Op_URShiftVB);
1379 int ext_vector_len = vector_len + 1;
1380 vextendbw(sign, dst, src, ext_vector_len);
1381 vpmovzxbw(vtmp, shift, ext_vector_len);
1382 varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1383 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1384 if (vector_len == 0) {
1385 vextracti128_high(vtmp, dst);
1386 vpackuswb(dst, dst, vtmp, vector_len);
1387 } else {
1388 vextracti64x4_high(vtmp, dst);
1389 vpackuswb(dst, dst, vtmp, vector_len);
1390 vpermq(dst, dst, 0xD8, vector_len);
1391 }
1392 }
1393
1394 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1395 switch(typ) {
1396 case T_BYTE:
1397 pinsrb(dst, val, idx);
1398 break;
1399 case T_SHORT:
1400 pinsrw(dst, val, idx);
1401 break;
1402 case T_INT:
1403 pinsrd(dst, val, idx);
1404 break;
1405 case T_LONG:
1406 pinsrq(dst, val, idx);
1407 break;
1408 default:
1409 assert(false,"Should not reach here.");
1410 break;
1411 }
1412 }
1413
1414 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1415 switch(typ) {
1416 case T_BYTE:
1417 vpinsrb(dst, src, val, idx);
1418 break;
1419 case T_SHORT:
1420 vpinsrw(dst, src, val, idx);
1421 break;
1422 case T_INT:
1423 vpinsrd(dst, src, val, idx);
1424 break;
1425 case T_LONG:
1426 vpinsrq(dst, src, val, idx);
1427 break;
1428 default:
1429 assert(false,"Should not reach here.");
1430 break;
1431 }
1432 }
1433
1434 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst,
1435 Register base, Register idx_base,
1436 Register mask, Register mask_idx,
1437 Register rtmp, int vlen_enc) {
1438 vpxor(dst, dst, dst, vlen_enc);
1439 if (elem_bt == T_SHORT) {
1440 for (int i = 0; i < 4; i++) {
1441 // dst[i] = mask[i] ? src[idx_base[i]] : 0
1442 Label skip_load;
1443 btq(mask, mask_idx);
1444 jccb(Assembler::carryClear, skip_load);
1445 movl(rtmp, Address(idx_base, i * 4));
1446 pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1447 bind(skip_load);
1448 incq(mask_idx);
1449 }
1450 } else {
1451 assert(elem_bt == T_BYTE, "");
1452 for (int i = 0; i < 8; i++) {
1453 // dst[i] = mask[i] ? src[idx_base[i]] : 0
1454 Label skip_load;
1455 btq(mask, mask_idx);
1456 jccb(Assembler::carryClear, skip_load);
1457 movl(rtmp, Address(idx_base, i * 4));
1458 pinsrb(dst, Address(base, rtmp), i);
1459 bind(skip_load);
1460 incq(mask_idx);
1461 }
1462 }
1463 }
1464
1465 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
1466 Register base, Register idx_base,
1467 Register rtmp, int vlen_enc) {
1468 vpxor(dst, dst, dst, vlen_enc);
1469 if (elem_bt == T_SHORT) {
1470 for (int i = 0; i < 4; i++) {
1471 // dst[i] = src[idx_base[i]]
1472 movl(rtmp, Address(idx_base, i * 4));
1473 pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1474 }
1475 } else {
1476 assert(elem_bt == T_BYTE, "");
1477 for (int i = 0; i < 8; i++) {
1478 // dst[i] = src[idx_base[i]]
1479 movl(rtmp, Address(idx_base, i * 4));
1480 pinsrb(dst, Address(base, rtmp), i);
1481 }
1482 }
1483 }
1484
1485 /*
1486 * Gather using hybrid algorithm, first partially unroll scalar loop
1487 * to accumulate values from gather indices into a quad-word(64bit) slice.
1488 * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1489 * permutation to place the slice into appropriate vector lane
1490 * locations in destination vector. Following pseudo code describes the
1491 * algorithm in detail:
1492 *
1493 * DST_VEC = ZERO_VEC
1494 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1495 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1496 * FOREACH_ITER:
1497 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1498 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1499 * DST_VEC = DST_VEC OR TEMP_PERM_VEC
1500 * PERM_INDEX = PERM_INDEX - TWO_VEC
1501 *
1502 * With each iteration, doubleword permute indices (0,1) corresponding
1503 * to gathered quadword gets right shifted by two lane positions.
1504 *
1505 */
1506 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1507 Register base, Register idx_base,
1508 Register mask, XMMRegister xtmp1,
1509 XMMRegister xtmp2, XMMRegister temp_dst,
1510 Register rtmp, Register mask_idx,
1511 Register length, int vector_len, int vlen_enc) {
1512 Label GATHER8_LOOP;
1513 assert(is_subword_type(elem_ty), "");
1514 movl(length, vector_len);
1515 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1516 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1517 vallones(xtmp2, vlen_enc);
1518 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1519 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1520 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1521
1522 bind(GATHER8_LOOP);
1523 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1524 if (mask == noreg) {
1525 vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc);
1526 } else {
1527 vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc);
1528 }
1529 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1530 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1531 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1532 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1533 // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1534 vpor(dst, dst, temp_dst, vlen_enc);
1535 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1));
1536 subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1537 jcc(Assembler::notEqual, GATHER8_LOOP);
1538 }
1539
1540 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1541 switch(typ) {
1542 case T_INT:
1543 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1544 break;
1545 case T_FLOAT:
1546 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1547 break;
1548 case T_LONG:
1549 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1550 break;
1551 case T_DOUBLE:
1552 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1553 break;
1554 default:
1555 assert(false,"Should not reach here.");
1556 break;
1557 }
1558 }
1559
1560 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1561 switch(typ) {
1562 case T_INT:
1563 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1564 break;
1565 case T_FLOAT:
1566 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1567 break;
1568 case T_LONG:
1569 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1570 break;
1571 case T_DOUBLE:
1572 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1573 break;
1574 default:
1575 assert(false,"Should not reach here.");
1576 break;
1577 }
1578 }
1579
1580 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1581 switch(typ) {
1582 case T_INT:
1583 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1584 break;
1585 case T_FLOAT:
1586 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1587 break;
1588 case T_LONG:
1589 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1590 break;
1591 case T_DOUBLE:
1592 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1593 break;
1594 default:
1595 assert(false,"Should not reach here.");
1596 break;
1597 }
1598 }
1599
1600 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1601 if (vlen_in_bytes <= 16) {
1602 pxor (dst, dst);
1603 psubb(dst, src);
1604 switch (elem_bt) {
1605 case T_BYTE: /* nothing to do */ break;
1606 case T_SHORT: pmovsxbw(dst, dst); break;
1607 case T_INT: pmovsxbd(dst, dst); break;
1608 case T_FLOAT: pmovsxbd(dst, dst); break;
1609 case T_LONG: pmovsxbq(dst, dst); break;
1610 case T_DOUBLE: pmovsxbq(dst, dst); break;
1611
1612 default: assert(false, "%s", type2name(elem_bt));
1613 }
1614 } else {
1615 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1616 int vlen_enc = vector_length_encoding(vlen_in_bytes);
1617
1618 vpxor (dst, dst, dst, vlen_enc);
1619 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1620
1621 switch (elem_bt) {
1622 case T_BYTE: /* nothing to do */ break;
1623 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break;
1624 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break;
1625 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break;
1626 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break;
1627 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1628
1629 default: assert(false, "%s", type2name(elem_bt));
1630 }
1631 }
1632 }
1633
1634 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1635 if (novlbwdq) {
1636 vpmovsxbd(xtmp, src, vlen_enc);
1637 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1638 Assembler::eq, true, vlen_enc, noreg);
1639 } else {
1640 vpxor(xtmp, xtmp, xtmp, vlen_enc);
1641 vpsubb(xtmp, xtmp, src, vlen_enc);
1642 evpmovb2m(dst, xtmp, vlen_enc);
1643 }
1644 }
1645
1646 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1647 if (is_integral_type(bt)) {
1648 switch (vlen_in_bytes) {
1649 case 4: movdl(dst, src); break;
1650 case 8: movq(dst, src); break;
1651 case 16: movdqu(dst, src); break;
1652 case 32: vmovdqu(dst, src); break;
1653 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1654 default: ShouldNotReachHere();
1655 }
1656 } else {
1657 switch (vlen_in_bytes) {
1658 case 4: movflt(dst, src); break;
1659 case 8: movdbl(dst, src); break;
1660 case 16: movups(dst, src); break;
1661 case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1662 case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1663 default: ShouldNotReachHere();
1664 }
1665 }
1666 }
1667
1668 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1669 assert(rscratch != noreg || always_reachable(src), "missing");
1670
1671 if (reachable(src)) {
1672 load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1673 } else {
1674 lea(rscratch, src);
1675 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1676 }
1677 }
1678
1679 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1680 int vlen_enc = vector_length_encoding(vlen);
1681 if (VM_Version::supports_avx()) {
1682 if (bt == T_LONG) {
1683 if (VM_Version::supports_avx2()) {
1684 vpbroadcastq(dst, src, vlen_enc);
1685 } else {
1686 vmovddup(dst, src, vlen_enc);
1687 }
1688 } else if (bt == T_DOUBLE) {
1689 if (vlen_enc != Assembler::AVX_128bit) {
1690 vbroadcastsd(dst, src, vlen_enc, noreg);
1691 } else {
1692 vmovddup(dst, src, vlen_enc);
1693 }
1694 } else {
1695 if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1696 vpbroadcastd(dst, src, vlen_enc);
1697 } else {
1698 vbroadcastss(dst, src, vlen_enc);
1699 }
1700 }
1701 } else if (VM_Version::supports_sse3()) {
1702 movddup(dst, src);
1703 } else {
1704 load_vector(bt, dst, src, vlen);
1705 }
1706 }
1707
1708 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1709 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1710 int offset = exact_log2(type2aelembytes(bt)) << 6;
1711 if (is_floating_point_type(bt)) {
1712 offset += 128;
1713 }
1714 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1715 load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1716 }
1717
1718 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1719
1720 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1721 int vector_len = Assembler::AVX_128bit;
1722
1723 switch (opcode) {
1724 case Op_AndReductionV: pand(dst, src); break;
1725 case Op_OrReductionV: por (dst, src); break;
1726 case Op_XorReductionV: pxor(dst, src); break;
1727 case Op_MinReductionV:
1728 switch (typ) {
1729 case T_BYTE: pminsb(dst, src); break;
1730 case T_SHORT: pminsw(dst, src); break;
1731 case T_INT: pminsd(dst, src); break;
1732 case T_LONG: assert(UseAVX > 2, "required");
1733 vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1734 default: assert(false, "wrong type");
1735 }
1736 break;
1737 case Op_MaxReductionV:
1738 switch (typ) {
1739 case T_BYTE: pmaxsb(dst, src); break;
1740 case T_SHORT: pmaxsw(dst, src); break;
1741 case T_INT: pmaxsd(dst, src); break;
1742 case T_LONG: assert(UseAVX > 2, "required");
1743 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1744 default: assert(false, "wrong type");
1745 }
1746 break;
1747 case Op_UMinReductionV:
1748 switch (typ) {
1749 case T_BYTE: vpminub(dst, dst, src, Assembler::AVX_128bit); break;
1750 case T_SHORT: vpminuw(dst, dst, src, Assembler::AVX_128bit); break;
1751 case T_INT: vpminud(dst, dst, src, Assembler::AVX_128bit); break;
1752 case T_LONG: evpminuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1753 default: assert(false, "wrong type");
1754 }
1755 break;
1756 case Op_UMaxReductionV:
1757 switch (typ) {
1758 case T_BYTE: vpmaxub(dst, dst, src, Assembler::AVX_128bit); break;
1759 case T_SHORT: vpmaxuw(dst, dst, src, Assembler::AVX_128bit); break;
1760 case T_INT: vpmaxud(dst, dst, src, Assembler::AVX_128bit); break;
1761 case T_LONG: evpmaxuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1762 default: assert(false, "wrong type");
1763 }
1764 break;
1765 case Op_AddReductionVF: addss(dst, src); break;
1766 case Op_AddReductionVD: addsd(dst, src); break;
1767 case Op_AddReductionVI:
1768 switch (typ) {
1769 case T_BYTE: paddb(dst, src); break;
1770 case T_SHORT: paddw(dst, src); break;
1771 case T_INT: paddd(dst, src); break;
1772 default: assert(false, "wrong type");
1773 }
1774 break;
1775 case Op_AddReductionVL: paddq(dst, src); break;
1776 case Op_MulReductionVF: mulss(dst, src); break;
1777 case Op_MulReductionVD: mulsd(dst, src); break;
1778 case Op_MulReductionVI:
1779 switch (typ) {
1780 case T_SHORT: pmullw(dst, src); break;
1781 case T_INT: pmulld(dst, src); break;
1782 default: assert(false, "wrong type");
1783 }
1784 break;
1785 case Op_MulReductionVL: assert(UseAVX > 2, "required");
1786 evpmullq(dst, dst, src, vector_len); break;
1787 default: assert(false, "wrong opcode");
1788 }
1789 }
1790
1791 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1792 switch (opcode) {
1793 case Op_AddReductionVF: addps(dst, src); break;
1794 case Op_AddReductionVD: addpd(dst, src); break;
1795 case Op_MulReductionVF: mulps(dst, src); break;
1796 case Op_MulReductionVD: mulpd(dst, src); break;
1797 default: assert(false, "%s", NodeClassNames[opcode]);
1798 }
1799 }
1800
1801 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1802 int vector_len = Assembler::AVX_256bit;
1803
1804 switch (opcode) {
1805 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break;
1806 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break;
1807 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break;
1808 case Op_MinReductionV:
1809 switch (typ) {
1810 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break;
1811 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break;
1812 case T_INT: vpminsd(dst, src1, src2, vector_len); break;
1813 case T_LONG: assert(UseAVX > 2, "required");
1814 vpminsq(dst, src1, src2, vector_len); break;
1815 default: assert(false, "wrong type");
1816 }
1817 break;
1818 case Op_MaxReductionV:
1819 switch (typ) {
1820 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break;
1821 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break;
1822 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break;
1823 case T_LONG: assert(UseAVX > 2, "required");
1824 vpmaxsq(dst, src1, src2, vector_len); break;
1825 default: assert(false, "wrong type");
1826 }
1827 break;
1828 case Op_UMinReductionV:
1829 switch (typ) {
1830 case T_BYTE: vpminub(dst, src1, src2, vector_len); break;
1831 case T_SHORT: vpminuw(dst, src1, src2, vector_len); break;
1832 case T_INT: vpminud(dst, src1, src2, vector_len); break;
1833 case T_LONG: evpminuq(dst, k0, src1, src2, true, vector_len); break;
1834 default: assert(false, "wrong type");
1835 }
1836 break;
1837 case Op_UMaxReductionV:
1838 switch (typ) {
1839 case T_BYTE: vpmaxub(dst, src1, src2, vector_len); break;
1840 case T_SHORT: vpmaxuw(dst, src1, src2, vector_len); break;
1841 case T_INT: vpmaxud(dst, src1, src2, vector_len); break;
1842 case T_LONG: evpmaxuq(dst, k0, src1, src2, true, vector_len); break;
1843 default: assert(false, "wrong type");
1844 }
1845 break;
1846 case Op_AddReductionVI:
1847 switch (typ) {
1848 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break;
1849 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break;
1850 case T_INT: vpaddd(dst, src1, src2, vector_len); break;
1851 default: assert(false, "wrong type");
1852 }
1853 break;
1854 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1855 case Op_MulReductionVI:
1856 switch (typ) {
1857 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break;
1858 case T_INT: vpmulld(dst, src1, src2, vector_len); break;
1859 default: assert(false, "wrong type");
1860 }
1861 break;
1862 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1863 default: assert(false, "wrong opcode");
1864 }
1865 }
1866
1867 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1868 int vector_len = Assembler::AVX_256bit;
1869
1870 switch (opcode) {
1871 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1872 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1873 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1874 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1875 default: assert(false, "%s", NodeClassNames[opcode]);
1876 }
1877 }
1878
1879 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1880 XMMRegister dst, XMMRegister src,
1881 XMMRegister vtmp1, XMMRegister vtmp2) {
1882 switch (opcode) {
1883 case Op_AddReductionVF:
1884 case Op_MulReductionVF:
1885 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1886 break;
1887
1888 case Op_AddReductionVD:
1889 case Op_MulReductionVD:
1890 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1891 break;
1892
1893 default: assert(false, "wrong opcode");
1894 }
1895 }
1896
1897 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1898 XMMRegister dst, XMMRegister src,
1899 XMMRegister vtmp1, XMMRegister vtmp2) {
1900 switch (opcode) {
1901 case Op_AddReductionVF:
1902 case Op_MulReductionVF:
1903 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1904 break;
1905
1906 case Op_AddReductionVD:
1907 case Op_MulReductionVD:
1908 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1909 break;
1910
1911 default: assert(false, "%s", NodeClassNames[opcode]);
1912 }
1913 }
1914
1915 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1916 Register dst, Register src1, XMMRegister src2,
1917 XMMRegister vtmp1, XMMRegister vtmp2) {
1918 switch (vlen) {
1919 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1920 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1921 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1922 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1923
1924 default: assert(false, "wrong vector length");
1925 }
1926 }
1927
1928 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1929 Register dst, Register src1, XMMRegister src2,
1930 XMMRegister vtmp1, XMMRegister vtmp2) {
1931 switch (vlen) {
1932 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1933 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1934 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1935 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1936
1937 default: assert(false, "wrong vector length");
1938 }
1939 }
1940
1941 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1942 Register dst, Register src1, XMMRegister src2,
1943 XMMRegister vtmp1, XMMRegister vtmp2) {
1944 switch (vlen) {
1945 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1946 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1947 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1948 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1949
1950 default: assert(false, "wrong vector length");
1951 }
1952 }
1953
1954 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1955 Register dst, Register src1, XMMRegister src2,
1956 XMMRegister vtmp1, XMMRegister vtmp2) {
1957 switch (vlen) {
1958 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1959 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1960 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1961 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1962
1963 default: assert(false, "wrong vector length");
1964 }
1965 }
1966
1967 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1968 Register dst, Register src1, XMMRegister src2,
1969 XMMRegister vtmp1, XMMRegister vtmp2) {
1970 switch (vlen) {
1971 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1972 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1973 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1974
1975 default: assert(false, "wrong vector length");
1976 }
1977 }
1978
1979 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1980 switch (vlen) {
1981 case 2:
1982 assert(vtmp2 == xnoreg, "");
1983 reduce2F(opcode, dst, src, vtmp1);
1984 break;
1985 case 4:
1986 assert(vtmp2 == xnoreg, "");
1987 reduce4F(opcode, dst, src, vtmp1);
1988 break;
1989 case 8:
1990 reduce8F(opcode, dst, src, vtmp1, vtmp2);
1991 break;
1992 case 16:
1993 reduce16F(opcode, dst, src, vtmp1, vtmp2);
1994 break;
1995 default: assert(false, "wrong vector length");
1996 }
1997 }
1998
1999 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2000 switch (vlen) {
2001 case 2:
2002 assert(vtmp2 == xnoreg, "");
2003 reduce2D(opcode, dst, src, vtmp1);
2004 break;
2005 case 4:
2006 reduce4D(opcode, dst, src, vtmp1, vtmp2);
2007 break;
2008 case 8:
2009 reduce8D(opcode, dst, src, vtmp1, vtmp2);
2010 break;
2011 default: assert(false, "wrong vector length");
2012 }
2013 }
2014
2015 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2016 switch (vlen) {
2017 case 2:
2018 assert(vtmp1 == xnoreg, "");
2019 assert(vtmp2 == xnoreg, "");
2020 unorderedReduce2F(opcode, dst, src);
2021 break;
2022 case 4:
2023 assert(vtmp2 == xnoreg, "");
2024 unorderedReduce4F(opcode, dst, src, vtmp1);
2025 break;
2026 case 8:
2027 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2028 break;
2029 case 16:
2030 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2031 break;
2032 default: assert(false, "wrong vector length");
2033 }
2034 }
2035
2036 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2037 switch (vlen) {
2038 case 2:
2039 assert(vtmp1 == xnoreg, "");
2040 assert(vtmp2 == xnoreg, "");
2041 unorderedReduce2D(opcode, dst, src);
2042 break;
2043 case 4:
2044 assert(vtmp2 == xnoreg, "");
2045 unorderedReduce4D(opcode, dst, src, vtmp1);
2046 break;
2047 case 8:
2048 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2049 break;
2050 default: assert(false, "wrong vector length");
2051 }
2052 }
2053
2054 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2055 if (opcode == Op_AddReductionVI) {
2056 if (vtmp1 != src2) {
2057 movdqu(vtmp1, src2);
2058 }
2059 phaddd(vtmp1, vtmp1);
2060 } else {
2061 pshufd(vtmp1, src2, 0x1);
2062 reduce_operation_128(T_INT, opcode, vtmp1, src2);
2063 }
2064 movdl(vtmp2, src1);
2065 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2066 movdl(dst, vtmp1);
2067 }
2068
2069 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2070 if (opcode == Op_AddReductionVI) {
2071 if (vtmp1 != src2) {
2072 movdqu(vtmp1, src2);
2073 }
2074 phaddd(vtmp1, src2);
2075 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2076 } else {
2077 pshufd(vtmp2, src2, 0xE);
2078 reduce_operation_128(T_INT, opcode, vtmp2, src2);
2079 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2080 }
2081 }
2082
2083 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2084 if (opcode == Op_AddReductionVI) {
2085 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2086 vextracti128_high(vtmp2, vtmp1);
2087 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2088 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2089 } else {
2090 vextracti128_high(vtmp1, src2);
2091 reduce_operation_128(T_INT, opcode, vtmp1, src2);
2092 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2093 }
2094 }
2095
2096 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2097 vextracti64x4_high(vtmp2, src2);
2098 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2099 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2100 }
2101
2102 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2103 pshufd(vtmp2, src2, 0x1);
2104 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2105 movdqu(vtmp1, vtmp2);
2106 psrldq(vtmp1, 2);
2107 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2108 movdqu(vtmp2, vtmp1);
2109 psrldq(vtmp2, 1);
2110 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2111 movdl(vtmp2, src1);
2112 if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2113 pmovzxbd(vtmp1, vtmp1);
2114 } else {
2115 pmovsxbd(vtmp1, vtmp1);
2116 }
2117 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2118 pextrb(dst, vtmp1, 0x0);
2119 movsbl(dst, dst);
2120 }
2121
2122 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2123 pshufd(vtmp1, src2, 0xE);
2124 reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2125 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2126 }
2127
2128 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2129 vextracti128_high(vtmp2, src2);
2130 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2131 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2132 }
2133
2134 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2135 vextracti64x4_high(vtmp1, src2);
2136 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2137 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2138 }
2139
2140 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2141 pmovsxbw(vtmp2, src2);
2142 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2143 }
2144
2145 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2146 if (UseAVX > 1) {
2147 int vector_len = Assembler::AVX_256bit;
2148 vpmovsxbw(vtmp1, src2, vector_len);
2149 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2150 } else {
2151 pmovsxbw(vtmp2, src2);
2152 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2153 pshufd(vtmp2, src2, 0xe);
2154 pmovsxbw(vtmp2, vtmp2);
2155 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2156 }
2157 }
2158
2159 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2160 if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2161 int vector_len = Assembler::AVX_512bit;
2162 vpmovsxbw(vtmp1, src2, vector_len);
2163 reduce32S(opcode, dst, src1, vtmp1, vtmp2, vtmp1);
2164 } else {
2165 assert(UseAVX >= 2,"Should not reach here.");
2166 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2167 vextracti128_high(vtmp2, src2);
2168 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2169 }
2170 }
2171
2172 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2173 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2174 vextracti64x4_high(vtmp2, src2);
2175 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2176 }
2177
2178 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2179 if (opcode == Op_AddReductionVI) {
2180 if (vtmp1 != src2) {
2181 movdqu(vtmp1, src2);
2182 }
2183 phaddw(vtmp1, vtmp1);
2184 phaddw(vtmp1, vtmp1);
2185 } else {
2186 pshufd(vtmp2, src2, 0x1);
2187 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2188 movdqu(vtmp1, vtmp2);
2189 psrldq(vtmp1, 2);
2190 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2191 }
2192 movdl(vtmp2, src1);
2193 if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2194 pmovzxwd(vtmp1, vtmp1);
2195 } else {
2196 pmovsxwd(vtmp1, vtmp1);
2197 }
2198 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2199 pextrw(dst, vtmp1, 0x0);
2200 movswl(dst, dst);
2201 }
2202
2203 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2204 if (opcode == Op_AddReductionVI) {
2205 if (vtmp1 != src2) {
2206 movdqu(vtmp1, src2);
2207 }
2208 phaddw(vtmp1, src2);
2209 } else {
2210 assert_different_registers(src2, vtmp1);
2211 pshufd(vtmp1, src2, 0xE);
2212 reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2213 }
2214 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2215 }
2216
2217 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2218 if (opcode == Op_AddReductionVI) {
2219 int vector_len = Assembler::AVX_256bit;
2220 vphaddw(vtmp2, src2, src2, vector_len);
2221 vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2222 } else {
2223 assert_different_registers(src2, vtmp2);
2224 vextracti128_high(vtmp2, src2);
2225 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2226 }
2227 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2228 }
2229
2230 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2231 assert_different_registers(src2, vtmp1);
2232 int vector_len = Assembler::AVX_256bit;
2233 vextracti64x4_high(vtmp1, src2);
2234 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2235 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2236 }
2237
2238 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2239 pshufd(vtmp2, src2, 0xE);
2240 reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2241 movdq(vtmp1, src1);
2242 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2243 movdq(dst, vtmp1);
2244 }
2245
2246 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2247 vextracti128_high(vtmp1, src2);
2248 reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2249 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2250 }
2251
2252 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2253 vextracti64x4_high(vtmp2, src2);
2254 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2255 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2256 }
2257
2258 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2259 mov64(temp, -1L);
2260 bzhiq(temp, temp, len);
2261 kmovql(dst, temp);
2262 }
2263
2264 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2265 reduce_operation_128(T_FLOAT, opcode, dst, src);
2266 pshufd(vtmp, src, 0x1);
2267 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2268 }
2269
2270 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2271 reduce2F(opcode, dst, src, vtmp);
2272 pshufd(vtmp, src, 0x2);
2273 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2274 pshufd(vtmp, src, 0x3);
2275 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2276 }
2277
2278 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2279 reduce4F(opcode, dst, src, vtmp2);
2280 vextractf128_high(vtmp2, src);
2281 reduce4F(opcode, dst, vtmp2, vtmp1);
2282 }
2283
2284 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2285 reduce8F(opcode, dst, src, vtmp1, vtmp2);
2286 vextracti64x4_high(vtmp1, src);
2287 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2288 }
2289
2290 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2291 pshufd(dst, src, 0x1);
2292 reduce_operation_128(T_FLOAT, opcode, dst, src);
2293 }
2294
2295 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2296 pshufd(vtmp, src, 0xE);
2297 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2298 unorderedReduce2F(opcode, dst, vtmp);
2299 }
2300
2301 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2302 vextractf128_high(vtmp1, src);
2303 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2304 unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2305 }
2306
2307 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2308 vextractf64x4_high(vtmp2, src);
2309 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2310 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2311 }
2312
2313 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2314 reduce_operation_128(T_DOUBLE, opcode, dst, src);
2315 pshufd(vtmp, src, 0xE);
2316 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2317 }
2318
2319 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2320 reduce2D(opcode, dst, src, vtmp2);
2321 vextractf128_high(vtmp2, src);
2322 reduce2D(opcode, dst, vtmp2, vtmp1);
2323 }
2324
2325 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2326 reduce4D(opcode, dst, src, vtmp1, vtmp2);
2327 vextracti64x4_high(vtmp1, src);
2328 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2329 }
2330
2331 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2332 pshufd(dst, src, 0xE);
2333 reduce_operation_128(T_DOUBLE, opcode, dst, src);
2334 }
2335
2336 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2337 vextractf128_high(vtmp, src);
2338 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2339 unorderedReduce2D(opcode, dst, vtmp);
2340 }
2341
2342 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2343 vextractf64x4_high(vtmp2, src);
2344 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2345 unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2346 }
2347
2348 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2349 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2350 }
2351
2352 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2353 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2354 }
2355
2356 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2357 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2358 }
2359
2360 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2361 int vec_enc) {
2362 switch(elem_bt) {
2363 case T_INT:
2364 case T_FLOAT:
2365 vmaskmovps(dst, src, mask, vec_enc);
2366 break;
2367 case T_LONG:
2368 case T_DOUBLE:
2369 vmaskmovpd(dst, src, mask, vec_enc);
2370 break;
2371 default:
2372 fatal("Unsupported type %s", type2name(elem_bt));
2373 break;
2374 }
2375 }
2376
2377 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2378 int vec_enc) {
2379 switch(elem_bt) {
2380 case T_INT:
2381 case T_FLOAT:
2382 vmaskmovps(dst, src, mask, vec_enc);
2383 break;
2384 case T_LONG:
2385 case T_DOUBLE:
2386 vmaskmovpd(dst, src, mask, vec_enc);
2387 break;
2388 default:
2389 fatal("Unsupported type %s", type2name(elem_bt));
2390 break;
2391 }
2392 }
2393
2394 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2395 XMMRegister dst, XMMRegister src,
2396 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2397 XMMRegister xmm_0, XMMRegister xmm_1) {
2398 const int permconst[] = {1, 14};
2399 XMMRegister wsrc = src;
2400 XMMRegister wdst = xmm_0;
2401 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2402
2403 int vlen_enc = Assembler::AVX_128bit;
2404 if (vlen == 16) {
2405 vlen_enc = Assembler::AVX_256bit;
2406 }
2407
2408 for (int i = log2(vlen) - 1; i >=0; i--) {
2409 if (i == 0 && !is_dst_valid) {
2410 wdst = dst;
2411 }
2412 if (i == 3) {
2413 vextracti64x4_high(wtmp, wsrc);
2414 } else if (i == 2) {
2415 vextracti128_high(wtmp, wsrc);
2416 } else { // i = [0,1]
2417 vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2418 }
2419
2420 if (VM_Version::supports_avx10_2()) {
2421 vminmax_fp_avx10_2(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2422 } else {
2423 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2424 }
2425 wsrc = wdst;
2426 vlen_enc = Assembler::AVX_128bit;
2427 }
2428 if (is_dst_valid) {
2429 if (VM_Version::supports_avx10_2()) {
2430 vminmax_fp_avx10_2(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2431 } else {
2432 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2433 }
2434 }
2435 }
2436
2437 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2438 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2439 XMMRegister xmm_0, XMMRegister xmm_1) {
2440 XMMRegister wsrc = src;
2441 XMMRegister wdst = xmm_0;
2442 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2443 int vlen_enc = Assembler::AVX_128bit;
2444 if (vlen == 8) {
2445 vlen_enc = Assembler::AVX_256bit;
2446 }
2447 for (int i = log2(vlen) - 1; i >=0; i--) {
2448 if (i == 0 && !is_dst_valid) {
2449 wdst = dst;
2450 }
2451 if (i == 1) {
2452 vextracti128_high(wtmp, wsrc);
2453 } else if (i == 2) {
2454 vextracti64x4_high(wtmp, wsrc);
2455 } else {
2456 assert(i == 0, "%d", i);
2457 vpermilpd(wtmp, wsrc, 1, vlen_enc);
2458 }
2459
2460 if (VM_Version::supports_avx10_2()) {
2461 vminmax_fp_avx10_2(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2462 } else {
2463 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2464 }
2465
2466 wsrc = wdst;
2467 vlen_enc = Assembler::AVX_128bit;
2468 }
2469
2470 if (is_dst_valid) {
2471 if (VM_Version::supports_avx10_2()) {
2472 vminmax_fp_avx10_2(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2473 } else {
2474 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2475 }
2476 }
2477 }
2478
2479 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2480 switch (bt) {
2481 case T_BYTE: pextrb(dst, src, idx); break;
2482 case T_SHORT: pextrw(dst, src, idx); break;
2483 case T_INT: pextrd(dst, src, idx); break;
2484 case T_LONG: pextrq(dst, src, idx); break;
2485
2486 default:
2487 assert(false,"Should not reach here.");
2488 break;
2489 }
2490 }
2491
2492 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2493 int esize = type2aelembytes(typ);
2494 int elem_per_lane = 16/esize;
2495 int lane = elemindex / elem_per_lane;
2496 int eindex = elemindex % elem_per_lane;
2497
2498 if (lane >= 2) {
2499 assert(UseAVX > 2, "required");
2500 vextractf32x4(dst, src, lane & 3);
2501 return dst;
2502 } else if (lane > 0) {
2503 assert(UseAVX > 0, "required");
2504 vextractf128(dst, src, lane);
2505 return dst;
2506 } else {
2507 return src;
2508 }
2509 }
2510
2511 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2512 if (typ == T_BYTE) {
2513 movsbl(dst, dst);
2514 } else if (typ == T_SHORT) {
2515 movswl(dst, dst);
2516 }
2517 }
2518
2519 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2520 int esize = type2aelembytes(typ);
2521 int elem_per_lane = 16/esize;
2522 int eindex = elemindex % elem_per_lane;
2523 assert(is_integral_type(typ),"required");
2524
2525 if (eindex == 0) {
2526 if (typ == T_LONG) {
2527 movq(dst, src);
2528 } else {
2529 movdl(dst, src);
2530 movsxl(typ, dst);
2531 }
2532 } else {
2533 extract(typ, dst, src, eindex);
2534 movsxl(typ, dst);
2535 }
2536 }
2537
2538 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2539 int esize = type2aelembytes(typ);
2540 int elem_per_lane = 16/esize;
2541 int eindex = elemindex % elem_per_lane;
2542 assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2543
2544 if (eindex == 0) {
2545 movq(dst, src);
2546 } else {
2547 if (typ == T_FLOAT) {
2548 if (UseAVX == 0) {
2549 movdqu(dst, src);
2550 shufps(dst, dst, eindex);
2551 } else {
2552 vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2553 }
2554 } else {
2555 if (UseAVX == 0) {
2556 movdqu(dst, src);
2557 psrldq(dst, eindex*esize);
2558 } else {
2559 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2560 }
2561 movq(dst, dst);
2562 }
2563 }
2564 // Zero upper bits
2565 if (typ == T_FLOAT) {
2566 if (UseAVX == 0) {
2567 assert(vtmp != xnoreg, "required.");
2568 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2569 pand(dst, vtmp);
2570 } else {
2571 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2572 }
2573 }
2574 }
2575
2576 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2577 switch(typ) {
2578 case T_BYTE:
2579 case T_BOOLEAN:
2580 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2581 break;
2582 case T_SHORT:
2583 case T_CHAR:
2584 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2585 break;
2586 case T_INT:
2587 case T_FLOAT:
2588 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2589 break;
2590 case T_LONG:
2591 case T_DOUBLE:
2592 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2593 break;
2594 default:
2595 assert(false,"Should not reach here.");
2596 break;
2597 }
2598 }
2599
2600 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2601 assert(rscratch != noreg || always_reachable(src2), "missing");
2602
2603 switch(typ) {
2604 case T_BOOLEAN:
2605 case T_BYTE:
2606 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2607 break;
2608 case T_CHAR:
2609 case T_SHORT:
2610 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2611 break;
2612 case T_INT:
2613 case T_FLOAT:
2614 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2615 break;
2616 case T_LONG:
2617 case T_DOUBLE:
2618 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2619 break;
2620 default:
2621 assert(false,"Should not reach here.");
2622 break;
2623 }
2624 }
2625
2626 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2627 switch(typ) {
2628 case T_BYTE:
2629 evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2630 break;
2631 case T_SHORT:
2632 evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2633 break;
2634 case T_INT:
2635 case T_FLOAT:
2636 evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2637 break;
2638 case T_LONG:
2639 case T_DOUBLE:
2640 evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2641 break;
2642 default:
2643 assert(false,"Should not reach here.");
2644 break;
2645 }
2646 }
2647
2648 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2649 assert(vlen_in_bytes <= 32, "");
2650 int esize = type2aelembytes(bt);
2651 if (vlen_in_bytes == 32) {
2652 assert(vtmp == xnoreg, "required.");
2653 if (esize >= 4) {
2654 vtestps(src1, src2, AVX_256bit);
2655 } else {
2656 vptest(src1, src2, AVX_256bit);
2657 }
2658 return;
2659 }
2660 if (vlen_in_bytes < 16) {
2661 // Duplicate the lower part to fill the whole register,
2662 // Don't need to do so for src2
2663 assert(vtmp != xnoreg, "required");
2664 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2665 pshufd(vtmp, src1, shuffle_imm);
2666 } else {
2667 assert(vtmp == xnoreg, "required");
2668 vtmp = src1;
2669 }
2670 if (esize >= 4 && VM_Version::supports_avx()) {
2671 vtestps(vtmp, src2, AVX_128bit);
2672 } else {
2673 ptest(vtmp, src2);
2674 }
2675 }
2676
2677 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2678 #ifdef ASSERT
2679 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2680 bool is_bw_supported = VM_Version::supports_avx512bw();
2681 if (is_bw && !is_bw_supported) {
2682 assert(vlen_enc != Assembler::AVX_512bit, "required");
2683 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2684 "XMM register should be 0-15");
2685 }
2686 #endif // ASSERT
2687 switch (elem_bt) {
2688 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2689 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2690 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2691 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2692 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2693 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2694 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2695 }
2696 }
2697
2698 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2699 assert(UseAVX >= 2, "required");
2700 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2701 bool is_vl = vlen_enc != Assembler::AVX_512bit;
2702 if ((UseAVX > 2) &&
2703 (!is_bw || VM_Version::supports_avx512bw()) &&
2704 (!is_vl || VM_Version::supports_avx512vl())) {
2705 switch (elem_bt) {
2706 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2707 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2708 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2709 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2710 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2711 }
2712 } else {
2713 assert(vlen_enc != Assembler::AVX_512bit, "required");
2714 assert((dst->encoding() < 16),"XMM register should be 0-15");
2715 switch (elem_bt) {
2716 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2717 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2718 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2719 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2720 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2721 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2722 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2723 }
2724 }
2725 }
2726
2727 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2728 switch (to_elem_bt) {
2729 case T_SHORT:
2730 vpmovsxbw(dst, src, vlen_enc);
2731 break;
2732 case T_INT:
2733 vpmovsxbd(dst, src, vlen_enc);
2734 break;
2735 case T_FLOAT:
2736 vpmovsxbd(dst, src, vlen_enc);
2737 vcvtdq2ps(dst, dst, vlen_enc);
2738 break;
2739 case T_LONG:
2740 vpmovsxbq(dst, src, vlen_enc);
2741 break;
2742 case T_DOUBLE: {
2743 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2744 vpmovsxbd(dst, src, mid_vlen_enc);
2745 vcvtdq2pd(dst, dst, vlen_enc);
2746 break;
2747 }
2748 default:
2749 fatal("Unsupported type %s", type2name(to_elem_bt));
2750 break;
2751 }
2752 }
2753
2754 //-------------------------------------------------------------------------------------------
2755
2756 // IndexOf for constant substrings with size >= 8 chars
2757 // which don't need to be loaded through stack.
2758 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2759 Register cnt1, Register cnt2,
2760 int int_cnt2, Register result,
2761 XMMRegister vec, Register tmp,
2762 int ae) {
2763 ShortBranchVerifier sbv(this);
2764 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2765 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2766
2767 // This method uses the pcmpestri instruction with bound registers
2768 // inputs:
2769 // xmm - substring
2770 // rax - substring length (elements count)
2771 // mem - scanned string
2772 // rdx - string length (elements count)
2773 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2774 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2775 // outputs:
2776 // rcx - matched index in string
2777 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2778 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2779 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2780 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2781 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2782
2783 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2784 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2785 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2786
2787 // Note, inline_string_indexOf() generates checks:
2788 // if (substr.count > string.count) return -1;
2789 // if (substr.count == 0) return 0;
2790 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2791
2792 // Load substring.
2793 if (ae == StrIntrinsicNode::UL) {
2794 pmovzxbw(vec, Address(str2, 0));
2795 } else {
2796 movdqu(vec, Address(str2, 0));
2797 }
2798 movl(cnt2, int_cnt2);
2799 movptr(result, str1); // string addr
2800
2801 if (int_cnt2 > stride) {
2802 jmpb(SCAN_TO_SUBSTR);
2803
2804 // Reload substr for rescan, this code
2805 // is executed only for large substrings (> 8 chars)
2806 bind(RELOAD_SUBSTR);
2807 if (ae == StrIntrinsicNode::UL) {
2808 pmovzxbw(vec, Address(str2, 0));
2809 } else {
2810 movdqu(vec, Address(str2, 0));
2811 }
2812 negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2813
2814 bind(RELOAD_STR);
2815 // We came here after the beginning of the substring was
2816 // matched but the rest of it was not so we need to search
2817 // again. Start from the next element after the previous match.
2818
2819 // cnt2 is number of substring reminding elements and
2820 // cnt1 is number of string reminding elements when cmp failed.
2821 // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2822 subl(cnt1, cnt2);
2823 addl(cnt1, int_cnt2);
2824 movl(cnt2, int_cnt2); // Now restore cnt2
2825
2826 decrementl(cnt1); // Shift to next element
2827 cmpl(cnt1, cnt2);
2828 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2829
2830 addptr(result, (1<<scale1));
2831
2832 } // (int_cnt2 > 8)
2833
2834 // Scan string for start of substr in 16-byte vectors
2835 bind(SCAN_TO_SUBSTR);
2836 pcmpestri(vec, Address(result, 0), mode);
2837 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
2838 subl(cnt1, stride);
2839 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2840 cmpl(cnt1, cnt2);
2841 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2842 addptr(result, 16);
2843 jmpb(SCAN_TO_SUBSTR);
2844
2845 // Found a potential substr
2846 bind(FOUND_CANDIDATE);
2847 // Matched whole vector if first element matched (tmp(rcx) == 0).
2848 if (int_cnt2 == stride) {
2849 jccb(Assembler::overflow, RET_FOUND); // OF == 1
2850 } else { // int_cnt2 > 8
2851 jccb(Assembler::overflow, FOUND_SUBSTR);
2852 }
2853 // After pcmpestri tmp(rcx) contains matched element index
2854 // Compute start addr of substr
2855 lea(result, Address(result, tmp, scale1));
2856
2857 // Make sure string is still long enough
2858 subl(cnt1, tmp);
2859 cmpl(cnt1, cnt2);
2860 if (int_cnt2 == stride) {
2861 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2862 } else { // int_cnt2 > 8
2863 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2864 }
2865 // Left less then substring.
2866
2867 bind(RET_NOT_FOUND);
2868 movl(result, -1);
2869 jmp(EXIT);
2870
2871 if (int_cnt2 > stride) {
2872 // This code is optimized for the case when whole substring
2873 // is matched if its head is matched.
2874 bind(MATCH_SUBSTR_HEAD);
2875 pcmpestri(vec, Address(result, 0), mode);
2876 // Reload only string if does not match
2877 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2878
2879 Label CONT_SCAN_SUBSTR;
2880 // Compare the rest of substring (> 8 chars).
2881 bind(FOUND_SUBSTR);
2882 // First 8 chars are already matched.
2883 negptr(cnt2);
2884 addptr(cnt2, stride);
2885
2886 bind(SCAN_SUBSTR);
2887 subl(cnt1, stride);
2888 cmpl(cnt2, -stride); // Do not read beyond substring
2889 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2890 // Back-up strings to avoid reading beyond substring:
2891 // cnt1 = cnt1 - cnt2 + 8
2892 addl(cnt1, cnt2); // cnt2 is negative
2893 addl(cnt1, stride);
2894 movl(cnt2, stride); negptr(cnt2);
2895 bind(CONT_SCAN_SUBSTR);
2896 if (int_cnt2 < (int)G) {
2897 int tail_off1 = int_cnt2<<scale1;
2898 int tail_off2 = int_cnt2<<scale2;
2899 if (ae == StrIntrinsicNode::UL) {
2900 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2901 } else {
2902 movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2903 }
2904 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2905 } else {
2906 // calculate index in register to avoid integer overflow (int_cnt2*2)
2907 movl(tmp, int_cnt2);
2908 addptr(tmp, cnt2);
2909 if (ae == StrIntrinsicNode::UL) {
2910 pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2911 } else {
2912 movdqu(vec, Address(str2, tmp, scale2, 0));
2913 }
2914 pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2915 }
2916 // Need to reload strings pointers if not matched whole vector
2917 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2918 addptr(cnt2, stride);
2919 jcc(Assembler::negative, SCAN_SUBSTR);
2920 // Fall through if found full substring
2921
2922 } // (int_cnt2 > 8)
2923
2924 bind(RET_FOUND);
2925 // Found result if we matched full small substring.
2926 // Compute substr offset
2927 subptr(result, str1);
2928 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2929 shrl(result, 1); // index
2930 }
2931 bind(EXIT);
2932
2933 } // string_indexofC8
2934
2935 // Small strings are loaded through stack if they cross page boundary.
2936 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2937 Register cnt1, Register cnt2,
2938 int int_cnt2, Register result,
2939 XMMRegister vec, Register tmp,
2940 int ae) {
2941 ShortBranchVerifier sbv(this);
2942 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2943 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2944
2945 //
2946 // int_cnt2 is length of small (< 8 chars) constant substring
2947 // or (-1) for non constant substring in which case its length
2948 // is in cnt2 register.
2949 //
2950 // Note, inline_string_indexOf() generates checks:
2951 // if (substr.count > string.count) return -1;
2952 // if (substr.count == 0) return 0;
2953 //
2954 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2955 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2956 // This method uses the pcmpestri instruction with bound registers
2957 // inputs:
2958 // xmm - substring
2959 // rax - substring length (elements count)
2960 // mem - scanned string
2961 // rdx - string length (elements count)
2962 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2963 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2964 // outputs:
2965 // rcx - matched index in string
2966 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2967 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2968 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2969 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2970
2971 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2972 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2973 FOUND_CANDIDATE;
2974
2975 { //========================================================
2976 // We don't know where these strings are located
2977 // and we can't read beyond them. Load them through stack.
2978 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2979
2980 movptr(tmp, rsp); // save old SP
2981
2982 if (int_cnt2 > 0) { // small (< 8 chars) constant substring
2983 if (int_cnt2 == (1>>scale2)) { // One byte
2984 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2985 load_unsigned_byte(result, Address(str2, 0));
2986 movdl(vec, result); // move 32 bits
2987 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes
2988 // Not enough header space in 32-bit VM: 12+3 = 15.
2989 movl(result, Address(str2, -1));
2990 shrl(result, 8);
2991 movdl(vec, result); // move 32 bits
2992 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char
2993 load_unsigned_short(result, Address(str2, 0));
2994 movdl(vec, result); // move 32 bits
2995 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2996 movdl(vec, Address(str2, 0)); // move 32 bits
2997 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2998 movq(vec, Address(str2, 0)); // move 64 bits
2999 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3000 // Array header size is 12 bytes in 32-bit VM
3001 // + 6 bytes for 3 chars == 18 bytes,
3002 // enough space to load vec and shift.
3003 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3004 if (ae == StrIntrinsicNode::UL) {
3005 int tail_off = int_cnt2-8;
3006 pmovzxbw(vec, Address(str2, tail_off));
3007 psrldq(vec, -2*tail_off);
3008 }
3009 else {
3010 int tail_off = int_cnt2*(1<<scale2);
3011 movdqu(vec, Address(str2, tail_off-16));
3012 psrldq(vec, 16-tail_off);
3013 }
3014 }
3015 } else { // not constant substring
3016 cmpl(cnt2, stride);
3017 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3018
3019 // We can read beyond string if srt+16 does not cross page boundary
3020 // since heaps are aligned and mapped by pages.
3021 assert(os::vm_page_size() < (int)G, "default page should be small");
3022 movl(result, str2); // We need only low 32 bits
3023 andl(result, ((int)os::vm_page_size()-1));
3024 cmpl(result, ((int)os::vm_page_size()-16));
3025 jccb(Assembler::belowEqual, CHECK_STR);
3026
3027 // Move small strings to stack to allow load 16 bytes into vec.
3028 subptr(rsp, 16);
3029 int stk_offset = wordSize-(1<<scale2);
3030 push(cnt2);
3031
3032 bind(COPY_SUBSTR);
3033 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3034 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3035 movb(Address(rsp, cnt2, scale2, stk_offset), result);
3036 } else if (ae == StrIntrinsicNode::UU) {
3037 load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3038 movw(Address(rsp, cnt2, scale2, stk_offset), result);
3039 }
3040 decrement(cnt2);
3041 jccb(Assembler::notZero, COPY_SUBSTR);
3042
3043 pop(cnt2);
3044 movptr(str2, rsp); // New substring address
3045 } // non constant
3046
3047 bind(CHECK_STR);
3048 cmpl(cnt1, stride);
3049 jccb(Assembler::aboveEqual, BIG_STRINGS);
3050
3051 // Check cross page boundary.
3052 movl(result, str1); // We need only low 32 bits
3053 andl(result, ((int)os::vm_page_size()-1));
3054 cmpl(result, ((int)os::vm_page_size()-16));
3055 jccb(Assembler::belowEqual, BIG_STRINGS);
3056
3057 subptr(rsp, 16);
3058 int stk_offset = -(1<<scale1);
3059 if (int_cnt2 < 0) { // not constant
3060 push(cnt2);
3061 stk_offset += wordSize;
3062 }
3063 movl(cnt2, cnt1);
3064
3065 bind(COPY_STR);
3066 if (ae == StrIntrinsicNode::LL) {
3067 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3068 movb(Address(rsp, cnt2, scale1, stk_offset), result);
3069 } else {
3070 load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3071 movw(Address(rsp, cnt2, scale1, stk_offset), result);
3072 }
3073 decrement(cnt2);
3074 jccb(Assembler::notZero, COPY_STR);
3075
3076 if (int_cnt2 < 0) { // not constant
3077 pop(cnt2);
3078 }
3079 movptr(str1, rsp); // New string address
3080
3081 bind(BIG_STRINGS);
3082 // Load substring.
3083 if (int_cnt2 < 0) { // -1
3084 if (ae == StrIntrinsicNode::UL) {
3085 pmovzxbw(vec, Address(str2, 0));
3086 } else {
3087 movdqu(vec, Address(str2, 0));
3088 }
3089 push(cnt2); // substr count
3090 push(str2); // substr addr
3091 push(str1); // string addr
3092 } else {
3093 // Small (< 8 chars) constant substrings are loaded already.
3094 movl(cnt2, int_cnt2);
3095 }
3096 push(tmp); // original SP
3097
3098 } // Finished loading
3099
3100 //========================================================
3101 // Start search
3102 //
3103
3104 movptr(result, str1); // string addr
3105
3106 if (int_cnt2 < 0) { // Only for non constant substring
3107 jmpb(SCAN_TO_SUBSTR);
3108
3109 // SP saved at sp+0
3110 // String saved at sp+1*wordSize
3111 // Substr saved at sp+2*wordSize
3112 // Substr count saved at sp+3*wordSize
3113
3114 // Reload substr for rescan, this code
3115 // is executed only for large substrings (> 8 chars)
3116 bind(RELOAD_SUBSTR);
3117 movptr(str2, Address(rsp, 2*wordSize));
3118 movl(cnt2, Address(rsp, 3*wordSize));
3119 if (ae == StrIntrinsicNode::UL) {
3120 pmovzxbw(vec, Address(str2, 0));
3121 } else {
3122 movdqu(vec, Address(str2, 0));
3123 }
3124 // We came here after the beginning of the substring was
3125 // matched but the rest of it was not so we need to search
3126 // again. Start from the next element after the previous match.
3127 subptr(str1, result); // Restore counter
3128 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3129 shrl(str1, 1);
3130 }
3131 addl(cnt1, str1);
3132 decrementl(cnt1); // Shift to next element
3133 cmpl(cnt1, cnt2);
3134 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
3135
3136 addptr(result, (1<<scale1));
3137 } // non constant
3138
3139 // Scan string for start of substr in 16-byte vectors
3140 bind(SCAN_TO_SUBSTR);
3141 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3142 pcmpestri(vec, Address(result, 0), mode);
3143 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
3144 subl(cnt1, stride);
3145 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3146 cmpl(cnt1, cnt2);
3147 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
3148 addptr(result, 16);
3149
3150 bind(ADJUST_STR);
3151 cmpl(cnt1, stride); // Do not read beyond string
3152 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3153 // Back-up string to avoid reading beyond string.
3154 lea(result, Address(result, cnt1, scale1, -16));
3155 movl(cnt1, stride);
3156 jmpb(SCAN_TO_SUBSTR);
3157
3158 // Found a potential substr
3159 bind(FOUND_CANDIDATE);
3160 // After pcmpestri tmp(rcx) contains matched element index
3161
3162 // Make sure string is still long enough
3163 subl(cnt1, tmp);
3164 cmpl(cnt1, cnt2);
3165 jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3166 // Left less then substring.
3167
3168 bind(RET_NOT_FOUND);
3169 movl(result, -1);
3170 jmp(CLEANUP);
3171
3172 bind(FOUND_SUBSTR);
3173 // Compute start addr of substr
3174 lea(result, Address(result, tmp, scale1));
3175 if (int_cnt2 > 0) { // Constant substring
3176 // Repeat search for small substring (< 8 chars)
3177 // from new point without reloading substring.
3178 // Have to check that we don't read beyond string.
3179 cmpl(tmp, stride-int_cnt2);
3180 jccb(Assembler::greater, ADJUST_STR);
3181 // Fall through if matched whole substring.
3182 } else { // non constant
3183 assert(int_cnt2 == -1, "should be != 0");
3184
3185 addl(tmp, cnt2);
3186 // Found result if we matched whole substring.
3187 cmpl(tmp, stride);
3188 jcc(Assembler::lessEqual, RET_FOUND);
3189
3190 // Repeat search for small substring (<= 8 chars)
3191 // from new point 'str1' without reloading substring.
3192 cmpl(cnt2, stride);
3193 // Have to check that we don't read beyond string.
3194 jccb(Assembler::lessEqual, ADJUST_STR);
3195
3196 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3197 // Compare the rest of substring (> 8 chars).
3198 movptr(str1, result);
3199
3200 cmpl(tmp, cnt2);
3201 // First 8 chars are already matched.
3202 jccb(Assembler::equal, CHECK_NEXT);
3203
3204 bind(SCAN_SUBSTR);
3205 pcmpestri(vec, Address(str1, 0), mode);
3206 // Need to reload strings pointers if not matched whole vector
3207 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3208
3209 bind(CHECK_NEXT);
3210 subl(cnt2, stride);
3211 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3212 addptr(str1, 16);
3213 if (ae == StrIntrinsicNode::UL) {
3214 addptr(str2, 8);
3215 } else {
3216 addptr(str2, 16);
3217 }
3218 subl(cnt1, stride);
3219 cmpl(cnt2, stride); // Do not read beyond substring
3220 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3221 // Back-up strings to avoid reading beyond substring.
3222
3223 if (ae == StrIntrinsicNode::UL) {
3224 lea(str2, Address(str2, cnt2, scale2, -8));
3225 lea(str1, Address(str1, cnt2, scale1, -16));
3226 } else {
3227 lea(str2, Address(str2, cnt2, scale2, -16));
3228 lea(str1, Address(str1, cnt2, scale1, -16));
3229 }
3230 subl(cnt1, cnt2);
3231 movl(cnt2, stride);
3232 addl(cnt1, stride);
3233 bind(CONT_SCAN_SUBSTR);
3234 if (ae == StrIntrinsicNode::UL) {
3235 pmovzxbw(vec, Address(str2, 0));
3236 } else {
3237 movdqu(vec, Address(str2, 0));
3238 }
3239 jmp(SCAN_SUBSTR);
3240
3241 bind(RET_FOUND_LONG);
3242 movptr(str1, Address(rsp, wordSize));
3243 } // non constant
3244
3245 bind(RET_FOUND);
3246 // Compute substr offset
3247 subptr(result, str1);
3248 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3249 shrl(result, 1); // index
3250 }
3251 bind(CLEANUP);
3252 pop(rsp); // restore SP
3253
3254 } // string_indexof
3255
3256 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3257 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3258 ShortBranchVerifier sbv(this);
3259 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3260
3261 int stride = 8;
3262
3263 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3264 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3265 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3266 FOUND_SEQ_CHAR, DONE_LABEL;
3267
3268 movptr(result, str1);
3269 if (UseAVX >= 2) {
3270 cmpl(cnt1, stride);
3271 jcc(Assembler::less, SCAN_TO_CHAR);
3272 cmpl(cnt1, 2*stride);
3273 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3274 movdl(vec1, ch);
3275 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3276 vpxor(vec2, vec2);
3277 movl(tmp, cnt1);
3278 andl(tmp, 0xFFFFFFF0); //vector count (in chars)
3279 andl(cnt1,0x0000000F); //tail count (in chars)
3280
3281 bind(SCAN_TO_16_CHAR_LOOP);
3282 vmovdqu(vec3, Address(result, 0));
3283 vpcmpeqw(vec3, vec3, vec1, 1);
3284 vptest(vec2, vec3);
3285 jcc(Assembler::carryClear, FOUND_CHAR);
3286 addptr(result, 32);
3287 subl(tmp, 2*stride);
3288 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3289 jmp(SCAN_TO_8_CHAR);
3290 bind(SCAN_TO_8_CHAR_INIT);
3291 movdl(vec1, ch);
3292 pshuflw(vec1, vec1, 0x00);
3293 pshufd(vec1, vec1, 0);
3294 pxor(vec2, vec2);
3295 }
3296 bind(SCAN_TO_8_CHAR);
3297 cmpl(cnt1, stride);
3298 jcc(Assembler::less, SCAN_TO_CHAR);
3299 if (UseAVX < 2) {
3300 movdl(vec1, ch);
3301 pshuflw(vec1, vec1, 0x00);
3302 pshufd(vec1, vec1, 0);
3303 pxor(vec2, vec2);
3304 }
3305 movl(tmp, cnt1);
3306 andl(tmp, 0xFFFFFFF8); //vector count (in chars)
3307 andl(cnt1,0x00000007); //tail count (in chars)
3308
3309 bind(SCAN_TO_8_CHAR_LOOP);
3310 movdqu(vec3, Address(result, 0));
3311 pcmpeqw(vec3, vec1);
3312 ptest(vec2, vec3);
3313 jcc(Assembler::carryClear, FOUND_CHAR);
3314 addptr(result, 16);
3315 subl(tmp, stride);
3316 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3317 bind(SCAN_TO_CHAR);
3318 testl(cnt1, cnt1);
3319 jcc(Assembler::zero, RET_NOT_FOUND);
3320 bind(SCAN_TO_CHAR_LOOP);
3321 load_unsigned_short(tmp, Address(result, 0));
3322 cmpl(ch, tmp);
3323 jccb(Assembler::equal, FOUND_SEQ_CHAR);
3324 addptr(result, 2);
3325 subl(cnt1, 1);
3326 jccb(Assembler::zero, RET_NOT_FOUND);
3327 jmp(SCAN_TO_CHAR_LOOP);
3328
3329 bind(RET_NOT_FOUND);
3330 movl(result, -1);
3331 jmpb(DONE_LABEL);
3332
3333 bind(FOUND_CHAR);
3334 if (UseAVX >= 2) {
3335 vpmovmskb(tmp, vec3);
3336 } else {
3337 pmovmskb(tmp, vec3);
3338 }
3339 bsfl(ch, tmp);
3340 addptr(result, ch);
3341
3342 bind(FOUND_SEQ_CHAR);
3343 subptr(result, str1);
3344 shrl(result, 1);
3345
3346 bind(DONE_LABEL);
3347 } // string_indexof_char
3348
3349 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3350 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3351 ShortBranchVerifier sbv(this);
3352 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3353
3354 int stride = 16;
3355
3356 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3357 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3358 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3359 FOUND_SEQ_CHAR, DONE_LABEL;
3360
3361 movptr(result, str1);
3362 if (UseAVX >= 2) {
3363 cmpl(cnt1, stride);
3364 jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3365 cmpl(cnt1, stride*2);
3366 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3367 movdl(vec1, ch);
3368 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3369 vpxor(vec2, vec2);
3370 movl(tmp, cnt1);
3371 andl(tmp, 0xFFFFFFE0); //vector count (in chars)
3372 andl(cnt1,0x0000001F); //tail count (in chars)
3373
3374 bind(SCAN_TO_32_CHAR_LOOP);
3375 vmovdqu(vec3, Address(result, 0));
3376 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3377 vptest(vec2, vec3);
3378 jcc(Assembler::carryClear, FOUND_CHAR);
3379 addptr(result, 32);
3380 subl(tmp, stride*2);
3381 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3382 jmp(SCAN_TO_16_CHAR);
3383
3384 bind(SCAN_TO_16_CHAR_INIT);
3385 movdl(vec1, ch);
3386 pxor(vec2, vec2);
3387 pshufb(vec1, vec2);
3388 }
3389
3390 bind(SCAN_TO_16_CHAR);
3391 cmpl(cnt1, stride);
3392 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3393 if (UseAVX < 2) {
3394 movdl(vec1, ch);
3395 pxor(vec2, vec2);
3396 pshufb(vec1, vec2);
3397 }
3398 movl(tmp, cnt1);
3399 andl(tmp, 0xFFFFFFF0); //vector count (in bytes)
3400 andl(cnt1,0x0000000F); //tail count (in bytes)
3401
3402 bind(SCAN_TO_16_CHAR_LOOP);
3403 movdqu(vec3, Address(result, 0));
3404 pcmpeqb(vec3, vec1);
3405 ptest(vec2, vec3);
3406 jcc(Assembler::carryClear, FOUND_CHAR);
3407 addptr(result, 16);
3408 subl(tmp, stride);
3409 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3410
3411 bind(SCAN_TO_CHAR_INIT);
3412 testl(cnt1, cnt1);
3413 jcc(Assembler::zero, RET_NOT_FOUND);
3414 bind(SCAN_TO_CHAR_LOOP);
3415 load_unsigned_byte(tmp, Address(result, 0));
3416 cmpl(ch, tmp);
3417 jccb(Assembler::equal, FOUND_SEQ_CHAR);
3418 addptr(result, 1);
3419 subl(cnt1, 1);
3420 jccb(Assembler::zero, RET_NOT_FOUND);
3421 jmp(SCAN_TO_CHAR_LOOP);
3422
3423 bind(RET_NOT_FOUND);
3424 movl(result, -1);
3425 jmpb(DONE_LABEL);
3426
3427 bind(FOUND_CHAR);
3428 if (UseAVX >= 2) {
3429 vpmovmskb(tmp, vec3);
3430 } else {
3431 pmovmskb(tmp, vec3);
3432 }
3433 bsfl(ch, tmp);
3434 addptr(result, ch);
3435
3436 bind(FOUND_SEQ_CHAR);
3437 subptr(result, str1);
3438
3439 bind(DONE_LABEL);
3440 } // stringL_indexof_char
3441
3442 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3443 switch (eltype) {
3444 case T_BOOLEAN: return sizeof(jboolean);
3445 case T_BYTE: return sizeof(jbyte);
3446 case T_SHORT: return sizeof(jshort);
3447 case T_CHAR: return sizeof(jchar);
3448 case T_INT: return sizeof(jint);
3449 default:
3450 ShouldNotReachHere();
3451 return -1;
3452 }
3453 }
3454
3455 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3456 switch (eltype) {
3457 // T_BOOLEAN used as surrogate for unsigned byte
3458 case T_BOOLEAN: movzbl(dst, src); break;
3459 case T_BYTE: movsbl(dst, src); break;
3460 case T_SHORT: movswl(dst, src); break;
3461 case T_CHAR: movzwl(dst, src); break;
3462 case T_INT: movl(dst, src); break;
3463 default:
3464 ShouldNotReachHere();
3465 }
3466 }
3467
3468 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3469 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3470 }
3471
3472 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3473 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3474 }
3475
3476 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3477 const int vlen = Assembler::AVX_256bit;
3478 switch (eltype) {
3479 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break;
3480 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break;
3481 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3482 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3483 case T_INT:
3484 // do nothing
3485 break;
3486 default:
3487 ShouldNotReachHere();
3488 }
3489 }
3490
3491 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3492 Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3493 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3494 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3495 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3496 BasicType eltype) {
3497 ShortBranchVerifier sbv(this);
3498 assert(UseAVX >= 2, "AVX2 intrinsics are required");
3499 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3500 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3501
3502 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3503 SHORT_UNROLLED_LOOP_EXIT,
3504 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3505 UNROLLED_VECTOR_LOOP_BEGIN,
3506 END;
3507 switch (eltype) {
3508 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3509 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break;
3510 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break;
3511 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break;
3512 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break;
3513 default: BLOCK_COMMENT("arrays_hashcode {"); break;
3514 }
3515
3516 // For "renaming" for readibility of the code
3517 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3518 vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3519 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3520
3521 const int elsize = arrays_hashcode_elsize(eltype);
3522
3523 /*
3524 if (cnt1 >= 2) {
3525 if (cnt1 >= 32) {
3526 UNROLLED VECTOR LOOP
3527 }
3528 UNROLLED SCALAR LOOP
3529 }
3530 SINGLE SCALAR
3531 */
3532
3533 cmpl(cnt1, 32);
3534 jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3535
3536 // cnt1 >= 32 && generate_vectorized_loop
3537 xorl(index, index);
3538
3539 // vresult = IntVector.zero(I256);
3540 for (int idx = 0; idx < 4; idx++) {
3541 vpxor(vresult[idx], vresult[idx]);
3542 }
3543 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3544 Register bound = tmp2;
3545 Register next = tmp3;
3546 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3547 movl(next, Address(tmp2, 0));
3548 movdl(vnext, next);
3549 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3550
3551 // index = 0;
3552 // bound = cnt1 & ~(32 - 1);
3553 movl(bound, cnt1);
3554 andl(bound, ~(32 - 1));
3555 // for (; index < bound; index += 32) {
3556 bind(UNROLLED_VECTOR_LOOP_BEGIN);
3557 // result *= next;
3558 imull(result, next);
3559 // loop fission to upfront the cost of fetching from memory, OOO execution
3560 // can then hopefully do a better job of prefetching
3561 for (int idx = 0; idx < 4; idx++) {
3562 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3563 }
3564 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3565 for (int idx = 0; idx < 4; idx++) {
3566 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3567 arrays_hashcode_elvcast(vtmp[idx], eltype);
3568 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3569 }
3570 // index += 32;
3571 addl(index, 32);
3572 // index < bound;
3573 cmpl(index, bound);
3574 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3575 // }
3576
3577 lea(ary1, Address(ary1, bound, Address::times(elsize)));
3578 subl(cnt1, bound);
3579 // release bound
3580
3581 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3582 for (int idx = 0; idx < 4; idx++) {
3583 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3584 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3585 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3586 }
3587 // result += vresult.reduceLanes(ADD);
3588 for (int idx = 0; idx < 4; idx++) {
3589 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3590 }
3591
3592 // } else if (cnt1 < 32) {
3593
3594 bind(SHORT_UNROLLED_BEGIN);
3595 // int i = 1;
3596 movl(index, 1);
3597 cmpl(index, cnt1);
3598 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3599
3600 // for (; i < cnt1 ; i += 2) {
3601 bind(SHORT_UNROLLED_LOOP_BEGIN);
3602 movl(tmp3, 961);
3603 imull(result, tmp3);
3604 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3605 movl(tmp3, tmp2);
3606 shll(tmp3, 5);
3607 subl(tmp3, tmp2);
3608 addl(result, tmp3);
3609 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3610 addl(result, tmp3);
3611 addl(index, 2);
3612 cmpl(index, cnt1);
3613 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3614
3615 // }
3616 // if (i >= cnt1) {
3617 bind(SHORT_UNROLLED_LOOP_EXIT);
3618 jccb(Assembler::greater, END);
3619 movl(tmp2, result);
3620 shll(result, 5);
3621 subl(result, tmp2);
3622 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3623 addl(result, tmp3);
3624 // }
3625 bind(END);
3626
3627 BLOCK_COMMENT("} // arrays_hashcode");
3628
3629 } // arrays_hashcode
3630
3631 // helper function for string_compare
3632 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3633 Address::ScaleFactor scale, Address::ScaleFactor scale1,
3634 Address::ScaleFactor scale2, Register index, int ae) {
3635 if (ae == StrIntrinsicNode::LL) {
3636 load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3637 load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3638 } else if (ae == StrIntrinsicNode::UU) {
3639 load_unsigned_short(elem1, Address(str1, index, scale, 0));
3640 load_unsigned_short(elem2, Address(str2, index, scale, 0));
3641 } else {
3642 load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3643 load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3644 }
3645 }
3646
3647 // Compare strings, used for char[] and byte[].
3648 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3649 Register cnt1, Register cnt2, Register result,
3650 XMMRegister vec1, int ae, KRegister mask) {
3651 ShortBranchVerifier sbv(this);
3652 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3653 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only AVX3
3654 int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3655 int stride2x2 = 0x40;
3656 Address::ScaleFactor scale = Address::no_scale;
3657 Address::ScaleFactor scale1 = Address::no_scale;
3658 Address::ScaleFactor scale2 = Address::no_scale;
3659
3660 if (ae != StrIntrinsicNode::LL) {
3661 stride2x2 = 0x20;
3662 }
3663
3664 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3665 shrl(cnt2, 1);
3666 }
3667 // Compute the minimum of the string lengths and the
3668 // difference of the string lengths (stack).
3669 // Do the conditional move stuff
3670 movl(result, cnt1);
3671 subl(cnt1, cnt2);
3672 push(cnt1);
3673 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2)
3674
3675 // Is the minimum length zero?
3676 testl(cnt2, cnt2);
3677 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3678 if (ae == StrIntrinsicNode::LL) {
3679 // Load first bytes
3680 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0]
3681 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0]
3682 } else if (ae == StrIntrinsicNode::UU) {
3683 // Load first characters
3684 load_unsigned_short(result, Address(str1, 0));
3685 load_unsigned_short(cnt1, Address(str2, 0));
3686 } else {
3687 load_unsigned_byte(result, Address(str1, 0));
3688 load_unsigned_short(cnt1, Address(str2, 0));
3689 }
3690 subl(result, cnt1);
3691 jcc(Assembler::notZero, POP_LABEL);
3692
3693 if (ae == StrIntrinsicNode::UU) {
3694 // Divide length by 2 to get number of chars
3695 shrl(cnt2, 1);
3696 }
3697 cmpl(cnt2, 1);
3698 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3699
3700 // Check if the strings start at the same location and setup scale and stride
3701 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3702 cmpptr(str1, str2);
3703 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3704 if (ae == StrIntrinsicNode::LL) {
3705 scale = Address::times_1;
3706 stride = 16;
3707 } else {
3708 scale = Address::times_2;
3709 stride = 8;
3710 }
3711 } else {
3712 scale1 = Address::times_1;
3713 scale2 = Address::times_2;
3714 // scale not used
3715 stride = 8;
3716 }
3717
3718 if (UseAVX >= 2 && UseSSE42Intrinsics) {
3719 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3720 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3721 Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3722 Label COMPARE_TAIL_LONG;
3723 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only AVX3
3724
3725 int pcmpmask = 0x19;
3726 if (ae == StrIntrinsicNode::LL) {
3727 pcmpmask &= ~0x01;
3728 }
3729
3730 // Setup to compare 16-chars (32-bytes) vectors,
3731 // start from first character again because it has aligned address.
3732 if (ae == StrIntrinsicNode::LL) {
3733 stride2 = 32;
3734 } else {
3735 stride2 = 16;
3736 }
3737 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3738 adr_stride = stride << scale;
3739 } else {
3740 adr_stride1 = 8; //stride << scale1;
3741 adr_stride2 = 16; //stride << scale2;
3742 }
3743
3744 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3745 // rax and rdx are used by pcmpestri as elements counters
3746 movl(result, cnt2);
3747 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count
3748 jcc(Assembler::zero, COMPARE_TAIL_LONG);
3749
3750 // fast path : compare first 2 8-char vectors.
3751 bind(COMPARE_16_CHARS);
3752 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3753 movdqu(vec1, Address(str1, 0));
3754 } else {
3755 pmovzxbw(vec1, Address(str1, 0));
3756 }
3757 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3758 jccb(Assembler::below, COMPARE_INDEX_CHAR);
3759
3760 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3761 movdqu(vec1, Address(str1, adr_stride));
3762 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3763 } else {
3764 pmovzxbw(vec1, Address(str1, adr_stride1));
3765 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3766 }
3767 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3768 addl(cnt1, stride);
3769
3770 // Compare the characters at index in cnt1
3771 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3772 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3773 subl(result, cnt2);
3774 jmp(POP_LABEL);
3775
3776 // Setup the registers to start vector comparison loop
3777 bind(COMPARE_WIDE_VECTORS);
3778 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3779 lea(str1, Address(str1, result, scale));
3780 lea(str2, Address(str2, result, scale));
3781 } else {
3782 lea(str1, Address(str1, result, scale1));
3783 lea(str2, Address(str2, result, scale2));
3784 }
3785 subl(result, stride2);
3786 subl(cnt2, stride2);
3787 jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3788 negptr(result);
3789
3790 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3791 bind(COMPARE_WIDE_VECTORS_LOOP);
3792
3793 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3794 cmpl(cnt2, stride2x2);
3795 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3796 testl(cnt2, stride2x2-1); // cnt2 holds the vector count
3797 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40
3798
3799 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3800 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3801 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3802 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3803 } else {
3804 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3805 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3806 }
3807 kortestql(mask, mask);
3808 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare
3809 addptr(result, stride2x2); // update since we already compared at this addr
3810 subl(cnt2, stride2x2); // and sub the size too
3811 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3812
3813 vpxor(vec1, vec1);
3814 jmpb(COMPARE_WIDE_TAIL);
3815 }//if (VM_Version::supports_avx512vlbw())
3816
3817 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3818 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3819 vmovdqu(vec1, Address(str1, result, scale));
3820 vpxor(vec1, Address(str2, result, scale));
3821 } else {
3822 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3823 vpxor(vec1, Address(str2, result, scale2));
3824 }
3825 vptest(vec1, vec1);
3826 jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3827 addptr(result, stride2);
3828 subl(cnt2, stride2);
3829 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3830 // clean upper bits of YMM registers
3831 vpxor(vec1, vec1);
3832
3833 // compare wide vectors tail
3834 bind(COMPARE_WIDE_TAIL);
3835 testptr(result, result);
3836 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3837
3838 movl(result, stride2);
3839 movl(cnt2, result);
3840 negptr(result);
3841 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3842
3843 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3844 bind(VECTOR_NOT_EQUAL);
3845 // clean upper bits of YMM registers
3846 vpxor(vec1, vec1);
3847 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3848 lea(str1, Address(str1, result, scale));
3849 lea(str2, Address(str2, result, scale));
3850 } else {
3851 lea(str1, Address(str1, result, scale1));
3852 lea(str2, Address(str2, result, scale2));
3853 }
3854 jmp(COMPARE_16_CHARS);
3855
3856 // Compare tail chars, length between 1 to 15 chars
3857 bind(COMPARE_TAIL_LONG);
3858 movl(cnt2, result);
3859 cmpl(cnt2, stride);
3860 jcc(Assembler::less, COMPARE_SMALL_STR);
3861
3862 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3863 movdqu(vec1, Address(str1, 0));
3864 } else {
3865 pmovzxbw(vec1, Address(str1, 0));
3866 }
3867 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3868 jcc(Assembler::below, COMPARE_INDEX_CHAR);
3869 subptr(cnt2, stride);
3870 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3871 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3872 lea(str1, Address(str1, result, scale));
3873 lea(str2, Address(str2, result, scale));
3874 } else {
3875 lea(str1, Address(str1, result, scale1));
3876 lea(str2, Address(str2, result, scale2));
3877 }
3878 negptr(cnt2);
3879 jmpb(WHILE_HEAD_LABEL);
3880
3881 bind(COMPARE_SMALL_STR);
3882 } else if (UseSSE42Intrinsics) {
3883 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3884 int pcmpmask = 0x19;
3885 // Setup to compare 8-char (16-byte) vectors,
3886 // start from first character again because it has aligned address.
3887 movl(result, cnt2);
3888 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count
3889 if (ae == StrIntrinsicNode::LL) {
3890 pcmpmask &= ~0x01;
3891 }
3892 jcc(Assembler::zero, COMPARE_TAIL);
3893 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3894 lea(str1, Address(str1, result, scale));
3895 lea(str2, Address(str2, result, scale));
3896 } else {
3897 lea(str1, Address(str1, result, scale1));
3898 lea(str2, Address(str2, result, scale2));
3899 }
3900 negptr(result);
3901
3902 // pcmpestri
3903 // inputs:
3904 // vec1- substring
3905 // rax - negative string length (elements count)
3906 // mem - scanned string
3907 // rdx - string length (elements count)
3908 // pcmpmask - cmp mode: 11000 (string compare with negated result)
3909 // + 00 (unsigned bytes) or + 01 (unsigned shorts)
3910 // outputs:
3911 // rcx - first mismatched element index
3912 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3913
3914 bind(COMPARE_WIDE_VECTORS);
3915 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3916 movdqu(vec1, Address(str1, result, scale));
3917 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3918 } else {
3919 pmovzxbw(vec1, Address(str1, result, scale1));
3920 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3921 }
3922 // After pcmpestri cnt1(rcx) contains mismatched element index
3923
3924 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1
3925 addptr(result, stride);
3926 subptr(cnt2, stride);
3927 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3928
3929 // compare wide vectors tail
3930 testptr(result, result);
3931 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3932
3933 movl(cnt2, stride);
3934 movl(result, stride);
3935 negptr(result);
3936 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3937 movdqu(vec1, Address(str1, result, scale));
3938 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3939 } else {
3940 pmovzxbw(vec1, Address(str1, result, scale1));
3941 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3942 }
3943 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3944
3945 // Mismatched characters in the vectors
3946 bind(VECTOR_NOT_EQUAL);
3947 addptr(cnt1, result);
3948 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3949 subl(result, cnt2);
3950 jmpb(POP_LABEL);
3951
3952 bind(COMPARE_TAIL); // limit is zero
3953 movl(cnt2, result);
3954 // Fallthru to tail compare
3955 }
3956 // Shift str2 and str1 to the end of the arrays, negate min
3957 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3958 lea(str1, Address(str1, cnt2, scale));
3959 lea(str2, Address(str2, cnt2, scale));
3960 } else {
3961 lea(str1, Address(str1, cnt2, scale1));
3962 lea(str2, Address(str2, cnt2, scale2));
3963 }
3964 decrementl(cnt2); // first character was compared already
3965 negptr(cnt2);
3966
3967 // Compare the rest of the elements
3968 bind(WHILE_HEAD_LABEL);
3969 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3970 subl(result, cnt1);
3971 jccb(Assembler::notZero, POP_LABEL);
3972 increment(cnt2);
3973 jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3974
3975 // Strings are equal up to min length. Return the length difference.
3976 bind(LENGTH_DIFF_LABEL);
3977 pop(result);
3978 if (ae == StrIntrinsicNode::UU) {
3979 // Divide diff by 2 to get number of chars
3980 sarl(result, 1);
3981 }
3982 jmpb(DONE_LABEL);
3983
3984 if (VM_Version::supports_avx512vlbw()) {
3985
3986 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3987
3988 kmovql(cnt1, mask);
3989 notq(cnt1);
3990 bsfq(cnt2, cnt1);
3991 if (ae != StrIntrinsicNode::LL) {
3992 // Divide diff by 2 to get number of chars
3993 sarl(cnt2, 1);
3994 }
3995 addq(result, cnt2);
3996 if (ae == StrIntrinsicNode::LL) {
3997 load_unsigned_byte(cnt1, Address(str2, result));
3998 load_unsigned_byte(result, Address(str1, result));
3999 } else if (ae == StrIntrinsicNode::UU) {
4000 load_unsigned_short(cnt1, Address(str2, result, scale));
4001 load_unsigned_short(result, Address(str1, result, scale));
4002 } else {
4003 load_unsigned_short(cnt1, Address(str2, result, scale2));
4004 load_unsigned_byte(result, Address(str1, result, scale1));
4005 }
4006 subl(result, cnt1);
4007 jmpb(POP_LABEL);
4008 }//if (VM_Version::supports_avx512vlbw())
4009
4010 // Discard the stored length difference
4011 bind(POP_LABEL);
4012 pop(cnt1);
4013
4014 // That's it
4015 bind(DONE_LABEL);
4016 if(ae == StrIntrinsicNode::UL) {
4017 negl(result);
4018 }
4019
4020 }
4021
4022 // Search for Non-ASCII character (Negative byte value) in a byte array,
4023 // return the index of the first such character, otherwise the length
4024 // of the array segment searched.
4025 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4026 // @IntrinsicCandidate
4027 // public static int countPositives(byte[] ba, int off, int len) {
4028 // for (int i = off; i < off + len; i++) {
4029 // if (ba[i] < 0) {
4030 // return i - off;
4031 // }
4032 // }
4033 // return len;
4034 // }
4035 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4036 Register result, Register tmp1,
4037 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4038 // rsi: byte array
4039 // rcx: len
4040 // rax: result
4041 ShortBranchVerifier sbv(this);
4042 assert_different_registers(ary1, len, result, tmp1);
4043 assert_different_registers(vec1, vec2);
4044 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4045
4046 movl(result, len); // copy
4047 // len == 0
4048 testl(len, len);
4049 jcc(Assembler::zero, DONE);
4050
4051 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4052 VM_Version::supports_avx512vlbw() &&
4053 VM_Version::supports_bmi2()) {
4054
4055 Label test_64_loop, test_tail, BREAK_LOOP;
4056 movl(tmp1, len);
4057 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4058
4059 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4060 andl(len, 0xffffffc0); // vector count (in chars)
4061 jccb(Assembler::zero, test_tail);
4062
4063 lea(ary1, Address(ary1, len, Address::times_1));
4064 negptr(len);
4065
4066 bind(test_64_loop);
4067 // Check whether our 64 elements of size byte contain negatives
4068 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4069 kortestql(mask1, mask1);
4070 jcc(Assembler::notZero, BREAK_LOOP);
4071
4072 addptr(len, 64);
4073 jccb(Assembler::notZero, test_64_loop);
4074
4075 bind(test_tail);
4076 // bail out when there is nothing to be done
4077 testl(tmp1, -1);
4078 jcc(Assembler::zero, DONE);
4079
4080
4081 // check the tail for absense of negatives
4082 // ~(~0 << len) applied up to two times (for 32-bit scenario)
4083 {
4084 Register tmp3_aliased = len;
4085 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4086 shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4087 notq(tmp3_aliased);
4088 kmovql(mask2, tmp3_aliased);
4089 }
4090
4091 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4092 ktestq(mask1, mask2);
4093 jcc(Assembler::zero, DONE);
4094
4095 // do a full check for negative registers in the tail
4096 movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4097 // ary1 already pointing to the right place
4098 jmpb(TAIL_START);
4099
4100 bind(BREAK_LOOP);
4101 // At least one byte in the last 64 byte block was negative.
4102 // Set up to look at the last 64 bytes as if they were a tail
4103 lea(ary1, Address(ary1, len, Address::times_1));
4104 addptr(result, len);
4105 // Ignore the very last byte: if all others are positive,
4106 // it must be negative, so we can skip right to the 2+1 byte
4107 // end comparison at this point
4108 orl(result, 63);
4109 movl(len, 63);
4110 // Fallthru to tail compare
4111 } else {
4112
4113 if (UseAVX >= 2) {
4114 // With AVX2, use 32-byte vector compare
4115 Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4116
4117 // Compare 32-byte vectors
4118 testl(len, 0xffffffe0); // vector count (in bytes)
4119 jccb(Assembler::zero, TAIL_START);
4120
4121 andl(len, 0xffffffe0);
4122 lea(ary1, Address(ary1, len, Address::times_1));
4123 negptr(len);
4124
4125 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
4126 movdl(vec2, tmp1);
4127 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4128
4129 bind(COMPARE_WIDE_VECTORS);
4130 vmovdqu(vec1, Address(ary1, len, Address::times_1));
4131 vptest(vec1, vec2);
4132 jccb(Assembler::notZero, BREAK_LOOP);
4133 addptr(len, 32);
4134 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4135
4136 testl(result, 0x0000001f); // any bytes remaining?
4137 jcc(Assembler::zero, DONE);
4138
4139 // Quick test using the already prepared vector mask
4140 movl(len, result);
4141 andl(len, 0x0000001f);
4142 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4143 vptest(vec1, vec2);
4144 jcc(Assembler::zero, DONE);
4145 // There are zeros, jump to the tail to determine exactly where
4146 jmpb(TAIL_START);
4147
4148 bind(BREAK_LOOP);
4149 // At least one byte in the last 32-byte vector is negative.
4150 // Set up to look at the last 32 bytes as if they were a tail
4151 lea(ary1, Address(ary1, len, Address::times_1));
4152 addptr(result, len);
4153 // Ignore the very last byte: if all others are positive,
4154 // it must be negative, so we can skip right to the 2+1 byte
4155 // end comparison at this point
4156 orl(result, 31);
4157 movl(len, 31);
4158 // Fallthru to tail compare
4159 } else if (UseSSE42Intrinsics) {
4160 // With SSE4.2, use double quad vector compare
4161 Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4162
4163 // Compare 16-byte vectors
4164 testl(len, 0xfffffff0); // vector count (in bytes)
4165 jcc(Assembler::zero, TAIL_START);
4166
4167 andl(len, 0xfffffff0);
4168 lea(ary1, Address(ary1, len, Address::times_1));
4169 negptr(len);
4170
4171 movl(tmp1, 0x80808080);
4172 movdl(vec2, tmp1);
4173 pshufd(vec2, vec2, 0);
4174
4175 bind(COMPARE_WIDE_VECTORS);
4176 movdqu(vec1, Address(ary1, len, Address::times_1));
4177 ptest(vec1, vec2);
4178 jccb(Assembler::notZero, BREAK_LOOP);
4179 addptr(len, 16);
4180 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4181
4182 testl(result, 0x0000000f); // len is zero, any bytes remaining?
4183 jcc(Assembler::zero, DONE);
4184
4185 // Quick test using the already prepared vector mask
4186 movl(len, result);
4187 andl(len, 0x0000000f); // tail count (in bytes)
4188 movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4189 ptest(vec1, vec2);
4190 jcc(Assembler::zero, DONE);
4191 jmpb(TAIL_START);
4192
4193 bind(BREAK_LOOP);
4194 // At least one byte in the last 16-byte vector is negative.
4195 // Set up and look at the last 16 bytes as if they were a tail
4196 lea(ary1, Address(ary1, len, Address::times_1));
4197 addptr(result, len);
4198 // Ignore the very last byte: if all others are positive,
4199 // it must be negative, so we can skip right to the 2+1 byte
4200 // end comparison at this point
4201 orl(result, 15);
4202 movl(len, 15);
4203 // Fallthru to tail compare
4204 }
4205 }
4206
4207 bind(TAIL_START);
4208 // Compare 4-byte vectors
4209 andl(len, 0xfffffffc); // vector count (in bytes)
4210 jccb(Assembler::zero, COMPARE_CHAR);
4211
4212 lea(ary1, Address(ary1, len, Address::times_1));
4213 negptr(len);
4214
4215 bind(COMPARE_VECTORS);
4216 movl(tmp1, Address(ary1, len, Address::times_1));
4217 andl(tmp1, 0x80808080);
4218 jccb(Assembler::notZero, TAIL_ADJUST);
4219 addptr(len, 4);
4220 jccb(Assembler::notZero, COMPARE_VECTORS);
4221
4222 // Compare trailing char (final 2-3 bytes), if any
4223 bind(COMPARE_CHAR);
4224
4225 testl(result, 0x2); // tail char
4226 jccb(Assembler::zero, COMPARE_BYTE);
4227 load_unsigned_short(tmp1, Address(ary1, 0));
4228 andl(tmp1, 0x00008080);
4229 jccb(Assembler::notZero, CHAR_ADJUST);
4230 lea(ary1, Address(ary1, 2));
4231
4232 bind(COMPARE_BYTE);
4233 testl(result, 0x1); // tail byte
4234 jccb(Assembler::zero, DONE);
4235 load_unsigned_byte(tmp1, Address(ary1, 0));
4236 testl(tmp1, 0x00000080);
4237 jccb(Assembler::zero, DONE);
4238 subptr(result, 1);
4239 jmpb(DONE);
4240
4241 bind(TAIL_ADJUST);
4242 // there are negative bits in the last 4 byte block.
4243 // Adjust result and check the next three bytes
4244 addptr(result, len);
4245 orl(result, 3);
4246 lea(ary1, Address(ary1, len, Address::times_1));
4247 jmpb(COMPARE_CHAR);
4248
4249 bind(CHAR_ADJUST);
4250 // We are looking at a char + optional byte tail, and found that one
4251 // of the bytes in the char is negative. Adjust the result, check the
4252 // first byte and readjust if needed.
4253 andl(result, 0xfffffffc);
4254 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4255 jccb(Assembler::notZero, DONE);
4256 addptr(result, 1);
4257
4258 // That's it
4259 bind(DONE);
4260 if (UseAVX >= 2) {
4261 // clean upper bits of YMM registers
4262 vpxor(vec1, vec1);
4263 vpxor(vec2, vec2);
4264 }
4265 }
4266
4267 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4268 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4269 Register limit, Register result, Register chr,
4270 XMMRegister vec1, XMMRegister vec2, bool is_char,
4271 KRegister mask, bool expand_ary2) {
4272 // for expand_ary2, limit is the (smaller) size of the second array.
4273 ShortBranchVerifier sbv(this);
4274 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4275
4276 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4277 "Expansion only implemented for AVX2");
4278
4279 int length_offset = arrayOopDesc::length_offset_in_bytes();
4280 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4281
4282 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4283 int scaleIncr = expand_ary2 ? 8 : 16;
4284
4285 if (is_array_equ) {
4286 // Check the input args
4287 cmpoop(ary1, ary2);
4288 jcc(Assembler::equal, TRUE_LABEL);
4289
4290 // Need additional checks for arrays_equals.
4291 testptr(ary1, ary1);
4292 jcc(Assembler::zero, FALSE_LABEL);
4293 testptr(ary2, ary2);
4294 jcc(Assembler::zero, FALSE_LABEL);
4295
4296 // Check the lengths
4297 movl(limit, Address(ary1, length_offset));
4298 cmpl(limit, Address(ary2, length_offset));
4299 jcc(Assembler::notEqual, FALSE_LABEL);
4300 }
4301
4302 // count == 0
4303 testl(limit, limit);
4304 jcc(Assembler::zero, TRUE_LABEL);
4305
4306 if (is_array_equ) {
4307 // Load array address
4308 lea(ary1, Address(ary1, base_offset));
4309 lea(ary2, Address(ary2, base_offset));
4310 }
4311
4312 if (is_array_equ && is_char) {
4313 // arrays_equals when used for char[].
4314 shll(limit, 1); // byte count != 0
4315 }
4316 movl(result, limit); // copy
4317
4318 if (UseAVX >= 2) {
4319 // With AVX2, use 32-byte vector compare
4320 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4321
4322 // Compare 32-byte vectors
4323 if (expand_ary2) {
4324 andl(result, 0x0000000f); // tail count (in bytes)
4325 andl(limit, 0xfffffff0); // vector count (in bytes)
4326 jcc(Assembler::zero, COMPARE_TAIL);
4327 } else {
4328 andl(result, 0x0000001f); // tail count (in bytes)
4329 andl(limit, 0xffffffe0); // vector count (in bytes)
4330 jcc(Assembler::zero, COMPARE_TAIL_16);
4331 }
4332
4333 lea(ary1, Address(ary1, limit, scaleFactor));
4334 lea(ary2, Address(ary2, limit, Address::times_1));
4335 negptr(limit);
4336
4337 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4338 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4339
4340 cmpl(limit, -64);
4341 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4342
4343 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4344
4345 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4346 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4347 kortestql(mask, mask);
4348 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
4349 addptr(limit, 64); // update since we already compared at this addr
4350 cmpl(limit, -64);
4351 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4352
4353 // At this point we may still need to compare -limit+result bytes.
4354 // We could execute the next two instruction and just continue via non-wide path:
4355 // cmpl(limit, 0);
4356 // jcc(Assembler::equal, COMPARE_TAIL); // true
4357 // But since we stopped at the points ary{1,2}+limit which are
4358 // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4359 // (|limit| <= 32 and result < 32),
4360 // we may just compare the last 64 bytes.
4361 //
4362 addptr(result, -64); // it is safe, bc we just came from this area
4363 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4364 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4365 kortestql(mask, mask);
4366 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
4367
4368 jmp(TRUE_LABEL);
4369
4370 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4371
4372 }//if (VM_Version::supports_avx512vlbw())
4373
4374 bind(COMPARE_WIDE_VECTORS);
4375 vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4376 if (expand_ary2) {
4377 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4378 } else {
4379 vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4380 }
4381 vpxor(vec1, vec2);
4382
4383 vptest(vec1, vec1);
4384 jcc(Assembler::notZero, FALSE_LABEL);
4385 addptr(limit, scaleIncr * 2);
4386 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4387
4388 testl(result, result);
4389 jcc(Assembler::zero, TRUE_LABEL);
4390
4391 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4392 if (expand_ary2) {
4393 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4394 } else {
4395 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4396 }
4397 vpxor(vec1, vec2);
4398
4399 vptest(vec1, vec1);
4400 jcc(Assembler::notZero, FALSE_LABEL);
4401 jmp(TRUE_LABEL);
4402
4403 bind(COMPARE_TAIL_16); // limit is zero
4404 movl(limit, result);
4405
4406 // Compare 16-byte chunks
4407 andl(result, 0x0000000f); // tail count (in bytes)
4408 andl(limit, 0xfffffff0); // vector count (in bytes)
4409 jcc(Assembler::zero, COMPARE_TAIL);
4410
4411 lea(ary1, Address(ary1, limit, scaleFactor));
4412 lea(ary2, Address(ary2, limit, Address::times_1));
4413 negptr(limit);
4414
4415 bind(COMPARE_WIDE_VECTORS_16);
4416 movdqu(vec1, Address(ary1, limit, scaleFactor));
4417 if (expand_ary2) {
4418 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4419 } else {
4420 movdqu(vec2, Address(ary2, limit, Address::times_1));
4421 }
4422 pxor(vec1, vec2);
4423
4424 ptest(vec1, vec1);
4425 jcc(Assembler::notZero, FALSE_LABEL);
4426 addptr(limit, scaleIncr);
4427 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4428
4429 bind(COMPARE_TAIL); // limit is zero
4430 movl(limit, result);
4431 // Fallthru to tail compare
4432 } else if (UseSSE42Intrinsics) {
4433 // With SSE4.2, use double quad vector compare
4434 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4435
4436 // Compare 16-byte vectors
4437 andl(result, 0x0000000f); // tail count (in bytes)
4438 andl(limit, 0xfffffff0); // vector count (in bytes)
4439 jcc(Assembler::zero, COMPARE_TAIL);
4440
4441 lea(ary1, Address(ary1, limit, Address::times_1));
4442 lea(ary2, Address(ary2, limit, Address::times_1));
4443 negptr(limit);
4444
4445 bind(COMPARE_WIDE_VECTORS);
4446 movdqu(vec1, Address(ary1, limit, Address::times_1));
4447 movdqu(vec2, Address(ary2, limit, Address::times_1));
4448 pxor(vec1, vec2);
4449
4450 ptest(vec1, vec1);
4451 jcc(Assembler::notZero, FALSE_LABEL);
4452 addptr(limit, 16);
4453 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4454
4455 testl(result, result);
4456 jcc(Assembler::zero, TRUE_LABEL);
4457
4458 movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4459 movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4460 pxor(vec1, vec2);
4461
4462 ptest(vec1, vec1);
4463 jccb(Assembler::notZero, FALSE_LABEL);
4464 jmpb(TRUE_LABEL);
4465
4466 bind(COMPARE_TAIL); // limit is zero
4467 movl(limit, result);
4468 // Fallthru to tail compare
4469 }
4470
4471 // Compare 4-byte vectors
4472 if (expand_ary2) {
4473 testl(result, result);
4474 jccb(Assembler::zero, TRUE_LABEL);
4475 } else {
4476 andl(limit, 0xfffffffc); // vector count (in bytes)
4477 jccb(Assembler::zero, COMPARE_CHAR);
4478 }
4479
4480 lea(ary1, Address(ary1, limit, scaleFactor));
4481 lea(ary2, Address(ary2, limit, Address::times_1));
4482 negptr(limit);
4483
4484 bind(COMPARE_VECTORS);
4485 if (expand_ary2) {
4486 // There are no "vector" operations for bytes to shorts
4487 movzbl(chr, Address(ary2, limit, Address::times_1));
4488 cmpw(Address(ary1, limit, Address::times_2), chr);
4489 jccb(Assembler::notEqual, FALSE_LABEL);
4490 addptr(limit, 1);
4491 jcc(Assembler::notZero, COMPARE_VECTORS);
4492 jmp(TRUE_LABEL);
4493 } else {
4494 movl(chr, Address(ary1, limit, Address::times_1));
4495 cmpl(chr, Address(ary2, limit, Address::times_1));
4496 jccb(Assembler::notEqual, FALSE_LABEL);
4497 addptr(limit, 4);
4498 jcc(Assembler::notZero, COMPARE_VECTORS);
4499 }
4500
4501 // Compare trailing char (final 2 bytes), if any
4502 bind(COMPARE_CHAR);
4503 testl(result, 0x2); // tail char
4504 jccb(Assembler::zero, COMPARE_BYTE);
4505 load_unsigned_short(chr, Address(ary1, 0));
4506 load_unsigned_short(limit, Address(ary2, 0));
4507 cmpl(chr, limit);
4508 jccb(Assembler::notEqual, FALSE_LABEL);
4509
4510 if (is_array_equ && is_char) {
4511 bind(COMPARE_BYTE);
4512 } else {
4513 lea(ary1, Address(ary1, 2));
4514 lea(ary2, Address(ary2, 2));
4515
4516 bind(COMPARE_BYTE);
4517 testl(result, 0x1); // tail byte
4518 jccb(Assembler::zero, TRUE_LABEL);
4519 load_unsigned_byte(chr, Address(ary1, 0));
4520 load_unsigned_byte(limit, Address(ary2, 0));
4521 cmpl(chr, limit);
4522 jccb(Assembler::notEqual, FALSE_LABEL);
4523 }
4524 bind(TRUE_LABEL);
4525 movl(result, 1); // return true
4526 jmpb(DONE);
4527
4528 bind(FALSE_LABEL);
4529 xorl(result, result); // return false
4530
4531 // That's it
4532 bind(DONE);
4533 if (UseAVX >= 2) {
4534 // clean upper bits of YMM registers
4535 vpxor(vec1, vec1);
4536 vpxor(vec2, vec2);
4537 }
4538 }
4539
4540 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4541 #define __ masm.
4542 Register dst = stub.data<0>();
4543 XMMRegister src = stub.data<1>();
4544 address target = stub.data<2>();
4545 __ bind(stub.entry());
4546 __ subptr(rsp, 8);
4547 __ movdbl(Address(rsp), src);
4548 __ call(RuntimeAddress(target));
4549 // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4550 __ pop(dst);
4551 __ jmp(stub.continuation());
4552 #undef __
4553 }
4554
4555 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4556 assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4557 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4558
4559 address slowpath_target;
4560 if (dst_bt == T_INT) {
4561 if (src_bt == T_FLOAT) {
4562 cvttss2sil(dst, src);
4563 cmpl(dst, 0x80000000);
4564 slowpath_target = StubRoutines::x86::f2i_fixup();
4565 } else {
4566 cvttsd2sil(dst, src);
4567 cmpl(dst, 0x80000000);
4568 slowpath_target = StubRoutines::x86::d2i_fixup();
4569 }
4570 } else {
4571 if (src_bt == T_FLOAT) {
4572 cvttss2siq(dst, src);
4573 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4574 slowpath_target = StubRoutines::x86::f2l_fixup();
4575 } else {
4576 cvttsd2siq(dst, src);
4577 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4578 slowpath_target = StubRoutines::x86::d2l_fixup();
4579 }
4580 }
4581
4582 // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4583 int max_size = 23 + (UseAPX ? 1 : 0);
4584 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4585 jcc(Assembler::equal, stub->entry());
4586 bind(stub->continuation());
4587 }
4588
4589 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4590 XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4591 switch(ideal_opc) {
4592 case Op_LShiftVS:
4593 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4594 case Op_LShiftVI:
4595 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4596 case Op_LShiftVL:
4597 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4598 case Op_RShiftVS:
4599 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4600 case Op_RShiftVI:
4601 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4602 case Op_RShiftVL:
4603 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4604 case Op_URShiftVS:
4605 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4606 case Op_URShiftVI:
4607 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4608 case Op_URShiftVL:
4609 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4610 case Op_RotateRightV:
4611 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4612 case Op_RotateLeftV:
4613 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4614 default:
4615 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4616 break;
4617 }
4618 }
4619
4620 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4621 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4622 if (is_unsigned) {
4623 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4624 } else {
4625 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4626 }
4627 }
4628
4629 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4630 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4631 switch (elem_bt) {
4632 case T_BYTE:
4633 if (ideal_opc == Op_SaturatingAddV) {
4634 evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4635 } else {
4636 assert(ideal_opc == Op_SaturatingSubV, "");
4637 evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4638 }
4639 break;
4640 case T_SHORT:
4641 if (ideal_opc == Op_SaturatingAddV) {
4642 evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4643 } else {
4644 assert(ideal_opc == Op_SaturatingSubV, "");
4645 evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4646 }
4647 break;
4648 default:
4649 fatal("Unsupported type %s", type2name(elem_bt));
4650 break;
4651 }
4652 }
4653
4654 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4655 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4656 switch (elem_bt) {
4657 case T_BYTE:
4658 if (ideal_opc == Op_SaturatingAddV) {
4659 evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4660 } else {
4661 assert(ideal_opc == Op_SaturatingSubV, "");
4662 evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4663 }
4664 break;
4665 case T_SHORT:
4666 if (ideal_opc == Op_SaturatingAddV) {
4667 evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4668 } else {
4669 assert(ideal_opc == Op_SaturatingSubV, "");
4670 evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4671 }
4672 break;
4673 default:
4674 fatal("Unsupported type %s", type2name(elem_bt));
4675 break;
4676 }
4677 }
4678
4679 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4680 Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4681 if (is_unsigned) {
4682 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4683 } else {
4684 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4685 }
4686 }
4687
4688 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4689 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4690 switch (elem_bt) {
4691 case T_BYTE:
4692 if (ideal_opc == Op_SaturatingAddV) {
4693 evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4694 } else {
4695 assert(ideal_opc == Op_SaturatingSubV, "");
4696 evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4697 }
4698 break;
4699 case T_SHORT:
4700 if (ideal_opc == Op_SaturatingAddV) {
4701 evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4702 } else {
4703 assert(ideal_opc == Op_SaturatingSubV, "");
4704 evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4705 }
4706 break;
4707 default:
4708 fatal("Unsupported type %s", type2name(elem_bt));
4709 break;
4710 }
4711 }
4712
4713 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4714 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4715 switch (elem_bt) {
4716 case T_BYTE:
4717 if (ideal_opc == Op_SaturatingAddV) {
4718 evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4719 } else {
4720 assert(ideal_opc == Op_SaturatingSubV, "");
4721 evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4722 }
4723 break;
4724 case T_SHORT:
4725 if (ideal_opc == Op_SaturatingAddV) {
4726 evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4727 } else {
4728 assert(ideal_opc == Op_SaturatingSubV, "");
4729 evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4730 }
4731 break;
4732 default:
4733 fatal("Unsupported type %s", type2name(elem_bt));
4734 break;
4735 }
4736 }
4737
4738 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4739 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4740 bool is_varshift) {
4741 switch (ideal_opc) {
4742 case Op_AddVB:
4743 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4744 case Op_AddVS:
4745 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4746 case Op_AddVI:
4747 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4748 case Op_AddVL:
4749 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4750 case Op_AddVF:
4751 evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4752 case Op_AddVD:
4753 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4754 case Op_SubVB:
4755 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4756 case Op_SubVS:
4757 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4758 case Op_SubVI:
4759 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4760 case Op_SubVL:
4761 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4762 case Op_SubVF:
4763 evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4764 case Op_SubVD:
4765 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4766 case Op_MulVS:
4767 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4768 case Op_MulVI:
4769 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4770 case Op_MulVL:
4771 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4772 case Op_MulVF:
4773 evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4774 case Op_MulVD:
4775 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4776 case Op_DivVF:
4777 evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4778 case Op_DivVD:
4779 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4780 case Op_SqrtVF:
4781 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4782 case Op_SqrtVD:
4783 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4784 case Op_AbsVB:
4785 evpabsb(dst, mask, src2, merge, vlen_enc); break;
4786 case Op_AbsVS:
4787 evpabsw(dst, mask, src2, merge, vlen_enc); break;
4788 case Op_AbsVI:
4789 evpabsd(dst, mask, src2, merge, vlen_enc); break;
4790 case Op_AbsVL:
4791 evpabsq(dst, mask, src2, merge, vlen_enc); break;
4792 case Op_FmaVF:
4793 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4794 case Op_FmaVD:
4795 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4796 case Op_VectorRearrange:
4797 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4798 case Op_LShiftVS:
4799 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4800 case Op_LShiftVI:
4801 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4802 case Op_LShiftVL:
4803 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4804 case Op_RShiftVS:
4805 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4806 case Op_RShiftVI:
4807 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4808 case Op_RShiftVL:
4809 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4810 case Op_URShiftVS:
4811 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4812 case Op_URShiftVI:
4813 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4814 case Op_URShiftVL:
4815 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4816 case Op_RotateLeftV:
4817 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4818 case Op_RotateRightV:
4819 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4820 case Op_MaxV:
4821 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4822 case Op_MinV:
4823 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4824 case Op_UMinV:
4825 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4826 case Op_UMaxV:
4827 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4828 case Op_XorV:
4829 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4830 case Op_OrV:
4831 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4832 case Op_AndV:
4833 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4834 default:
4835 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4836 break;
4837 }
4838 }
4839
4840 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4841 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4842 switch (ideal_opc) {
4843 case Op_AddVB:
4844 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4845 case Op_AddVS:
4846 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4847 case Op_AddVI:
4848 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4849 case Op_AddVL:
4850 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4851 case Op_AddVF:
4852 evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4853 case Op_AddVD:
4854 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4855 case Op_SubVB:
4856 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4857 case Op_SubVS:
4858 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4859 case Op_SubVI:
4860 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4861 case Op_SubVL:
4862 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4863 case Op_SubVF:
4864 evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4865 case Op_SubVD:
4866 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4867 case Op_MulVS:
4868 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4869 case Op_MulVI:
4870 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4871 case Op_MulVL:
4872 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4873 case Op_MulVF:
4874 evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4875 case Op_MulVD:
4876 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4877 case Op_DivVF:
4878 evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4879 case Op_DivVD:
4880 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4881 case Op_FmaVF:
4882 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4883 case Op_FmaVD:
4884 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4885 case Op_MaxV:
4886 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4887 case Op_MinV:
4888 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4889 case Op_UMaxV:
4890 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4891 case Op_UMinV:
4892 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4893 case Op_XorV:
4894 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4895 case Op_OrV:
4896 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4897 case Op_AndV:
4898 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4899 default:
4900 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4901 break;
4902 }
4903 }
4904
4905 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4906 KRegister src1, KRegister src2) {
4907 BasicType etype = T_ILLEGAL;
4908 switch(mask_len) {
4909 case 2:
4910 case 4:
4911 case 8: etype = T_BYTE; break;
4912 case 16: etype = T_SHORT; break;
4913 case 32: etype = T_INT; break;
4914 case 64: etype = T_LONG; break;
4915 default: fatal("Unsupported type"); break;
4916 }
4917 assert(etype != T_ILLEGAL, "");
4918 switch(ideal_opc) {
4919 case Op_AndVMask:
4920 kand(etype, dst, src1, src2); break;
4921 case Op_OrVMask:
4922 kor(etype, dst, src1, src2); break;
4923 case Op_XorVMask:
4924 kxor(etype, dst, src1, src2); break;
4925 default:
4926 fatal("Unsupported masked operation"); break;
4927 }
4928 }
4929
4930 /*
4931 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4932 * If src is NaN, the result is 0.
4933 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4934 * the result is equal to the value of Integer.MIN_VALUE.
4935 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4936 * the result is equal to the value of Integer.MAX_VALUE.
4937 */
4938 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4939 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4940 Register rscratch, AddressLiteral float_sign_flip,
4941 int vec_enc) {
4942 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4943 Label done;
4944 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4945 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4946 vptest(xtmp2, xtmp2, vec_enc);
4947 jccb(Assembler::equal, done);
4948
4949 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4950 vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4951
4952 vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4953 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4954 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4955
4956 // Recompute the mask for remaining special value.
4957 vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4958 // Extract SRC values corresponding to TRUE mask lanes.
4959 vpand(xtmp4, xtmp2, src, vec_enc);
4960 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4961 // values are set.
4962 vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4963
4964 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4965 bind(done);
4966 }
4967
4968 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4969 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4970 Register rscratch, AddressLiteral float_sign_flip,
4971 int vec_enc) {
4972 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4973 Label done;
4974 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4975 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4976 kortestwl(ktmp1, ktmp1);
4977 jccb(Assembler::equal, done);
4978
4979 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
4980 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
4981 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
4982
4983 kxorwl(ktmp1, ktmp1, ktmp2);
4984 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
4985 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
4986 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
4987 bind(done);
4988 }
4989
4990 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4991 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4992 Register rscratch, AddressLiteral double_sign_flip,
4993 int vec_enc) {
4994 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
4995
4996 Label done;
4997 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
4998 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
4999 kortestwl(ktmp1, ktmp1);
5000 jccb(Assembler::equal, done);
5001
5002 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5003 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5004 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5005
5006 kxorwl(ktmp1, ktmp1, ktmp2);
5007 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5008 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5009 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5010 bind(done);
5011 }
5012
5013 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5014 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5015 Register rscratch, AddressLiteral float_sign_flip,
5016 int vec_enc) {
5017 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5018 Label done;
5019 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5020 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5021 kortestwl(ktmp1, ktmp1);
5022 jccb(Assembler::equal, done);
5023
5024 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5025 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5026 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5027
5028 kxorwl(ktmp1, ktmp1, ktmp2);
5029 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5030 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5031 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5032 bind(done);
5033 }
5034
5035 /*
5036 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5037 * If src is NaN, the result is 0.
5038 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5039 * the result is equal to the value of Long.MIN_VALUE.
5040 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5041 * the result is equal to the value of Long.MAX_VALUE.
5042 */
5043 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5044 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5045 Register rscratch, AddressLiteral double_sign_flip,
5046 int vec_enc) {
5047 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5048
5049 Label done;
5050 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5051 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5052 kortestwl(ktmp1, ktmp1);
5053 jccb(Assembler::equal, done);
5054
5055 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5056 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5057 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5058
5059 kxorwl(ktmp1, ktmp1, ktmp2);
5060 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5061 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5062 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5063 bind(done);
5064 }
5065
5066 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5067 XMMRegister xtmp, int index, int vec_enc) {
5068 assert(vec_enc < Assembler::AVX_512bit, "");
5069 if (vec_enc == Assembler::AVX_256bit) {
5070 vextractf128_high(xtmp, src);
5071 vshufps(dst, src, xtmp, index, vec_enc);
5072 } else {
5073 vshufps(dst, src, zero, index, vec_enc);
5074 }
5075 }
5076
5077 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5078 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5079 AddressLiteral float_sign_flip, int src_vec_enc) {
5080 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5081
5082 Label done;
5083 // Compare the destination lanes with float_sign_flip
5084 // value to get mask for all special values.
5085 movdqu(xtmp1, float_sign_flip, rscratch);
5086 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5087 ptest(xtmp2, xtmp2);
5088 jccb(Assembler::equal, done);
5089
5090 // Flip float_sign_flip to get max integer value.
5091 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5092 pxor(xtmp1, xtmp4);
5093
5094 // Set detination lanes corresponding to unordered source lanes as zero.
5095 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5096 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5097
5098 // Shuffle mask vector and pack lower doubles word from each quadword lane.
5099 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5100 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5101
5102 // Recompute the mask for remaining special value.
5103 pxor(xtmp2, xtmp3);
5104 // Extract mask corresponding to non-negative source lanes.
5105 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5106
5107 // Shuffle mask vector and pack lower doubles word from each quadword lane.
5108 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5109 pand(xtmp3, xtmp2);
5110
5111 // Replace destination lanes holding special value(0x80000000) with max int
5112 // if corresponding source lane holds a +ve value.
5113 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5114 bind(done);
5115 }
5116
5117
5118 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5119 XMMRegister xtmp, Register rscratch, int vec_enc) {
5120 switch(to_elem_bt) {
5121 case T_SHORT:
5122 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5123 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5124 vpackusdw(dst, dst, zero, vec_enc);
5125 if (vec_enc == Assembler::AVX_256bit) {
5126 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5127 }
5128 break;
5129 case T_BYTE:
5130 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5131 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5132 vpackusdw(dst, dst, zero, vec_enc);
5133 if (vec_enc == Assembler::AVX_256bit) {
5134 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5135 }
5136 vpackuswb(dst, dst, zero, vec_enc);
5137 break;
5138 default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt));
5139 }
5140 }
5141
5142 /*
5143 * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):-
5144 * a) Perform vector D2L/F2I cast.
5145 * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5146 * It signifies that source value could be any of the special floating point
5147 * values(NaN,-Inf,Inf,Max,-Min).
5148 * c) Set destination to zero if source is NaN value.
5149 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5150 */
5151
5152 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5153 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5154 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5155 int to_elem_sz = type2aelembytes(to_elem_bt);
5156 assert(to_elem_sz <= 4, "");
5157 vcvttps2dq(dst, src, vec_enc);
5158 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5159 if (to_elem_sz < 4) {
5160 vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5161 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5162 }
5163 }
5164
5165 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5166 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5167 Register rscratch, int vec_enc) {
5168 int to_elem_sz = type2aelembytes(to_elem_bt);
5169 assert(to_elem_sz <= 4, "");
5170 vcvttps2dq(dst, src, vec_enc);
5171 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5172 switch(to_elem_bt) {
5173 case T_INT:
5174 break;
5175 case T_SHORT:
5176 evpmovdw(dst, dst, vec_enc);
5177 break;
5178 case T_BYTE:
5179 evpmovdb(dst, dst, vec_enc);
5180 break;
5181 default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt));
5182 }
5183 }
5184
5185 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5186 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5187 Register rscratch, int vec_enc) {
5188 evcvttps2qq(dst, src, vec_enc);
5189 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5190 }
5191
5192 // Handling for downcasting from double to integer or sub-word types on AVX2.
5193 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5194 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5195 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5196 int to_elem_sz = type2aelembytes(to_elem_bt);
5197 assert(to_elem_sz < 8, "");
5198 vcvttpd2dq(dst, src, vec_enc);
5199 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5200 float_sign_flip, vec_enc);
5201 if (to_elem_sz < 4) {
5202 // xtmp4 holds all zero lanes.
5203 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5204 }
5205 }
5206
5207 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5208 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5209 KRegister ktmp2, AddressLiteral sign_flip,
5210 Register rscratch, int vec_enc) {
5211 if (VM_Version::supports_avx512dq()) {
5212 evcvttpd2qq(dst, src, vec_enc);
5213 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5214 switch(to_elem_bt) {
5215 case T_LONG:
5216 break;
5217 case T_INT:
5218 evpmovsqd(dst, dst, vec_enc);
5219 break;
5220 case T_SHORT:
5221 evpmovsqd(dst, dst, vec_enc);
5222 evpmovdw(dst, dst, vec_enc);
5223 break;
5224 case T_BYTE:
5225 evpmovsqd(dst, dst, vec_enc);
5226 evpmovdb(dst, dst, vec_enc);
5227 break;
5228 default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt));
5229 }
5230 } else {
5231 assert(type2aelembytes(to_elem_bt) <= 4, "");
5232 vcvttpd2dq(dst, src, vec_enc);
5233 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5234 switch(to_elem_bt) {
5235 case T_INT:
5236 break;
5237 case T_SHORT:
5238 evpmovdw(dst, dst, vec_enc);
5239 break;
5240 case T_BYTE:
5241 evpmovdb(dst, dst, vec_enc);
5242 break;
5243 default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt));
5244 }
5245 }
5246 }
5247
5248 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5249 switch(to_elem_bt) {
5250 case T_LONG:
5251 evcvttps2qqs(dst, src, vec_enc);
5252 break;
5253 case T_INT:
5254 evcvttps2dqs(dst, src, vec_enc);
5255 break;
5256 case T_SHORT:
5257 evcvttps2dqs(dst, src, vec_enc);
5258 evpmovdw(dst, dst, vec_enc);
5259 break;
5260 case T_BYTE:
5261 evcvttps2dqs(dst, src, vec_enc);
5262 evpmovdb(dst, dst, vec_enc);
5263 break;
5264 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt));
5265 }
5266 }
5267
5268 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5269 switch(to_elem_bt) {
5270 case T_LONG:
5271 evcvttps2qqs(dst, src, vec_enc);
5272 break;
5273 case T_INT:
5274 evcvttps2dqs(dst, src, vec_enc);
5275 break;
5276 case T_SHORT:
5277 evcvttps2dqs(dst, src, vec_enc);
5278 evpmovdw(dst, dst, vec_enc);
5279 break;
5280 case T_BYTE:
5281 evcvttps2dqs(dst, src, vec_enc);
5282 evpmovdb(dst, dst, vec_enc);
5283 break;
5284 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt));
5285 }
5286 }
5287
5288 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5289 switch(to_elem_bt) {
5290 case T_LONG:
5291 evcvttpd2qqs(dst, src, vec_enc);
5292 break;
5293 case T_INT:
5294 evcvttpd2dqs(dst, src, vec_enc);
5295 break;
5296 case T_SHORT:
5297 evcvttpd2dqs(dst, src, vec_enc);
5298 evpmovdw(dst, dst, vec_enc);
5299 break;
5300 case T_BYTE:
5301 evcvttpd2dqs(dst, src, vec_enc);
5302 evpmovdb(dst, dst, vec_enc);
5303 break;
5304 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt));
5305 }
5306 }
5307
5308 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5309 switch(to_elem_bt) {
5310 case T_LONG:
5311 evcvttpd2qqs(dst, src, vec_enc);
5312 break;
5313 case T_INT:
5314 evcvttpd2dqs(dst, src, vec_enc);
5315 break;
5316 case T_SHORT:
5317 evcvttpd2dqs(dst, src, vec_enc);
5318 evpmovdw(dst, dst, vec_enc);
5319 break;
5320 case T_BYTE:
5321 evcvttpd2dqs(dst, src, vec_enc);
5322 evpmovdb(dst, dst, vec_enc);
5323 break;
5324 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt));
5325 }
5326 }
5327
5328 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5329 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5330 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5331 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5332 // and re-instantiate original MXCSR.RC mode after that.
5333 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5334
5335 mov64(tmp, julong_cast(0.5L));
5336 evpbroadcastq(xtmp1, tmp, vec_enc);
5337 vaddpd(xtmp1, src , xtmp1, vec_enc);
5338 evcvtpd2qq(dst, xtmp1, vec_enc);
5339 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5340 double_sign_flip, vec_enc);;
5341
5342 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5343 }
5344
5345 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5346 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5347 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5348 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5349 // and re-instantiate original MXCSR.RC mode after that.
5350 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5351
5352 movl(tmp, jint_cast(0.5));
5353 movq(xtmp1, tmp);
5354 vbroadcastss(xtmp1, xtmp1, vec_enc);
5355 vaddps(xtmp1, src , xtmp1, vec_enc);
5356 vcvtps2dq(dst, xtmp1, vec_enc);
5357 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5358 float_sign_flip, vec_enc);
5359
5360 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5361 }
5362
5363 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5364 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5365 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5366 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5367 // and re-instantiate original MXCSR.RC mode after that.
5368 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5369
5370 movl(tmp, jint_cast(0.5));
5371 movq(xtmp1, tmp);
5372 vbroadcastss(xtmp1, xtmp1, vec_enc);
5373 vaddps(xtmp1, src , xtmp1, vec_enc);
5374 vcvtps2dq(dst, xtmp1, vec_enc);
5375 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5376
5377 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5378 }
5379
5380 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5381 BasicType from_elem_bt, BasicType to_elem_bt) {
5382 switch (from_elem_bt) {
5383 case T_BYTE:
5384 switch (to_elem_bt) {
5385 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5386 case T_INT: vpmovzxbd(dst, src, vlen_enc); break;
5387 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break;
5388 default: ShouldNotReachHere();
5389 }
5390 break;
5391 case T_SHORT:
5392 switch (to_elem_bt) {
5393 case T_INT: vpmovzxwd(dst, src, vlen_enc); break;
5394 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5395 default: ShouldNotReachHere();
5396 }
5397 break;
5398 case T_INT:
5399 assert(to_elem_bt == T_LONG, "");
5400 vpmovzxdq(dst, src, vlen_enc);
5401 break;
5402 default:
5403 ShouldNotReachHere();
5404 }
5405 }
5406
5407 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5408 BasicType from_elem_bt, BasicType to_elem_bt) {
5409 switch (from_elem_bt) {
5410 case T_BYTE:
5411 switch (to_elem_bt) {
5412 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5413 case T_INT: vpmovsxbd(dst, src, vlen_enc); break;
5414 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break;
5415 default: ShouldNotReachHere();
5416 }
5417 break;
5418 case T_SHORT:
5419 switch (to_elem_bt) {
5420 case T_INT: vpmovsxwd(dst, src, vlen_enc); break;
5421 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5422 default: ShouldNotReachHere();
5423 }
5424 break;
5425 case T_INT:
5426 assert(to_elem_bt == T_LONG, "");
5427 vpmovsxdq(dst, src, vlen_enc);
5428 break;
5429 default:
5430 ShouldNotReachHere();
5431 }
5432 }
5433
5434 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5435 BasicType dst_bt, BasicType src_bt, int vlen) {
5436 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5437 assert(vlen_enc != AVX_512bit, "");
5438
5439 int dst_bt_size = type2aelembytes(dst_bt);
5440 int src_bt_size = type2aelembytes(src_bt);
5441 if (dst_bt_size > src_bt_size) {
5442 switch (dst_bt_size / src_bt_size) {
5443 case 2: vpmovsxbw(dst, src, vlen_enc); break;
5444 case 4: vpmovsxbd(dst, src, vlen_enc); break;
5445 case 8: vpmovsxbq(dst, src, vlen_enc); break;
5446 default: ShouldNotReachHere();
5447 }
5448 } else {
5449 assert(dst_bt_size < src_bt_size, "");
5450 switch (src_bt_size / dst_bt_size) {
5451 case 2: {
5452 if (vlen_enc == AVX_128bit) {
5453 vpacksswb(dst, src, src, vlen_enc);
5454 } else {
5455 vpacksswb(dst, src, src, vlen_enc);
5456 vpermq(dst, dst, 0x08, vlen_enc);
5457 }
5458 break;
5459 }
5460 case 4: {
5461 if (vlen_enc == AVX_128bit) {
5462 vpackssdw(dst, src, src, vlen_enc);
5463 vpacksswb(dst, dst, dst, vlen_enc);
5464 } else {
5465 vpackssdw(dst, src, src, vlen_enc);
5466 vpermq(dst, dst, 0x08, vlen_enc);
5467 vpacksswb(dst, dst, dst, AVX_128bit);
5468 }
5469 break;
5470 }
5471 case 8: {
5472 if (vlen_enc == AVX_128bit) {
5473 vpshufd(dst, src, 0x08, vlen_enc);
5474 vpackssdw(dst, dst, dst, vlen_enc);
5475 vpacksswb(dst, dst, dst, vlen_enc);
5476 } else {
5477 vpshufd(dst, src, 0x08, vlen_enc);
5478 vpermq(dst, dst, 0x08, vlen_enc);
5479 vpackssdw(dst, dst, dst, AVX_128bit);
5480 vpacksswb(dst, dst, dst, AVX_128bit);
5481 }
5482 break;
5483 }
5484 default: ShouldNotReachHere();
5485 }
5486 }
5487 }
5488
5489 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5490 bool merge, BasicType bt, int vlen_enc) {
5491 if (bt == T_INT) {
5492 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5493 } else {
5494 assert(bt == T_LONG, "");
5495 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5496 }
5497 }
5498
5499 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5500 bool merge, BasicType bt, int vlen_enc) {
5501 if (bt == T_INT) {
5502 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5503 } else {
5504 assert(bt == T_LONG, "");
5505 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5506 }
5507 }
5508
5509 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5510 Register rtmp2, XMMRegister xtmp, int mask_len,
5511 int vec_enc) {
5512 int index = 0;
5513 int vindex = 0;
5514 mov64(rtmp1, 0x0101010101010101L);
5515 pdepq(rtmp1, src, rtmp1);
5516 if (mask_len > 8) {
5517 movq(rtmp2, src);
5518 vpxor(xtmp, xtmp, xtmp, vec_enc);
5519 movq(xtmp, rtmp1);
5520 }
5521 movq(dst, rtmp1);
5522
5523 mask_len -= 8;
5524 while (mask_len > 0) {
5525 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5526 index++;
5527 if ((index % 2) == 0) {
5528 pxor(xtmp, xtmp);
5529 }
5530 mov64(rtmp1, 0x0101010101010101L);
5531 shrq(rtmp2, 8);
5532 pdepq(rtmp1, rtmp2, rtmp1);
5533 pinsrq(xtmp, rtmp1, index % 2);
5534 vindex = index / 2;
5535 if (vindex) {
5536 // Write entire 16 byte vector when both 64 bit
5537 // lanes are update to save redundant instructions.
5538 if (index % 2) {
5539 vinsertf128(dst, dst, xtmp, vindex);
5540 }
5541 } else {
5542 vmovdqu(dst, xtmp);
5543 }
5544 mask_len -= 8;
5545 }
5546 }
5547
5548 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5549 switch(opc) {
5550 case Op_VectorMaskTrueCount:
5551 popcntq(dst, tmp);
5552 break;
5553 case Op_VectorMaskLastTrue:
5554 if (VM_Version::supports_lzcnt()) {
5555 lzcntq(tmp, tmp);
5556 movl(dst, 63);
5557 subl(dst, tmp);
5558 } else {
5559 movl(dst, -1);
5560 bsrq(tmp, tmp);
5561 cmov32(Assembler::notZero, dst, tmp);
5562 }
5563 break;
5564 case Op_VectorMaskFirstTrue:
5565 if (VM_Version::supports_bmi1()) {
5566 if (masklen < 32) {
5567 orl(tmp, 1 << masklen);
5568 tzcntl(dst, tmp);
5569 } else if (masklen == 32) {
5570 tzcntl(dst, tmp);
5571 } else {
5572 assert(masklen == 64, "");
5573 tzcntq(dst, tmp);
5574 }
5575 } else {
5576 if (masklen < 32) {
5577 orl(tmp, 1 << masklen);
5578 bsfl(dst, tmp);
5579 } else {
5580 assert(masklen == 32 || masklen == 64, "");
5581 movl(dst, masklen);
5582 if (masklen == 32) {
5583 bsfl(tmp, tmp);
5584 } else {
5585 bsfq(tmp, tmp);
5586 }
5587 cmov32(Assembler::notZero, dst, tmp);
5588 }
5589 }
5590 break;
5591 case Op_VectorMaskToLong:
5592 assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5593 break;
5594 default: assert(false, "Unhandled mask operation");
5595 }
5596 }
5597
5598 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5599 int masklen, int masksize, int vec_enc) {
5600 assert(VM_Version::supports_popcnt(), "");
5601
5602 if(VM_Version::supports_avx512bw()) {
5603 kmovql(tmp, mask);
5604 } else {
5605 assert(masklen <= 16, "");
5606 kmovwl(tmp, mask);
5607 }
5608
5609 // Mask generated out of partial vector comparisons/replicate/mask manipulation
5610 // operations needs to be clipped.
5611 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5612 andq(tmp, (1 << masklen) - 1);
5613 }
5614
5615 vector_mask_operation_helper(opc, dst, tmp, masklen);
5616 }
5617
5618 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5619 Register tmp, int masklen, BasicType bt, int vec_enc) {
5620 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5621 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5622 assert(VM_Version::supports_popcnt(), "");
5623
5624 bool need_clip = false;
5625 switch(bt) {
5626 case T_BOOLEAN:
5627 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5628 vpxor(xtmp, xtmp, xtmp, vec_enc);
5629 vpsubb(xtmp, xtmp, mask, vec_enc);
5630 vpmovmskb(tmp, xtmp, vec_enc);
5631 need_clip = masklen < 16;
5632 break;
5633 case T_BYTE:
5634 vpmovmskb(tmp, mask, vec_enc);
5635 need_clip = masklen < 16;
5636 break;
5637 case T_SHORT:
5638 vpacksswb(xtmp, mask, mask, vec_enc);
5639 if (masklen >= 16) {
5640 vpermpd(xtmp, xtmp, 8, vec_enc);
5641 }
5642 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5643 need_clip = masklen < 16;
5644 break;
5645 case T_INT:
5646 case T_FLOAT:
5647 vmovmskps(tmp, mask, vec_enc);
5648 need_clip = masklen < 4;
5649 break;
5650 case T_LONG:
5651 case T_DOUBLE:
5652 vmovmskpd(tmp, mask, vec_enc);
5653 need_clip = masklen < 2;
5654 break;
5655 default: assert(false, "Unhandled type, %s", type2name(bt));
5656 }
5657
5658 // Mask generated out of partial vector comparisons/replicate/mask manipulation
5659 // operations needs to be clipped.
5660 if (need_clip && opc != Op_VectorMaskFirstTrue) {
5661 // need_clip implies masklen < 32
5662 andq(tmp, (1 << masklen) - 1);
5663 }
5664
5665 vector_mask_operation_helper(opc, dst, tmp, masklen);
5666 }
5667
5668 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5669 Register rtmp2, int mask_len) {
5670 kmov(rtmp1, src);
5671 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5672 mov64(rtmp2, -1L);
5673 pextq(rtmp2, rtmp2, rtmp1);
5674 kmov(dst, rtmp2);
5675 }
5676
5677 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5678 XMMRegister mask, Register rtmp, Register rscratch,
5679 XMMRegister permv, XMMRegister xtmp, BasicType bt,
5680 int vec_enc) {
5681 assert(type2aelembytes(bt) >= 4, "");
5682 assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5683 address compress_perm_table = nullptr;
5684 address expand_perm_table = nullptr;
5685 if (type2aelembytes(bt) == 8) {
5686 compress_perm_table = StubRoutines::x86::compress_perm_table64();
5687 expand_perm_table = StubRoutines::x86::expand_perm_table64();
5688 vmovmskpd(rtmp, mask, vec_enc);
5689 } else {
5690 compress_perm_table = StubRoutines::x86::compress_perm_table32();
5691 expand_perm_table = StubRoutines::x86::expand_perm_table32();
5692 vmovmskps(rtmp, mask, vec_enc);
5693 }
5694 shlq(rtmp, 5); // for 32 byte permute row.
5695 if (opcode == Op_CompressV) {
5696 lea(rscratch, ExternalAddress(compress_perm_table));
5697 } else {
5698 lea(rscratch, ExternalAddress(expand_perm_table));
5699 }
5700 addptr(rtmp, rscratch);
5701 vmovdqu(permv, Address(rtmp));
5702 vpermps(dst, permv, src, Assembler::AVX_256bit);
5703 vpxor(xtmp, xtmp, xtmp, vec_enc);
5704 // Blend the result with zero vector using permute mask, each column entry
5705 // in a permute table row contains either a valid permute index or a -1 (default)
5706 // value, this can potentially be used as a blending mask after
5707 // compressing/expanding the source vector lanes.
5708 vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5709 }
5710
5711 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5712 bool merge, BasicType bt, int vec_enc) {
5713 if (opcode == Op_CompressV) {
5714 switch(bt) {
5715 case T_BYTE:
5716 evpcompressb(dst, mask, src, merge, vec_enc);
5717 break;
5718 case T_CHAR:
5719 case T_SHORT:
5720 evpcompressw(dst, mask, src, merge, vec_enc);
5721 break;
5722 case T_INT:
5723 evpcompressd(dst, mask, src, merge, vec_enc);
5724 break;
5725 case T_FLOAT:
5726 evcompressps(dst, mask, src, merge, vec_enc);
5727 break;
5728 case T_LONG:
5729 evpcompressq(dst, mask, src, merge, vec_enc);
5730 break;
5731 case T_DOUBLE:
5732 evcompresspd(dst, mask, src, merge, vec_enc);
5733 break;
5734 default:
5735 fatal("Unsupported type %s", type2name(bt));
5736 break;
5737 }
5738 } else {
5739 assert(opcode == Op_ExpandV, "");
5740 switch(bt) {
5741 case T_BYTE:
5742 evpexpandb(dst, mask, src, merge, vec_enc);
5743 break;
5744 case T_CHAR:
5745 case T_SHORT:
5746 evpexpandw(dst, mask, src, merge, vec_enc);
5747 break;
5748 case T_INT:
5749 evpexpandd(dst, mask, src, merge, vec_enc);
5750 break;
5751 case T_FLOAT:
5752 evexpandps(dst, mask, src, merge, vec_enc);
5753 break;
5754 case T_LONG:
5755 evpexpandq(dst, mask, src, merge, vec_enc);
5756 break;
5757 case T_DOUBLE:
5758 evexpandpd(dst, mask, src, merge, vec_enc);
5759 break;
5760 default:
5761 fatal("Unsupported type %s", type2name(bt));
5762 break;
5763 }
5764 }
5765 }
5766
5767 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5768 KRegister ktmp1, int vec_enc) {
5769 if (opcode == Op_SignumVD) {
5770 vsubpd(dst, zero, one, vec_enc);
5771 // if src < 0 ? -1 : 1
5772 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5773 evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5774 // if src == NaN, -0.0 or 0.0 return src.
5775 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5776 evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5777 } else {
5778 assert(opcode == Op_SignumVF, "");
5779 vsubps(dst, zero, one, vec_enc);
5780 // if src < 0 ? -1 : 1
5781 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5782 evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5783 // if src == NaN, -0.0 or 0.0 return src.
5784 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5785 evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5786 }
5787 }
5788
5789 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5790 XMMRegister xtmp1, int vec_enc) {
5791 if (opcode == Op_SignumVD) {
5792 vsubpd(dst, zero, one, vec_enc);
5793 // if src < 0 ? -1 : 1
5794 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5795 // if src == NaN, -0.0 or 0.0 return src.
5796 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5797 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5798 } else {
5799 assert(opcode == Op_SignumVF, "");
5800 vsubps(dst, zero, one, vec_enc);
5801 // if src < 0 ? -1 : 1
5802 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5803 // if src == NaN, -0.0 or 0.0 return src.
5804 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5805 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5806 }
5807 }
5808
5809 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5810 if (VM_Version::supports_avx512bw()) {
5811 if (mask_len > 32) {
5812 kmovql(dst, src);
5813 } else {
5814 kmovdl(dst, src);
5815 if (mask_len != 32) {
5816 kshiftrdl(dst, dst, 32 - mask_len);
5817 }
5818 }
5819 } else {
5820 assert(mask_len <= 16, "");
5821 kmovwl(dst, src);
5822 if (mask_len != 16) {
5823 kshiftrwl(dst, dst, 16 - mask_len);
5824 }
5825 }
5826 }
5827
5828 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5829 int lane_size = type2aelembytes(bt);
5830 if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5831 (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5832 movptr(rtmp, imm32);
5833 switch(lane_size) {
5834 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5835 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5836 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5837 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5838 fatal("Unsupported lane size %d", lane_size);
5839 break;
5840 }
5841 } else {
5842 movptr(rtmp, imm32);
5843 movq(dst, rtmp);
5844 switch(lane_size) {
5845 case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5846 case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5847 case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5848 case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5849 fatal("Unsupported lane size %d", lane_size);
5850 break;
5851 }
5852 }
5853 }
5854
5855 //
5856 // Following is lookup table based popcount computation algorithm:-
5857 // Index Bit set count
5858 // [ 0000 -> 0,
5859 // 0001 -> 1,
5860 // 0010 -> 1,
5861 // 0011 -> 2,
5862 // 0100 -> 1,
5863 // 0101 -> 2,
5864 // 0110 -> 2,
5865 // 0111 -> 3,
5866 // 1000 -> 1,
5867 // 1001 -> 2,
5868 // 1010 -> 3,
5869 // 1011 -> 3,
5870 // 1100 -> 2,
5871 // 1101 -> 3,
5872 // 1111 -> 4 ]
5873 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5874 // shuffle indices for lookup table access.
5875 // b. Right shift each byte of vector lane by 4 positions.
5876 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5877 // shuffle indices for lookup table access.
5878 // d. Add the bitset count of upper and lower 4 bits of each byte.
5879 // e. Unpack double words to quad words and compute sum of absolute difference of bitset
5880 // count of all the bytes of a quadword.
5881 // f. Perform step e. for upper 128bit vector lane.
5882 // g. Pack the bitset count of quadwords back to double word.
5883 // h. Unpacking and packing operations are not needed for 64bit vector lane.
5884
5885 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5886 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5887 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5888 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5889 vpsrlw(dst, src, 4, vec_enc);
5890 vpand(dst, dst, xtmp1, vec_enc);
5891 vpand(xtmp1, src, xtmp1, vec_enc);
5892 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5893 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5894 vpshufb(dst, xtmp2, dst, vec_enc);
5895 vpaddb(dst, dst, xtmp1, vec_enc);
5896 }
5897
5898 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5899 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5900 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5901 // Following code is as per steps e,f,g and h of above algorithm.
5902 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5903 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5904 vpsadbw(dst, dst, xtmp2, vec_enc);
5905 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5906 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5907 vpackuswb(dst, xtmp1, dst, vec_enc);
5908 }
5909
5910 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5911 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5912 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5913 // Add the popcount of upper and lower bytes of word.
5914 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5915 vpsrlw(dst, xtmp1, 8, vec_enc);
5916 vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5917 vpaddw(dst, dst, xtmp1, vec_enc);
5918 }
5919
5920 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5921 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5922 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5923 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5924 vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5925 }
5926
5927 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5928 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5929 switch(bt) {
5930 case T_LONG:
5931 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5932 break;
5933 case T_INT:
5934 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5935 break;
5936 case T_CHAR:
5937 case T_SHORT:
5938 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5939 break;
5940 case T_BYTE:
5941 case T_BOOLEAN:
5942 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5943 break;
5944 default:
5945 fatal("Unsupported type %s", type2name(bt));
5946 break;
5947 }
5948 }
5949
5950 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5951 KRegister mask, bool merge, int vec_enc) {
5952 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5953 switch(bt) {
5954 case T_LONG:
5955 assert(VM_Version::supports_avx512_vpopcntdq(), "");
5956 evpopcntq(dst, mask, src, merge, vec_enc);
5957 break;
5958 case T_INT:
5959 assert(VM_Version::supports_avx512_vpopcntdq(), "");
5960 evpopcntd(dst, mask, src, merge, vec_enc);
5961 break;
5962 case T_CHAR:
5963 case T_SHORT:
5964 assert(VM_Version::supports_avx512_bitalg(), "");
5965 evpopcntw(dst, mask, src, merge, vec_enc);
5966 break;
5967 case T_BYTE:
5968 case T_BOOLEAN:
5969 assert(VM_Version::supports_avx512_bitalg(), "");
5970 evpopcntb(dst, mask, src, merge, vec_enc);
5971 break;
5972 default:
5973 fatal("Unsupported type %s", type2name(bt));
5974 break;
5975 }
5976 }
5977
5978 // Bit reversal algorithm first reverses the bits of each byte followed by
5979 // a byte level reversal for multi-byte primitive types (short/int/long).
5980 // Algorithm performs a lookup table access to get reverse bit sequence
5981 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
5982 // is obtained by swapping the reverse bit sequences of upper and lower
5983 // nibble of a byte.
5984 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5985 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5986 if (VM_Version::supports_avx512vlbw()) {
5987
5988 // Get the reverse bit sequence of lower nibble of each byte.
5989 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
5990 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
5991 evpandq(dst, xtmp2, src, vec_enc);
5992 vpshufb(dst, xtmp1, dst, vec_enc);
5993 vpsllq(dst, dst, 4, vec_enc);
5994
5995 // Get the reverse bit sequence of upper nibble of each byte.
5996 vpandn(xtmp2, xtmp2, src, vec_enc);
5997 vpsrlq(xtmp2, xtmp2, 4, vec_enc);
5998 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
5999
6000 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6001 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6002 evporq(xtmp2, dst, xtmp2, vec_enc);
6003 vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6004
6005 } else if(vec_enc == Assembler::AVX_512bit) {
6006 // Shift based bit reversal.
6007 assert(bt == T_LONG || bt == T_INT, "");
6008
6009 // Swap lower and upper nibble of each byte.
6010 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
6011
6012 // Swap two least and most significant bits of each nibble.
6013 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
6014
6015 // Swap adjacent pair of bits.
6016 evmovdqul(xtmp1, k0, dst, true, vec_enc);
6017 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
6018
6019 evmovdqul(xtmp1, k0, dst, true, vec_enc);
6020 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
6021 } else {
6022 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
6023 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6024
6025 // Get the reverse bit sequence of lower nibble of each byte.
6026 vpand(dst, xtmp2, src, vec_enc);
6027 vpshufb(dst, xtmp1, dst, vec_enc);
6028 vpsllq(dst, dst, 4, vec_enc);
6029
6030 // Get the reverse bit sequence of upper nibble of each byte.
6031 vpandn(xtmp2, xtmp2, src, vec_enc);
6032 vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6033 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6034
6035 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6036 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6037 vpor(xtmp2, dst, xtmp2, vec_enc);
6038 vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6039 }
6040 }
6041
6042 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6043 XMMRegister xtmp, Register rscratch) {
6044 assert(VM_Version::supports_gfni(), "");
6045 assert(rscratch != noreg || always_reachable(mask), "missing");
6046
6047 // Galois field instruction based bit reversal based on following algorithm.
6048 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6049 vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6050 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6051 vector_reverse_byte(bt, dst, xtmp, vec_enc);
6052 }
6053
6054 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6055 XMMRegister xtmp1, Register rtmp, int vec_enc) {
6056 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6057 evpandq(dst, xtmp1, src, vec_enc);
6058 vpsllq(dst, dst, nbits, vec_enc);
6059 vpandn(xtmp1, xtmp1, src, vec_enc);
6060 vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6061 evporq(dst, dst, xtmp1, vec_enc);
6062 }
6063
6064 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6065 XMMRegister xtmp2, Register rtmp, int vec_enc) {
6066 // Shift based bit reversal.
6067 assert(VM_Version::supports_evex(), "");
6068 switch(bt) {
6069 case T_LONG:
6070 // Swap upper and lower double word of each quad word.
6071 evprorq(xtmp1, k0, src, 32, true, vec_enc);
6072 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6073 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6074 break;
6075 case T_INT:
6076 // Swap upper and lower word of each double word.
6077 evprord(xtmp1, k0, src, 16, true, vec_enc);
6078 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6079 break;
6080 case T_CHAR:
6081 case T_SHORT:
6082 // Swap upper and lower byte of each word.
6083 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6084 break;
6085 case T_BYTE:
6086 evmovdquq(dst, k0, src, true, vec_enc);
6087 break;
6088 default:
6089 fatal("Unsupported type %s", type2name(bt));
6090 break;
6091 }
6092 }
6093
6094 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6095 if (bt == T_BYTE) {
6096 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6097 evmovdquq(dst, k0, src, true, vec_enc);
6098 } else {
6099 vmovdqu(dst, src);
6100 }
6101 return;
6102 }
6103 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6104 // pre-computed shuffle indices.
6105 switch(bt) {
6106 case T_LONG:
6107 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6108 break;
6109 case T_INT:
6110 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6111 break;
6112 case T_CHAR:
6113 case T_SHORT:
6114 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6115 break;
6116 default:
6117 fatal("Unsupported type %s", type2name(bt));
6118 break;
6119 }
6120 vpshufb(dst, src, dst, vec_enc);
6121 }
6122
6123 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6124 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6125 KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6126 assert(is_integral_type(bt), "");
6127 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6128 assert(VM_Version::supports_avx512cd(), "");
6129 switch(bt) {
6130 case T_LONG:
6131 evplzcntq(dst, ktmp, src, merge, vec_enc);
6132 break;
6133 case T_INT:
6134 evplzcntd(dst, ktmp, src, merge, vec_enc);
6135 break;
6136 case T_SHORT:
6137 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6138 vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6139 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6140 vpunpckhwd(dst, xtmp1, src, vec_enc);
6141 evplzcntd(dst, ktmp, dst, merge, vec_enc);
6142 vpackusdw(dst, xtmp2, dst, vec_enc);
6143 break;
6144 case T_BYTE:
6145 // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6146 // accessing the lookup table.
6147 // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6148 // accessing the lookup table.
6149 // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6150 assert(VM_Version::supports_avx512bw(), "");
6151 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6152 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6153 vpand(xtmp2, dst, src, vec_enc);
6154 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6155 vpsrlw(xtmp3, src, 4, vec_enc);
6156 vpand(xtmp3, dst, xtmp3, vec_enc);
6157 vpshufb(dst, xtmp1, xtmp3, vec_enc);
6158 vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6159 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6160 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6161 break;
6162 default:
6163 fatal("Unsupported type %s", type2name(bt));
6164 break;
6165 }
6166 }
6167
6168 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6169 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6170 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6171 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6172 // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6173 // accessing the lookup table.
6174 vpand(dst, xtmp2, src, vec_enc);
6175 vpshufb(dst, xtmp1, dst, vec_enc);
6176 // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6177 // accessing the lookup table.
6178 vpsrlw(xtmp3, src, 4, vec_enc);
6179 vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6180 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6181 // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6182 vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6183 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6184 vpaddb(dst, dst, xtmp2, vec_enc);
6185 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6186 }
6187
6188 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6189 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6190 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6191 // Add zero counts of lower byte and upper byte of a word if
6192 // upper byte holds a zero value.
6193 vpsrlw(xtmp3, src, 8, vec_enc);
6194 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6195 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6196 vpsllw(xtmp2, dst, 8, vec_enc);
6197 vpaddw(xtmp2, xtmp2, dst, vec_enc);
6198 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6199 vpsrlw(dst, dst, 8, vec_enc);
6200 }
6201
6202 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6203 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6204 // By converting the integer to a float, we can obtain the number of leading zeros based on the exponent of the float.
6205 // As the float exponent contains a bias of 127 for nonzero values, the bias must be removed before interpreting the
6206 // exponent as the leading zero count.
6207
6208 // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6209 // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6210 // contributes to the leading number of zeros.
6211 vpsrld(dst, src, 1, vec_enc);
6212 vpandn(dst, dst, src, vec_enc);
6213
6214 vcvtdq2ps(dst, dst, vec_enc);
6215
6216 // By comparing the register to itself, all the bits in the destination are set.
6217 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6218
6219 // Move the biased exponent to the low end of the lane and mask with 0xFF to discard the sign bit.
6220 vpsrld(xtmp2, xtmp1, 24, vec_enc);
6221 vpsrld(dst, dst, 23, vec_enc);
6222 vpand(dst, xtmp2, dst, vec_enc);
6223
6224 // Subtract 127 from the exponent, which removes the bias from the exponent.
6225 vpsrld(xtmp2, xtmp1, 25, vec_enc);
6226 vpsubd(dst, dst, xtmp2, vec_enc);
6227
6228 vpsrld(xtmp2, xtmp1, 27, vec_enc);
6229
6230 // If the original value is 0 the exponent would not have bias, so the subtraction creates a negative number. If this
6231 // is found in any of the lanes, replace the lane with -1 from xtmp1.
6232 vblendvps(dst, dst, xtmp1, dst, vec_enc, true, xtmp3);
6233
6234 // If the original value is negative, replace the lane with 31.
6235 vblendvps(dst, dst, xtmp2, src, vec_enc, true, xtmp3);
6236
6237 // Subtract the exponent from 31, giving the final result. For 0, the result is 32 as the exponent was replaced with -1,
6238 // and for negative numbers the result is 0 as the exponent was replaced with 31.
6239 vpsubd(dst, xtmp2, dst, vec_enc);
6240 }
6241
6242 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6243 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6244 // Find the leading zeros of the top and bottom halves of the long individually.
6245 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6246
6247 // Move the top half result to the bottom half of xtmp1, setting the top half to 0.
6248 vpsrlq(xtmp1, dst, 32, vec_enc);
6249 // By moving the top half result to the right by 6 bits, if the top half was empty (i.e. 32 is returned) the result bit will
6250 // be in the most significant position of the bottom half.
6251 vpsrlq(xtmp2, dst, 6, vec_enc);
6252
6253 // In the bottom half, add the top half and bottom half results.
6254 vpaddq(dst, xtmp1, dst, vec_enc);
6255
6256 // For the bottom half, choose between the values using the most significant bit of xtmp2.
6257 // If the MSB is set, then bottom+top in dst is the resulting value. If the top half is less than 32 xtmp1 is chosen,
6258 // which contains only the top half result.
6259 // In the top half the MSB is always zero, so the value in xtmp1 is always chosen. This value is always 0, which clears
6260 // the lane as required.
6261 vblendvps(dst, xtmp1, dst, xtmp2, vec_enc, true, xtmp3);
6262 }
6263
6264 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6265 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6266 Register rtmp, int vec_enc) {
6267 assert(is_integral_type(bt), "unexpected type");
6268 assert(vec_enc < Assembler::AVX_512bit, "");
6269 switch(bt) {
6270 case T_LONG:
6271 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6272 break;
6273 case T_INT:
6274 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6275 break;
6276 case T_SHORT:
6277 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6278 break;
6279 case T_BYTE:
6280 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6281 break;
6282 default:
6283 fatal("Unsupported type %s", type2name(bt));
6284 break;
6285 }
6286 }
6287
6288 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6289 switch(bt) {
6290 case T_BYTE:
6291 vpsubb(dst, src1, src2, vec_enc);
6292 break;
6293 case T_SHORT:
6294 vpsubw(dst, src1, src2, vec_enc);
6295 break;
6296 case T_INT:
6297 vpsubd(dst, src1, src2, vec_enc);
6298 break;
6299 case T_LONG:
6300 vpsubq(dst, src1, src2, vec_enc);
6301 break;
6302 default:
6303 fatal("Unsupported type %s", type2name(bt));
6304 break;
6305 }
6306 }
6307
6308 // Trailing zero count computation is based on leading zero count operation as per
6309 // following equation. All AVX3 targets support AVX512CD feature which offers
6310 // direct vector instruction to compute leading zero count.
6311 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6312 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6313 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6314 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6315 assert(is_integral_type(bt), "");
6316 // xtmp = -1
6317 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6318 // xtmp = xtmp + src
6319 vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6320 // xtmp = xtmp & ~src
6321 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6322 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6323 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6324 vpsub(bt, dst, xtmp4, dst, vec_enc);
6325 }
6326
6327 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6328 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6329 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6330 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6331 assert(is_integral_type(bt), "");
6332 // xtmp = 0
6333 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6334 // xtmp = 0 - src
6335 vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6336 // xtmp = xtmp | src
6337 vpor(xtmp3, xtmp3, src, vec_enc);
6338 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6339 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6340 vpsub(bt, dst, xtmp1, dst, vec_enc);
6341 }
6342
6343 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6344 Label done;
6345 Label neg_divisor_fastpath;
6346 cmpl(divisor, 0);
6347 jccb(Assembler::less, neg_divisor_fastpath);
6348 xorl(rdx, rdx);
6349 divl(divisor);
6350 jmpb(done);
6351 bind(neg_divisor_fastpath);
6352 // Fastpath for divisor < 0:
6353 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6354 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6355 movl(rdx, rax);
6356 subl(rdx, divisor);
6357 if (VM_Version::supports_bmi1()) {
6358 andnl(rax, rdx, rax);
6359 } else {
6360 notl(rdx);
6361 andl(rax, rdx);
6362 }
6363 shrl(rax, 31);
6364 bind(done);
6365 }
6366
6367 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6368 Label done;
6369 Label neg_divisor_fastpath;
6370 cmpl(divisor, 0);
6371 jccb(Assembler::less, neg_divisor_fastpath);
6372 xorl(rdx, rdx);
6373 divl(divisor);
6374 jmpb(done);
6375 bind(neg_divisor_fastpath);
6376 // Fastpath when divisor < 0:
6377 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6378 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6379 movl(rdx, rax);
6380 subl(rax, divisor);
6381 if (VM_Version::supports_bmi1()) {
6382 andnl(rax, rax, rdx);
6383 } else {
6384 notl(rax);
6385 andl(rax, rdx);
6386 }
6387 sarl(rax, 31);
6388 andl(rax, divisor);
6389 subl(rdx, rax);
6390 bind(done);
6391 }
6392
6393 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6394 Label done;
6395 Label neg_divisor_fastpath;
6396
6397 cmpl(divisor, 0);
6398 jccb(Assembler::less, neg_divisor_fastpath);
6399 xorl(rdx, rdx);
6400 divl(divisor);
6401 jmpb(done);
6402 bind(neg_divisor_fastpath);
6403 // Fastpath for divisor < 0:
6404 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6405 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6406 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6407 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6408 movl(rdx, rax);
6409 subl(rax, divisor);
6410 if (VM_Version::supports_bmi1()) {
6411 andnl(rax, rax, rdx);
6412 } else {
6413 notl(rax);
6414 andl(rax, rdx);
6415 }
6416 movl(tmp, rax);
6417 shrl(rax, 31); // quotient
6418 sarl(tmp, 31);
6419 andl(tmp, divisor);
6420 subl(rdx, tmp); // remainder
6421 bind(done);
6422 }
6423
6424 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6425 XMMRegister xtmp2, Register rtmp) {
6426 if(VM_Version::supports_gfni()) {
6427 // Galois field instruction based bit reversal based on following algorithm.
6428 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6429 mov64(rtmp, 0x8040201008040201L);
6430 movq(xtmp1, src);
6431 movq(xtmp2, rtmp);
6432 gf2p8affineqb(xtmp1, xtmp2, 0);
6433 movq(dst, xtmp1);
6434 } else {
6435 // Swap even and odd numbered bits.
6436 movl(rtmp, src);
6437 andl(rtmp, 0x55555555);
6438 shll(rtmp, 1);
6439 movl(dst, src);
6440 andl(dst, 0xAAAAAAAA);
6441 shrl(dst, 1);
6442 orl(dst, rtmp);
6443
6444 // Swap LSB and MSB 2 bits of each nibble.
6445 movl(rtmp, dst);
6446 andl(rtmp, 0x33333333);
6447 shll(rtmp, 2);
6448 andl(dst, 0xCCCCCCCC);
6449 shrl(dst, 2);
6450 orl(dst, rtmp);
6451
6452 // Swap LSB and MSB 4 bits of each byte.
6453 movl(rtmp, dst);
6454 andl(rtmp, 0x0F0F0F0F);
6455 shll(rtmp, 4);
6456 andl(dst, 0xF0F0F0F0);
6457 shrl(dst, 4);
6458 orl(dst, rtmp);
6459 }
6460 bswapl(dst);
6461 }
6462
6463 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6464 XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6465 if(VM_Version::supports_gfni()) {
6466 // Galois field instruction based bit reversal based on following algorithm.
6467 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6468 mov64(rtmp1, 0x8040201008040201L);
6469 movq(xtmp1, src);
6470 movq(xtmp2, rtmp1);
6471 gf2p8affineqb(xtmp1, xtmp2, 0);
6472 movq(dst, xtmp1);
6473 } else {
6474 // Swap even and odd numbered bits.
6475 movq(rtmp1, src);
6476 mov64(rtmp2, 0x5555555555555555L);
6477 andq(rtmp1, rtmp2);
6478 shlq(rtmp1, 1);
6479 movq(dst, src);
6480 notq(rtmp2);
6481 andq(dst, rtmp2);
6482 shrq(dst, 1);
6483 orq(dst, rtmp1);
6484
6485 // Swap LSB and MSB 2 bits of each nibble.
6486 movq(rtmp1, dst);
6487 mov64(rtmp2, 0x3333333333333333L);
6488 andq(rtmp1, rtmp2);
6489 shlq(rtmp1, 2);
6490 notq(rtmp2);
6491 andq(dst, rtmp2);
6492 shrq(dst, 2);
6493 orq(dst, rtmp1);
6494
6495 // Swap LSB and MSB 4 bits of each byte.
6496 movq(rtmp1, dst);
6497 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6498 andq(rtmp1, rtmp2);
6499 shlq(rtmp1, 4);
6500 notq(rtmp2);
6501 andq(dst, rtmp2);
6502 shrq(dst, 4);
6503 orq(dst, rtmp1);
6504 }
6505 bswapq(dst);
6506 }
6507
6508 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6509 Label done;
6510 Label neg_divisor_fastpath;
6511 cmpq(divisor, 0);
6512 jccb(Assembler::less, neg_divisor_fastpath);
6513 xorl(rdx, rdx);
6514 divq(divisor);
6515 jmpb(done);
6516 bind(neg_divisor_fastpath);
6517 // Fastpath for divisor < 0:
6518 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6519 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6520 movq(rdx, rax);
6521 subq(rdx, divisor);
6522 if (VM_Version::supports_bmi1()) {
6523 andnq(rax, rdx, rax);
6524 } else {
6525 notq(rdx);
6526 andq(rax, rdx);
6527 }
6528 shrq(rax, 63);
6529 bind(done);
6530 }
6531
6532 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6533 Label done;
6534 Label neg_divisor_fastpath;
6535 cmpq(divisor, 0);
6536 jccb(Assembler::less, neg_divisor_fastpath);
6537 xorq(rdx, rdx);
6538 divq(divisor);
6539 jmp(done);
6540 bind(neg_divisor_fastpath);
6541 // Fastpath when divisor < 0:
6542 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6543 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6544 movq(rdx, rax);
6545 subq(rax, divisor);
6546 if (VM_Version::supports_bmi1()) {
6547 andnq(rax, rax, rdx);
6548 } else {
6549 notq(rax);
6550 andq(rax, rdx);
6551 }
6552 sarq(rax, 63);
6553 andq(rax, divisor);
6554 subq(rdx, rax);
6555 bind(done);
6556 }
6557
6558 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6559 Label done;
6560 Label neg_divisor_fastpath;
6561 cmpq(divisor, 0);
6562 jccb(Assembler::less, neg_divisor_fastpath);
6563 xorq(rdx, rdx);
6564 divq(divisor);
6565 jmp(done);
6566 bind(neg_divisor_fastpath);
6567 // Fastpath for divisor < 0:
6568 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6569 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6570 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6571 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6572 movq(rdx, rax);
6573 subq(rax, divisor);
6574 if (VM_Version::supports_bmi1()) {
6575 andnq(rax, rax, rdx);
6576 } else {
6577 notq(rax);
6578 andq(rax, rdx);
6579 }
6580 movq(tmp, rax);
6581 shrq(rax, 63); // quotient
6582 sarq(tmp, 63);
6583 andq(tmp, divisor);
6584 subq(rdx, tmp); // remainder
6585 bind(done);
6586 }
6587
6588 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6589 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6590 int vlen_enc) {
6591 assert(VM_Version::supports_avx512bw(), "");
6592 // Byte shuffles are inlane operations and indices are determined using
6593 // lower 4 bit of each shuffle lane, thus all shuffle indices are
6594 // normalized to index range 0-15. This makes sure that all the multiples
6595 // of an index value are placed at same relative position in 128 bit
6596 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6597 // will be 16th element in their respective 128 bit lanes.
6598 movl(rtmp, 16);
6599 evpbroadcastb(xtmp1, rtmp, vlen_enc);
6600
6601 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6602 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6603 // original shuffle indices and move the shuffled lanes corresponding to true
6604 // mask to destination vector.
6605 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6606 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6607 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6608
6609 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6610 // and broadcasting second 128 bit lane.
6611 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
6612 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6613 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6614 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6615 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6616
6617 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6618 // and broadcasting third 128 bit lane.
6619 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc);
6620 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6621 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6622 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6623 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6624
6625 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6626 // and broadcasting third 128 bit lane.
6627 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
6628 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6629 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6630 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6631 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6632 }
6633
6634 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6635 XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6636 if (vlen_enc == AVX_128bit) {
6637 vpermilps(dst, src, shuffle, vlen_enc);
6638 } else if (bt == T_INT) {
6639 vpermd(dst, shuffle, src, vlen_enc);
6640 } else {
6641 assert(bt == T_FLOAT, "");
6642 vpermps(dst, shuffle, src, vlen_enc);
6643 }
6644 }
6645
6646 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6647 switch(opcode) {
6648 case Op_AddHF: vaddsh(dst, src1, src2); break;
6649 case Op_SubHF: vsubsh(dst, src1, src2); break;
6650 case Op_MulHF: vmulsh(dst, src1, src2); break;
6651 case Op_DivHF: vdivsh(dst, src1, src2); break;
6652 default: assert(false, "%s", NodeClassNames[opcode]); break;
6653 }
6654 }
6655
6656 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6657 switch(elem_bt) {
6658 case T_BYTE:
6659 if (ideal_opc == Op_SaturatingAddV) {
6660 vpaddsb(dst, src1, src2, vlen_enc);
6661 } else {
6662 assert(ideal_opc == Op_SaturatingSubV, "");
6663 vpsubsb(dst, src1, src2, vlen_enc);
6664 }
6665 break;
6666 case T_SHORT:
6667 if (ideal_opc == Op_SaturatingAddV) {
6668 vpaddsw(dst, src1, src2, vlen_enc);
6669 } else {
6670 assert(ideal_opc == Op_SaturatingSubV, "");
6671 vpsubsw(dst, src1, src2, vlen_enc);
6672 }
6673 break;
6674 default:
6675 fatal("Unsupported type %s", type2name(elem_bt));
6676 break;
6677 }
6678 }
6679
6680 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6681 switch(elem_bt) {
6682 case T_BYTE:
6683 if (ideal_opc == Op_SaturatingAddV) {
6684 vpaddusb(dst, src1, src2, vlen_enc);
6685 } else {
6686 assert(ideal_opc == Op_SaturatingSubV, "");
6687 vpsubusb(dst, src1, src2, vlen_enc);
6688 }
6689 break;
6690 case T_SHORT:
6691 if (ideal_opc == Op_SaturatingAddV) {
6692 vpaddusw(dst, src1, src2, vlen_enc);
6693 } else {
6694 assert(ideal_opc == Op_SaturatingSubV, "");
6695 vpsubusw(dst, src1, src2, vlen_enc);
6696 }
6697 break;
6698 default:
6699 fatal("Unsupported type %s", type2name(elem_bt));
6700 break;
6701 }
6702 }
6703
6704 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6705 XMMRegister src2, KRegister ktmp, int vlen_enc) {
6706 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6707 // overflow_mask = Inp1 <u Inp2
6708 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc);
6709 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6710 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6711 }
6712
6713 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6714 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6715 // Emulate unsigned comparison using signed comparison
6716 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6717 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6718 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6719 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6720
6721 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6722
6723 // Res = INP1 - INP2 (non-commutative and non-associative)
6724 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6725 // Res = Mask ? Zero : Res
6726 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6727 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6728 }
6729
6730 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6731 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6732 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6733 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6734 // Res = Signed Add INP1, INP2
6735 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6736 // T1 = SRC1 | SRC2
6737 vpor(xtmp1, src1, src2, vlen_enc);
6738 // Max_Unsigned = -1
6739 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6740 // Unsigned compare: Mask = Res <u T1
6741 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6742 // res = Mask ? Max_Unsigned : Res
6743 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc);
6744 }
6745
6746 //
6747 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6748 // unsigned addition operation.
6749 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6750 //
6751 // We empirically determined its semantic equivalence to following reduced expression
6752 // overflow_mask = (a + b) <u (a | b)
6753 //
6754 // and also verified it though Alive2 solver.
6755 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6756 //
6757
6758 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6759 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6760 // Res = Signed Add INP1, INP2
6761 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6762 // Compute T1 = INP1 | INP2
6763 vpor(xtmp3, src1, src2, vlen_enc);
6764 // T1 = Minimum signed value.
6765 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6766 // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6767 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6768 // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6769 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6770 // Compute overflow detection mask = Res<1> <s T1
6771 if (elem_bt == T_INT) {
6772 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6773 } else {
6774 assert(elem_bt == T_LONG, "");
6775 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6776 }
6777 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6778 }
6779
6780 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6781 int vlen_enc, bool xtmp2_hold_M1) {
6782 if (VM_Version::supports_avx512dq()) {
6783 evpmovq2m(ktmp, src, vlen_enc);
6784 } else {
6785 assert(VM_Version::supports_evex(), "");
6786 if (!xtmp2_hold_M1) {
6787 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6788 }
6789 evpsraq(xtmp1, src, 63, vlen_enc);
6790 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6791 }
6792 }
6793
6794 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6795 int vlen_enc, bool xtmp2_hold_M1) {
6796 if (VM_Version::supports_avx512dq()) {
6797 evpmovd2m(ktmp, src, vlen_enc);
6798 } else {
6799 assert(VM_Version::supports_evex(), "");
6800 if (!xtmp2_hold_M1) {
6801 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6802 }
6803 vpsrad(xtmp1, src, 31, vlen_enc);
6804 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6805 }
6806 }
6807
6808
6809 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6810 if (elem_bt == T_LONG) {
6811 if (VM_Version::supports_evex()) {
6812 evpsraq(dst, src, 63, vlen_enc);
6813 } else {
6814 vpsrad(dst, src, 31, vlen_enc);
6815 vpshufd(dst, dst, 0xF5, vlen_enc);
6816 }
6817 } else {
6818 assert(elem_bt == T_INT, "");
6819 vpsrad(dst, src, 31, vlen_enc);
6820 }
6821 }
6822
6823 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6824 if (compute_allones) {
6825 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6826 vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6827 } else {
6828 vpcmpeqq(allones, allones, allones, vlen_enc);
6829 }
6830 }
6831 if (elem_bt == T_LONG) {
6832 vpsrlq(dst, allones, 1, vlen_enc);
6833 } else {
6834 assert(elem_bt == T_INT, "");
6835 vpsrld(dst, allones, 1, vlen_enc);
6836 }
6837 }
6838
6839 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6840 if (compute_allones) {
6841 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6842 vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6843 } else {
6844 vpcmpeqq(allones, allones, allones, vlen_enc);
6845 }
6846 }
6847 if (elem_bt == T_LONG) {
6848 vpsllq(dst, allones, 63, vlen_enc);
6849 } else {
6850 assert(elem_bt == T_INT, "");
6851 vpslld(dst, allones, 31, vlen_enc);
6852 }
6853 }
6854
6855 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2,
6856 Assembler::ComparisonPredicate cond, int vlen_enc) {
6857 switch(elem_bt) {
6858 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6859 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6860 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6861 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6862 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6863 }
6864 }
6865
6866 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6867 switch(elem_bt) {
6868 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break;
6869 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break;
6870 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6871 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break;
6872 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6873 }
6874 }
6875
6876 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6877 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6878 if (elem_bt == T_LONG) {
6879 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6880 } else {
6881 assert(elem_bt == T_INT, "");
6882 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6883 }
6884 }
6885
6886 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6887 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6888 KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6889 assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6890 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6891 // Overflow detection based on Hacker's delight section 2-13.
6892 if (ideal_opc == Op_SaturatingAddV) {
6893 // res = src1 + src2
6894 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6895 // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6896 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6897 vpxor(xtmp1, dst, src1, vlen_enc);
6898 vpxor(xtmp2, dst, src2, vlen_enc);
6899 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6900 } else {
6901 assert(ideal_opc == Op_SaturatingSubV, "");
6902 // res = src1 - src2
6903 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6904 // Overflow occurs when both inputs have opposite polarity and
6905 // result polarity does not comply with first input polarity.
6906 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6907 vpxor(xtmp1, src1, src2, vlen_enc);
6908 vpxor(xtmp2, dst, src1, vlen_enc);
6909 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6910 }
6911
6912 // Compute overflow detection mask.
6913 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6914 // Note: xtmp1 hold -1 in all its lanes after above call.
6915
6916 // Compute mask based on first input polarity.
6917 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6918
6919 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6920 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6921
6922 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6923 // set bits in first input polarity mask holds a min value.
6924 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6925 // Blend destination lanes with saturated values using overflow detection mask.
6926 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6927 }
6928
6929
6930 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6931 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6932 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6933 assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6934 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6935 // Overflow detection based on Hacker's delight section 2-13.
6936 if (ideal_opc == Op_SaturatingAddV) {
6937 // res = src1 + src2
6938 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6939 // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6940 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6941 vpxor(xtmp1, dst, src1, vlen_enc);
6942 vpxor(xtmp2, dst, src2, vlen_enc);
6943 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6944 } else {
6945 assert(ideal_opc == Op_SaturatingSubV, "");
6946 // res = src1 - src2
6947 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6948 // Overflow occurs when both inputs have opposite polarity and
6949 // result polarity does not comply with first input polarity.
6950 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6951 vpxor(xtmp1, src1, src2, vlen_enc);
6952 vpxor(xtmp2, dst, src1, vlen_enc);
6953 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6954 }
6955
6956 // Sign-extend to compute overflow detection mask.
6957 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6958
6959 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6960 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6961 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6962
6963 // Compose saturating min/max vector using first input polarity mask.
6964 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6965 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6966
6967 // Blend result with saturating vector using overflow detection mask.
6968 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6969 }
6970
6971 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6972 switch(elem_bt) {
6973 case T_BYTE:
6974 if (ideal_opc == Op_SaturatingAddV) {
6975 vpaddsb(dst, src1, src2, vlen_enc);
6976 } else {
6977 assert(ideal_opc == Op_SaturatingSubV, "");
6978 vpsubsb(dst, src1, src2, vlen_enc);
6979 }
6980 break;
6981 case T_SHORT:
6982 if (ideal_opc == Op_SaturatingAddV) {
6983 vpaddsw(dst, src1, src2, vlen_enc);
6984 } else {
6985 assert(ideal_opc == Op_SaturatingSubV, "");
6986 vpsubsw(dst, src1, src2, vlen_enc);
6987 }
6988 break;
6989 default:
6990 fatal("Unsupported type %s", type2name(elem_bt));
6991 break;
6992 }
6993 }
6994
6995 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6996 switch(elem_bt) {
6997 case T_BYTE:
6998 if (ideal_opc == Op_SaturatingAddV) {
6999 vpaddusb(dst, src1, src2, vlen_enc);
7000 } else {
7001 assert(ideal_opc == Op_SaturatingSubV, "");
7002 vpsubusb(dst, src1, src2, vlen_enc);
7003 }
7004 break;
7005 case T_SHORT:
7006 if (ideal_opc == Op_SaturatingAddV) {
7007 vpaddusw(dst, src1, src2, vlen_enc);
7008 } else {
7009 assert(ideal_opc == Op_SaturatingSubV, "");
7010 vpsubusw(dst, src1, src2, vlen_enc);
7011 }
7012 break;
7013 default:
7014 fatal("Unsupported type %s", type2name(elem_bt));
7015 break;
7016 }
7017 }
7018
7019 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
7020 XMMRegister src2, int vlen_enc) {
7021 switch(elem_bt) {
7022 case T_BYTE:
7023 evpermi2b(dst, src1, src2, vlen_enc);
7024 break;
7025 case T_SHORT:
7026 evpermi2w(dst, src1, src2, vlen_enc);
7027 break;
7028 case T_INT:
7029 evpermi2d(dst, src1, src2, vlen_enc);
7030 break;
7031 case T_LONG:
7032 evpermi2q(dst, src1, src2, vlen_enc);
7033 break;
7034 case T_FLOAT:
7035 evpermi2ps(dst, src1, src2, vlen_enc);
7036 break;
7037 case T_DOUBLE:
7038 evpermi2pd(dst, src1, src2, vlen_enc);
7039 break;
7040 default:
7041 fatal("Unsupported type %s", type2name(elem_bt));
7042 break;
7043 }
7044 }
7045
7046 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7047 if (is_unsigned) {
7048 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7049 } else {
7050 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7051 }
7052 }
7053
7054 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7055 if (is_unsigned) {
7056 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7057 } else {
7058 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7059 }
7060 }
7061
7062 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
7063 switch(opcode) {
7064 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7065 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7066 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7067 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7068 default: assert(false, "%s", NodeClassNames[opcode]); break;
7069 }
7070 }
7071
7072 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7073 switch(opcode) {
7074 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7075 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7076 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7077 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7078 default: assert(false, "%s", NodeClassNames[opcode]); break;
7079 }
7080 }
7081
7082 void C2_MacroAssembler::sminmax_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7083 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7084 vminmax_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7085 }
7086
7087 void C2_MacroAssembler::sminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7088 KRegister ktmp) {
7089 if (opcode == Op_MaxHF) {
7090 // dst = max(src1, src2)
7091 evminmaxsh(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN);
7092 } else {
7093 assert(opcode == Op_MinHF, "");
7094 // dst = min(src1, src2)
7095 evminmaxsh(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN);
7096 }
7097 }
7098
7099 void C2_MacroAssembler::vminmax_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7100 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7101 if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7102 // Move sign bits of src2 to mask register.
7103 evpmovw2m(ktmp, src2, vlen_enc);
7104 // xtmp1 = src2 < 0 ? src2 : src1
7105 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7106 // xtmp2 = src2 < 0 ? ? src1 : src2
7107 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7108 // Idea behind above swapping is to make seconds source operand a +ve value.
7109 // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7110 // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7111 // the second source operand, either a NaN or a valid floating-point value, is returned
7112 // dst = max(xtmp1, xtmp2)
7113 evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7114 // isNaN = is_unordered_quiet(xtmp1)
7115 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7116 // Final result is same as first source if its a NaN value,
7117 // in case second operand holds a NaN value then as per above semantics
7118 // result is same as second operand.
7119 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7120 } else {
7121 assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7122 // Move sign bits of src1 to mask register.
7123 evpmovw2m(ktmp, src1, vlen_enc);
7124 // xtmp1 = src1 < 0 ? src2 : src1
7125 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7126 // xtmp2 = src1 < 0 ? src1 : src2
7127 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7128 // Idea behind above swapping is to make seconds source operand a -ve value.
7129 // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7130 // the second source operand is returned.
7131 // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7132 // or a valid floating-point value, is written to the result.
7133 // dst = min(xtmp1, xtmp2)
7134 evminph(dst, xtmp1, xtmp2, vlen_enc);
7135 // isNaN = is_unordered_quiet(xtmp1)
7136 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7137 // Final result is same as first source if its a NaN value,
7138 // in case second operand holds a NaN value then as per above semantics
7139 // result is same as second operand.
7140 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7141 }
7142 }
7143
7144 void C2_MacroAssembler::vminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7145 KRegister ktmp, int vlen_enc) {
7146 if (opcode == Op_MaxVHF) {
7147 // dst = max(src1, src2)
7148 evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vlen_enc);
7149 } else {
7150 assert(opcode == Op_MinVHF, "");
7151 // dst = min(src1, src2)
7152 evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vlen_enc);
7153 }
7154 }
7155
7156 void C2_MacroAssembler::vminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, Address src2,
7157 KRegister ktmp, int vlen_enc) {
7158 if (opcode == Op_MaxVHF) {
7159 // dst = max(src1, src2)
7160 evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vlen_enc);
7161 } else {
7162 assert(opcode == Op_MinVHF, "");
7163 // dst = min(src1, src2)
7164 evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vlen_enc);
7165 }
7166 }