1 /*
2 * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "asm/assembler.hpp"
26 #include "asm/assembler.inline.hpp"
27 #include "gc/shared/barrierSet.hpp"
28 #include "gc/shared/barrierSetAssembler.hpp"
29 #include "oops/methodData.hpp"
30 #include "opto/c2_MacroAssembler.hpp"
31 #include "opto/intrinsicnode.hpp"
32 #include "opto/output.hpp"
33 #include "opto/opcodes.hpp"
34 #include "opto/subnode.hpp"
35 #include "runtime/globals.hpp"
36 #include "runtime/objectMonitor.hpp"
37 #include "runtime/objectMonitorTable.hpp"
38 #include "runtime/stubRoutines.hpp"
39 #include "runtime/synchronizer.hpp"
40 #include "utilities/checkedCast.hpp"
41 #include "utilities/globalDefinitions.hpp"
42 #include "utilities/powerOfTwo.hpp"
43 #include "utilities/sizes.hpp"
44
45 #ifdef PRODUCT
46 #define BLOCK_COMMENT(str) /* nothing */
47 #define STOP(error) stop(error)
48 #else
49 #define BLOCK_COMMENT(str) block_comment(str)
50 #define STOP(error) block_comment(error); stop(error)
51 #endif
52
53 // C2 compiled method's prolog code.
54 // Beware! This sp_inc is NOT the same as the one mentioned in MacroAssembler::remove_frame but only the size
55 // of the extension space + the additional copy of the return address. That means, it doesn't contain the
56 // frame size (where the local and sp_inc are) and the saved RBP.
57 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) {
58 if (C->clinit_barrier_on_entry()) {
59 assert(VM_Version::supports_fast_class_init_checks(), "sanity");
60 assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
61
62 Label L_skip_barrier;
63 Register klass = rscratch1;
64
65 mov_metadata(klass, C->method()->holder()->constant_encoding());
66 clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
67
68 jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
69
70 bind(L_skip_barrier);
71 }
72
73 int framesize = C->output()->frame_size_in_bytes();
74 int bangsize = C->output()->bang_size_in_bytes();
75 bool fp_mode_24b = false;
76 int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0;
77
78 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
79
80 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
81 // Remove word for return addr
82 framesize -= wordSize;
83 stack_bang_size -= wordSize;
84
85 // Calls to C2R adapters often do not accept exceptional returns.
86 // We require that their callers must bang for them. But be careful, because
87 // some VM calls (such as call site linkage) can use several kilobytes of
88 // stack. But the stack safety zone should account for that.
89 // See bugs 4446381, 4468289, 4497237.
90 if (stack_bang_size > 0) {
91 generate_stack_overflow_check(stack_bang_size);
92
93 // We always push rbp, so that on return to interpreter rbp, will be
94 // restored correctly and we can correct the stack.
95 push(rbp);
96 #ifdef ASSERT
97 if (sp_inc > 0) {
98 movl(Address(rsp, 0), badRegWordVal);
99 movl(Address(rsp, VMRegImpl::stack_slot_size), badRegWordVal);
100 }
101 #endif
102 // Save caller's stack pointer into RBP if the frame pointer is preserved.
103 if (PreserveFramePointer) {
104 mov(rbp, rsp);
105 }
106 // Remove word for ebp
107 framesize -= wordSize;
108
109 // Create frame
110 if (framesize) {
111 subptr(rsp, framesize);
112 }
113 } else {
114 subptr(rsp, framesize);
115
116 // Save RBP register now.
117 framesize -= wordSize;
118 movptr(Address(rsp, framesize), rbp);
119 #ifdef ASSERT
120 if (sp_inc > 0) {
121 movl(Address(rsp, framesize), badRegWordVal);
122 movl(Address(rsp, framesize + VMRegImpl::stack_slot_size), badRegWordVal);
123 }
124 #endif
125 // Save caller's stack pointer into RBP if the frame pointer is preserved.
126 if (PreserveFramePointer) {
127 movptr(rbp, rsp);
128 if (framesize > 0) {
129 addptr(rbp, framesize);
130 }
131 }
132 }
133
134 if (C->needs_stack_repair()) {
135 // Save stack increment just below the saved rbp (also account for fixed framesize and rbp)
136 assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned");
137 movptr(Address(rsp, framesize - wordSize), sp_inc + framesize);
138 }
139
140 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
141 framesize -= wordSize;
142 movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
143 }
144
145 #ifdef ASSERT
146 if (VerifyStackAtCalls) {
147 Label L;
148 push(rax);
149 mov(rax, rsp);
150 andptr(rax, StackAlignmentInBytes-1);
151 cmpptr(rax, StackAlignmentInBytes-wordSize);
152 pop(rax);
153 jcc(Assembler::equal, L);
154 STOP("Stack is not properly aligned!");
155 bind(L);
156 }
157 #endif
158 }
159
160 void C2_MacroAssembler::entry_barrier() {
161 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
162 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
163 Label dummy_slow_path;
164 Label dummy_continuation;
165 Label* slow_path = &dummy_slow_path;
166 Label* continuation = &dummy_continuation;
167 if (!Compile::current()->output()->in_scratch_emit_size()) {
168 // Use real labels from actual stub when not emitting code for the purpose of measuring its size
169 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
170 Compile::current()->output()->add_stub(stub);
171 slow_path = &stub->entry();
172 continuation = &stub->continuation();
173 }
174 bs->nmethod_entry_barrier(this, slow_path, continuation);
175 }
176
177 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
178 switch (vlen_in_bytes) {
179 case 4: // fall-through
180 case 8: // fall-through
181 case 16: return Assembler::AVX_128bit;
182 case 32: return Assembler::AVX_256bit;
183 case 64: return Assembler::AVX_512bit;
184
185 default: {
186 ShouldNotReachHere();
187 return Assembler::AVX_NoVec;
188 }
189 }
190 }
191
192 // fast_lock and fast_unlock used by C2
193
194 // Because the transitions from emitted code to the runtime
195 // monitorenter/exit helper stubs are so slow it's critical that
196 // we inline both the lock-stack fast path and the inflated fast path.
197 //
198 // See also: cmpFastLock and cmpFastUnlock.
199 //
200 // What follows is a specialized inline transliteration of the code
201 // in enter() and exit(). If we're concerned about I$ bloat another
202 // option would be to emit TrySlowEnter and TrySlowExit methods
203 // at startup-time. These methods would accept arguments as
204 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
205 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply
206 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
207 // In practice, however, the # of lock sites is bounded and is usually small.
208 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
209 // if the processor uses simple bimodal branch predictors keyed by EIP
210 // Since the helper routines would be called from multiple synchronization
211 // sites.
212 //
213 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
214 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
215 // to those specialized methods. That'd give us a mostly platform-independent
216 // implementation that the JITs could optimize and inline at their pleasure.
217 // Done correctly, the only time we'd need to cross to native could would be
218 // to park() or unpark() threads. We'd also need a few more unsafe operators
219 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
220 // (b) explicit barriers or fence operations.
221 //
222 // TODO:
223 //
224 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
225 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
226 // Given TLAB allocation, Self is usually manifested in a register, so passing it into
227 // the lock operators would typically be faster than reifying Self.
228 //
229 // * Ideally I'd define the primitives as:
230 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
231 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
232 // Unfortunately ADLC bugs prevent us from expressing the ideal form.
233 // Instead, we're stuck with a rather awkward and brittle register assignments below.
234 // Furthermore the register assignments are overconstrained, possibly resulting in
235 // sub-optimal code near the synchronization site.
236 //
237 // * Eliminate the sp-proximity tests and just use "== Self" tests instead.
238 // Alternately, use a better sp-proximity test.
239 //
240 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
241 // Either one is sufficient to uniquely identify a thread.
242 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
243 //
244 // * Intrinsify notify() and notifyAll() for the common cases where the
245 // object is locked by the calling thread but the waitlist is empty.
246 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
247 //
248 // * use jccb and jmpb instead of jcc and jmp to improve code density.
249 // But beware of excessive branch density on AMD Opterons.
250 //
251 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success
252 // or failure of the fast path. If the fast path fails then we pass
253 // control to the slow path, typically in C. In fast_lock and
254 // fast_unlock we often branch to DONE_LABEL, just to find that C2
255 // will emit a conditional branch immediately after the node.
256 // So we have branches to branches and lots of ICC.ZF games.
257 // Instead, it might be better to have C2 pass a "FailureLabel"
258 // into fast_lock and fast_unlock. In the case of success, control
259 // will drop through the node. ICC.ZF is undefined at exit.
260 // In the case of failure, the node will branch directly to the
261 // FailureLabel
262
263 // obj: object to lock
264 // box: on-stack box address -- KILLED
265 // rax: tmp -- KILLED
266 // t : tmp -- KILLED
267 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register rax_reg,
268 Register t, Register thread) {
269 assert(rax_reg == rax, "Used for CAS");
270 assert_different_registers(obj, box, rax_reg, t, thread);
271
272 // Handle inflated monitor.
273 Label inflated;
274 // Finish fast lock successfully. ZF value is irrelevant.
275 Label locked;
276 // Finish fast lock unsuccessfully. MUST jump with ZF == 0
277 Label slow_path;
278
279 if (UseObjectMonitorTable) {
280 // Clear cache in case fast locking succeeds or we need to take the slow-path.
281 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
282 }
283
284 if (DiagnoseSyncOnValueBasedClasses != 0) {
285 load_klass(rax_reg, obj, t);
286 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
287 jcc(Assembler::notZero, slow_path);
288 }
289
290 const Register mark = t;
291
292 { // Fast Lock
293
294 Label push;
295
296 const Register top = UseObjectMonitorTable ? rax_reg : box;
297
298 // Load the mark.
299 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
300
301 // Prefetch top.
302 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
303
304 // Check for monitor (0b10).
305 testptr(mark, markWord::monitor_value);
306 jcc(Assembler::notZero, inflated);
307
308 // Check if lock-stack is full.
309 cmpl(top, LockStack::end_offset() - 1);
310 jcc(Assembler::greater, slow_path);
311
312 // Check if recursive.
313 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
314 jccb(Assembler::equal, push);
315
316 // Try to lock. Transition lock bits 0b01 => 0b00
317 movptr(rax_reg, mark);
318 orptr(rax_reg, markWord::unlocked_value);
319 andptr(mark, ~(int32_t)markWord::unlocked_value);
320 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
321 jcc(Assembler::notEqual, slow_path);
322
323 if (UseObjectMonitorTable) {
324 // Need to reload top, clobbered by CAS.
325 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
326 }
327 bind(push);
328 // After successful lock, push object on lock-stack.
329 movptr(Address(thread, top), obj);
330 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
331 jmp(locked);
332 }
333
334 { // Handle inflated monitor.
335 bind(inflated);
336
337 const Register monitor = t;
338
339 if (!UseObjectMonitorTable) {
340 assert(mark == monitor, "should be the same here");
341 } else {
342 const Register hash = t;
343 Label monitor_found;
344
345 // Look for the monitor in the om_cache.
346
347 ByteSize cache_offset = JavaThread::om_cache_oops_offset();
348 ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
349 const int num_unrolled = OMCache::CAPACITY;
350 for (int i = 0; i < num_unrolled; i++) {
351 movptr(monitor, Address(thread, cache_offset + monitor_offset));
352 cmpptr(obj, Address(thread, cache_offset));
353 jccb(Assembler::equal, monitor_found);
354 cache_offset = cache_offset + OMCache::oop_to_oop_difference();
355 }
356
357 // Look for the monitor in the table.
358
359 // Get the hash code.
360 movptr(hash, Address(obj, oopDesc::mark_offset_in_bytes()));
361 shrq(hash, markWord::hash_shift);
362 andq(hash, markWord::hash_mask);
363
364 // Get the table and calculate the bucket's address.
365 lea(rax_reg, ExternalAddress(ObjectMonitorTable::current_table_address()));
366 movptr(rax_reg, Address(rax_reg));
367 andq(hash, Address(rax_reg, ObjectMonitorTable::table_capacity_mask_offset()));
368 movptr(rax_reg, Address(rax_reg, ObjectMonitorTable::table_buckets_offset()));
369
370 // Read the monitor from the bucket.
371 movptr(monitor, Address(rax_reg, hash, Address::times_ptr));
372
373 // Check if the monitor in the bucket is special (empty, tombstone or removed)
374 cmpptr(monitor, ObjectMonitorTable::SpecialPointerValues::below_is_special);
375 jcc(Assembler::below, slow_path);
376
377 // Check if object matches.
378 movptr(rax_reg, Address(monitor, ObjectMonitor::object_offset()));
379 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
380 bs_asm->try_resolve_weak_handle_in_c2(this, rax_reg, slow_path);
381 cmpptr(rax_reg, obj);
382 jcc(Assembler::notEqual, slow_path);
383
384 bind(monitor_found);
385 }
386 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
387 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
388 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
389
390 Label monitor_locked;
391 // Lock the monitor.
392
393 if (UseObjectMonitorTable) {
394 // Cache the monitor for unlock before trashing box. On failure to acquire
395 // the lock, the slow path will reset the entry accordingly (see CacheSetter).
396 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
397 }
398
399 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
400 xorptr(rax_reg, rax_reg);
401 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
402 lock(); cmpxchgptr(box, owner_address);
403 jccb(Assembler::equal, monitor_locked);
404
405 // Check if recursive.
406 cmpptr(box, rax_reg);
407 jccb(Assembler::notEqual, slow_path);
408
409 // Recursive.
410 increment(recursions_address);
411
412 bind(monitor_locked);
413 }
414
415 bind(locked);
416 // Set ZF = 1
417 xorl(rax_reg, rax_reg);
418
419 #ifdef ASSERT
420 // Check that locked label is reached with ZF set.
421 Label zf_correct;
422 Label zf_bad_zero;
423 jcc(Assembler::zero, zf_correct);
424 jmp(zf_bad_zero);
425 #endif
426
427 bind(slow_path);
428 #ifdef ASSERT
429 // Check that slow_path label is reached with ZF not set.
430 jcc(Assembler::notZero, zf_correct);
431 stop("Fast Lock ZF != 0");
432 bind(zf_bad_zero);
433 stop("Fast Lock ZF != 1");
434 bind(zf_correct);
435 #endif
436 // C2 uses the value of ZF to determine the continuation.
437 }
438
439 // obj: object to lock
440 // rax: tmp -- KILLED
441 // t : tmp - cannot be obj nor rax -- KILLED
442 //
443 // Some commentary on balanced locking:
444 //
445 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
446 // Methods that don't have provably balanced locking are forced to run in the
447 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
448 // The interpreter provides two properties:
449 // I1: At return-time the interpreter automatically and quietly unlocks any
450 // objects acquired in the current activation (frame). Recall that the
451 // interpreter maintains an on-stack list of locks currently held by
452 // a frame.
453 // I2: If a method attempts to unlock an object that is not held by the
454 // frame the interpreter throws IMSX.
455 //
456 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
457 // B() doesn't have provably balanced locking so it runs in the interpreter.
458 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
459 // is still locked by A().
460 //
461 // The only other source of unbalanced locking would be JNI. The "Java Native Interface
462 // Specification" states that an object locked by JNI's MonitorEnter should not be
463 // unlocked by "normal" java-level locking and vice-versa. The specification doesn't
464 // specify what will occur if a program engages in such mixed-mode locking, however.
465 // Arguably given that the spec legislates the JNI case as undefined our implementation
466 // could reasonably *avoid* checking owner in fast_unlock().
467 // In the interest of performance we elide m->Owner==Self check in unlock.
468 // A perfectly viable alternative is to elide the owner check except when
469 // Xcheck:jni is enabled.
470
471 void C2_MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register t, Register thread) {
472 assert(reg_rax == rax, "Used for CAS");
473 assert_different_registers(obj, reg_rax, t);
474
475 // Handle inflated monitor.
476 Label inflated, inflated_check_lock_stack;
477 // Finish fast unlock successfully. MUST jump with ZF == 1
478 Label unlocked, slow_path;
479
480 const Register mark = t;
481 const Register monitor = t;
482 const Register top = UseObjectMonitorTable ? t : reg_rax;
483 const Register box = reg_rax;
484
485 Label dummy;
486 C2FastUnlockStub* stub = nullptr;
487
488 if (!Compile::current()->output()->in_scratch_emit_size()) {
489 stub = new (Compile::current()->comp_arena()) C2FastUnlockStub(obj, mark, reg_rax, thread);
490 Compile::current()->output()->add_stub(stub);
491 }
492
493 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
494
495 { // Fast Unlock
496
497 // Load top.
498 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
499
500 if (!UseObjectMonitorTable) {
501 // Prefetch mark.
502 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
503 }
504
505 // Check if obj is top of lock-stack.
506 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
507 // Top of lock stack was not obj. Must be monitor.
508 jcc(Assembler::notEqual, inflated_check_lock_stack);
509
510 // Pop lock-stack.
511 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
512 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
513
514 // Check if recursive.
515 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
516 jcc(Assembler::equal, unlocked);
517
518 // We elide the monitor check, let the CAS fail instead.
519
520 if (UseObjectMonitorTable) {
521 // Load mark.
522 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
523 }
524
525 // Try to unlock. Transition lock bits 0b00 => 0b01
526 movptr(reg_rax, mark);
527 andptr(reg_rax, ~(int32_t)markWord::lock_mask_in_place);
528 orptr(mark, markWord::unlocked_value);
529 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
530 jcc(Assembler::notEqual, push_and_slow_path);
531 jmp(unlocked);
532 }
533
534
535 { // Handle inflated monitor.
536 bind(inflated_check_lock_stack);
537 #ifdef ASSERT
538 Label check_done;
539 subl(top, oopSize);
540 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
541 jcc(Assembler::below, check_done);
542 cmpptr(obj, Address(thread, top));
543 jcc(Assembler::notEqual, inflated_check_lock_stack);
544 stop("Fast Unlock lock on stack");
545 bind(check_done);
546 if (UseObjectMonitorTable) {
547 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
548 }
549 testptr(mark, markWord::monitor_value);
550 jcc(Assembler::notZero, inflated);
551 stop("Fast Unlock not monitor");
552 #endif
553
554 bind(inflated);
555
556 if (!UseObjectMonitorTable) {
557 assert(mark == monitor, "should be the same here");
558 } else {
559 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack.
560 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
561 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
562 cmpptr(monitor, alignof(ObjectMonitor*));
563 jcc(Assembler::below, slow_path);
564 }
565 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
566 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
567 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
568 const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
569 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
570
571 Label recursive;
572
573 // Check if recursive.
574 cmpptr(recursions_address, 0);
575 jcc(Assembler::notZero, recursive);
576
577 // Set owner to null.
578 // Release to satisfy the JMM
579 movptr(owner_address, NULL_WORD);
580 // We need a full fence after clearing owner to avoid stranding.
581 // StoreLoad achieves this.
582 membar(StoreLoad);
583
584 // Check if the entry_list is empty.
585 cmpptr(entry_list_address, NULL_WORD);
586 jcc(Assembler::zero, unlocked); // If so we are done.
587
588 // Check if there is a successor.
589 cmpptr(succ_address, NULL_WORD);
590 jcc(Assembler::notZero, unlocked); // If so we are done.
591
592 // Save the monitor pointer in the current thread, so we can try to
593 // reacquire the lock in SharedRuntime::monitor_exit_helper().
594 if (!UseObjectMonitorTable) {
595 andptr(monitor, ~(int32_t)markWord::monitor_value);
596 }
597 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
598
599 orl(t, 1); // Fast Unlock ZF = 0
600 jmpb(slow_path);
601
602 // Recursive unlock.
603 bind(recursive);
604 decrement(recursions_address);
605 }
606
607 bind(unlocked);
608 xorl(t, t); // Fast Unlock ZF = 1
609
610 #ifdef ASSERT
611 // Check that unlocked label is reached with ZF set.
612 Label zf_correct;
613 Label zf_bad_zero;
614 jcc(Assembler::zero, zf_correct);
615 jmp(zf_bad_zero);
616 #endif
617
618 bind(slow_path);
619 if (stub != nullptr) {
620 bind(stub->slow_path_continuation());
621 }
622 #ifdef ASSERT
623 // Check that stub->continuation() label is reached with ZF not set.
624 jcc(Assembler::notZero, zf_correct);
625 stop("Fast Unlock ZF != 0");
626 bind(zf_bad_zero);
627 stop("Fast Unlock ZF != 1");
628 bind(zf_correct);
629 #endif
630 // C2 uses the value of ZF to determine the continuation.
631 }
632
633 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
634 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
635 }
636
637 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
638 const int framesize = Compile::current()->output()->frame_size_in_bytes();
639 masm->movptr(dst, rsp);
640 if (framesize > 2 * wordSize) {
641 masm->addptr(dst, framesize - 2 * wordSize);
642 }
643 }
644
645 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
646 if (PreserveFramePointer) {
647 // frame pointer is valid
648 #ifdef ASSERT
649 // Verify frame pointer value in rbp.
650 reconstruct_frame_pointer_helper(this, rtmp);
651 Label L_success;
652 cmpq(rbp, rtmp);
653 jccb(Assembler::equal, L_success);
654 STOP("frame pointer mismatch");
655 bind(L_success);
656 #endif // ASSERT
657 } else {
658 reconstruct_frame_pointer_helper(this, rbp);
659 }
660 }
661
662 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
663 jint lo = t->_lo;
664 jint hi = t->_hi;
665 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
666 if (t == TypeInt::INT) {
667 return;
668 }
669
670 BLOCK_COMMENT("CastII {");
671 Label fail;
672 Label succeed;
673
674 if (lo != min_jint) {
675 cmpl(val, lo);
676 jccb(Assembler::less, fail);
677 }
678 if (hi != max_jint) {
679 cmpl(val, hi);
680 jccb(Assembler::greater, fail);
681 }
682 jmpb(succeed);
683
684 bind(fail);
685 movl(c_rarg0, idx);
686 movl(c_rarg1, val);
687 movl(c_rarg2, lo);
688 movl(c_rarg3, hi);
689 reconstruct_frame_pointer(rscratch1);
690 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
691 hlt();
692 bind(succeed);
693 BLOCK_COMMENT("} // CastII");
694 }
695
696 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
697 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
698 }
699
700 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
701 jlong lo = t->_lo;
702 jlong hi = t->_hi;
703 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
704 if (t == TypeLong::LONG) {
705 return;
706 }
707
708 BLOCK_COMMENT("CastLL {");
709 Label fail;
710 Label succeed;
711
712 auto cmp_val = [&](jlong bound) {
713 if (is_simm32(bound)) {
714 cmpq(val, checked_cast<int>(bound));
715 } else {
716 mov64(tmp, bound);
717 cmpq(val, tmp);
718 }
719 };
720
721 if (lo != min_jlong) {
722 cmp_val(lo);
723 jccb(Assembler::less, fail);
724 }
725 if (hi != max_jlong) {
726 cmp_val(hi);
727 jccb(Assembler::greater, fail);
728 }
729 jmpb(succeed);
730
731 bind(fail);
732 movl(c_rarg0, idx);
733 movq(c_rarg1, val);
734 mov64(c_rarg2, lo);
735 mov64(c_rarg3, hi);
736 reconstruct_frame_pointer(rscratch1);
737 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
738 hlt();
739 bind(succeed);
740 BLOCK_COMMENT("} // CastLL");
741 }
742
743 //-------------------------------------------------------------------------------------------
744 // Generic instructions support for use in .ad files C2 code generation
745
746 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
747 if (dst != src) {
748 movdqu(dst, src);
749 }
750 if (opcode == Op_AbsVD) {
751 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
752 } else {
753 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
754 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
755 }
756 }
757
758 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
759 if (opcode == Op_AbsVD) {
760 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
761 } else {
762 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
763 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
764 }
765 }
766
767 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
768 if (dst != src) {
769 movdqu(dst, src);
770 }
771 if (opcode == Op_AbsVF) {
772 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
773 } else {
774 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
775 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
776 }
777 }
778
779 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
780 if (opcode == Op_AbsVF) {
781 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
782 } else {
783 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
784 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
785 }
786 }
787
788 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
789 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
790 assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
791
792 if (opcode == Op_MinV) {
793 if (elem_bt == T_BYTE) {
794 pminsb(dst, src);
795 } else if (elem_bt == T_SHORT) {
796 pminsw(dst, src);
797 } else if (elem_bt == T_INT) {
798 pminsd(dst, src);
799 } else {
800 assert(elem_bt == T_LONG, "required");
801 assert(tmp == xmm0, "required");
802 assert_different_registers(dst, src, tmp);
803 movdqu(xmm0, dst);
804 pcmpgtq(xmm0, src);
805 blendvpd(dst, src); // xmm0 as mask
806 }
807 } else { // opcode == Op_MaxV
808 if (elem_bt == T_BYTE) {
809 pmaxsb(dst, src);
810 } else if (elem_bt == T_SHORT) {
811 pmaxsw(dst, src);
812 } else if (elem_bt == T_INT) {
813 pmaxsd(dst, src);
814 } else {
815 assert(elem_bt == T_LONG, "required");
816 assert(tmp == xmm0, "required");
817 assert_different_registers(dst, src, tmp);
818 movdqu(xmm0, src);
819 pcmpgtq(xmm0, dst);
820 blendvpd(dst, src); // xmm0 as mask
821 }
822 }
823 }
824
825 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
826 XMMRegister src1, Address src2, int vlen_enc) {
827 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
828 if (opcode == Op_UMinV) {
829 switch(elem_bt) {
830 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break;
831 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
832 case T_INT: vpminud(dst, src1, src2, vlen_enc); break;
833 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
834 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
835 }
836 } else {
837 assert(opcode == Op_UMaxV, "required");
838 switch(elem_bt) {
839 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break;
840 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
841 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break;
842 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
843 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
844 }
845 }
846 }
847
848 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
849 // For optimality, leverage a full vector width of 512 bits
850 // for operations over smaller vector sizes on AVX512 targets.
851 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
852 if (opcode == Op_UMaxV) {
853 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
854 } else {
855 assert(opcode == Op_UMinV, "required");
856 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
857 }
858 } else {
859 // T1 = -1
860 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
861 // T1 = -1 << 63
862 vpsllq(xtmp1, xtmp1, 63, vlen_enc);
863 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
864 vpaddq(xtmp2, xtmp1, src2, vlen_enc);
865 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
866 vpaddq(xtmp1, xtmp1, src1, vlen_enc);
867 // Mask = T2 > T1
868 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
869 if (opcode == Op_UMaxV) {
870 // Res = Mask ? Src2 : Src1
871 vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
872 } else {
873 // Res = Mask ? Src1 : Src2
874 vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
875 }
876 }
877 }
878
879 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
880 XMMRegister src1, XMMRegister src2, int vlen_enc) {
881 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
882 if (opcode == Op_UMinV) {
883 switch(elem_bt) {
884 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break;
885 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
886 case T_INT: vpminud(dst, src1, src2, vlen_enc); break;
887 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
888 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
889 }
890 } else {
891 assert(opcode == Op_UMaxV, "required");
892 switch(elem_bt) {
893 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break;
894 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
895 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break;
896 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
897 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
898 }
899 }
900 }
901
902 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
903 XMMRegister dst, XMMRegister src1, XMMRegister src2,
904 int vlen_enc) {
905 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
906
907 if (opcode == Op_MinV) {
908 if (elem_bt == T_BYTE) {
909 vpminsb(dst, src1, src2, vlen_enc);
910 } else if (elem_bt == T_SHORT) {
911 vpminsw(dst, src1, src2, vlen_enc);
912 } else if (elem_bt == T_INT) {
913 vpminsd(dst, src1, src2, vlen_enc);
914 } else {
915 assert(elem_bt == T_LONG, "required");
916 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
917 vpminsq(dst, src1, src2, vlen_enc);
918 } else {
919 assert_different_registers(dst, src1, src2);
920 vpcmpgtq(dst, src1, src2, vlen_enc);
921 vblendvpd(dst, src1, src2, dst, vlen_enc);
922 }
923 }
924 } else { // opcode == Op_MaxV
925 if (elem_bt == T_BYTE) {
926 vpmaxsb(dst, src1, src2, vlen_enc);
927 } else if (elem_bt == T_SHORT) {
928 vpmaxsw(dst, src1, src2, vlen_enc);
929 } else if (elem_bt == T_INT) {
930 vpmaxsd(dst, src1, src2, vlen_enc);
931 } else {
932 assert(elem_bt == T_LONG, "required");
933 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
934 vpmaxsq(dst, src1, src2, vlen_enc);
935 } else {
936 assert_different_registers(dst, src1, src2);
937 vpcmpgtq(dst, src1, src2, vlen_enc);
938 vblendvpd(dst, src2, src1, dst, vlen_enc);
939 }
940 }
941 }
942 }
943
944 // Float/Double min max
945
946 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
947 XMMRegister dst, XMMRegister a, XMMRegister b,
948 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
949 int vlen_enc) {
950 assert(UseAVX > 0, "required");
951 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
952 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
953 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
954 assert_different_registers(a, tmp, atmp, btmp);
955 assert_different_registers(b, tmp, atmp, btmp);
956
957 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
958 bool is_double_word = is_double_word_type(elem_bt);
959
960 /* Note on 'non-obvious' assembly sequence:
961 *
962 * While there are vminps/vmaxps instructions, there are two important differences between hardware
963 * and Java on how they handle floats:
964 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
965 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
966 *
967 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
968 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
969 * (only useful when signs differ, noop otherwise)
970 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
971
972 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
973 * btmp = (b < +0.0) ? a : b
974 * atmp = (b < +0.0) ? b : a
975 * Tmp = Max_Float(atmp , btmp)
976 * Res = (atmp == NaN) ? atmp : Tmp
977 */
978
979 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
980 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
981 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
982 XMMRegister mask;
983
984 if (!is_double_word && is_min) {
985 mask = a;
986 vblend = &MacroAssembler::vblendvps;
987 vmaxmin = &MacroAssembler::vminps;
988 vcmp = &MacroAssembler::vcmpps;
989 } else if (!is_double_word && !is_min) {
990 mask = b;
991 vblend = &MacroAssembler::vblendvps;
992 vmaxmin = &MacroAssembler::vmaxps;
993 vcmp = &MacroAssembler::vcmpps;
994 } else if (is_double_word && is_min) {
995 mask = a;
996 vblend = &MacroAssembler::vblendvpd;
997 vmaxmin = &MacroAssembler::vminpd;
998 vcmp = &MacroAssembler::vcmppd;
999 } else {
1000 assert(is_double_word && !is_min, "sanity");
1001 mask = b;
1002 vblend = &MacroAssembler::vblendvpd;
1003 vmaxmin = &MacroAssembler::vmaxpd;
1004 vcmp = &MacroAssembler::vcmppd;
1005 }
1006
1007 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1008 XMMRegister maxmin, scratch;
1009 if (dst == btmp) {
1010 maxmin = btmp;
1011 scratch = tmp;
1012 } else {
1013 maxmin = tmp;
1014 scratch = btmp;
1015 }
1016
1017 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1018 if (precompute_mask && !is_double_word) {
1019 vpsrad(tmp, mask, 32, vlen_enc);
1020 mask = tmp;
1021 } else if (precompute_mask && is_double_word) {
1022 vpxor(tmp, tmp, tmp, vlen_enc);
1023 vpcmpgtq(tmp, tmp, mask, vlen_enc);
1024 mask = tmp;
1025 }
1026
1027 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1028 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1029 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1030 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1031 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1032 }
1033
1034 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1035 XMMRegister dst, XMMRegister a, XMMRegister b,
1036 KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1037 int vlen_enc) {
1038 assert(UseAVX > 2, "required");
1039 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1040 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1041 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1042 assert_different_registers(dst, a, atmp, btmp);
1043 assert_different_registers(dst, b, atmp, btmp);
1044
1045 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1046 bool is_double_word = is_double_word_type(elem_bt);
1047 bool merge = true;
1048
1049 if (!is_double_word && is_min) {
1050 evpmovd2m(ktmp, a, vlen_enc);
1051 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1052 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1053 vminps(dst, atmp, btmp, vlen_enc);
1054 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1055 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1056 } else if (!is_double_word && !is_min) {
1057 evpmovd2m(ktmp, b, vlen_enc);
1058 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1059 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1060 vmaxps(dst, atmp, btmp, vlen_enc);
1061 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1062 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1063 } else if (is_double_word && is_min) {
1064 evpmovq2m(ktmp, a, vlen_enc);
1065 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1066 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1067 vminpd(dst, atmp, btmp, vlen_enc);
1068 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1069 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1070 } else {
1071 assert(is_double_word && !is_min, "sanity");
1072 evpmovq2m(ktmp, b, vlen_enc);
1073 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1074 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1075 vmaxpd(dst, atmp, btmp, vlen_enc);
1076 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1077 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1078 }
1079 }
1080
1081 void C2_MacroAssembler::vminmax_fp_avx10_2(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1082 XMMRegister src1, XMMRegister src2, int vlen_enc) {
1083 assert(opc == Op_MinV || opc == Op_MinReductionV ||
1084 opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1085
1086 int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1087 : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1088 if (elem_bt == T_FLOAT) {
1089 evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1090 } else {
1091 assert(elem_bt == T_DOUBLE, "");
1092 evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1093 }
1094 }
1095
1096 void C2_MacroAssembler::sminmax_fp_avx10_2(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1097 XMMRegister src1, XMMRegister src2) {
1098 assert(opc == Op_MinF || opc == Op_MaxF ||
1099 opc == Op_MinD || opc == Op_MaxD, "sanity");
1100
1101 int imm8 = (opc == Op_MinF || opc == Op_MinD) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1102 : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1103 if (elem_bt == T_FLOAT) {
1104 evminmaxss(dst, mask, src1, src2, true, imm8);
1105 } else {
1106 assert(elem_bt == T_DOUBLE, "");
1107 evminmaxsd(dst, mask, src1, src2, true, imm8);
1108 }
1109 }
1110
1111 // Float/Double signum
1112 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1113 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1114
1115 Label DONE_LABEL;
1116
1117 // Handle special cases +0.0/-0.0 and NaN, if argument is +0.0/-0.0 or NaN, return argument
1118 // If AVX10.2 (or newer) floating point comparison instructions used, SF=1 for equal and unordered cases
1119 // If other floating point comparison instructions used, ZF=1 for equal and unordered cases
1120 if (opcode == Op_SignumF) {
1121 if (VM_Version::supports_avx10_2()) {
1122 evucomxss(dst, zero);
1123 jcc(Assembler::negative, DONE_LABEL);
1124 } else {
1125 ucomiss(dst, zero);
1126 jcc(Assembler::equal, DONE_LABEL);
1127 }
1128 movflt(dst, one);
1129 jcc(Assembler::above, DONE_LABEL);
1130 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1131 } else if (opcode == Op_SignumD) {
1132 if (VM_Version::supports_avx10_2()) {
1133 evucomxsd(dst, zero);
1134 jcc(Assembler::negative, DONE_LABEL);
1135 } else {
1136 ucomisd(dst, zero);
1137 jcc(Assembler::equal, DONE_LABEL);
1138 }
1139 movdbl(dst, one);
1140 jcc(Assembler::above, DONE_LABEL);
1141 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1142 }
1143
1144 bind(DONE_LABEL);
1145 }
1146
1147 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1148 if (sign) {
1149 pmovsxbw(dst, src);
1150 } else {
1151 pmovzxbw(dst, src);
1152 }
1153 }
1154
1155 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1156 if (sign) {
1157 vpmovsxbw(dst, src, vector_len);
1158 } else {
1159 vpmovzxbw(dst, src, vector_len);
1160 }
1161 }
1162
1163 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1164 if (sign) {
1165 vpmovsxbd(dst, src, vector_len);
1166 } else {
1167 vpmovzxbd(dst, src, vector_len);
1168 }
1169 }
1170
1171 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1172 if (sign) {
1173 vpmovsxwd(dst, src, vector_len);
1174 } else {
1175 vpmovzxwd(dst, src, vector_len);
1176 }
1177 }
1178
1179 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1180 int shift, int vector_len) {
1181 if (opcode == Op_RotateLeftV) {
1182 if (etype == T_INT) {
1183 evprold(dst, src, shift, vector_len);
1184 } else {
1185 assert(etype == T_LONG, "expected type T_LONG");
1186 evprolq(dst, src, shift, vector_len);
1187 }
1188 } else {
1189 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1190 if (etype == T_INT) {
1191 evprord(dst, src, shift, vector_len);
1192 } else {
1193 assert(etype == T_LONG, "expected type T_LONG");
1194 evprorq(dst, src, shift, vector_len);
1195 }
1196 }
1197 }
1198
1199 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1200 XMMRegister shift, int vector_len) {
1201 if (opcode == Op_RotateLeftV) {
1202 if (etype == T_INT) {
1203 evprolvd(dst, src, shift, vector_len);
1204 } else {
1205 assert(etype == T_LONG, "expected type T_LONG");
1206 evprolvq(dst, src, shift, vector_len);
1207 }
1208 } else {
1209 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1210 if (etype == T_INT) {
1211 evprorvd(dst, src, shift, vector_len);
1212 } else {
1213 assert(etype == T_LONG, "expected type T_LONG");
1214 evprorvq(dst, src, shift, vector_len);
1215 }
1216 }
1217 }
1218
1219 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1220 if (opcode == Op_RShiftVI) {
1221 psrad(dst, shift);
1222 } else if (opcode == Op_LShiftVI) {
1223 pslld(dst, shift);
1224 } else {
1225 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1226 psrld(dst, shift);
1227 }
1228 }
1229
1230 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1231 switch (opcode) {
1232 case Op_RShiftVI: psrad(dst, shift); break;
1233 case Op_LShiftVI: pslld(dst, shift); break;
1234 case Op_URShiftVI: psrld(dst, shift); break;
1235
1236 default: assert(false, "%s", NodeClassNames[opcode]);
1237 }
1238 }
1239
1240 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1241 if (opcode == Op_RShiftVI) {
1242 vpsrad(dst, nds, shift, vector_len);
1243 } else if (opcode == Op_LShiftVI) {
1244 vpslld(dst, nds, shift, vector_len);
1245 } else {
1246 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1247 vpsrld(dst, nds, shift, vector_len);
1248 }
1249 }
1250
1251 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1252 switch (opcode) {
1253 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break;
1254 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break;
1255 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1256
1257 default: assert(false, "%s", NodeClassNames[opcode]);
1258 }
1259 }
1260
1261 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1262 switch (opcode) {
1263 case Op_RShiftVB: // fall-through
1264 case Op_RShiftVS: psraw(dst, shift); break;
1265
1266 case Op_LShiftVB: // fall-through
1267 case Op_LShiftVS: psllw(dst, shift); break;
1268
1269 case Op_URShiftVS: // fall-through
1270 case Op_URShiftVB: psrlw(dst, shift); break;
1271
1272 default: assert(false, "%s", NodeClassNames[opcode]);
1273 }
1274 }
1275
1276 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1277 switch (opcode) {
1278 case Op_RShiftVB: // fall-through
1279 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break;
1280
1281 case Op_LShiftVB: // fall-through
1282 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break;
1283
1284 case Op_URShiftVS: // fall-through
1285 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1286
1287 default: assert(false, "%s", NodeClassNames[opcode]);
1288 }
1289 }
1290
1291 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1292 switch (opcode) {
1293 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1294 case Op_LShiftVL: psllq(dst, shift); break;
1295 case Op_URShiftVL: psrlq(dst, shift); break;
1296
1297 default: assert(false, "%s", NodeClassNames[opcode]);
1298 }
1299 }
1300
1301 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1302 if (opcode == Op_RShiftVL) {
1303 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems
1304 } else if (opcode == Op_LShiftVL) {
1305 psllq(dst, shift);
1306 } else {
1307 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1308 psrlq(dst, shift);
1309 }
1310 }
1311
1312 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1313 switch (opcode) {
1314 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1315 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break;
1316 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1317
1318 default: assert(false, "%s", NodeClassNames[opcode]);
1319 }
1320 }
1321
1322 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1323 if (opcode == Op_RShiftVL) {
1324 evpsraq(dst, nds, shift, vector_len);
1325 } else if (opcode == Op_LShiftVL) {
1326 vpsllq(dst, nds, shift, vector_len);
1327 } else {
1328 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1329 vpsrlq(dst, nds, shift, vector_len);
1330 }
1331 }
1332
1333 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1334 switch (opcode) {
1335 case Op_RShiftVB: // fall-through
1336 case Op_RShiftVS: // fall-through
1337 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break;
1338
1339 case Op_LShiftVB: // fall-through
1340 case Op_LShiftVS: // fall-through
1341 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break;
1342
1343 case Op_URShiftVB: // fall-through
1344 case Op_URShiftVS: // fall-through
1345 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1346
1347 default: assert(false, "%s", NodeClassNames[opcode]);
1348 }
1349 }
1350
1351 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1352 switch (opcode) {
1353 case Op_RShiftVB: // fall-through
1354 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break;
1355
1356 case Op_LShiftVB: // fall-through
1357 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break;
1358
1359 case Op_URShiftVB: // fall-through
1360 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1361
1362 default: assert(false, "%s", NodeClassNames[opcode]);
1363 }
1364 }
1365
1366 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1367 assert(UseAVX >= 2, "required");
1368 switch (opcode) {
1369 case Op_RShiftVL: {
1370 if (UseAVX > 2) {
1371 assert(tmp == xnoreg, "not used");
1372 if (!VM_Version::supports_avx512vl()) {
1373 vlen_enc = Assembler::AVX_512bit;
1374 }
1375 evpsravq(dst, src, shift, vlen_enc);
1376 } else {
1377 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1378 vpsrlvq(dst, src, shift, vlen_enc);
1379 vpsrlvq(tmp, tmp, shift, vlen_enc);
1380 vpxor(dst, dst, tmp, vlen_enc);
1381 vpsubq(dst, dst, tmp, vlen_enc);
1382 }
1383 break;
1384 }
1385 case Op_LShiftVL: {
1386 assert(tmp == xnoreg, "not used");
1387 vpsllvq(dst, src, shift, vlen_enc);
1388 break;
1389 }
1390 case Op_URShiftVL: {
1391 assert(tmp == xnoreg, "not used");
1392 vpsrlvq(dst, src, shift, vlen_enc);
1393 break;
1394 }
1395 default: assert(false, "%s", NodeClassNames[opcode]);
1396 }
1397 }
1398
1399 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1400 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1401 assert(opcode == Op_LShiftVB ||
1402 opcode == Op_RShiftVB ||
1403 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1404 bool sign = (opcode != Op_URShiftVB);
1405 assert(vector_len == 0, "required");
1406 vextendbd(sign, dst, src, 1);
1407 vpmovzxbd(vtmp, shift, 1);
1408 varshiftd(opcode, dst, dst, vtmp, 1);
1409 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1410 vextracti128_high(vtmp, dst);
1411 vpackusdw(dst, dst, vtmp, 0);
1412 }
1413
1414 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1415 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1416 assert(opcode == Op_LShiftVB ||
1417 opcode == Op_RShiftVB ||
1418 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1419 bool sign = (opcode != Op_URShiftVB);
1420 int ext_vector_len = vector_len + 1;
1421 vextendbw(sign, dst, src, ext_vector_len);
1422 vpmovzxbw(vtmp, shift, ext_vector_len);
1423 varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1424 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1425 if (vector_len == 0) {
1426 vextracti128_high(vtmp, dst);
1427 vpackuswb(dst, dst, vtmp, vector_len);
1428 } else {
1429 vextracti64x4_high(vtmp, dst);
1430 vpackuswb(dst, dst, vtmp, vector_len);
1431 vpermq(dst, dst, 0xD8, vector_len);
1432 }
1433 }
1434
1435 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1436 switch(typ) {
1437 case T_BYTE:
1438 pinsrb(dst, val, idx);
1439 break;
1440 case T_SHORT:
1441 pinsrw(dst, val, idx);
1442 break;
1443 case T_INT:
1444 pinsrd(dst, val, idx);
1445 break;
1446 case T_LONG:
1447 pinsrq(dst, val, idx);
1448 break;
1449 default:
1450 assert(false,"Should not reach here.");
1451 break;
1452 }
1453 }
1454
1455 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1456 switch(typ) {
1457 case T_BYTE:
1458 vpinsrb(dst, src, val, idx);
1459 break;
1460 case T_SHORT:
1461 vpinsrw(dst, src, val, idx);
1462 break;
1463 case T_INT:
1464 vpinsrd(dst, src, val, idx);
1465 break;
1466 case T_LONG:
1467 vpinsrq(dst, src, val, idx);
1468 break;
1469 default:
1470 assert(false,"Should not reach here.");
1471 break;
1472 }
1473 }
1474
1475 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst,
1476 Register base, Register idx_base,
1477 Register mask, Register mask_idx,
1478 Register rtmp, int vlen_enc) {
1479 vpxor(dst, dst, dst, vlen_enc);
1480 if (elem_bt == T_SHORT) {
1481 for (int i = 0; i < 4; i++) {
1482 // dst[i] = mask[i] ? src[idx_base[i]] : 0
1483 Label skip_load;
1484 btq(mask, mask_idx);
1485 jccb(Assembler::carryClear, skip_load);
1486 movl(rtmp, Address(idx_base, i * 4));
1487 pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1488 bind(skip_load);
1489 incq(mask_idx);
1490 }
1491 } else {
1492 assert(elem_bt == T_BYTE, "");
1493 for (int i = 0; i < 8; i++) {
1494 // dst[i] = mask[i] ? src[idx_base[i]] : 0
1495 Label skip_load;
1496 btq(mask, mask_idx);
1497 jccb(Assembler::carryClear, skip_load);
1498 movl(rtmp, Address(idx_base, i * 4));
1499 pinsrb(dst, Address(base, rtmp), i);
1500 bind(skip_load);
1501 incq(mask_idx);
1502 }
1503 }
1504 }
1505
1506 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
1507 Register base, Register idx_base,
1508 Register rtmp, int vlen_enc) {
1509 vpxor(dst, dst, dst, vlen_enc);
1510 if (elem_bt == T_SHORT) {
1511 for (int i = 0; i < 4; i++) {
1512 // dst[i] = src[idx_base[i]]
1513 movl(rtmp, Address(idx_base, i * 4));
1514 pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1515 }
1516 } else {
1517 assert(elem_bt == T_BYTE, "");
1518 for (int i = 0; i < 8; i++) {
1519 // dst[i] = src[idx_base[i]]
1520 movl(rtmp, Address(idx_base, i * 4));
1521 pinsrb(dst, Address(base, rtmp), i);
1522 }
1523 }
1524 }
1525
1526 /*
1527 * Gather using hybrid algorithm, first partially unroll scalar loop
1528 * to accumulate values from gather indices into a quad-word(64bit) slice.
1529 * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1530 * permutation to place the slice into appropriate vector lane
1531 * locations in destination vector. Following pseudo code describes the
1532 * algorithm in detail:
1533 *
1534 * DST_VEC = ZERO_VEC
1535 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1536 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1537 * FOREACH_ITER:
1538 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1539 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1540 * DST_VEC = DST_VEC OR TEMP_PERM_VEC
1541 * PERM_INDEX = PERM_INDEX - TWO_VEC
1542 *
1543 * With each iteration, doubleword permute indices (0,1) corresponding
1544 * to gathered quadword gets right shifted by two lane positions.
1545 *
1546 */
1547 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1548 Register base, Register idx_base,
1549 Register mask, XMMRegister xtmp1,
1550 XMMRegister xtmp2, XMMRegister temp_dst,
1551 Register rtmp, Register mask_idx,
1552 Register length, int vector_len, int vlen_enc) {
1553 Label GATHER8_LOOP;
1554 assert(is_subword_type(elem_ty), "");
1555 movl(length, vector_len);
1556 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1557 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1558 vallones(xtmp2, vlen_enc);
1559 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1560 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1561 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1562
1563 bind(GATHER8_LOOP);
1564 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1565 if (mask == noreg) {
1566 vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc);
1567 } else {
1568 vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc);
1569 }
1570 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1571 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1572 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1573 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1574 // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1575 vpor(dst, dst, temp_dst, vlen_enc);
1576 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1));
1577 subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1578 jcc(Assembler::notEqual, GATHER8_LOOP);
1579 }
1580
1581 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1582 switch(typ) {
1583 case T_INT:
1584 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1585 break;
1586 case T_FLOAT:
1587 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1588 break;
1589 case T_LONG:
1590 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1591 break;
1592 case T_DOUBLE:
1593 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1594 break;
1595 default:
1596 assert(false,"Should not reach here.");
1597 break;
1598 }
1599 }
1600
1601 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1602 switch(typ) {
1603 case T_INT:
1604 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1605 break;
1606 case T_FLOAT:
1607 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1608 break;
1609 case T_LONG:
1610 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1611 break;
1612 case T_DOUBLE:
1613 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1614 break;
1615 default:
1616 assert(false,"Should not reach here.");
1617 break;
1618 }
1619 }
1620
1621 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1622 switch(typ) {
1623 case T_INT:
1624 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1625 break;
1626 case T_FLOAT:
1627 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1628 break;
1629 case T_LONG:
1630 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1631 break;
1632 case T_DOUBLE:
1633 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1634 break;
1635 default:
1636 assert(false,"Should not reach here.");
1637 break;
1638 }
1639 }
1640
1641 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1642 if (vlen_in_bytes <= 16) {
1643 pxor (dst, dst);
1644 psubb(dst, src);
1645 switch (elem_bt) {
1646 case T_BYTE: /* nothing to do */ break;
1647 case T_SHORT: pmovsxbw(dst, dst); break;
1648 case T_INT: pmovsxbd(dst, dst); break;
1649 case T_FLOAT: pmovsxbd(dst, dst); break;
1650 case T_LONG: pmovsxbq(dst, dst); break;
1651 case T_DOUBLE: pmovsxbq(dst, dst); break;
1652
1653 default: assert(false, "%s", type2name(elem_bt));
1654 }
1655 } else {
1656 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1657 int vlen_enc = vector_length_encoding(vlen_in_bytes);
1658
1659 vpxor (dst, dst, dst, vlen_enc);
1660 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1661
1662 switch (elem_bt) {
1663 case T_BYTE: /* nothing to do */ break;
1664 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break;
1665 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break;
1666 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break;
1667 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break;
1668 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1669
1670 default: assert(false, "%s", type2name(elem_bt));
1671 }
1672 }
1673 }
1674
1675 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1676 if (novlbwdq) {
1677 vpmovsxbd(xtmp, src, vlen_enc);
1678 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1679 Assembler::eq, true, vlen_enc, noreg);
1680 } else {
1681 vpxor(xtmp, xtmp, xtmp, vlen_enc);
1682 vpsubb(xtmp, xtmp, src, vlen_enc);
1683 evpmovb2m(dst, xtmp, vlen_enc);
1684 }
1685 }
1686
1687 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1688 if (is_integral_type(bt)) {
1689 switch (vlen_in_bytes) {
1690 case 4: movdl(dst, src); break;
1691 case 8: movq(dst, src); break;
1692 case 16: movdqu(dst, src); break;
1693 case 32: vmovdqu(dst, src); break;
1694 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1695 default: ShouldNotReachHere();
1696 }
1697 } else {
1698 switch (vlen_in_bytes) {
1699 case 4: movflt(dst, src); break;
1700 case 8: movdbl(dst, src); break;
1701 case 16: movups(dst, src); break;
1702 case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1703 case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1704 default: ShouldNotReachHere();
1705 }
1706 }
1707 }
1708
1709 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1710 assert(rscratch != noreg || always_reachable(src), "missing");
1711
1712 if (reachable(src)) {
1713 load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1714 } else {
1715 lea(rscratch, src);
1716 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1717 }
1718 }
1719
1720 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1721 int vlen_enc = vector_length_encoding(vlen);
1722 if (VM_Version::supports_avx()) {
1723 if (bt == T_LONG) {
1724 if (VM_Version::supports_avx2()) {
1725 vpbroadcastq(dst, src, vlen_enc);
1726 } else {
1727 vmovddup(dst, src, vlen_enc);
1728 }
1729 } else if (bt == T_DOUBLE) {
1730 if (vlen_enc != Assembler::AVX_128bit) {
1731 vbroadcastsd(dst, src, vlen_enc, noreg);
1732 } else {
1733 vmovddup(dst, src, vlen_enc);
1734 }
1735 } else {
1736 if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1737 vpbroadcastd(dst, src, vlen_enc);
1738 } else {
1739 vbroadcastss(dst, src, vlen_enc);
1740 }
1741 }
1742 } else if (VM_Version::supports_sse3()) {
1743 movddup(dst, src);
1744 } else {
1745 load_vector(bt, dst, src, vlen);
1746 }
1747 }
1748
1749 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1750 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1751 int offset = exact_log2(type2aelembytes(bt)) << 6;
1752 if (is_floating_point_type(bt)) {
1753 offset += 128;
1754 }
1755 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1756 load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1757 }
1758
1759 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1760
1761 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1762 int vector_len = Assembler::AVX_128bit;
1763
1764 switch (opcode) {
1765 case Op_AndReductionV: pand(dst, src); break;
1766 case Op_OrReductionV: por (dst, src); break;
1767 case Op_XorReductionV: pxor(dst, src); break;
1768 case Op_MinReductionV:
1769 switch (typ) {
1770 case T_BYTE: pminsb(dst, src); break;
1771 case T_SHORT: pminsw(dst, src); break;
1772 case T_INT: pminsd(dst, src); break;
1773 case T_LONG: assert(UseAVX > 2, "required");
1774 vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1775 default: assert(false, "wrong type");
1776 }
1777 break;
1778 case Op_MaxReductionV:
1779 switch (typ) {
1780 case T_BYTE: pmaxsb(dst, src); break;
1781 case T_SHORT: pmaxsw(dst, src); break;
1782 case T_INT: pmaxsd(dst, src); break;
1783 case T_LONG: assert(UseAVX > 2, "required");
1784 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1785 default: assert(false, "wrong type");
1786 }
1787 break;
1788 case Op_UMinReductionV:
1789 switch (typ) {
1790 case T_BYTE: vpminub(dst, dst, src, Assembler::AVX_128bit); break;
1791 case T_SHORT: vpminuw(dst, dst, src, Assembler::AVX_128bit); break;
1792 case T_INT: vpminud(dst, dst, src, Assembler::AVX_128bit); break;
1793 case T_LONG: evpminuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1794 default: assert(false, "wrong type");
1795 }
1796 break;
1797 case Op_UMaxReductionV:
1798 switch (typ) {
1799 case T_BYTE: vpmaxub(dst, dst, src, Assembler::AVX_128bit); break;
1800 case T_SHORT: vpmaxuw(dst, dst, src, Assembler::AVX_128bit); break;
1801 case T_INT: vpmaxud(dst, dst, src, Assembler::AVX_128bit); break;
1802 case T_LONG: evpmaxuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1803 default: assert(false, "wrong type");
1804 }
1805 break;
1806 case Op_AddReductionVF: addss(dst, src); break;
1807 case Op_AddReductionVD: addsd(dst, src); break;
1808 case Op_AddReductionVI:
1809 switch (typ) {
1810 case T_BYTE: paddb(dst, src); break;
1811 case T_SHORT: paddw(dst, src); break;
1812 case T_INT: paddd(dst, src); break;
1813 default: assert(false, "wrong type");
1814 }
1815 break;
1816 case Op_AddReductionVL: paddq(dst, src); break;
1817 case Op_MulReductionVF: mulss(dst, src); break;
1818 case Op_MulReductionVD: mulsd(dst, src); break;
1819 case Op_MulReductionVI:
1820 switch (typ) {
1821 case T_SHORT: pmullw(dst, src); break;
1822 case T_INT: pmulld(dst, src); break;
1823 default: assert(false, "wrong type");
1824 }
1825 break;
1826 case Op_MulReductionVL: assert(UseAVX > 2, "required");
1827 evpmullq(dst, dst, src, vector_len); break;
1828 default: assert(false, "wrong opcode");
1829 }
1830 }
1831
1832 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1833 switch (opcode) {
1834 case Op_AddReductionVF: addps(dst, src); break;
1835 case Op_AddReductionVD: addpd(dst, src); break;
1836 case Op_MulReductionVF: mulps(dst, src); break;
1837 case Op_MulReductionVD: mulpd(dst, src); break;
1838 default: assert(false, "%s", NodeClassNames[opcode]);
1839 }
1840 }
1841
1842 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1843 int vector_len = Assembler::AVX_256bit;
1844
1845 switch (opcode) {
1846 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break;
1847 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break;
1848 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break;
1849 case Op_MinReductionV:
1850 switch (typ) {
1851 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break;
1852 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break;
1853 case T_INT: vpminsd(dst, src1, src2, vector_len); break;
1854 case T_LONG: assert(UseAVX > 2, "required");
1855 vpminsq(dst, src1, src2, vector_len); break;
1856 default: assert(false, "wrong type");
1857 }
1858 break;
1859 case Op_MaxReductionV:
1860 switch (typ) {
1861 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break;
1862 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break;
1863 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break;
1864 case T_LONG: assert(UseAVX > 2, "required");
1865 vpmaxsq(dst, src1, src2, vector_len); break;
1866 default: assert(false, "wrong type");
1867 }
1868 break;
1869 case Op_UMinReductionV:
1870 switch (typ) {
1871 case T_BYTE: vpminub(dst, src1, src2, vector_len); break;
1872 case T_SHORT: vpminuw(dst, src1, src2, vector_len); break;
1873 case T_INT: vpminud(dst, src1, src2, vector_len); break;
1874 case T_LONG: evpminuq(dst, k0, src1, src2, true, vector_len); break;
1875 default: assert(false, "wrong type");
1876 }
1877 break;
1878 case Op_UMaxReductionV:
1879 switch (typ) {
1880 case T_BYTE: vpmaxub(dst, src1, src2, vector_len); break;
1881 case T_SHORT: vpmaxuw(dst, src1, src2, vector_len); break;
1882 case T_INT: vpmaxud(dst, src1, src2, vector_len); break;
1883 case T_LONG: evpmaxuq(dst, k0, src1, src2, true, vector_len); break;
1884 default: assert(false, "wrong type");
1885 }
1886 break;
1887 case Op_AddReductionVI:
1888 switch (typ) {
1889 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break;
1890 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break;
1891 case T_INT: vpaddd(dst, src1, src2, vector_len); break;
1892 default: assert(false, "wrong type");
1893 }
1894 break;
1895 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1896 case Op_MulReductionVI:
1897 switch (typ) {
1898 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break;
1899 case T_INT: vpmulld(dst, src1, src2, vector_len); break;
1900 default: assert(false, "wrong type");
1901 }
1902 break;
1903 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1904 default: assert(false, "wrong opcode");
1905 }
1906 }
1907
1908 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1909 int vector_len = Assembler::AVX_256bit;
1910
1911 switch (opcode) {
1912 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1913 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1914 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1915 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1916 default: assert(false, "%s", NodeClassNames[opcode]);
1917 }
1918 }
1919
1920 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1921 XMMRegister dst, XMMRegister src,
1922 XMMRegister vtmp1, XMMRegister vtmp2) {
1923 switch (opcode) {
1924 case Op_AddReductionVF:
1925 case Op_MulReductionVF:
1926 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1927 break;
1928
1929 case Op_AddReductionVD:
1930 case Op_MulReductionVD:
1931 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1932 break;
1933
1934 default: assert(false, "wrong opcode");
1935 }
1936 }
1937
1938 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1939 XMMRegister dst, XMMRegister src,
1940 XMMRegister vtmp1, XMMRegister vtmp2) {
1941 switch (opcode) {
1942 case Op_AddReductionVF:
1943 case Op_MulReductionVF:
1944 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1945 break;
1946
1947 case Op_AddReductionVD:
1948 case Op_MulReductionVD:
1949 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1950 break;
1951
1952 default: assert(false, "%s", NodeClassNames[opcode]);
1953 }
1954 }
1955
1956 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1957 Register dst, Register src1, XMMRegister src2,
1958 XMMRegister vtmp1, XMMRegister vtmp2) {
1959 switch (vlen) {
1960 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1961 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1962 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1963 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1964
1965 default: assert(false, "wrong vector length");
1966 }
1967 }
1968
1969 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1970 Register dst, Register src1, XMMRegister src2,
1971 XMMRegister vtmp1, XMMRegister vtmp2) {
1972 switch (vlen) {
1973 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1974 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1975 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1976 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1977
1978 default: assert(false, "wrong vector length");
1979 }
1980 }
1981
1982 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1983 Register dst, Register src1, XMMRegister src2,
1984 XMMRegister vtmp1, XMMRegister vtmp2) {
1985 switch (vlen) {
1986 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1987 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1988 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1989 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1990
1991 default: assert(false, "wrong vector length");
1992 }
1993 }
1994
1995 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1996 Register dst, Register src1, XMMRegister src2,
1997 XMMRegister vtmp1, XMMRegister vtmp2) {
1998 switch (vlen) {
1999 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2000 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2001 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
2002 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2003
2004 default: assert(false, "wrong vector length");
2005 }
2006 }
2007
2008 void C2_MacroAssembler::reduceL(int opcode, int vlen,
2009 Register dst, Register src1, XMMRegister src2,
2010 XMMRegister vtmp1, XMMRegister vtmp2) {
2011 switch (vlen) {
2012 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2013 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2014 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2015
2016 default: assert(false, "wrong vector length");
2017 }
2018 }
2019
2020 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2021 switch (vlen) {
2022 case 2:
2023 assert(vtmp2 == xnoreg, "");
2024 reduce2F(opcode, dst, src, vtmp1);
2025 break;
2026 case 4:
2027 assert(vtmp2 == xnoreg, "");
2028 reduce4F(opcode, dst, src, vtmp1);
2029 break;
2030 case 8:
2031 reduce8F(opcode, dst, src, vtmp1, vtmp2);
2032 break;
2033 case 16:
2034 reduce16F(opcode, dst, src, vtmp1, vtmp2);
2035 break;
2036 default: assert(false, "wrong vector length");
2037 }
2038 }
2039
2040 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2041 switch (vlen) {
2042 case 2:
2043 assert(vtmp2 == xnoreg, "");
2044 reduce2D(opcode, dst, src, vtmp1);
2045 break;
2046 case 4:
2047 reduce4D(opcode, dst, src, vtmp1, vtmp2);
2048 break;
2049 case 8:
2050 reduce8D(opcode, dst, src, vtmp1, vtmp2);
2051 break;
2052 default: assert(false, "wrong vector length");
2053 }
2054 }
2055
2056 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2057 switch (vlen) {
2058 case 2:
2059 assert(vtmp1 == xnoreg, "");
2060 assert(vtmp2 == xnoreg, "");
2061 unorderedReduce2F(opcode, dst, src);
2062 break;
2063 case 4:
2064 assert(vtmp2 == xnoreg, "");
2065 unorderedReduce4F(opcode, dst, src, vtmp1);
2066 break;
2067 case 8:
2068 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2069 break;
2070 case 16:
2071 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2072 break;
2073 default: assert(false, "wrong vector length");
2074 }
2075 }
2076
2077 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2078 switch (vlen) {
2079 case 2:
2080 assert(vtmp1 == xnoreg, "");
2081 assert(vtmp2 == xnoreg, "");
2082 unorderedReduce2D(opcode, dst, src);
2083 break;
2084 case 4:
2085 assert(vtmp2 == xnoreg, "");
2086 unorderedReduce4D(opcode, dst, src, vtmp1);
2087 break;
2088 case 8:
2089 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2090 break;
2091 default: assert(false, "wrong vector length");
2092 }
2093 }
2094
2095 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2096 if (opcode == Op_AddReductionVI) {
2097 if (vtmp1 != src2) {
2098 movdqu(vtmp1, src2);
2099 }
2100 phaddd(vtmp1, vtmp1);
2101 } else {
2102 pshufd(vtmp1, src2, 0x1);
2103 reduce_operation_128(T_INT, opcode, vtmp1, src2);
2104 }
2105 movdl(vtmp2, src1);
2106 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2107 movdl(dst, vtmp1);
2108 }
2109
2110 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2111 if (opcode == Op_AddReductionVI) {
2112 if (vtmp1 != src2) {
2113 movdqu(vtmp1, src2);
2114 }
2115 phaddd(vtmp1, src2);
2116 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2117 } else {
2118 pshufd(vtmp2, src2, 0xE);
2119 reduce_operation_128(T_INT, opcode, vtmp2, src2);
2120 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2121 }
2122 }
2123
2124 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2125 if (opcode == Op_AddReductionVI) {
2126 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2127 vextracti128_high(vtmp2, vtmp1);
2128 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2129 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2130 } else {
2131 vextracti128_high(vtmp1, src2);
2132 reduce_operation_128(T_INT, opcode, vtmp1, src2);
2133 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2134 }
2135 }
2136
2137 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2138 vextracti64x4_high(vtmp2, src2);
2139 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2140 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2141 }
2142
2143 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2144 pshufd(vtmp2, src2, 0x1);
2145 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2146 movdqu(vtmp1, vtmp2);
2147 psrldq(vtmp1, 2);
2148 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2149 movdqu(vtmp2, vtmp1);
2150 psrldq(vtmp2, 1);
2151 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2152 movdl(vtmp2, src1);
2153 if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2154 pmovzxbd(vtmp1, vtmp1);
2155 } else {
2156 pmovsxbd(vtmp1, vtmp1);
2157 }
2158 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2159 pextrb(dst, vtmp1, 0x0);
2160 movsbl(dst, dst);
2161 }
2162
2163 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2164 pshufd(vtmp1, src2, 0xE);
2165 reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2166 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2167 }
2168
2169 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2170 vextracti128_high(vtmp2, src2);
2171 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2172 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2173 }
2174
2175 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2176 vextracti64x4_high(vtmp1, src2);
2177 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2178 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2179 }
2180
2181 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2182 pmovsxbw(vtmp2, src2);
2183 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2184 }
2185
2186 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2187 if (UseAVX > 1) {
2188 int vector_len = Assembler::AVX_256bit;
2189 vpmovsxbw(vtmp1, src2, vector_len);
2190 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2191 } else {
2192 pmovsxbw(vtmp2, src2);
2193 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2194 pshufd(vtmp2, src2, 0xe);
2195 pmovsxbw(vtmp2, vtmp2);
2196 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2197 }
2198 }
2199
2200 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2201 if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2202 int vector_len = Assembler::AVX_512bit;
2203 vpmovsxbw(vtmp1, src2, vector_len);
2204 reduce32S(opcode, dst, src1, vtmp1, vtmp2, vtmp1);
2205 } else {
2206 assert(UseAVX >= 2,"Should not reach here.");
2207 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2208 vextracti128_high(vtmp2, src2);
2209 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2210 }
2211 }
2212
2213 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2214 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2215 vextracti64x4_high(vtmp2, src2);
2216 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2217 }
2218
2219 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2220 if (opcode == Op_AddReductionVI) {
2221 if (vtmp1 != src2) {
2222 movdqu(vtmp1, src2);
2223 }
2224 phaddw(vtmp1, vtmp1);
2225 phaddw(vtmp1, vtmp1);
2226 } else {
2227 pshufd(vtmp2, src2, 0x1);
2228 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2229 movdqu(vtmp1, vtmp2);
2230 psrldq(vtmp1, 2);
2231 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2232 }
2233 movdl(vtmp2, src1);
2234 if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2235 pmovzxwd(vtmp1, vtmp1);
2236 } else {
2237 pmovsxwd(vtmp1, vtmp1);
2238 }
2239 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2240 pextrw(dst, vtmp1, 0x0);
2241 movswl(dst, dst);
2242 }
2243
2244 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2245 if (opcode == Op_AddReductionVI) {
2246 if (vtmp1 != src2) {
2247 movdqu(vtmp1, src2);
2248 }
2249 phaddw(vtmp1, src2);
2250 } else {
2251 assert_different_registers(src2, vtmp1);
2252 pshufd(vtmp1, src2, 0xE);
2253 reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2254 }
2255 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2256 }
2257
2258 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2259 if (opcode == Op_AddReductionVI) {
2260 int vector_len = Assembler::AVX_256bit;
2261 vphaddw(vtmp2, src2, src2, vector_len);
2262 vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2263 } else {
2264 assert_different_registers(src2, vtmp2);
2265 vextracti128_high(vtmp2, src2);
2266 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2267 }
2268 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2269 }
2270
2271 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2272 assert_different_registers(src2, vtmp1);
2273 int vector_len = Assembler::AVX_256bit;
2274 vextracti64x4_high(vtmp1, src2);
2275 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2276 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2277 }
2278
2279 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2280 pshufd(vtmp2, src2, 0xE);
2281 reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2282 movdq(vtmp1, src1);
2283 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2284 movdq(dst, vtmp1);
2285 }
2286
2287 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2288 vextracti128_high(vtmp1, src2);
2289 reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2290 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2291 }
2292
2293 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2294 vextracti64x4_high(vtmp2, src2);
2295 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2296 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2297 }
2298
2299 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2300 mov64(temp, -1L);
2301 bzhiq(temp, temp, len);
2302 kmovql(dst, temp);
2303 }
2304
2305 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2306 reduce_operation_128(T_FLOAT, opcode, dst, src);
2307 pshufd(vtmp, src, 0x1);
2308 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2309 }
2310
2311 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2312 reduce2F(opcode, dst, src, vtmp);
2313 pshufd(vtmp, src, 0x2);
2314 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2315 pshufd(vtmp, src, 0x3);
2316 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2317 }
2318
2319 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2320 reduce4F(opcode, dst, src, vtmp2);
2321 vextractf128_high(vtmp2, src);
2322 reduce4F(opcode, dst, vtmp2, vtmp1);
2323 }
2324
2325 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2326 reduce8F(opcode, dst, src, vtmp1, vtmp2);
2327 vextracti64x4_high(vtmp1, src);
2328 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2329 }
2330
2331 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2332 pshufd(dst, src, 0x1);
2333 reduce_operation_128(T_FLOAT, opcode, dst, src);
2334 }
2335
2336 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2337 pshufd(vtmp, src, 0xE);
2338 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2339 unorderedReduce2F(opcode, dst, vtmp);
2340 }
2341
2342 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2343 vextractf128_high(vtmp1, src);
2344 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2345 unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2346 }
2347
2348 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2349 vextractf64x4_high(vtmp2, src);
2350 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2351 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2352 }
2353
2354 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2355 reduce_operation_128(T_DOUBLE, opcode, dst, src);
2356 pshufd(vtmp, src, 0xE);
2357 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2358 }
2359
2360 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2361 reduce2D(opcode, dst, src, vtmp2);
2362 vextractf128_high(vtmp2, src);
2363 reduce2D(opcode, dst, vtmp2, vtmp1);
2364 }
2365
2366 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2367 reduce4D(opcode, dst, src, vtmp1, vtmp2);
2368 vextracti64x4_high(vtmp1, src);
2369 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2370 }
2371
2372 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2373 pshufd(dst, src, 0xE);
2374 reduce_operation_128(T_DOUBLE, opcode, dst, src);
2375 }
2376
2377 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2378 vextractf128_high(vtmp, src);
2379 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2380 unorderedReduce2D(opcode, dst, vtmp);
2381 }
2382
2383 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2384 vextractf64x4_high(vtmp2, src);
2385 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2386 unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2387 }
2388
2389 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2390 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2391 }
2392
2393 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2394 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2395 }
2396
2397 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2398 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2399 }
2400
2401 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2402 int vec_enc) {
2403 switch(elem_bt) {
2404 case T_INT:
2405 case T_FLOAT:
2406 vmaskmovps(dst, src, mask, vec_enc);
2407 break;
2408 case T_LONG:
2409 case T_DOUBLE:
2410 vmaskmovpd(dst, src, mask, vec_enc);
2411 break;
2412 default:
2413 fatal("Unsupported type %s", type2name(elem_bt));
2414 break;
2415 }
2416 }
2417
2418 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2419 int vec_enc) {
2420 switch(elem_bt) {
2421 case T_INT:
2422 case T_FLOAT:
2423 vmaskmovps(dst, src, mask, vec_enc);
2424 break;
2425 case T_LONG:
2426 case T_DOUBLE:
2427 vmaskmovpd(dst, src, mask, vec_enc);
2428 break;
2429 default:
2430 fatal("Unsupported type %s", type2name(elem_bt));
2431 break;
2432 }
2433 }
2434
2435 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2436 XMMRegister dst, XMMRegister src,
2437 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2438 XMMRegister xmm_0, XMMRegister xmm_1) {
2439 const int permconst[] = {1, 14};
2440 XMMRegister wsrc = src;
2441 XMMRegister wdst = xmm_0;
2442 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2443
2444 int vlen_enc = Assembler::AVX_128bit;
2445 if (vlen == 16) {
2446 vlen_enc = Assembler::AVX_256bit;
2447 }
2448
2449 for (int i = log2(vlen) - 1; i >=0; i--) {
2450 if (i == 0 && !is_dst_valid) {
2451 wdst = dst;
2452 }
2453 if (i == 3) {
2454 vextracti64x4_high(wtmp, wsrc);
2455 } else if (i == 2) {
2456 vextracti128_high(wtmp, wsrc);
2457 } else { // i = [0,1]
2458 vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2459 }
2460
2461 if (VM_Version::supports_avx10_2()) {
2462 vminmax_fp_avx10_2(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2463 } else {
2464 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2465 }
2466 wsrc = wdst;
2467 vlen_enc = Assembler::AVX_128bit;
2468 }
2469 if (is_dst_valid) {
2470 if (VM_Version::supports_avx10_2()) {
2471 vminmax_fp_avx10_2(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2472 } else {
2473 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2474 }
2475 }
2476 }
2477
2478 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2479 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2480 XMMRegister xmm_0, XMMRegister xmm_1) {
2481 XMMRegister wsrc = src;
2482 XMMRegister wdst = xmm_0;
2483 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2484 int vlen_enc = Assembler::AVX_128bit;
2485 if (vlen == 8) {
2486 vlen_enc = Assembler::AVX_256bit;
2487 }
2488 for (int i = log2(vlen) - 1; i >=0; i--) {
2489 if (i == 0 && !is_dst_valid) {
2490 wdst = dst;
2491 }
2492 if (i == 1) {
2493 vextracti128_high(wtmp, wsrc);
2494 } else if (i == 2) {
2495 vextracti64x4_high(wtmp, wsrc);
2496 } else {
2497 assert(i == 0, "%d", i);
2498 vpermilpd(wtmp, wsrc, 1, vlen_enc);
2499 }
2500
2501 if (VM_Version::supports_avx10_2()) {
2502 vminmax_fp_avx10_2(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2503 } else {
2504 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2505 }
2506
2507 wsrc = wdst;
2508 vlen_enc = Assembler::AVX_128bit;
2509 }
2510
2511 if (is_dst_valid) {
2512 if (VM_Version::supports_avx10_2()) {
2513 vminmax_fp_avx10_2(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2514 } else {
2515 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2516 }
2517 }
2518 }
2519
2520 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2521 switch (bt) {
2522 case T_BYTE: pextrb(dst, src, idx); break;
2523 case T_SHORT: pextrw(dst, src, idx); break;
2524 case T_INT: pextrd(dst, src, idx); break;
2525 case T_LONG: pextrq(dst, src, idx); break;
2526
2527 default:
2528 assert(false,"Should not reach here.");
2529 break;
2530 }
2531 }
2532
2533 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2534 int esize = type2aelembytes(typ);
2535 int elem_per_lane = 16/esize;
2536 int lane = elemindex / elem_per_lane;
2537 int eindex = elemindex % elem_per_lane;
2538
2539 if (lane >= 2) {
2540 assert(UseAVX > 2, "required");
2541 vextractf32x4(dst, src, lane & 3);
2542 return dst;
2543 } else if (lane > 0) {
2544 assert(UseAVX > 0, "required");
2545 vextractf128(dst, src, lane);
2546 return dst;
2547 } else {
2548 return src;
2549 }
2550 }
2551
2552 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2553 if (typ == T_BYTE) {
2554 movsbl(dst, dst);
2555 } else if (typ == T_SHORT) {
2556 movswl(dst, dst);
2557 }
2558 }
2559
2560 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2561 int esize = type2aelembytes(typ);
2562 int elem_per_lane = 16/esize;
2563 int eindex = elemindex % elem_per_lane;
2564 assert(is_integral_type(typ),"required");
2565
2566 if (eindex == 0) {
2567 if (typ == T_LONG) {
2568 movq(dst, src);
2569 } else {
2570 movdl(dst, src);
2571 movsxl(typ, dst);
2572 }
2573 } else {
2574 extract(typ, dst, src, eindex);
2575 movsxl(typ, dst);
2576 }
2577 }
2578
2579 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2580 int esize = type2aelembytes(typ);
2581 int elem_per_lane = 16/esize;
2582 int eindex = elemindex % elem_per_lane;
2583 assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2584
2585 if (eindex == 0) {
2586 movq(dst, src);
2587 } else {
2588 if (typ == T_FLOAT) {
2589 if (UseAVX == 0) {
2590 movdqu(dst, src);
2591 shufps(dst, dst, eindex);
2592 } else {
2593 vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2594 }
2595 } else {
2596 if (UseAVX == 0) {
2597 movdqu(dst, src);
2598 psrldq(dst, eindex*esize);
2599 } else {
2600 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2601 }
2602 movq(dst, dst);
2603 }
2604 }
2605 // Zero upper bits
2606 if (typ == T_FLOAT) {
2607 if (UseAVX == 0) {
2608 assert(vtmp != xnoreg, "required.");
2609 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2610 pand(dst, vtmp);
2611 } else {
2612 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2613 }
2614 }
2615 }
2616
2617 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2618 switch(typ) {
2619 case T_BYTE:
2620 case T_BOOLEAN:
2621 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2622 break;
2623 case T_SHORT:
2624 case T_CHAR:
2625 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2626 break;
2627 case T_INT:
2628 case T_FLOAT:
2629 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2630 break;
2631 case T_LONG:
2632 case T_DOUBLE:
2633 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2634 break;
2635 default:
2636 assert(false,"Should not reach here.");
2637 break;
2638 }
2639 }
2640
2641 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2642 assert(rscratch != noreg || always_reachable(src2), "missing");
2643
2644 switch(typ) {
2645 case T_BOOLEAN:
2646 case T_BYTE:
2647 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2648 break;
2649 case T_CHAR:
2650 case T_SHORT:
2651 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2652 break;
2653 case T_INT:
2654 case T_FLOAT:
2655 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2656 break;
2657 case T_LONG:
2658 case T_DOUBLE:
2659 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2660 break;
2661 default:
2662 assert(false,"Should not reach here.");
2663 break;
2664 }
2665 }
2666
2667 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2668 switch(typ) {
2669 case T_BYTE:
2670 evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2671 break;
2672 case T_SHORT:
2673 evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2674 break;
2675 case T_INT:
2676 case T_FLOAT:
2677 evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2678 break;
2679 case T_LONG:
2680 case T_DOUBLE:
2681 evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2682 break;
2683 default:
2684 assert(false,"Should not reach here.");
2685 break;
2686 }
2687 }
2688
2689 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2690 assert(vlen_in_bytes <= 32, "");
2691 int esize = type2aelembytes(bt);
2692 if (vlen_in_bytes == 32) {
2693 assert(vtmp == xnoreg, "required.");
2694 if (esize >= 4) {
2695 vtestps(src1, src2, AVX_256bit);
2696 } else {
2697 vptest(src1, src2, AVX_256bit);
2698 }
2699 return;
2700 }
2701 if (vlen_in_bytes < 16) {
2702 // Duplicate the lower part to fill the whole register,
2703 // Don't need to do so for src2
2704 assert(vtmp != xnoreg, "required");
2705 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2706 pshufd(vtmp, src1, shuffle_imm);
2707 } else {
2708 assert(vtmp == xnoreg, "required");
2709 vtmp = src1;
2710 }
2711 if (esize >= 4 && VM_Version::supports_avx()) {
2712 vtestps(vtmp, src2, AVX_128bit);
2713 } else {
2714 ptest(vtmp, src2);
2715 }
2716 }
2717
2718 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2719 #ifdef ASSERT
2720 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2721 bool is_bw_supported = VM_Version::supports_avx512bw();
2722 if (is_bw && !is_bw_supported) {
2723 assert(vlen_enc != Assembler::AVX_512bit, "required");
2724 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2725 "XMM register should be 0-15");
2726 }
2727 #endif // ASSERT
2728 switch (elem_bt) {
2729 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2730 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2731 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2732 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2733 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2734 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2735 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2736 }
2737 }
2738
2739 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2740 assert(UseAVX >= 2, "required");
2741 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2742 bool is_vl = vlen_enc != Assembler::AVX_512bit;
2743 if ((UseAVX > 2) &&
2744 (!is_bw || VM_Version::supports_avx512bw()) &&
2745 (!is_vl || VM_Version::supports_avx512vl())) {
2746 switch (elem_bt) {
2747 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2748 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2749 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2750 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2751 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2752 }
2753 } else {
2754 assert(vlen_enc != Assembler::AVX_512bit, "required");
2755 assert((dst->encoding() < 16),"XMM register should be 0-15");
2756 switch (elem_bt) {
2757 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2758 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2759 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2760 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2761 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2762 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2763 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2764 }
2765 }
2766 }
2767
2768 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2769 switch (to_elem_bt) {
2770 case T_SHORT:
2771 vpmovsxbw(dst, src, vlen_enc);
2772 break;
2773 case T_INT:
2774 vpmovsxbd(dst, src, vlen_enc);
2775 break;
2776 case T_FLOAT:
2777 vpmovsxbd(dst, src, vlen_enc);
2778 vcvtdq2ps(dst, dst, vlen_enc);
2779 break;
2780 case T_LONG:
2781 vpmovsxbq(dst, src, vlen_enc);
2782 break;
2783 case T_DOUBLE: {
2784 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2785 vpmovsxbd(dst, src, mid_vlen_enc);
2786 vcvtdq2pd(dst, dst, vlen_enc);
2787 break;
2788 }
2789 default:
2790 fatal("Unsupported type %s", type2name(to_elem_bt));
2791 break;
2792 }
2793 }
2794
2795 //-------------------------------------------------------------------------------------------
2796
2797 // IndexOf for constant substrings with size >= 8 chars
2798 // which don't need to be loaded through stack.
2799 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2800 Register cnt1, Register cnt2,
2801 int int_cnt2, Register result,
2802 XMMRegister vec, Register tmp,
2803 int ae) {
2804 ShortBranchVerifier sbv(this);
2805 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2806 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2807
2808 // This method uses the pcmpestri instruction with bound registers
2809 // inputs:
2810 // xmm - substring
2811 // rax - substring length (elements count)
2812 // mem - scanned string
2813 // rdx - string length (elements count)
2814 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2815 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2816 // outputs:
2817 // rcx - matched index in string
2818 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2819 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2820 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2821 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2822 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2823
2824 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2825 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2826 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2827
2828 // Note, inline_string_indexOf() generates checks:
2829 // if (substr.count > string.count) return -1;
2830 // if (substr.count == 0) return 0;
2831 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2832
2833 // Load substring.
2834 if (ae == StrIntrinsicNode::UL) {
2835 pmovzxbw(vec, Address(str2, 0));
2836 } else {
2837 movdqu(vec, Address(str2, 0));
2838 }
2839 movl(cnt2, int_cnt2);
2840 movptr(result, str1); // string addr
2841
2842 if (int_cnt2 > stride) {
2843 jmpb(SCAN_TO_SUBSTR);
2844
2845 // Reload substr for rescan, this code
2846 // is executed only for large substrings (> 8 chars)
2847 bind(RELOAD_SUBSTR);
2848 if (ae == StrIntrinsicNode::UL) {
2849 pmovzxbw(vec, Address(str2, 0));
2850 } else {
2851 movdqu(vec, Address(str2, 0));
2852 }
2853 negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2854
2855 bind(RELOAD_STR);
2856 // We came here after the beginning of the substring was
2857 // matched but the rest of it was not so we need to search
2858 // again. Start from the next element after the previous match.
2859
2860 // cnt2 is number of substring reminding elements and
2861 // cnt1 is number of string reminding elements when cmp failed.
2862 // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2863 subl(cnt1, cnt2);
2864 addl(cnt1, int_cnt2);
2865 movl(cnt2, int_cnt2); // Now restore cnt2
2866
2867 decrementl(cnt1); // Shift to next element
2868 cmpl(cnt1, cnt2);
2869 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2870
2871 addptr(result, (1<<scale1));
2872
2873 } // (int_cnt2 > 8)
2874
2875 // Scan string for start of substr in 16-byte vectors
2876 bind(SCAN_TO_SUBSTR);
2877 pcmpestri(vec, Address(result, 0), mode);
2878 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
2879 subl(cnt1, stride);
2880 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2881 cmpl(cnt1, cnt2);
2882 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2883 addptr(result, 16);
2884 jmpb(SCAN_TO_SUBSTR);
2885
2886 // Found a potential substr
2887 bind(FOUND_CANDIDATE);
2888 // Matched whole vector if first element matched (tmp(rcx) == 0).
2889 if (int_cnt2 == stride) {
2890 jccb(Assembler::overflow, RET_FOUND); // OF == 1
2891 } else { // int_cnt2 > 8
2892 jccb(Assembler::overflow, FOUND_SUBSTR);
2893 }
2894 // After pcmpestri tmp(rcx) contains matched element index
2895 // Compute start addr of substr
2896 lea(result, Address(result, tmp, scale1));
2897
2898 // Make sure string is still long enough
2899 subl(cnt1, tmp);
2900 cmpl(cnt1, cnt2);
2901 if (int_cnt2 == stride) {
2902 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2903 } else { // int_cnt2 > 8
2904 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2905 }
2906 // Left less then substring.
2907
2908 bind(RET_NOT_FOUND);
2909 movl(result, -1);
2910 jmp(EXIT);
2911
2912 if (int_cnt2 > stride) {
2913 // This code is optimized for the case when whole substring
2914 // is matched if its head is matched.
2915 bind(MATCH_SUBSTR_HEAD);
2916 pcmpestri(vec, Address(result, 0), mode);
2917 // Reload only string if does not match
2918 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2919
2920 Label CONT_SCAN_SUBSTR;
2921 // Compare the rest of substring (> 8 chars).
2922 bind(FOUND_SUBSTR);
2923 // First 8 chars are already matched.
2924 negptr(cnt2);
2925 addptr(cnt2, stride);
2926
2927 bind(SCAN_SUBSTR);
2928 subl(cnt1, stride);
2929 cmpl(cnt2, -stride); // Do not read beyond substring
2930 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2931 // Back-up strings to avoid reading beyond substring:
2932 // cnt1 = cnt1 - cnt2 + 8
2933 addl(cnt1, cnt2); // cnt2 is negative
2934 addl(cnt1, stride);
2935 movl(cnt2, stride); negptr(cnt2);
2936 bind(CONT_SCAN_SUBSTR);
2937 if (int_cnt2 < (int)G) {
2938 int tail_off1 = int_cnt2<<scale1;
2939 int tail_off2 = int_cnt2<<scale2;
2940 if (ae == StrIntrinsicNode::UL) {
2941 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2942 } else {
2943 movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2944 }
2945 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2946 } else {
2947 // calculate index in register to avoid integer overflow (int_cnt2*2)
2948 movl(tmp, int_cnt2);
2949 addptr(tmp, cnt2);
2950 if (ae == StrIntrinsicNode::UL) {
2951 pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2952 } else {
2953 movdqu(vec, Address(str2, tmp, scale2, 0));
2954 }
2955 pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2956 }
2957 // Need to reload strings pointers if not matched whole vector
2958 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2959 addptr(cnt2, stride);
2960 jcc(Assembler::negative, SCAN_SUBSTR);
2961 // Fall through if found full substring
2962
2963 } // (int_cnt2 > 8)
2964
2965 bind(RET_FOUND);
2966 // Found result if we matched full small substring.
2967 // Compute substr offset
2968 subptr(result, str1);
2969 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2970 shrl(result, 1); // index
2971 }
2972 bind(EXIT);
2973
2974 } // string_indexofC8
2975
2976 // Small strings are loaded through stack if they cross page boundary.
2977 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2978 Register cnt1, Register cnt2,
2979 int int_cnt2, Register result,
2980 XMMRegister vec, Register tmp,
2981 int ae) {
2982 ShortBranchVerifier sbv(this);
2983 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2984 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2985
2986 //
2987 // int_cnt2 is length of small (< 8 chars) constant substring
2988 // or (-1) for non constant substring in which case its length
2989 // is in cnt2 register.
2990 //
2991 // Note, inline_string_indexOf() generates checks:
2992 // if (substr.count > string.count) return -1;
2993 // if (substr.count == 0) return 0;
2994 //
2995 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2996 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2997 // This method uses the pcmpestri instruction with bound registers
2998 // inputs:
2999 // xmm - substring
3000 // rax - substring length (elements count)
3001 // mem - scanned string
3002 // rdx - string length (elements count)
3003 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
3004 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
3005 // outputs:
3006 // rcx - matched index in string
3007 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3008 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
3009 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
3010 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
3011
3012 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
3013 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
3014 FOUND_CANDIDATE;
3015
3016 { //========================================================
3017 // We don't know where these strings are located
3018 // and we can't read beyond them. Load them through stack.
3019 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
3020
3021 movptr(tmp, rsp); // save old SP
3022
3023 if (int_cnt2 > 0) { // small (< 8 chars) constant substring
3024 if (int_cnt2 == (1>>scale2)) { // One byte
3025 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
3026 load_unsigned_byte(result, Address(str2, 0));
3027 movdl(vec, result); // move 32 bits
3028 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes
3029 // Not enough header space in 32-bit VM: 12+3 = 15.
3030 movl(result, Address(str2, -1));
3031 shrl(result, 8);
3032 movdl(vec, result); // move 32 bits
3033 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char
3034 load_unsigned_short(result, Address(str2, 0));
3035 movdl(vec, result); // move 32 bits
3036 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
3037 movdl(vec, Address(str2, 0)); // move 32 bits
3038 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3039 movq(vec, Address(str2, 0)); // move 64 bits
3040 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3041 // Array header size is 12 bytes in 32-bit VM
3042 // + 6 bytes for 3 chars == 18 bytes,
3043 // enough space to load vec and shift.
3044 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3045 if (ae == StrIntrinsicNode::UL) {
3046 int tail_off = int_cnt2-8;
3047 pmovzxbw(vec, Address(str2, tail_off));
3048 psrldq(vec, -2*tail_off);
3049 }
3050 else {
3051 int tail_off = int_cnt2*(1<<scale2);
3052 movdqu(vec, Address(str2, tail_off-16));
3053 psrldq(vec, 16-tail_off);
3054 }
3055 }
3056 } else { // not constant substring
3057 cmpl(cnt2, stride);
3058 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3059
3060 // We can read beyond string if srt+16 does not cross page boundary
3061 // since heaps are aligned and mapped by pages.
3062 assert(os::vm_page_size() < (int)G, "default page should be small");
3063 movl(result, str2); // We need only low 32 bits
3064 andl(result, ((int)os::vm_page_size()-1));
3065 cmpl(result, ((int)os::vm_page_size()-16));
3066 jccb(Assembler::belowEqual, CHECK_STR);
3067
3068 // Move small strings to stack to allow load 16 bytes into vec.
3069 subptr(rsp, 16);
3070 int stk_offset = wordSize-(1<<scale2);
3071 push(cnt2);
3072
3073 bind(COPY_SUBSTR);
3074 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3075 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3076 movb(Address(rsp, cnt2, scale2, stk_offset), result);
3077 } else if (ae == StrIntrinsicNode::UU) {
3078 load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3079 movw(Address(rsp, cnt2, scale2, stk_offset), result);
3080 }
3081 decrement(cnt2);
3082 jccb(Assembler::notZero, COPY_SUBSTR);
3083
3084 pop(cnt2);
3085 movptr(str2, rsp); // New substring address
3086 } // non constant
3087
3088 bind(CHECK_STR);
3089 cmpl(cnt1, stride);
3090 jccb(Assembler::aboveEqual, BIG_STRINGS);
3091
3092 // Check cross page boundary.
3093 movl(result, str1); // We need only low 32 bits
3094 andl(result, ((int)os::vm_page_size()-1));
3095 cmpl(result, ((int)os::vm_page_size()-16));
3096 jccb(Assembler::belowEqual, BIG_STRINGS);
3097
3098 subptr(rsp, 16);
3099 int stk_offset = -(1<<scale1);
3100 if (int_cnt2 < 0) { // not constant
3101 push(cnt2);
3102 stk_offset += wordSize;
3103 }
3104 movl(cnt2, cnt1);
3105
3106 bind(COPY_STR);
3107 if (ae == StrIntrinsicNode::LL) {
3108 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3109 movb(Address(rsp, cnt2, scale1, stk_offset), result);
3110 } else {
3111 load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3112 movw(Address(rsp, cnt2, scale1, stk_offset), result);
3113 }
3114 decrement(cnt2);
3115 jccb(Assembler::notZero, COPY_STR);
3116
3117 if (int_cnt2 < 0) { // not constant
3118 pop(cnt2);
3119 }
3120 movptr(str1, rsp); // New string address
3121
3122 bind(BIG_STRINGS);
3123 // Load substring.
3124 if (int_cnt2 < 0) { // -1
3125 if (ae == StrIntrinsicNode::UL) {
3126 pmovzxbw(vec, Address(str2, 0));
3127 } else {
3128 movdqu(vec, Address(str2, 0));
3129 }
3130 push(cnt2); // substr count
3131 push(str2); // substr addr
3132 push(str1); // string addr
3133 } else {
3134 // Small (< 8 chars) constant substrings are loaded already.
3135 movl(cnt2, int_cnt2);
3136 }
3137 push(tmp); // original SP
3138
3139 } // Finished loading
3140
3141 //========================================================
3142 // Start search
3143 //
3144
3145 movptr(result, str1); // string addr
3146
3147 if (int_cnt2 < 0) { // Only for non constant substring
3148 jmpb(SCAN_TO_SUBSTR);
3149
3150 // SP saved at sp+0
3151 // String saved at sp+1*wordSize
3152 // Substr saved at sp+2*wordSize
3153 // Substr count saved at sp+3*wordSize
3154
3155 // Reload substr for rescan, this code
3156 // is executed only for large substrings (> 8 chars)
3157 bind(RELOAD_SUBSTR);
3158 movptr(str2, Address(rsp, 2*wordSize));
3159 movl(cnt2, Address(rsp, 3*wordSize));
3160 if (ae == StrIntrinsicNode::UL) {
3161 pmovzxbw(vec, Address(str2, 0));
3162 } else {
3163 movdqu(vec, Address(str2, 0));
3164 }
3165 // We came here after the beginning of the substring was
3166 // matched but the rest of it was not so we need to search
3167 // again. Start from the next element after the previous match.
3168 subptr(str1, result); // Restore counter
3169 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3170 shrl(str1, 1);
3171 }
3172 addl(cnt1, str1);
3173 decrementl(cnt1); // Shift to next element
3174 cmpl(cnt1, cnt2);
3175 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
3176
3177 addptr(result, (1<<scale1));
3178 } // non constant
3179
3180 // Scan string for start of substr in 16-byte vectors
3181 bind(SCAN_TO_SUBSTR);
3182 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3183 pcmpestri(vec, Address(result, 0), mode);
3184 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
3185 subl(cnt1, stride);
3186 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3187 cmpl(cnt1, cnt2);
3188 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
3189 addptr(result, 16);
3190
3191 bind(ADJUST_STR);
3192 cmpl(cnt1, stride); // Do not read beyond string
3193 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3194 // Back-up string to avoid reading beyond string.
3195 lea(result, Address(result, cnt1, scale1, -16));
3196 movl(cnt1, stride);
3197 jmpb(SCAN_TO_SUBSTR);
3198
3199 // Found a potential substr
3200 bind(FOUND_CANDIDATE);
3201 // After pcmpestri tmp(rcx) contains matched element index
3202
3203 // Make sure string is still long enough
3204 subl(cnt1, tmp);
3205 cmpl(cnt1, cnt2);
3206 jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3207 // Left less then substring.
3208
3209 bind(RET_NOT_FOUND);
3210 movl(result, -1);
3211 jmp(CLEANUP);
3212
3213 bind(FOUND_SUBSTR);
3214 // Compute start addr of substr
3215 lea(result, Address(result, tmp, scale1));
3216 if (int_cnt2 > 0) { // Constant substring
3217 // Repeat search for small substring (< 8 chars)
3218 // from new point without reloading substring.
3219 // Have to check that we don't read beyond string.
3220 cmpl(tmp, stride-int_cnt2);
3221 jccb(Assembler::greater, ADJUST_STR);
3222 // Fall through if matched whole substring.
3223 } else { // non constant
3224 assert(int_cnt2 == -1, "should be != 0");
3225
3226 addl(tmp, cnt2);
3227 // Found result if we matched whole substring.
3228 cmpl(tmp, stride);
3229 jcc(Assembler::lessEqual, RET_FOUND);
3230
3231 // Repeat search for small substring (<= 8 chars)
3232 // from new point 'str1' without reloading substring.
3233 cmpl(cnt2, stride);
3234 // Have to check that we don't read beyond string.
3235 jccb(Assembler::lessEqual, ADJUST_STR);
3236
3237 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3238 // Compare the rest of substring (> 8 chars).
3239 movptr(str1, result);
3240
3241 cmpl(tmp, cnt2);
3242 // First 8 chars are already matched.
3243 jccb(Assembler::equal, CHECK_NEXT);
3244
3245 bind(SCAN_SUBSTR);
3246 pcmpestri(vec, Address(str1, 0), mode);
3247 // Need to reload strings pointers if not matched whole vector
3248 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3249
3250 bind(CHECK_NEXT);
3251 subl(cnt2, stride);
3252 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3253 addptr(str1, 16);
3254 if (ae == StrIntrinsicNode::UL) {
3255 addptr(str2, 8);
3256 } else {
3257 addptr(str2, 16);
3258 }
3259 subl(cnt1, stride);
3260 cmpl(cnt2, stride); // Do not read beyond substring
3261 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3262 // Back-up strings to avoid reading beyond substring.
3263
3264 if (ae == StrIntrinsicNode::UL) {
3265 lea(str2, Address(str2, cnt2, scale2, -8));
3266 lea(str1, Address(str1, cnt2, scale1, -16));
3267 } else {
3268 lea(str2, Address(str2, cnt2, scale2, -16));
3269 lea(str1, Address(str1, cnt2, scale1, -16));
3270 }
3271 subl(cnt1, cnt2);
3272 movl(cnt2, stride);
3273 addl(cnt1, stride);
3274 bind(CONT_SCAN_SUBSTR);
3275 if (ae == StrIntrinsicNode::UL) {
3276 pmovzxbw(vec, Address(str2, 0));
3277 } else {
3278 movdqu(vec, Address(str2, 0));
3279 }
3280 jmp(SCAN_SUBSTR);
3281
3282 bind(RET_FOUND_LONG);
3283 movptr(str1, Address(rsp, wordSize));
3284 } // non constant
3285
3286 bind(RET_FOUND);
3287 // Compute substr offset
3288 subptr(result, str1);
3289 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3290 shrl(result, 1); // index
3291 }
3292 bind(CLEANUP);
3293 pop(rsp); // restore SP
3294
3295 } // string_indexof
3296
3297 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3298 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3299 ShortBranchVerifier sbv(this);
3300 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3301
3302 int stride = 8;
3303
3304 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3305 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3306 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3307 FOUND_SEQ_CHAR, DONE_LABEL;
3308
3309 movptr(result, str1);
3310 if (UseAVX >= 2) {
3311 cmpl(cnt1, stride);
3312 jcc(Assembler::less, SCAN_TO_CHAR);
3313 cmpl(cnt1, 2*stride);
3314 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3315 movdl(vec1, ch);
3316 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3317 vpxor(vec2, vec2);
3318 movl(tmp, cnt1);
3319 andl(tmp, 0xFFFFFFF0); //vector count (in chars)
3320 andl(cnt1,0x0000000F); //tail count (in chars)
3321
3322 bind(SCAN_TO_16_CHAR_LOOP);
3323 vmovdqu(vec3, Address(result, 0));
3324 vpcmpeqw(vec3, vec3, vec1, 1);
3325 vptest(vec2, vec3);
3326 jcc(Assembler::carryClear, FOUND_CHAR);
3327 addptr(result, 32);
3328 subl(tmp, 2*stride);
3329 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3330 jmp(SCAN_TO_8_CHAR);
3331 bind(SCAN_TO_8_CHAR_INIT);
3332 movdl(vec1, ch);
3333 pshuflw(vec1, vec1, 0x00);
3334 pshufd(vec1, vec1, 0);
3335 pxor(vec2, vec2);
3336 }
3337 bind(SCAN_TO_8_CHAR);
3338 cmpl(cnt1, stride);
3339 jcc(Assembler::less, SCAN_TO_CHAR);
3340 if (UseAVX < 2) {
3341 movdl(vec1, ch);
3342 pshuflw(vec1, vec1, 0x00);
3343 pshufd(vec1, vec1, 0);
3344 pxor(vec2, vec2);
3345 }
3346 movl(tmp, cnt1);
3347 andl(tmp, 0xFFFFFFF8); //vector count (in chars)
3348 andl(cnt1,0x00000007); //tail count (in chars)
3349
3350 bind(SCAN_TO_8_CHAR_LOOP);
3351 movdqu(vec3, Address(result, 0));
3352 pcmpeqw(vec3, vec1);
3353 ptest(vec2, vec3);
3354 jcc(Assembler::carryClear, FOUND_CHAR);
3355 addptr(result, 16);
3356 subl(tmp, stride);
3357 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3358 bind(SCAN_TO_CHAR);
3359 testl(cnt1, cnt1);
3360 jcc(Assembler::zero, RET_NOT_FOUND);
3361 bind(SCAN_TO_CHAR_LOOP);
3362 load_unsigned_short(tmp, Address(result, 0));
3363 cmpl(ch, tmp);
3364 jccb(Assembler::equal, FOUND_SEQ_CHAR);
3365 addptr(result, 2);
3366 subl(cnt1, 1);
3367 jccb(Assembler::zero, RET_NOT_FOUND);
3368 jmp(SCAN_TO_CHAR_LOOP);
3369
3370 bind(RET_NOT_FOUND);
3371 movl(result, -1);
3372 jmpb(DONE_LABEL);
3373
3374 bind(FOUND_CHAR);
3375 if (UseAVX >= 2) {
3376 vpmovmskb(tmp, vec3);
3377 } else {
3378 pmovmskb(tmp, vec3);
3379 }
3380 bsfl(ch, tmp);
3381 addptr(result, ch);
3382
3383 bind(FOUND_SEQ_CHAR);
3384 subptr(result, str1);
3385 shrl(result, 1);
3386
3387 bind(DONE_LABEL);
3388 } // string_indexof_char
3389
3390 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3391 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3392 ShortBranchVerifier sbv(this);
3393 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3394
3395 int stride = 16;
3396
3397 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3398 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3399 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3400 FOUND_SEQ_CHAR, DONE_LABEL;
3401
3402 movptr(result, str1);
3403 if (UseAVX >= 2) {
3404 cmpl(cnt1, stride);
3405 jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3406 cmpl(cnt1, stride*2);
3407 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3408 movdl(vec1, ch);
3409 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3410 vpxor(vec2, vec2);
3411 movl(tmp, cnt1);
3412 andl(tmp, 0xFFFFFFE0); //vector count (in chars)
3413 andl(cnt1,0x0000001F); //tail count (in chars)
3414
3415 bind(SCAN_TO_32_CHAR_LOOP);
3416 vmovdqu(vec3, Address(result, 0));
3417 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3418 vptest(vec2, vec3);
3419 jcc(Assembler::carryClear, FOUND_CHAR);
3420 addptr(result, 32);
3421 subl(tmp, stride*2);
3422 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3423 jmp(SCAN_TO_16_CHAR);
3424
3425 bind(SCAN_TO_16_CHAR_INIT);
3426 movdl(vec1, ch);
3427 pxor(vec2, vec2);
3428 pshufb(vec1, vec2);
3429 }
3430
3431 bind(SCAN_TO_16_CHAR);
3432 cmpl(cnt1, stride);
3433 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3434 if (UseAVX < 2) {
3435 movdl(vec1, ch);
3436 pxor(vec2, vec2);
3437 pshufb(vec1, vec2);
3438 }
3439 movl(tmp, cnt1);
3440 andl(tmp, 0xFFFFFFF0); //vector count (in bytes)
3441 andl(cnt1,0x0000000F); //tail count (in bytes)
3442
3443 bind(SCAN_TO_16_CHAR_LOOP);
3444 movdqu(vec3, Address(result, 0));
3445 pcmpeqb(vec3, vec1);
3446 ptest(vec2, vec3);
3447 jcc(Assembler::carryClear, FOUND_CHAR);
3448 addptr(result, 16);
3449 subl(tmp, stride);
3450 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3451
3452 bind(SCAN_TO_CHAR_INIT);
3453 testl(cnt1, cnt1);
3454 jcc(Assembler::zero, RET_NOT_FOUND);
3455 bind(SCAN_TO_CHAR_LOOP);
3456 load_unsigned_byte(tmp, Address(result, 0));
3457 cmpl(ch, tmp);
3458 jccb(Assembler::equal, FOUND_SEQ_CHAR);
3459 addptr(result, 1);
3460 subl(cnt1, 1);
3461 jccb(Assembler::zero, RET_NOT_FOUND);
3462 jmp(SCAN_TO_CHAR_LOOP);
3463
3464 bind(RET_NOT_FOUND);
3465 movl(result, -1);
3466 jmpb(DONE_LABEL);
3467
3468 bind(FOUND_CHAR);
3469 if (UseAVX >= 2) {
3470 vpmovmskb(tmp, vec3);
3471 } else {
3472 pmovmskb(tmp, vec3);
3473 }
3474 bsfl(ch, tmp);
3475 addptr(result, ch);
3476
3477 bind(FOUND_SEQ_CHAR);
3478 subptr(result, str1);
3479
3480 bind(DONE_LABEL);
3481 } // stringL_indexof_char
3482
3483 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3484 switch (eltype) {
3485 case T_BOOLEAN: return sizeof(jboolean);
3486 case T_BYTE: return sizeof(jbyte);
3487 case T_SHORT: return sizeof(jshort);
3488 case T_CHAR: return sizeof(jchar);
3489 case T_INT: return sizeof(jint);
3490 default:
3491 ShouldNotReachHere();
3492 return -1;
3493 }
3494 }
3495
3496 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3497 switch (eltype) {
3498 // T_BOOLEAN used as surrogate for unsigned byte
3499 case T_BOOLEAN: movzbl(dst, src); break;
3500 case T_BYTE: movsbl(dst, src); break;
3501 case T_SHORT: movswl(dst, src); break;
3502 case T_CHAR: movzwl(dst, src); break;
3503 case T_INT: movl(dst, src); break;
3504 default:
3505 ShouldNotReachHere();
3506 }
3507 }
3508
3509 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3510 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3511 }
3512
3513 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3514 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3515 }
3516
3517 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3518 const int vlen = Assembler::AVX_256bit;
3519 switch (eltype) {
3520 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break;
3521 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break;
3522 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3523 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3524 case T_INT:
3525 // do nothing
3526 break;
3527 default:
3528 ShouldNotReachHere();
3529 }
3530 }
3531
3532 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3533 Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3534 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3535 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3536 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3537 BasicType eltype) {
3538 ShortBranchVerifier sbv(this);
3539 assert(UseAVX >= 2, "AVX2 intrinsics are required");
3540 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3541 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3542
3543 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3544 SHORT_UNROLLED_LOOP_EXIT,
3545 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3546 UNROLLED_VECTOR_LOOP_BEGIN,
3547 END;
3548 switch (eltype) {
3549 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3550 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break;
3551 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break;
3552 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break;
3553 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break;
3554 default: BLOCK_COMMENT("arrays_hashcode {"); break;
3555 }
3556
3557 // For "renaming" for readibility of the code
3558 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3559 vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3560 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3561
3562 const int elsize = arrays_hashcode_elsize(eltype);
3563
3564 /*
3565 if (cnt1 >= 2) {
3566 if (cnt1 >= 32) {
3567 UNROLLED VECTOR LOOP
3568 }
3569 UNROLLED SCALAR LOOP
3570 }
3571 SINGLE SCALAR
3572 */
3573
3574 cmpl(cnt1, 32);
3575 jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3576
3577 // cnt1 >= 32 && generate_vectorized_loop
3578 xorl(index, index);
3579
3580 // vresult = IntVector.zero(I256);
3581 for (int idx = 0; idx < 4; idx++) {
3582 vpxor(vresult[idx], vresult[idx]);
3583 }
3584 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3585 Register bound = tmp2;
3586 Register next = tmp3;
3587 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3588 movl(next, Address(tmp2, 0));
3589 movdl(vnext, next);
3590 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3591
3592 // index = 0;
3593 // bound = cnt1 & ~(32 - 1);
3594 movl(bound, cnt1);
3595 andl(bound, ~(32 - 1));
3596 // for (; index < bound; index += 32) {
3597 bind(UNROLLED_VECTOR_LOOP_BEGIN);
3598 // result *= next;
3599 imull(result, next);
3600 // loop fission to upfront the cost of fetching from memory, OOO execution
3601 // can then hopefully do a better job of prefetching
3602 for (int idx = 0; idx < 4; idx++) {
3603 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3604 }
3605 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3606 for (int idx = 0; idx < 4; idx++) {
3607 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3608 arrays_hashcode_elvcast(vtmp[idx], eltype);
3609 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3610 }
3611 // index += 32;
3612 addl(index, 32);
3613 // index < bound;
3614 cmpl(index, bound);
3615 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3616 // }
3617
3618 lea(ary1, Address(ary1, bound, Address::times(elsize)));
3619 subl(cnt1, bound);
3620 // release bound
3621
3622 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3623 for (int idx = 0; idx < 4; idx++) {
3624 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3625 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3626 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3627 }
3628 // result += vresult.reduceLanes(ADD);
3629 for (int idx = 0; idx < 4; idx++) {
3630 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3631 }
3632
3633 // } else if (cnt1 < 32) {
3634
3635 bind(SHORT_UNROLLED_BEGIN);
3636 // int i = 1;
3637 movl(index, 1);
3638 cmpl(index, cnt1);
3639 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3640
3641 // for (; i < cnt1 ; i += 2) {
3642 bind(SHORT_UNROLLED_LOOP_BEGIN);
3643 movl(tmp3, 961);
3644 imull(result, tmp3);
3645 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3646 movl(tmp3, tmp2);
3647 shll(tmp3, 5);
3648 subl(tmp3, tmp2);
3649 addl(result, tmp3);
3650 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3651 addl(result, tmp3);
3652 addl(index, 2);
3653 cmpl(index, cnt1);
3654 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3655
3656 // }
3657 // if (i >= cnt1) {
3658 bind(SHORT_UNROLLED_LOOP_EXIT);
3659 jccb(Assembler::greater, END);
3660 movl(tmp2, result);
3661 shll(result, 5);
3662 subl(result, tmp2);
3663 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3664 addl(result, tmp3);
3665 // }
3666 bind(END);
3667
3668 BLOCK_COMMENT("} // arrays_hashcode");
3669
3670 } // arrays_hashcode
3671
3672 // helper function for string_compare
3673 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3674 Address::ScaleFactor scale, Address::ScaleFactor scale1,
3675 Address::ScaleFactor scale2, Register index, int ae) {
3676 if (ae == StrIntrinsicNode::LL) {
3677 load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3678 load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3679 } else if (ae == StrIntrinsicNode::UU) {
3680 load_unsigned_short(elem1, Address(str1, index, scale, 0));
3681 load_unsigned_short(elem2, Address(str2, index, scale, 0));
3682 } else {
3683 load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3684 load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3685 }
3686 }
3687
3688 // Compare strings, used for char[] and byte[].
3689 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3690 Register cnt1, Register cnt2, Register result,
3691 XMMRegister vec1, int ae, KRegister mask) {
3692 ShortBranchVerifier sbv(this);
3693 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3694 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only AVX3
3695 int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3696 int stride2x2 = 0x40;
3697 Address::ScaleFactor scale = Address::no_scale;
3698 Address::ScaleFactor scale1 = Address::no_scale;
3699 Address::ScaleFactor scale2 = Address::no_scale;
3700
3701 if (ae != StrIntrinsicNode::LL) {
3702 stride2x2 = 0x20;
3703 }
3704
3705 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3706 shrl(cnt2, 1);
3707 }
3708 // Compute the minimum of the string lengths and the
3709 // difference of the string lengths (stack).
3710 // Do the conditional move stuff
3711 movl(result, cnt1);
3712 subl(cnt1, cnt2);
3713 push(cnt1);
3714 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2)
3715
3716 // Is the minimum length zero?
3717 testl(cnt2, cnt2);
3718 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3719 if (ae == StrIntrinsicNode::LL) {
3720 // Load first bytes
3721 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0]
3722 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0]
3723 } else if (ae == StrIntrinsicNode::UU) {
3724 // Load first characters
3725 load_unsigned_short(result, Address(str1, 0));
3726 load_unsigned_short(cnt1, Address(str2, 0));
3727 } else {
3728 load_unsigned_byte(result, Address(str1, 0));
3729 load_unsigned_short(cnt1, Address(str2, 0));
3730 }
3731 subl(result, cnt1);
3732 jcc(Assembler::notZero, POP_LABEL);
3733
3734 if (ae == StrIntrinsicNode::UU) {
3735 // Divide length by 2 to get number of chars
3736 shrl(cnt2, 1);
3737 }
3738 cmpl(cnt2, 1);
3739 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3740
3741 // Check if the strings start at the same location and setup scale and stride
3742 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3743 cmpptr(str1, str2);
3744 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3745 if (ae == StrIntrinsicNode::LL) {
3746 scale = Address::times_1;
3747 stride = 16;
3748 } else {
3749 scale = Address::times_2;
3750 stride = 8;
3751 }
3752 } else {
3753 scale1 = Address::times_1;
3754 scale2 = Address::times_2;
3755 // scale not used
3756 stride = 8;
3757 }
3758
3759 if (UseAVX >= 2 && UseSSE42Intrinsics) {
3760 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3761 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3762 Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3763 Label COMPARE_TAIL_LONG;
3764 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only AVX3
3765
3766 int pcmpmask = 0x19;
3767 if (ae == StrIntrinsicNode::LL) {
3768 pcmpmask &= ~0x01;
3769 }
3770
3771 // Setup to compare 16-chars (32-bytes) vectors,
3772 // start from first character again because it has aligned address.
3773 if (ae == StrIntrinsicNode::LL) {
3774 stride2 = 32;
3775 } else {
3776 stride2 = 16;
3777 }
3778 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3779 adr_stride = stride << scale;
3780 } else {
3781 adr_stride1 = 8; //stride << scale1;
3782 adr_stride2 = 16; //stride << scale2;
3783 }
3784
3785 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3786 // rax and rdx are used by pcmpestri as elements counters
3787 movl(result, cnt2);
3788 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count
3789 jcc(Assembler::zero, COMPARE_TAIL_LONG);
3790
3791 // fast path : compare first 2 8-char vectors.
3792 bind(COMPARE_16_CHARS);
3793 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3794 movdqu(vec1, Address(str1, 0));
3795 } else {
3796 pmovzxbw(vec1, Address(str1, 0));
3797 }
3798 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3799 jccb(Assembler::below, COMPARE_INDEX_CHAR);
3800
3801 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3802 movdqu(vec1, Address(str1, adr_stride));
3803 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3804 } else {
3805 pmovzxbw(vec1, Address(str1, adr_stride1));
3806 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3807 }
3808 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3809 addl(cnt1, stride);
3810
3811 // Compare the characters at index in cnt1
3812 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3813 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3814 subl(result, cnt2);
3815 jmp(POP_LABEL);
3816
3817 // Setup the registers to start vector comparison loop
3818 bind(COMPARE_WIDE_VECTORS);
3819 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3820 lea(str1, Address(str1, result, scale));
3821 lea(str2, Address(str2, result, scale));
3822 } else {
3823 lea(str1, Address(str1, result, scale1));
3824 lea(str2, Address(str2, result, scale2));
3825 }
3826 subl(result, stride2);
3827 subl(cnt2, stride2);
3828 jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3829 negptr(result);
3830
3831 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3832 bind(COMPARE_WIDE_VECTORS_LOOP);
3833
3834 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3835 cmpl(cnt2, stride2x2);
3836 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3837 testl(cnt2, stride2x2-1); // cnt2 holds the vector count
3838 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40
3839
3840 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3841 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3842 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3843 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3844 } else {
3845 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3846 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3847 }
3848 kortestql(mask, mask);
3849 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare
3850 addptr(result, stride2x2); // update since we already compared at this addr
3851 subl(cnt2, stride2x2); // and sub the size too
3852 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3853
3854 vpxor(vec1, vec1);
3855 jmpb(COMPARE_WIDE_TAIL);
3856 }//if (VM_Version::supports_avx512vlbw())
3857
3858 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3859 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3860 vmovdqu(vec1, Address(str1, result, scale));
3861 vpxor(vec1, Address(str2, result, scale));
3862 } else {
3863 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3864 vpxor(vec1, Address(str2, result, scale2));
3865 }
3866 vptest(vec1, vec1);
3867 jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3868 addptr(result, stride2);
3869 subl(cnt2, stride2);
3870 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3871 // clean upper bits of YMM registers
3872 vpxor(vec1, vec1);
3873
3874 // compare wide vectors tail
3875 bind(COMPARE_WIDE_TAIL);
3876 testptr(result, result);
3877 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3878
3879 movl(result, stride2);
3880 movl(cnt2, result);
3881 negptr(result);
3882 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3883
3884 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3885 bind(VECTOR_NOT_EQUAL);
3886 // clean upper bits of YMM registers
3887 vpxor(vec1, vec1);
3888 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3889 lea(str1, Address(str1, result, scale));
3890 lea(str2, Address(str2, result, scale));
3891 } else {
3892 lea(str1, Address(str1, result, scale1));
3893 lea(str2, Address(str2, result, scale2));
3894 }
3895 jmp(COMPARE_16_CHARS);
3896
3897 // Compare tail chars, length between 1 to 15 chars
3898 bind(COMPARE_TAIL_LONG);
3899 movl(cnt2, result);
3900 cmpl(cnt2, stride);
3901 jcc(Assembler::less, COMPARE_SMALL_STR);
3902
3903 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3904 movdqu(vec1, Address(str1, 0));
3905 } else {
3906 pmovzxbw(vec1, Address(str1, 0));
3907 }
3908 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3909 jcc(Assembler::below, COMPARE_INDEX_CHAR);
3910 subptr(cnt2, stride);
3911 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3912 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3913 lea(str1, Address(str1, result, scale));
3914 lea(str2, Address(str2, result, scale));
3915 } else {
3916 lea(str1, Address(str1, result, scale1));
3917 lea(str2, Address(str2, result, scale2));
3918 }
3919 negptr(cnt2);
3920 jmpb(WHILE_HEAD_LABEL);
3921
3922 bind(COMPARE_SMALL_STR);
3923 } else if (UseSSE42Intrinsics) {
3924 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3925 int pcmpmask = 0x19;
3926 // Setup to compare 8-char (16-byte) vectors,
3927 // start from first character again because it has aligned address.
3928 movl(result, cnt2);
3929 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count
3930 if (ae == StrIntrinsicNode::LL) {
3931 pcmpmask &= ~0x01;
3932 }
3933 jcc(Assembler::zero, COMPARE_TAIL);
3934 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3935 lea(str1, Address(str1, result, scale));
3936 lea(str2, Address(str2, result, scale));
3937 } else {
3938 lea(str1, Address(str1, result, scale1));
3939 lea(str2, Address(str2, result, scale2));
3940 }
3941 negptr(result);
3942
3943 // pcmpestri
3944 // inputs:
3945 // vec1- substring
3946 // rax - negative string length (elements count)
3947 // mem - scanned string
3948 // rdx - string length (elements count)
3949 // pcmpmask - cmp mode: 11000 (string compare with negated result)
3950 // + 00 (unsigned bytes) or + 01 (unsigned shorts)
3951 // outputs:
3952 // rcx - first mismatched element index
3953 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3954
3955 bind(COMPARE_WIDE_VECTORS);
3956 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3957 movdqu(vec1, Address(str1, result, scale));
3958 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3959 } else {
3960 pmovzxbw(vec1, Address(str1, result, scale1));
3961 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3962 }
3963 // After pcmpestri cnt1(rcx) contains mismatched element index
3964
3965 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1
3966 addptr(result, stride);
3967 subptr(cnt2, stride);
3968 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3969
3970 // compare wide vectors tail
3971 testptr(result, result);
3972 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3973
3974 movl(cnt2, stride);
3975 movl(result, stride);
3976 negptr(result);
3977 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3978 movdqu(vec1, Address(str1, result, scale));
3979 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3980 } else {
3981 pmovzxbw(vec1, Address(str1, result, scale1));
3982 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3983 }
3984 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3985
3986 // Mismatched characters in the vectors
3987 bind(VECTOR_NOT_EQUAL);
3988 addptr(cnt1, result);
3989 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3990 subl(result, cnt2);
3991 jmpb(POP_LABEL);
3992
3993 bind(COMPARE_TAIL); // limit is zero
3994 movl(cnt2, result);
3995 // Fallthru to tail compare
3996 }
3997 // Shift str2 and str1 to the end of the arrays, negate min
3998 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3999 lea(str1, Address(str1, cnt2, scale));
4000 lea(str2, Address(str2, cnt2, scale));
4001 } else {
4002 lea(str1, Address(str1, cnt2, scale1));
4003 lea(str2, Address(str2, cnt2, scale2));
4004 }
4005 decrementl(cnt2); // first character was compared already
4006 negptr(cnt2);
4007
4008 // Compare the rest of the elements
4009 bind(WHILE_HEAD_LABEL);
4010 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
4011 subl(result, cnt1);
4012 jccb(Assembler::notZero, POP_LABEL);
4013 increment(cnt2);
4014 jccb(Assembler::notZero, WHILE_HEAD_LABEL);
4015
4016 // Strings are equal up to min length. Return the length difference.
4017 bind(LENGTH_DIFF_LABEL);
4018 pop(result);
4019 if (ae == StrIntrinsicNode::UU) {
4020 // Divide diff by 2 to get number of chars
4021 sarl(result, 1);
4022 }
4023 jmpb(DONE_LABEL);
4024
4025 if (VM_Version::supports_avx512vlbw()) {
4026
4027 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
4028
4029 kmovql(cnt1, mask);
4030 notq(cnt1);
4031 bsfq(cnt2, cnt1);
4032 if (ae != StrIntrinsicNode::LL) {
4033 // Divide diff by 2 to get number of chars
4034 sarl(cnt2, 1);
4035 }
4036 addq(result, cnt2);
4037 if (ae == StrIntrinsicNode::LL) {
4038 load_unsigned_byte(cnt1, Address(str2, result));
4039 load_unsigned_byte(result, Address(str1, result));
4040 } else if (ae == StrIntrinsicNode::UU) {
4041 load_unsigned_short(cnt1, Address(str2, result, scale));
4042 load_unsigned_short(result, Address(str1, result, scale));
4043 } else {
4044 load_unsigned_short(cnt1, Address(str2, result, scale2));
4045 load_unsigned_byte(result, Address(str1, result, scale1));
4046 }
4047 subl(result, cnt1);
4048 jmpb(POP_LABEL);
4049 }//if (VM_Version::supports_avx512vlbw())
4050
4051 // Discard the stored length difference
4052 bind(POP_LABEL);
4053 pop(cnt1);
4054
4055 // That's it
4056 bind(DONE_LABEL);
4057 if(ae == StrIntrinsicNode::UL) {
4058 negl(result);
4059 }
4060
4061 }
4062
4063 // Search for Non-ASCII character (Negative byte value) in a byte array,
4064 // return the index of the first such character, otherwise the length
4065 // of the array segment searched.
4066 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4067 // @IntrinsicCandidate
4068 // public static int countPositives(byte[] ba, int off, int len) {
4069 // for (int i = off; i < off + len; i++) {
4070 // if (ba[i] < 0) {
4071 // return i - off;
4072 // }
4073 // }
4074 // return len;
4075 // }
4076 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4077 Register result, Register tmp1,
4078 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4079 // rsi: byte array
4080 // rcx: len
4081 // rax: result
4082 ShortBranchVerifier sbv(this);
4083 assert_different_registers(ary1, len, result, tmp1);
4084 assert_different_registers(vec1, vec2);
4085 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4086
4087 movl(result, len); // copy
4088 // len == 0
4089 testl(len, len);
4090 jcc(Assembler::zero, DONE);
4091
4092 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4093 VM_Version::supports_avx512vlbw() &&
4094 VM_Version::supports_bmi2()) {
4095
4096 Label test_64_loop, test_tail, BREAK_LOOP;
4097 movl(tmp1, len);
4098 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4099
4100 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4101 andl(len, 0xffffffc0); // vector count (in chars)
4102 jccb(Assembler::zero, test_tail);
4103
4104 lea(ary1, Address(ary1, len, Address::times_1));
4105 negptr(len);
4106
4107 bind(test_64_loop);
4108 // Check whether our 64 elements of size byte contain negatives
4109 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4110 kortestql(mask1, mask1);
4111 jcc(Assembler::notZero, BREAK_LOOP);
4112
4113 addptr(len, 64);
4114 jccb(Assembler::notZero, test_64_loop);
4115
4116 bind(test_tail);
4117 // bail out when there is nothing to be done
4118 testl(tmp1, -1);
4119 jcc(Assembler::zero, DONE);
4120
4121
4122 // check the tail for absense of negatives
4123 // ~(~0 << len) applied up to two times (for 32-bit scenario)
4124 {
4125 Register tmp3_aliased = len;
4126 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4127 shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4128 notq(tmp3_aliased);
4129 kmovql(mask2, tmp3_aliased);
4130 }
4131
4132 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4133 ktestq(mask1, mask2);
4134 jcc(Assembler::zero, DONE);
4135
4136 // do a full check for negative registers in the tail
4137 movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4138 // ary1 already pointing to the right place
4139 jmpb(TAIL_START);
4140
4141 bind(BREAK_LOOP);
4142 // At least one byte in the last 64 byte block was negative.
4143 // Set up to look at the last 64 bytes as if they were a tail
4144 lea(ary1, Address(ary1, len, Address::times_1));
4145 addptr(result, len);
4146 // Ignore the very last byte: if all others are positive,
4147 // it must be negative, so we can skip right to the 2+1 byte
4148 // end comparison at this point
4149 orl(result, 63);
4150 movl(len, 63);
4151 // Fallthru to tail compare
4152 } else {
4153
4154 if (UseAVX >= 2) {
4155 // With AVX2, use 32-byte vector compare
4156 Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4157
4158 // Compare 32-byte vectors
4159 testl(len, 0xffffffe0); // vector count (in bytes)
4160 jccb(Assembler::zero, TAIL_START);
4161
4162 andl(len, 0xffffffe0);
4163 lea(ary1, Address(ary1, len, Address::times_1));
4164 negptr(len);
4165
4166 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
4167 movdl(vec2, tmp1);
4168 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4169
4170 bind(COMPARE_WIDE_VECTORS);
4171 vmovdqu(vec1, Address(ary1, len, Address::times_1));
4172 vptest(vec1, vec2);
4173 jccb(Assembler::notZero, BREAK_LOOP);
4174 addptr(len, 32);
4175 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4176
4177 testl(result, 0x0000001f); // any bytes remaining?
4178 jcc(Assembler::zero, DONE);
4179
4180 // Quick test using the already prepared vector mask
4181 movl(len, result);
4182 andl(len, 0x0000001f);
4183 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4184 vptest(vec1, vec2);
4185 jcc(Assembler::zero, DONE);
4186 // There are zeros, jump to the tail to determine exactly where
4187 jmpb(TAIL_START);
4188
4189 bind(BREAK_LOOP);
4190 // At least one byte in the last 32-byte vector is negative.
4191 // Set up to look at the last 32 bytes as if they were a tail
4192 lea(ary1, Address(ary1, len, Address::times_1));
4193 addptr(result, len);
4194 // Ignore the very last byte: if all others are positive,
4195 // it must be negative, so we can skip right to the 2+1 byte
4196 // end comparison at this point
4197 orl(result, 31);
4198 movl(len, 31);
4199 // Fallthru to tail compare
4200 } else if (UseSSE42Intrinsics) {
4201 // With SSE4.2, use double quad vector compare
4202 Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4203
4204 // Compare 16-byte vectors
4205 testl(len, 0xfffffff0); // vector count (in bytes)
4206 jcc(Assembler::zero, TAIL_START);
4207
4208 andl(len, 0xfffffff0);
4209 lea(ary1, Address(ary1, len, Address::times_1));
4210 negptr(len);
4211
4212 movl(tmp1, 0x80808080);
4213 movdl(vec2, tmp1);
4214 pshufd(vec2, vec2, 0);
4215
4216 bind(COMPARE_WIDE_VECTORS);
4217 movdqu(vec1, Address(ary1, len, Address::times_1));
4218 ptest(vec1, vec2);
4219 jccb(Assembler::notZero, BREAK_LOOP);
4220 addptr(len, 16);
4221 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4222
4223 testl(result, 0x0000000f); // len is zero, any bytes remaining?
4224 jcc(Assembler::zero, DONE);
4225
4226 // Quick test using the already prepared vector mask
4227 movl(len, result);
4228 andl(len, 0x0000000f); // tail count (in bytes)
4229 movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4230 ptest(vec1, vec2);
4231 jcc(Assembler::zero, DONE);
4232 jmpb(TAIL_START);
4233
4234 bind(BREAK_LOOP);
4235 // At least one byte in the last 16-byte vector is negative.
4236 // Set up and look at the last 16 bytes as if they were a tail
4237 lea(ary1, Address(ary1, len, Address::times_1));
4238 addptr(result, len);
4239 // Ignore the very last byte: if all others are positive,
4240 // it must be negative, so we can skip right to the 2+1 byte
4241 // end comparison at this point
4242 orl(result, 15);
4243 movl(len, 15);
4244 // Fallthru to tail compare
4245 }
4246 }
4247
4248 bind(TAIL_START);
4249 // Compare 4-byte vectors
4250 andl(len, 0xfffffffc); // vector count (in bytes)
4251 jccb(Assembler::zero, COMPARE_CHAR);
4252
4253 lea(ary1, Address(ary1, len, Address::times_1));
4254 negptr(len);
4255
4256 bind(COMPARE_VECTORS);
4257 movl(tmp1, Address(ary1, len, Address::times_1));
4258 andl(tmp1, 0x80808080);
4259 jccb(Assembler::notZero, TAIL_ADJUST);
4260 addptr(len, 4);
4261 jccb(Assembler::notZero, COMPARE_VECTORS);
4262
4263 // Compare trailing char (final 2-3 bytes), if any
4264 bind(COMPARE_CHAR);
4265
4266 testl(result, 0x2); // tail char
4267 jccb(Assembler::zero, COMPARE_BYTE);
4268 load_unsigned_short(tmp1, Address(ary1, 0));
4269 andl(tmp1, 0x00008080);
4270 jccb(Assembler::notZero, CHAR_ADJUST);
4271 lea(ary1, Address(ary1, 2));
4272
4273 bind(COMPARE_BYTE);
4274 testl(result, 0x1); // tail byte
4275 jccb(Assembler::zero, DONE);
4276 load_unsigned_byte(tmp1, Address(ary1, 0));
4277 testl(tmp1, 0x00000080);
4278 jccb(Assembler::zero, DONE);
4279 subptr(result, 1);
4280 jmpb(DONE);
4281
4282 bind(TAIL_ADJUST);
4283 // there are negative bits in the last 4 byte block.
4284 // Adjust result and check the next three bytes
4285 addptr(result, len);
4286 orl(result, 3);
4287 lea(ary1, Address(ary1, len, Address::times_1));
4288 jmpb(COMPARE_CHAR);
4289
4290 bind(CHAR_ADJUST);
4291 // We are looking at a char + optional byte tail, and found that one
4292 // of the bytes in the char is negative. Adjust the result, check the
4293 // first byte and readjust if needed.
4294 andl(result, 0xfffffffc);
4295 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4296 jccb(Assembler::notZero, DONE);
4297 addptr(result, 1);
4298
4299 // That's it
4300 bind(DONE);
4301 if (UseAVX >= 2) {
4302 // clean upper bits of YMM registers
4303 vpxor(vec1, vec1);
4304 vpxor(vec2, vec2);
4305 }
4306 }
4307
4308 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4309 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4310 Register limit, Register result, Register chr,
4311 XMMRegister vec1, XMMRegister vec2, bool is_char,
4312 KRegister mask, bool expand_ary2) {
4313 // for expand_ary2, limit is the (smaller) size of the second array.
4314 ShortBranchVerifier sbv(this);
4315 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4316
4317 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4318 "Expansion only implemented for AVX2");
4319
4320 int length_offset = arrayOopDesc::length_offset_in_bytes();
4321 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4322
4323 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4324 int scaleIncr = expand_ary2 ? 8 : 16;
4325
4326 if (is_array_equ) {
4327 // Check the input args
4328 cmpoop(ary1, ary2);
4329 jcc(Assembler::equal, TRUE_LABEL);
4330
4331 // Need additional checks for arrays_equals.
4332 testptr(ary1, ary1);
4333 jcc(Assembler::zero, FALSE_LABEL);
4334 testptr(ary2, ary2);
4335 jcc(Assembler::zero, FALSE_LABEL);
4336
4337 // Check the lengths
4338 movl(limit, Address(ary1, length_offset));
4339 cmpl(limit, Address(ary2, length_offset));
4340 jcc(Assembler::notEqual, FALSE_LABEL);
4341 }
4342
4343 // count == 0
4344 testl(limit, limit);
4345 jcc(Assembler::zero, TRUE_LABEL);
4346
4347 if (is_array_equ) {
4348 // Load array address
4349 lea(ary1, Address(ary1, base_offset));
4350 lea(ary2, Address(ary2, base_offset));
4351 }
4352
4353 if (is_array_equ && is_char) {
4354 // arrays_equals when used for char[].
4355 shll(limit, 1); // byte count != 0
4356 }
4357 movl(result, limit); // copy
4358
4359 if (UseAVX >= 2) {
4360 // With AVX2, use 32-byte vector compare
4361 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4362
4363 // Compare 32-byte vectors
4364 if (expand_ary2) {
4365 andl(result, 0x0000000f); // tail count (in bytes)
4366 andl(limit, 0xfffffff0); // vector count (in bytes)
4367 jcc(Assembler::zero, COMPARE_TAIL);
4368 } else {
4369 andl(result, 0x0000001f); // tail count (in bytes)
4370 andl(limit, 0xffffffe0); // vector count (in bytes)
4371 jcc(Assembler::zero, COMPARE_TAIL_16);
4372 }
4373
4374 lea(ary1, Address(ary1, limit, scaleFactor));
4375 lea(ary2, Address(ary2, limit, Address::times_1));
4376 negptr(limit);
4377
4378 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4379 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4380
4381 cmpl(limit, -64);
4382 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4383
4384 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4385
4386 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4387 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4388 kortestql(mask, mask);
4389 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
4390 addptr(limit, 64); // update since we already compared at this addr
4391 cmpl(limit, -64);
4392 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4393
4394 // At this point we may still need to compare -limit+result bytes.
4395 // We could execute the next two instruction and just continue via non-wide path:
4396 // cmpl(limit, 0);
4397 // jcc(Assembler::equal, COMPARE_TAIL); // true
4398 // But since we stopped at the points ary{1,2}+limit which are
4399 // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4400 // (|limit| <= 32 and result < 32),
4401 // we may just compare the last 64 bytes.
4402 //
4403 addptr(result, -64); // it is safe, bc we just came from this area
4404 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4405 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4406 kortestql(mask, mask);
4407 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
4408
4409 jmp(TRUE_LABEL);
4410
4411 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4412
4413 }//if (VM_Version::supports_avx512vlbw())
4414
4415 bind(COMPARE_WIDE_VECTORS);
4416 vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4417 if (expand_ary2) {
4418 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4419 } else {
4420 vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4421 }
4422 vpxor(vec1, vec2);
4423
4424 vptest(vec1, vec1);
4425 jcc(Assembler::notZero, FALSE_LABEL);
4426 addptr(limit, scaleIncr * 2);
4427 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4428
4429 testl(result, result);
4430 jcc(Assembler::zero, TRUE_LABEL);
4431
4432 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4433 if (expand_ary2) {
4434 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4435 } else {
4436 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4437 }
4438 vpxor(vec1, vec2);
4439
4440 vptest(vec1, vec1);
4441 jcc(Assembler::notZero, FALSE_LABEL);
4442 jmp(TRUE_LABEL);
4443
4444 bind(COMPARE_TAIL_16); // limit is zero
4445 movl(limit, result);
4446
4447 // Compare 16-byte chunks
4448 andl(result, 0x0000000f); // tail count (in bytes)
4449 andl(limit, 0xfffffff0); // vector count (in bytes)
4450 jcc(Assembler::zero, COMPARE_TAIL);
4451
4452 lea(ary1, Address(ary1, limit, scaleFactor));
4453 lea(ary2, Address(ary2, limit, Address::times_1));
4454 negptr(limit);
4455
4456 bind(COMPARE_WIDE_VECTORS_16);
4457 movdqu(vec1, Address(ary1, limit, scaleFactor));
4458 if (expand_ary2) {
4459 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4460 } else {
4461 movdqu(vec2, Address(ary2, limit, Address::times_1));
4462 }
4463 pxor(vec1, vec2);
4464
4465 ptest(vec1, vec1);
4466 jcc(Assembler::notZero, FALSE_LABEL);
4467 addptr(limit, scaleIncr);
4468 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4469
4470 bind(COMPARE_TAIL); // limit is zero
4471 movl(limit, result);
4472 // Fallthru to tail compare
4473 } else if (UseSSE42Intrinsics) {
4474 // With SSE4.2, use double quad vector compare
4475 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4476
4477 // Compare 16-byte vectors
4478 andl(result, 0x0000000f); // tail count (in bytes)
4479 andl(limit, 0xfffffff0); // vector count (in bytes)
4480 jcc(Assembler::zero, COMPARE_TAIL);
4481
4482 lea(ary1, Address(ary1, limit, Address::times_1));
4483 lea(ary2, Address(ary2, limit, Address::times_1));
4484 negptr(limit);
4485
4486 bind(COMPARE_WIDE_VECTORS);
4487 movdqu(vec1, Address(ary1, limit, Address::times_1));
4488 movdqu(vec2, Address(ary2, limit, Address::times_1));
4489 pxor(vec1, vec2);
4490
4491 ptest(vec1, vec1);
4492 jcc(Assembler::notZero, FALSE_LABEL);
4493 addptr(limit, 16);
4494 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4495
4496 testl(result, result);
4497 jcc(Assembler::zero, TRUE_LABEL);
4498
4499 movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4500 movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4501 pxor(vec1, vec2);
4502
4503 ptest(vec1, vec1);
4504 jccb(Assembler::notZero, FALSE_LABEL);
4505 jmpb(TRUE_LABEL);
4506
4507 bind(COMPARE_TAIL); // limit is zero
4508 movl(limit, result);
4509 // Fallthru to tail compare
4510 }
4511
4512 // Compare 4-byte vectors
4513 if (expand_ary2) {
4514 testl(result, result);
4515 jccb(Assembler::zero, TRUE_LABEL);
4516 } else {
4517 andl(limit, 0xfffffffc); // vector count (in bytes)
4518 jccb(Assembler::zero, COMPARE_CHAR);
4519 }
4520
4521 lea(ary1, Address(ary1, limit, scaleFactor));
4522 lea(ary2, Address(ary2, limit, Address::times_1));
4523 negptr(limit);
4524
4525 bind(COMPARE_VECTORS);
4526 if (expand_ary2) {
4527 // There are no "vector" operations for bytes to shorts
4528 movzbl(chr, Address(ary2, limit, Address::times_1));
4529 cmpw(Address(ary1, limit, Address::times_2), chr);
4530 jccb(Assembler::notEqual, FALSE_LABEL);
4531 addptr(limit, 1);
4532 jcc(Assembler::notZero, COMPARE_VECTORS);
4533 jmp(TRUE_LABEL);
4534 } else {
4535 movl(chr, Address(ary1, limit, Address::times_1));
4536 cmpl(chr, Address(ary2, limit, Address::times_1));
4537 jccb(Assembler::notEqual, FALSE_LABEL);
4538 addptr(limit, 4);
4539 jcc(Assembler::notZero, COMPARE_VECTORS);
4540 }
4541
4542 // Compare trailing char (final 2 bytes), if any
4543 bind(COMPARE_CHAR);
4544 testl(result, 0x2); // tail char
4545 jccb(Assembler::zero, COMPARE_BYTE);
4546 load_unsigned_short(chr, Address(ary1, 0));
4547 load_unsigned_short(limit, Address(ary2, 0));
4548 cmpl(chr, limit);
4549 jccb(Assembler::notEqual, FALSE_LABEL);
4550
4551 if (is_array_equ && is_char) {
4552 bind(COMPARE_BYTE);
4553 } else {
4554 lea(ary1, Address(ary1, 2));
4555 lea(ary2, Address(ary2, 2));
4556
4557 bind(COMPARE_BYTE);
4558 testl(result, 0x1); // tail byte
4559 jccb(Assembler::zero, TRUE_LABEL);
4560 load_unsigned_byte(chr, Address(ary1, 0));
4561 load_unsigned_byte(limit, Address(ary2, 0));
4562 cmpl(chr, limit);
4563 jccb(Assembler::notEqual, FALSE_LABEL);
4564 }
4565 bind(TRUE_LABEL);
4566 movl(result, 1); // return true
4567 jmpb(DONE);
4568
4569 bind(FALSE_LABEL);
4570 xorl(result, result); // return false
4571
4572 // That's it
4573 bind(DONE);
4574 if (UseAVX >= 2) {
4575 // clean upper bits of YMM registers
4576 vpxor(vec1, vec1);
4577 vpxor(vec2, vec2);
4578 }
4579 }
4580
4581 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4582 #define __ masm.
4583 Register dst = stub.data<0>();
4584 XMMRegister src = stub.data<1>();
4585 address target = stub.data<2>();
4586 __ bind(stub.entry());
4587 __ subptr(rsp, 8);
4588 __ movdbl(Address(rsp), src);
4589 __ call(RuntimeAddress(target));
4590 // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4591 __ pop(dst);
4592 __ jmp(stub.continuation());
4593 #undef __
4594 }
4595
4596 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4597 assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4598 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4599
4600 address slowpath_target;
4601 if (dst_bt == T_INT) {
4602 if (src_bt == T_FLOAT) {
4603 cvttss2sil(dst, src);
4604 cmpl(dst, 0x80000000);
4605 slowpath_target = StubRoutines::x86::f2i_fixup();
4606 } else {
4607 cvttsd2sil(dst, src);
4608 cmpl(dst, 0x80000000);
4609 slowpath_target = StubRoutines::x86::d2i_fixup();
4610 }
4611 } else {
4612 if (src_bt == T_FLOAT) {
4613 cvttss2siq(dst, src);
4614 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4615 slowpath_target = StubRoutines::x86::f2l_fixup();
4616 } else {
4617 cvttsd2siq(dst, src);
4618 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4619 slowpath_target = StubRoutines::x86::d2l_fixup();
4620 }
4621 }
4622
4623 // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4624 int max_size = 23 + (UseAPX ? 1 : 0);
4625 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4626 jcc(Assembler::equal, stub->entry());
4627 bind(stub->continuation());
4628 }
4629
4630 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4631 XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4632 switch(ideal_opc) {
4633 case Op_LShiftVS:
4634 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4635 case Op_LShiftVI:
4636 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4637 case Op_LShiftVL:
4638 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4639 case Op_RShiftVS:
4640 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4641 case Op_RShiftVI:
4642 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4643 case Op_RShiftVL:
4644 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4645 case Op_URShiftVS:
4646 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4647 case Op_URShiftVI:
4648 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4649 case Op_URShiftVL:
4650 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4651 case Op_RotateRightV:
4652 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4653 case Op_RotateLeftV:
4654 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4655 default:
4656 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4657 break;
4658 }
4659 }
4660
4661 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4662 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4663 if (is_unsigned) {
4664 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4665 } else {
4666 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4667 }
4668 }
4669
4670 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4671 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4672 switch (elem_bt) {
4673 case T_BYTE:
4674 if (ideal_opc == Op_SaturatingAddV) {
4675 evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4676 } else {
4677 assert(ideal_opc == Op_SaturatingSubV, "");
4678 evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4679 }
4680 break;
4681 case T_SHORT:
4682 if (ideal_opc == Op_SaturatingAddV) {
4683 evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4684 } else {
4685 assert(ideal_opc == Op_SaturatingSubV, "");
4686 evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4687 }
4688 break;
4689 default:
4690 fatal("Unsupported type %s", type2name(elem_bt));
4691 break;
4692 }
4693 }
4694
4695 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4696 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4697 switch (elem_bt) {
4698 case T_BYTE:
4699 if (ideal_opc == Op_SaturatingAddV) {
4700 evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4701 } else {
4702 assert(ideal_opc == Op_SaturatingSubV, "");
4703 evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4704 }
4705 break;
4706 case T_SHORT:
4707 if (ideal_opc == Op_SaturatingAddV) {
4708 evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4709 } else {
4710 assert(ideal_opc == Op_SaturatingSubV, "");
4711 evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4712 }
4713 break;
4714 default:
4715 fatal("Unsupported type %s", type2name(elem_bt));
4716 break;
4717 }
4718 }
4719
4720 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4721 Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4722 if (is_unsigned) {
4723 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4724 } else {
4725 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4726 }
4727 }
4728
4729 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4730 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4731 switch (elem_bt) {
4732 case T_BYTE:
4733 if (ideal_opc == Op_SaturatingAddV) {
4734 evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4735 } else {
4736 assert(ideal_opc == Op_SaturatingSubV, "");
4737 evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4738 }
4739 break;
4740 case T_SHORT:
4741 if (ideal_opc == Op_SaturatingAddV) {
4742 evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4743 } else {
4744 assert(ideal_opc == Op_SaturatingSubV, "");
4745 evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4746 }
4747 break;
4748 default:
4749 fatal("Unsupported type %s", type2name(elem_bt));
4750 break;
4751 }
4752 }
4753
4754 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4755 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4756 switch (elem_bt) {
4757 case T_BYTE:
4758 if (ideal_opc == Op_SaturatingAddV) {
4759 evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4760 } else {
4761 assert(ideal_opc == Op_SaturatingSubV, "");
4762 evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4763 }
4764 break;
4765 case T_SHORT:
4766 if (ideal_opc == Op_SaturatingAddV) {
4767 evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4768 } else {
4769 assert(ideal_opc == Op_SaturatingSubV, "");
4770 evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4771 }
4772 break;
4773 default:
4774 fatal("Unsupported type %s", type2name(elem_bt));
4775 break;
4776 }
4777 }
4778
4779 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4780 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4781 bool is_varshift) {
4782 switch (ideal_opc) {
4783 case Op_AddVB:
4784 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4785 case Op_AddVS:
4786 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4787 case Op_AddVI:
4788 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4789 case Op_AddVL:
4790 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4791 case Op_AddVF:
4792 evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4793 case Op_AddVD:
4794 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4795 case Op_SubVB:
4796 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4797 case Op_SubVS:
4798 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4799 case Op_SubVI:
4800 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4801 case Op_SubVL:
4802 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4803 case Op_SubVF:
4804 evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4805 case Op_SubVD:
4806 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4807 case Op_MulVS:
4808 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4809 case Op_MulVI:
4810 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4811 case Op_MulVL:
4812 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4813 case Op_MulVF:
4814 evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4815 case Op_MulVD:
4816 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4817 case Op_DivVF:
4818 evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4819 case Op_DivVD:
4820 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4821 case Op_SqrtVF:
4822 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4823 case Op_SqrtVD:
4824 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4825 case Op_AbsVB:
4826 evpabsb(dst, mask, src2, merge, vlen_enc); break;
4827 case Op_AbsVS:
4828 evpabsw(dst, mask, src2, merge, vlen_enc); break;
4829 case Op_AbsVI:
4830 evpabsd(dst, mask, src2, merge, vlen_enc); break;
4831 case Op_AbsVL:
4832 evpabsq(dst, mask, src2, merge, vlen_enc); break;
4833 case Op_FmaVF:
4834 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4835 case Op_FmaVD:
4836 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4837 case Op_VectorRearrange:
4838 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4839 case Op_LShiftVS:
4840 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4841 case Op_LShiftVI:
4842 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4843 case Op_LShiftVL:
4844 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4845 case Op_RShiftVS:
4846 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4847 case Op_RShiftVI:
4848 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4849 case Op_RShiftVL:
4850 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4851 case Op_URShiftVS:
4852 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4853 case Op_URShiftVI:
4854 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4855 case Op_URShiftVL:
4856 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4857 case Op_RotateLeftV:
4858 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4859 case Op_RotateRightV:
4860 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4861 case Op_MaxV:
4862 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4863 case Op_MinV:
4864 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4865 case Op_UMinV:
4866 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4867 case Op_UMaxV:
4868 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4869 case Op_XorV:
4870 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4871 case Op_OrV:
4872 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4873 case Op_AndV:
4874 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4875 default:
4876 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4877 break;
4878 }
4879 }
4880
4881 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4882 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4883 switch (ideal_opc) {
4884 case Op_AddVB:
4885 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4886 case Op_AddVS:
4887 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4888 case Op_AddVI:
4889 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4890 case Op_AddVL:
4891 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4892 case Op_AddVF:
4893 evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4894 case Op_AddVD:
4895 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4896 case Op_SubVB:
4897 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4898 case Op_SubVS:
4899 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4900 case Op_SubVI:
4901 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4902 case Op_SubVL:
4903 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4904 case Op_SubVF:
4905 evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4906 case Op_SubVD:
4907 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4908 case Op_MulVS:
4909 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4910 case Op_MulVI:
4911 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4912 case Op_MulVL:
4913 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4914 case Op_MulVF:
4915 evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4916 case Op_MulVD:
4917 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4918 case Op_DivVF:
4919 evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4920 case Op_DivVD:
4921 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4922 case Op_FmaVF:
4923 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4924 case Op_FmaVD:
4925 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4926 case Op_MaxV:
4927 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4928 case Op_MinV:
4929 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4930 case Op_UMaxV:
4931 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4932 case Op_UMinV:
4933 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4934 case Op_XorV:
4935 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4936 case Op_OrV:
4937 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4938 case Op_AndV:
4939 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4940 default:
4941 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4942 break;
4943 }
4944 }
4945
4946 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4947 KRegister src1, KRegister src2) {
4948 BasicType etype = T_ILLEGAL;
4949 switch(mask_len) {
4950 case 2:
4951 case 4:
4952 case 8: etype = T_BYTE; break;
4953 case 16: etype = T_SHORT; break;
4954 case 32: etype = T_INT; break;
4955 case 64: etype = T_LONG; break;
4956 default: fatal("Unsupported type"); break;
4957 }
4958 assert(etype != T_ILLEGAL, "");
4959 switch(ideal_opc) {
4960 case Op_AndVMask:
4961 kand(etype, dst, src1, src2); break;
4962 case Op_OrVMask:
4963 kor(etype, dst, src1, src2); break;
4964 case Op_XorVMask:
4965 kxor(etype, dst, src1, src2); break;
4966 default:
4967 fatal("Unsupported masked operation"); break;
4968 }
4969 }
4970
4971 /*
4972 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4973 * If src is NaN, the result is 0.
4974 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4975 * the result is equal to the value of Integer.MIN_VALUE.
4976 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4977 * the result is equal to the value of Integer.MAX_VALUE.
4978 */
4979 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4980 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4981 Register rscratch, AddressLiteral float_sign_flip,
4982 int vec_enc) {
4983 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4984 Label done;
4985 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4986 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4987 vptest(xtmp2, xtmp2, vec_enc);
4988 jccb(Assembler::equal, done);
4989
4990 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4991 vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4992
4993 vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4994 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4995 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4996
4997 // Recompute the mask for remaining special value.
4998 vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4999 // Extract SRC values corresponding to TRUE mask lanes.
5000 vpand(xtmp4, xtmp2, src, vec_enc);
5001 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
5002 // values are set.
5003 vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
5004
5005 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
5006 bind(done);
5007 }
5008
5009 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5010 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5011 Register rscratch, AddressLiteral float_sign_flip,
5012 int vec_enc) {
5013 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5014 Label done;
5015 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5016 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5017 kortestwl(ktmp1, ktmp1);
5018 jccb(Assembler::equal, done);
5019
5020 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5021 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5022 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5023
5024 kxorwl(ktmp1, ktmp1, ktmp2);
5025 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5026 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5027 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5028 bind(done);
5029 }
5030
5031 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5032 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5033 Register rscratch, AddressLiteral double_sign_flip,
5034 int vec_enc) {
5035 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5036
5037 Label done;
5038 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5039 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
5040 kortestwl(ktmp1, ktmp1);
5041 jccb(Assembler::equal, done);
5042
5043 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5044 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5045 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5046
5047 kxorwl(ktmp1, ktmp1, ktmp2);
5048 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5049 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5050 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5051 bind(done);
5052 }
5053
5054 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5055 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5056 Register rscratch, AddressLiteral float_sign_flip,
5057 int vec_enc) {
5058 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5059 Label done;
5060 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5061 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5062 kortestwl(ktmp1, ktmp1);
5063 jccb(Assembler::equal, done);
5064
5065 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5066 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5067 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5068
5069 kxorwl(ktmp1, ktmp1, ktmp2);
5070 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5071 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5072 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5073 bind(done);
5074 }
5075
5076 /*
5077 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5078 * If src is NaN, the result is 0.
5079 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5080 * the result is equal to the value of Long.MIN_VALUE.
5081 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5082 * the result is equal to the value of Long.MAX_VALUE.
5083 */
5084 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5085 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5086 Register rscratch, AddressLiteral double_sign_flip,
5087 int vec_enc) {
5088 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5089
5090 Label done;
5091 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5092 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5093 kortestwl(ktmp1, ktmp1);
5094 jccb(Assembler::equal, done);
5095
5096 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5097 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5098 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5099
5100 kxorwl(ktmp1, ktmp1, ktmp2);
5101 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5102 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5103 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5104 bind(done);
5105 }
5106
5107 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5108 XMMRegister xtmp, int index, int vec_enc) {
5109 assert(vec_enc < Assembler::AVX_512bit, "");
5110 if (vec_enc == Assembler::AVX_256bit) {
5111 vextractf128_high(xtmp, src);
5112 vshufps(dst, src, xtmp, index, vec_enc);
5113 } else {
5114 vshufps(dst, src, zero, index, vec_enc);
5115 }
5116 }
5117
5118 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5119 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5120 AddressLiteral float_sign_flip, int src_vec_enc) {
5121 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5122
5123 Label done;
5124 // Compare the destination lanes with float_sign_flip
5125 // value to get mask for all special values.
5126 movdqu(xtmp1, float_sign_flip, rscratch);
5127 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5128 ptest(xtmp2, xtmp2);
5129 jccb(Assembler::equal, done);
5130
5131 // Flip float_sign_flip to get max integer value.
5132 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5133 pxor(xtmp1, xtmp4);
5134
5135 // Set detination lanes corresponding to unordered source lanes as zero.
5136 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5137 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5138
5139 // Shuffle mask vector and pack lower doubles word from each quadword lane.
5140 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5141 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5142
5143 // Recompute the mask for remaining special value.
5144 pxor(xtmp2, xtmp3);
5145 // Extract mask corresponding to non-negative source lanes.
5146 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5147
5148 // Shuffle mask vector and pack lower doubles word from each quadword lane.
5149 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5150 pand(xtmp3, xtmp2);
5151
5152 // Replace destination lanes holding special value(0x80000000) with max int
5153 // if corresponding source lane holds a +ve value.
5154 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5155 bind(done);
5156 }
5157
5158
5159 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5160 XMMRegister xtmp, Register rscratch, int vec_enc) {
5161 switch(to_elem_bt) {
5162 case T_SHORT:
5163 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5164 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5165 vpackusdw(dst, dst, zero, vec_enc);
5166 if (vec_enc == Assembler::AVX_256bit) {
5167 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5168 }
5169 break;
5170 case T_BYTE:
5171 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5172 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5173 vpackusdw(dst, dst, zero, vec_enc);
5174 if (vec_enc == Assembler::AVX_256bit) {
5175 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5176 }
5177 vpackuswb(dst, dst, zero, vec_enc);
5178 break;
5179 default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt));
5180 }
5181 }
5182
5183 /*
5184 * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):-
5185 * a) Perform vector D2L/F2I cast.
5186 * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5187 * It signifies that source value could be any of the special floating point
5188 * values(NaN,-Inf,Inf,Max,-Min).
5189 * c) Set destination to zero if source is NaN value.
5190 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5191 */
5192
5193 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5194 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5195 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5196 int to_elem_sz = type2aelembytes(to_elem_bt);
5197 assert(to_elem_sz <= 4, "");
5198 vcvttps2dq(dst, src, vec_enc);
5199 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5200 if (to_elem_sz < 4) {
5201 vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5202 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5203 }
5204 }
5205
5206 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5207 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5208 Register rscratch, int vec_enc) {
5209 int to_elem_sz = type2aelembytes(to_elem_bt);
5210 assert(to_elem_sz <= 4, "");
5211 vcvttps2dq(dst, src, vec_enc);
5212 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5213 switch(to_elem_bt) {
5214 case T_INT:
5215 break;
5216 case T_SHORT:
5217 evpmovdw(dst, dst, vec_enc);
5218 break;
5219 case T_BYTE:
5220 evpmovdb(dst, dst, vec_enc);
5221 break;
5222 default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt));
5223 }
5224 }
5225
5226 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5227 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5228 Register rscratch, int vec_enc) {
5229 evcvttps2qq(dst, src, vec_enc);
5230 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5231 }
5232
5233 // Handling for downcasting from double to integer or sub-word types on AVX2.
5234 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5235 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5236 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5237 int to_elem_sz = type2aelembytes(to_elem_bt);
5238 assert(to_elem_sz < 8, "");
5239 vcvttpd2dq(dst, src, vec_enc);
5240 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5241 float_sign_flip, vec_enc);
5242 if (to_elem_sz < 4) {
5243 // xtmp4 holds all zero lanes.
5244 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5245 }
5246 }
5247
5248 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5249 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5250 KRegister ktmp2, AddressLiteral sign_flip,
5251 Register rscratch, int vec_enc) {
5252 if (VM_Version::supports_avx512dq()) {
5253 evcvttpd2qq(dst, src, vec_enc);
5254 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5255 switch(to_elem_bt) {
5256 case T_LONG:
5257 break;
5258 case T_INT:
5259 evpmovsqd(dst, dst, vec_enc);
5260 break;
5261 case T_SHORT:
5262 evpmovsqd(dst, dst, vec_enc);
5263 evpmovdw(dst, dst, vec_enc);
5264 break;
5265 case T_BYTE:
5266 evpmovsqd(dst, dst, vec_enc);
5267 evpmovdb(dst, dst, vec_enc);
5268 break;
5269 default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt));
5270 }
5271 } else {
5272 assert(type2aelembytes(to_elem_bt) <= 4, "");
5273 vcvttpd2dq(dst, src, vec_enc);
5274 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5275 switch(to_elem_bt) {
5276 case T_INT:
5277 break;
5278 case T_SHORT:
5279 evpmovdw(dst, dst, vec_enc);
5280 break;
5281 case T_BYTE:
5282 evpmovdb(dst, dst, vec_enc);
5283 break;
5284 default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt));
5285 }
5286 }
5287 }
5288
5289 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5290 switch(to_elem_bt) {
5291 case T_LONG:
5292 evcvttps2qqs(dst, src, vec_enc);
5293 break;
5294 case T_INT:
5295 evcvttps2dqs(dst, src, vec_enc);
5296 break;
5297 case T_SHORT:
5298 evcvttps2dqs(dst, src, vec_enc);
5299 evpmovdw(dst, dst, vec_enc);
5300 break;
5301 case T_BYTE:
5302 evcvttps2dqs(dst, src, vec_enc);
5303 evpmovdb(dst, dst, vec_enc);
5304 break;
5305 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt));
5306 }
5307 }
5308
5309 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5310 switch(to_elem_bt) {
5311 case T_LONG:
5312 evcvttps2qqs(dst, src, vec_enc);
5313 break;
5314 case T_INT:
5315 evcvttps2dqs(dst, src, vec_enc);
5316 break;
5317 case T_SHORT:
5318 evcvttps2dqs(dst, src, vec_enc);
5319 evpmovdw(dst, dst, vec_enc);
5320 break;
5321 case T_BYTE:
5322 evcvttps2dqs(dst, src, vec_enc);
5323 evpmovdb(dst, dst, vec_enc);
5324 break;
5325 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt));
5326 }
5327 }
5328
5329 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5330 switch(to_elem_bt) {
5331 case T_LONG:
5332 evcvttpd2qqs(dst, src, vec_enc);
5333 break;
5334 case T_INT:
5335 evcvttpd2dqs(dst, src, vec_enc);
5336 break;
5337 case T_SHORT:
5338 evcvttpd2dqs(dst, src, vec_enc);
5339 evpmovdw(dst, dst, vec_enc);
5340 break;
5341 case T_BYTE:
5342 evcvttpd2dqs(dst, src, vec_enc);
5343 evpmovdb(dst, dst, vec_enc);
5344 break;
5345 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt));
5346 }
5347 }
5348
5349 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5350 switch(to_elem_bt) {
5351 case T_LONG:
5352 evcvttpd2qqs(dst, src, vec_enc);
5353 break;
5354 case T_INT:
5355 evcvttpd2dqs(dst, src, vec_enc);
5356 break;
5357 case T_SHORT:
5358 evcvttpd2dqs(dst, src, vec_enc);
5359 evpmovdw(dst, dst, vec_enc);
5360 break;
5361 case T_BYTE:
5362 evcvttpd2dqs(dst, src, vec_enc);
5363 evpmovdb(dst, dst, vec_enc);
5364 break;
5365 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt));
5366 }
5367 }
5368
5369 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5370 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5371 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5372 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5373 // and re-instantiate original MXCSR.RC mode after that.
5374 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5375
5376 mov64(tmp, julong_cast(0.5L));
5377 evpbroadcastq(xtmp1, tmp, vec_enc);
5378 vaddpd(xtmp1, src , xtmp1, vec_enc);
5379 evcvtpd2qq(dst, xtmp1, vec_enc);
5380 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5381 double_sign_flip, vec_enc);;
5382
5383 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5384 }
5385
5386 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5387 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5388 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5389 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5390 // and re-instantiate original MXCSR.RC mode after that.
5391 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5392
5393 movl(tmp, jint_cast(0.5));
5394 movq(xtmp1, tmp);
5395 vbroadcastss(xtmp1, xtmp1, vec_enc);
5396 vaddps(xtmp1, src , xtmp1, vec_enc);
5397 vcvtps2dq(dst, xtmp1, vec_enc);
5398 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5399 float_sign_flip, vec_enc);
5400
5401 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5402 }
5403
5404 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5405 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5406 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5407 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5408 // and re-instantiate original MXCSR.RC mode after that.
5409 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5410
5411 movl(tmp, jint_cast(0.5));
5412 movq(xtmp1, tmp);
5413 vbroadcastss(xtmp1, xtmp1, vec_enc);
5414 vaddps(xtmp1, src , xtmp1, vec_enc);
5415 vcvtps2dq(dst, xtmp1, vec_enc);
5416 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5417
5418 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5419 }
5420
5421 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5422 BasicType from_elem_bt, BasicType to_elem_bt) {
5423 switch (from_elem_bt) {
5424 case T_BYTE:
5425 switch (to_elem_bt) {
5426 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5427 case T_INT: vpmovzxbd(dst, src, vlen_enc); break;
5428 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break;
5429 default: ShouldNotReachHere();
5430 }
5431 break;
5432 case T_SHORT:
5433 switch (to_elem_bt) {
5434 case T_INT: vpmovzxwd(dst, src, vlen_enc); break;
5435 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5436 default: ShouldNotReachHere();
5437 }
5438 break;
5439 case T_INT:
5440 assert(to_elem_bt == T_LONG, "");
5441 vpmovzxdq(dst, src, vlen_enc);
5442 break;
5443 default:
5444 ShouldNotReachHere();
5445 }
5446 }
5447
5448 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5449 BasicType from_elem_bt, BasicType to_elem_bt) {
5450 switch (from_elem_bt) {
5451 case T_BYTE:
5452 switch (to_elem_bt) {
5453 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5454 case T_INT: vpmovsxbd(dst, src, vlen_enc); break;
5455 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break;
5456 default: ShouldNotReachHere();
5457 }
5458 break;
5459 case T_SHORT:
5460 switch (to_elem_bt) {
5461 case T_INT: vpmovsxwd(dst, src, vlen_enc); break;
5462 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5463 default: ShouldNotReachHere();
5464 }
5465 break;
5466 case T_INT:
5467 assert(to_elem_bt == T_LONG, "");
5468 vpmovsxdq(dst, src, vlen_enc);
5469 break;
5470 default:
5471 ShouldNotReachHere();
5472 }
5473 }
5474
5475 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5476 BasicType dst_bt, BasicType src_bt, int vlen) {
5477 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5478 assert(vlen_enc != AVX_512bit, "");
5479
5480 int dst_bt_size = type2aelembytes(dst_bt);
5481 int src_bt_size = type2aelembytes(src_bt);
5482 if (dst_bt_size > src_bt_size) {
5483 switch (dst_bt_size / src_bt_size) {
5484 case 2: vpmovsxbw(dst, src, vlen_enc); break;
5485 case 4: vpmovsxbd(dst, src, vlen_enc); break;
5486 case 8: vpmovsxbq(dst, src, vlen_enc); break;
5487 default: ShouldNotReachHere();
5488 }
5489 } else {
5490 assert(dst_bt_size < src_bt_size, "");
5491 switch (src_bt_size / dst_bt_size) {
5492 case 2: {
5493 if (vlen_enc == AVX_128bit) {
5494 vpacksswb(dst, src, src, vlen_enc);
5495 } else {
5496 vpacksswb(dst, src, src, vlen_enc);
5497 vpermq(dst, dst, 0x08, vlen_enc);
5498 }
5499 break;
5500 }
5501 case 4: {
5502 if (vlen_enc == AVX_128bit) {
5503 vpackssdw(dst, src, src, vlen_enc);
5504 vpacksswb(dst, dst, dst, vlen_enc);
5505 } else {
5506 vpackssdw(dst, src, src, vlen_enc);
5507 vpermq(dst, dst, 0x08, vlen_enc);
5508 vpacksswb(dst, dst, dst, AVX_128bit);
5509 }
5510 break;
5511 }
5512 case 8: {
5513 if (vlen_enc == AVX_128bit) {
5514 vpshufd(dst, src, 0x08, vlen_enc);
5515 vpackssdw(dst, dst, dst, vlen_enc);
5516 vpacksswb(dst, dst, dst, vlen_enc);
5517 } else {
5518 vpshufd(dst, src, 0x08, vlen_enc);
5519 vpermq(dst, dst, 0x08, vlen_enc);
5520 vpackssdw(dst, dst, dst, AVX_128bit);
5521 vpacksswb(dst, dst, dst, AVX_128bit);
5522 }
5523 break;
5524 }
5525 default: ShouldNotReachHere();
5526 }
5527 }
5528 }
5529
5530 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5531 bool merge, BasicType bt, int vlen_enc) {
5532 if (bt == T_INT) {
5533 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5534 } else {
5535 assert(bt == T_LONG, "");
5536 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5537 }
5538 }
5539
5540 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5541 bool merge, BasicType bt, int vlen_enc) {
5542 if (bt == T_INT) {
5543 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5544 } else {
5545 assert(bt == T_LONG, "");
5546 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5547 }
5548 }
5549
5550 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5551 Register rtmp2, XMMRegister xtmp, int mask_len,
5552 int vec_enc) {
5553 int index = 0;
5554 int vindex = 0;
5555 mov64(rtmp1, 0x0101010101010101L);
5556 pdepq(rtmp1, src, rtmp1);
5557 if (mask_len > 8) {
5558 movq(rtmp2, src);
5559 vpxor(xtmp, xtmp, xtmp, vec_enc);
5560 movq(xtmp, rtmp1);
5561 }
5562 movq(dst, rtmp1);
5563
5564 mask_len -= 8;
5565 while (mask_len > 0) {
5566 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5567 index++;
5568 if ((index % 2) == 0) {
5569 pxor(xtmp, xtmp);
5570 }
5571 mov64(rtmp1, 0x0101010101010101L);
5572 shrq(rtmp2, 8);
5573 pdepq(rtmp1, rtmp2, rtmp1);
5574 pinsrq(xtmp, rtmp1, index % 2);
5575 vindex = index / 2;
5576 if (vindex) {
5577 // Write entire 16 byte vector when both 64 bit
5578 // lanes are update to save redundant instructions.
5579 if (index % 2) {
5580 vinsertf128(dst, dst, xtmp, vindex);
5581 }
5582 } else {
5583 vmovdqu(dst, xtmp);
5584 }
5585 mask_len -= 8;
5586 }
5587 }
5588
5589 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5590 switch(opc) {
5591 case Op_VectorMaskTrueCount:
5592 popcntq(dst, tmp);
5593 break;
5594 case Op_VectorMaskLastTrue:
5595 if (VM_Version::supports_lzcnt()) {
5596 lzcntq(tmp, tmp);
5597 movl(dst, 63);
5598 subl(dst, tmp);
5599 } else {
5600 movl(dst, -1);
5601 bsrq(tmp, tmp);
5602 cmov32(Assembler::notZero, dst, tmp);
5603 }
5604 break;
5605 case Op_VectorMaskFirstTrue:
5606 if (VM_Version::supports_bmi1()) {
5607 if (masklen < 32) {
5608 orl(tmp, 1 << masklen);
5609 tzcntl(dst, tmp);
5610 } else if (masklen == 32) {
5611 tzcntl(dst, tmp);
5612 } else {
5613 assert(masklen == 64, "");
5614 tzcntq(dst, tmp);
5615 }
5616 } else {
5617 if (masklen < 32) {
5618 orl(tmp, 1 << masklen);
5619 bsfl(dst, tmp);
5620 } else {
5621 assert(masklen == 32 || masklen == 64, "");
5622 movl(dst, masklen);
5623 if (masklen == 32) {
5624 bsfl(tmp, tmp);
5625 } else {
5626 bsfq(tmp, tmp);
5627 }
5628 cmov32(Assembler::notZero, dst, tmp);
5629 }
5630 }
5631 break;
5632 case Op_VectorMaskToLong:
5633 assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5634 break;
5635 default: assert(false, "Unhandled mask operation");
5636 }
5637 }
5638
5639 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5640 int masklen, int masksize, int vec_enc) {
5641 assert(VM_Version::supports_popcnt(), "");
5642
5643 if(VM_Version::supports_avx512bw()) {
5644 kmovql(tmp, mask);
5645 } else {
5646 assert(masklen <= 16, "");
5647 kmovwl(tmp, mask);
5648 }
5649
5650 // Mask generated out of partial vector comparisons/replicate/mask manipulation
5651 // operations needs to be clipped.
5652 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5653 andq(tmp, (1 << masklen) - 1);
5654 }
5655
5656 vector_mask_operation_helper(opc, dst, tmp, masklen);
5657 }
5658
5659 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5660 Register tmp, int masklen, BasicType bt, int vec_enc) {
5661 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5662 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5663 assert(VM_Version::supports_popcnt(), "");
5664
5665 bool need_clip = false;
5666 switch(bt) {
5667 case T_BOOLEAN:
5668 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5669 vpxor(xtmp, xtmp, xtmp, vec_enc);
5670 vpsubb(xtmp, xtmp, mask, vec_enc);
5671 vpmovmskb(tmp, xtmp, vec_enc);
5672 need_clip = masklen < 16;
5673 break;
5674 case T_BYTE:
5675 vpmovmskb(tmp, mask, vec_enc);
5676 need_clip = masklen < 16;
5677 break;
5678 case T_SHORT:
5679 vpacksswb(xtmp, mask, mask, vec_enc);
5680 if (masklen >= 16) {
5681 vpermpd(xtmp, xtmp, 8, vec_enc);
5682 }
5683 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5684 need_clip = masklen < 16;
5685 break;
5686 case T_INT:
5687 case T_FLOAT:
5688 vmovmskps(tmp, mask, vec_enc);
5689 need_clip = masklen < 4;
5690 break;
5691 case T_LONG:
5692 case T_DOUBLE:
5693 vmovmskpd(tmp, mask, vec_enc);
5694 need_clip = masklen < 2;
5695 break;
5696 default: assert(false, "Unhandled type, %s", type2name(bt));
5697 }
5698
5699 // Mask generated out of partial vector comparisons/replicate/mask manipulation
5700 // operations needs to be clipped.
5701 if (need_clip && opc != Op_VectorMaskFirstTrue) {
5702 // need_clip implies masklen < 32
5703 andq(tmp, (1 << masklen) - 1);
5704 }
5705
5706 vector_mask_operation_helper(opc, dst, tmp, masklen);
5707 }
5708
5709 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5710 Register rtmp2, int mask_len) {
5711 kmov(rtmp1, src);
5712 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5713 mov64(rtmp2, -1L);
5714 pextq(rtmp2, rtmp2, rtmp1);
5715 kmov(dst, rtmp2);
5716 }
5717
5718 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5719 XMMRegister mask, Register rtmp, Register rscratch,
5720 XMMRegister permv, XMMRegister xtmp, BasicType bt,
5721 int vec_enc) {
5722 assert(type2aelembytes(bt) >= 4, "");
5723 assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5724 address compress_perm_table = nullptr;
5725 address expand_perm_table = nullptr;
5726 if (type2aelembytes(bt) == 8) {
5727 compress_perm_table = StubRoutines::x86::compress_perm_table64();
5728 expand_perm_table = StubRoutines::x86::expand_perm_table64();
5729 vmovmskpd(rtmp, mask, vec_enc);
5730 } else {
5731 compress_perm_table = StubRoutines::x86::compress_perm_table32();
5732 expand_perm_table = StubRoutines::x86::expand_perm_table32();
5733 vmovmskps(rtmp, mask, vec_enc);
5734 }
5735 shlq(rtmp, 5); // for 32 byte permute row.
5736 if (opcode == Op_CompressV) {
5737 lea(rscratch, ExternalAddress(compress_perm_table));
5738 } else {
5739 lea(rscratch, ExternalAddress(expand_perm_table));
5740 }
5741 addptr(rtmp, rscratch);
5742 vmovdqu(permv, Address(rtmp));
5743 vpermps(dst, permv, src, Assembler::AVX_256bit);
5744 vpxor(xtmp, xtmp, xtmp, vec_enc);
5745 // Blend the result with zero vector using permute mask, each column entry
5746 // in a permute table row contains either a valid permute index or a -1 (default)
5747 // value, this can potentially be used as a blending mask after
5748 // compressing/expanding the source vector lanes.
5749 vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5750 }
5751
5752 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5753 bool merge, BasicType bt, int vec_enc) {
5754 if (opcode == Op_CompressV) {
5755 switch(bt) {
5756 case T_BYTE:
5757 evpcompressb(dst, mask, src, merge, vec_enc);
5758 break;
5759 case T_CHAR:
5760 case T_SHORT:
5761 evpcompressw(dst, mask, src, merge, vec_enc);
5762 break;
5763 case T_INT:
5764 evpcompressd(dst, mask, src, merge, vec_enc);
5765 break;
5766 case T_FLOAT:
5767 evcompressps(dst, mask, src, merge, vec_enc);
5768 break;
5769 case T_LONG:
5770 evpcompressq(dst, mask, src, merge, vec_enc);
5771 break;
5772 case T_DOUBLE:
5773 evcompresspd(dst, mask, src, merge, vec_enc);
5774 break;
5775 default:
5776 fatal("Unsupported type %s", type2name(bt));
5777 break;
5778 }
5779 } else {
5780 assert(opcode == Op_ExpandV, "");
5781 switch(bt) {
5782 case T_BYTE:
5783 evpexpandb(dst, mask, src, merge, vec_enc);
5784 break;
5785 case T_CHAR:
5786 case T_SHORT:
5787 evpexpandw(dst, mask, src, merge, vec_enc);
5788 break;
5789 case T_INT:
5790 evpexpandd(dst, mask, src, merge, vec_enc);
5791 break;
5792 case T_FLOAT:
5793 evexpandps(dst, mask, src, merge, vec_enc);
5794 break;
5795 case T_LONG:
5796 evpexpandq(dst, mask, src, merge, vec_enc);
5797 break;
5798 case T_DOUBLE:
5799 evexpandpd(dst, mask, src, merge, vec_enc);
5800 break;
5801 default:
5802 fatal("Unsupported type %s", type2name(bt));
5803 break;
5804 }
5805 }
5806 }
5807
5808 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5809 KRegister ktmp1, int vec_enc) {
5810 if (opcode == Op_SignumVD) {
5811 vsubpd(dst, zero, one, vec_enc);
5812 // if src < 0 ? -1 : 1
5813 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5814 evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5815 // if src == NaN, -0.0 or 0.0 return src.
5816 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5817 evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5818 } else {
5819 assert(opcode == Op_SignumVF, "");
5820 vsubps(dst, zero, one, vec_enc);
5821 // if src < 0 ? -1 : 1
5822 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5823 evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5824 // if src == NaN, -0.0 or 0.0 return src.
5825 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5826 evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5827 }
5828 }
5829
5830 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5831 XMMRegister xtmp1, int vec_enc) {
5832 if (opcode == Op_SignumVD) {
5833 vsubpd(dst, zero, one, vec_enc);
5834 // if src < 0 ? -1 : 1
5835 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5836 // if src == NaN, -0.0 or 0.0 return src.
5837 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5838 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5839 } else {
5840 assert(opcode == Op_SignumVF, "");
5841 vsubps(dst, zero, one, vec_enc);
5842 // if src < 0 ? -1 : 1
5843 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5844 // if src == NaN, -0.0 or 0.0 return src.
5845 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5846 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5847 }
5848 }
5849
5850 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5851 if (VM_Version::supports_avx512bw()) {
5852 if (mask_len > 32) {
5853 kmovql(dst, src);
5854 } else {
5855 kmovdl(dst, src);
5856 if (mask_len != 32) {
5857 kshiftrdl(dst, dst, 32 - mask_len);
5858 }
5859 }
5860 } else {
5861 assert(mask_len <= 16, "");
5862 kmovwl(dst, src);
5863 if (mask_len != 16) {
5864 kshiftrwl(dst, dst, 16 - mask_len);
5865 }
5866 }
5867 }
5868
5869 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5870 int lane_size = type2aelembytes(bt);
5871 if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5872 (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5873 movptr(rtmp, imm32);
5874 switch(lane_size) {
5875 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5876 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5877 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5878 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5879 fatal("Unsupported lane size %d", lane_size);
5880 break;
5881 }
5882 } else {
5883 movptr(rtmp, imm32);
5884 movq(dst, rtmp);
5885 switch(lane_size) {
5886 case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5887 case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5888 case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5889 case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5890 fatal("Unsupported lane size %d", lane_size);
5891 break;
5892 }
5893 }
5894 }
5895
5896 //
5897 // Following is lookup table based popcount computation algorithm:-
5898 // Index Bit set count
5899 // [ 0000 -> 0,
5900 // 0001 -> 1,
5901 // 0010 -> 1,
5902 // 0011 -> 2,
5903 // 0100 -> 1,
5904 // 0101 -> 2,
5905 // 0110 -> 2,
5906 // 0111 -> 3,
5907 // 1000 -> 1,
5908 // 1001 -> 2,
5909 // 1010 -> 3,
5910 // 1011 -> 3,
5911 // 1100 -> 2,
5912 // 1101 -> 3,
5913 // 1111 -> 4 ]
5914 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5915 // shuffle indices for lookup table access.
5916 // b. Right shift each byte of vector lane by 4 positions.
5917 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5918 // shuffle indices for lookup table access.
5919 // d. Add the bitset count of upper and lower 4 bits of each byte.
5920 // e. Unpack double words to quad words and compute sum of absolute difference of bitset
5921 // count of all the bytes of a quadword.
5922 // f. Perform step e. for upper 128bit vector lane.
5923 // g. Pack the bitset count of quadwords back to double word.
5924 // h. Unpacking and packing operations are not needed for 64bit vector lane.
5925
5926 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5927 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5928 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5929 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5930 vpsrlw(dst, src, 4, vec_enc);
5931 vpand(dst, dst, xtmp1, vec_enc);
5932 vpand(xtmp1, src, xtmp1, vec_enc);
5933 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5934 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5935 vpshufb(dst, xtmp2, dst, vec_enc);
5936 vpaddb(dst, dst, xtmp1, vec_enc);
5937 }
5938
5939 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5940 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5941 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5942 // Following code is as per steps e,f,g and h of above algorithm.
5943 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5944 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5945 vpsadbw(dst, dst, xtmp2, vec_enc);
5946 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5947 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5948 vpackuswb(dst, xtmp1, dst, vec_enc);
5949 }
5950
5951 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5952 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5953 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5954 // Add the popcount of upper and lower bytes of word.
5955 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5956 vpsrlw(dst, xtmp1, 8, vec_enc);
5957 vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5958 vpaddw(dst, dst, xtmp1, vec_enc);
5959 }
5960
5961 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5962 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5963 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5964 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5965 vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5966 }
5967
5968 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5969 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5970 switch(bt) {
5971 case T_LONG:
5972 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5973 break;
5974 case T_INT:
5975 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5976 break;
5977 case T_CHAR:
5978 case T_SHORT:
5979 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5980 break;
5981 case T_BYTE:
5982 case T_BOOLEAN:
5983 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5984 break;
5985 default:
5986 fatal("Unsupported type %s", type2name(bt));
5987 break;
5988 }
5989 }
5990
5991 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5992 KRegister mask, bool merge, int vec_enc) {
5993 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5994 switch(bt) {
5995 case T_LONG:
5996 assert(VM_Version::supports_avx512_vpopcntdq(), "");
5997 evpopcntq(dst, mask, src, merge, vec_enc);
5998 break;
5999 case T_INT:
6000 assert(VM_Version::supports_avx512_vpopcntdq(), "");
6001 evpopcntd(dst, mask, src, merge, vec_enc);
6002 break;
6003 case T_CHAR:
6004 case T_SHORT:
6005 assert(VM_Version::supports_avx512_bitalg(), "");
6006 evpopcntw(dst, mask, src, merge, vec_enc);
6007 break;
6008 case T_BYTE:
6009 case T_BOOLEAN:
6010 assert(VM_Version::supports_avx512_bitalg(), "");
6011 evpopcntb(dst, mask, src, merge, vec_enc);
6012 break;
6013 default:
6014 fatal("Unsupported type %s", type2name(bt));
6015 break;
6016 }
6017 }
6018
6019 // Bit reversal algorithm first reverses the bits of each byte followed by
6020 // a byte level reversal for multi-byte primitive types (short/int/long).
6021 // Algorithm performs a lookup table access to get reverse bit sequence
6022 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
6023 // is obtained by swapping the reverse bit sequences of upper and lower
6024 // nibble of a byte.
6025 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6026 XMMRegister xtmp2, Register rtmp, int vec_enc) {
6027 if (VM_Version::supports_avx512vlbw()) {
6028
6029 // Get the reverse bit sequence of lower nibble of each byte.
6030 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
6031 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6032 evpandq(dst, xtmp2, src, vec_enc);
6033 vpshufb(dst, xtmp1, dst, vec_enc);
6034 vpsllq(dst, dst, 4, vec_enc);
6035
6036 // Get the reverse bit sequence of upper nibble of each byte.
6037 vpandn(xtmp2, xtmp2, src, vec_enc);
6038 vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6039 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6040
6041 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6042 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6043 evporq(xtmp2, dst, xtmp2, vec_enc);
6044 vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6045
6046 } else if(vec_enc == Assembler::AVX_512bit) {
6047 // Shift based bit reversal.
6048 assert(bt == T_LONG || bt == T_INT, "");
6049
6050 // Swap lower and upper nibble of each byte.
6051 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
6052
6053 // Swap two least and most significant bits of each nibble.
6054 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
6055
6056 // Swap adjacent pair of bits.
6057 evmovdqul(xtmp1, k0, dst, true, vec_enc);
6058 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
6059
6060 evmovdqul(xtmp1, k0, dst, true, vec_enc);
6061 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
6062 } else {
6063 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
6064 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6065
6066 // Get the reverse bit sequence of lower nibble of each byte.
6067 vpand(dst, xtmp2, src, vec_enc);
6068 vpshufb(dst, xtmp1, dst, vec_enc);
6069 vpsllq(dst, dst, 4, vec_enc);
6070
6071 // Get the reverse bit sequence of upper nibble of each byte.
6072 vpandn(xtmp2, xtmp2, src, vec_enc);
6073 vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6074 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6075
6076 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6077 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6078 vpor(xtmp2, dst, xtmp2, vec_enc);
6079 vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6080 }
6081 }
6082
6083 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6084 XMMRegister xtmp, Register rscratch) {
6085 assert(VM_Version::supports_gfni(), "");
6086 assert(rscratch != noreg || always_reachable(mask), "missing");
6087
6088 // Galois field instruction based bit reversal based on following algorithm.
6089 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6090 vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6091 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6092 vector_reverse_byte(bt, dst, xtmp, vec_enc);
6093 }
6094
6095 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6096 XMMRegister xtmp1, Register rtmp, int vec_enc) {
6097 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6098 evpandq(dst, xtmp1, src, vec_enc);
6099 vpsllq(dst, dst, nbits, vec_enc);
6100 vpandn(xtmp1, xtmp1, src, vec_enc);
6101 vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6102 evporq(dst, dst, xtmp1, vec_enc);
6103 }
6104
6105 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6106 XMMRegister xtmp2, Register rtmp, int vec_enc) {
6107 // Shift based bit reversal.
6108 assert(VM_Version::supports_evex(), "");
6109 switch(bt) {
6110 case T_LONG:
6111 // Swap upper and lower double word of each quad word.
6112 evprorq(xtmp1, k0, src, 32, true, vec_enc);
6113 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6114 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6115 break;
6116 case T_INT:
6117 // Swap upper and lower word of each double word.
6118 evprord(xtmp1, k0, src, 16, true, vec_enc);
6119 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6120 break;
6121 case T_CHAR:
6122 case T_SHORT:
6123 // Swap upper and lower byte of each word.
6124 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6125 break;
6126 case T_BYTE:
6127 evmovdquq(dst, k0, src, true, vec_enc);
6128 break;
6129 default:
6130 fatal("Unsupported type %s", type2name(bt));
6131 break;
6132 }
6133 }
6134
6135 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6136 if (bt == T_BYTE) {
6137 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6138 evmovdquq(dst, k0, src, true, vec_enc);
6139 } else {
6140 vmovdqu(dst, src);
6141 }
6142 return;
6143 }
6144 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6145 // pre-computed shuffle indices.
6146 switch(bt) {
6147 case T_LONG:
6148 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6149 break;
6150 case T_INT:
6151 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6152 break;
6153 case T_CHAR:
6154 case T_SHORT:
6155 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6156 break;
6157 default:
6158 fatal("Unsupported type %s", type2name(bt));
6159 break;
6160 }
6161 vpshufb(dst, src, dst, vec_enc);
6162 }
6163
6164 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6165 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6166 KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6167 assert(is_integral_type(bt), "");
6168 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6169 assert(VM_Version::supports_avx512cd(), "");
6170 switch(bt) {
6171 case T_LONG:
6172 evplzcntq(dst, ktmp, src, merge, vec_enc);
6173 break;
6174 case T_INT:
6175 evplzcntd(dst, ktmp, src, merge, vec_enc);
6176 break;
6177 case T_SHORT:
6178 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6179 vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6180 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6181 vpunpckhwd(dst, xtmp1, src, vec_enc);
6182 evplzcntd(dst, ktmp, dst, merge, vec_enc);
6183 vpackusdw(dst, xtmp2, dst, vec_enc);
6184 break;
6185 case T_BYTE:
6186 // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6187 // accessing the lookup table.
6188 // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6189 // accessing the lookup table.
6190 // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6191 assert(VM_Version::supports_avx512bw(), "");
6192 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6193 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6194 vpand(xtmp2, dst, src, vec_enc);
6195 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6196 vpsrlw(xtmp3, src, 4, vec_enc);
6197 vpand(xtmp3, dst, xtmp3, vec_enc);
6198 vpshufb(dst, xtmp1, xtmp3, vec_enc);
6199 vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6200 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6201 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6202 break;
6203 default:
6204 fatal("Unsupported type %s", type2name(bt));
6205 break;
6206 }
6207 }
6208
6209 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6210 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6211 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6212 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6213 // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6214 // accessing the lookup table.
6215 vpand(dst, xtmp2, src, vec_enc);
6216 vpshufb(dst, xtmp1, dst, vec_enc);
6217 // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6218 // accessing the lookup table.
6219 vpsrlw(xtmp3, src, 4, vec_enc);
6220 vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6221 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6222 // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6223 vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6224 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6225 vpaddb(dst, dst, xtmp2, vec_enc);
6226 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6227 }
6228
6229 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6230 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6231 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6232 // Add zero counts of lower byte and upper byte of a word if
6233 // upper byte holds a zero value.
6234 vpsrlw(xtmp3, src, 8, vec_enc);
6235 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6236 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6237 vpsllw(xtmp2, dst, 8, vec_enc);
6238 vpaddw(xtmp2, xtmp2, dst, vec_enc);
6239 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6240 vpsrlw(dst, dst, 8, vec_enc);
6241 }
6242
6243 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6244 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6245 // By converting the integer to a float, we can obtain the number of leading zeros based on the exponent of the float.
6246 // As the float exponent contains a bias of 127 for nonzero values, the bias must be removed before interpreting the
6247 // exponent as the leading zero count.
6248
6249 // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6250 // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6251 // contributes to the leading number of zeros.
6252 vpsrld(dst, src, 1, vec_enc);
6253 vpandn(dst, dst, src, vec_enc);
6254
6255 vcvtdq2ps(dst, dst, vec_enc);
6256
6257 // By comparing the register to itself, all the bits in the destination are set.
6258 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6259
6260 // Move the biased exponent to the low end of the lane and mask with 0xFF to discard the sign bit.
6261 vpsrld(xtmp2, xtmp1, 24, vec_enc);
6262 vpsrld(dst, dst, 23, vec_enc);
6263 vpand(dst, xtmp2, dst, vec_enc);
6264
6265 // Subtract 127 from the exponent, which removes the bias from the exponent.
6266 vpsrld(xtmp2, xtmp1, 25, vec_enc);
6267 vpsubd(dst, dst, xtmp2, vec_enc);
6268
6269 vpsrld(xtmp2, xtmp1, 27, vec_enc);
6270
6271 // If the original value is 0 the exponent would not have bias, so the subtraction creates a negative number. If this
6272 // is found in any of the lanes, replace the lane with -1 from xtmp1.
6273 vblendvps(dst, dst, xtmp1, dst, vec_enc, true, xtmp3);
6274
6275 // If the original value is negative, replace the lane with 31.
6276 vblendvps(dst, dst, xtmp2, src, vec_enc, true, xtmp3);
6277
6278 // Subtract the exponent from 31, giving the final result. For 0, the result is 32 as the exponent was replaced with -1,
6279 // and for negative numbers the result is 0 as the exponent was replaced with 31.
6280 vpsubd(dst, xtmp2, dst, vec_enc);
6281 }
6282
6283 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6284 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6285 // Find the leading zeros of the top and bottom halves of the long individually.
6286 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6287
6288 // Move the top half result to the bottom half of xtmp1, setting the top half to 0.
6289 vpsrlq(xtmp1, dst, 32, vec_enc);
6290 // By moving the top half result to the right by 6 bits, if the top half was empty (i.e. 32 is returned) the result bit will
6291 // be in the most significant position of the bottom half.
6292 vpsrlq(xtmp2, dst, 6, vec_enc);
6293
6294 // In the bottom half, add the top half and bottom half results.
6295 vpaddq(dst, xtmp1, dst, vec_enc);
6296
6297 // For the bottom half, choose between the values using the most significant bit of xtmp2.
6298 // If the MSB is set, then bottom+top in dst is the resulting value. If the top half is less than 32 xtmp1 is chosen,
6299 // which contains only the top half result.
6300 // In the top half the MSB is always zero, so the value in xtmp1 is always chosen. This value is always 0, which clears
6301 // the lane as required.
6302 vblendvps(dst, xtmp1, dst, xtmp2, vec_enc, true, xtmp3);
6303 }
6304
6305 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6306 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6307 Register rtmp, int vec_enc) {
6308 assert(is_integral_type(bt), "unexpected type");
6309 assert(vec_enc < Assembler::AVX_512bit, "");
6310 switch(bt) {
6311 case T_LONG:
6312 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6313 break;
6314 case T_INT:
6315 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6316 break;
6317 case T_SHORT:
6318 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6319 break;
6320 case T_BYTE:
6321 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6322 break;
6323 default:
6324 fatal("Unsupported type %s", type2name(bt));
6325 break;
6326 }
6327 }
6328
6329 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6330 switch(bt) {
6331 case T_BYTE:
6332 vpsubb(dst, src1, src2, vec_enc);
6333 break;
6334 case T_SHORT:
6335 vpsubw(dst, src1, src2, vec_enc);
6336 break;
6337 case T_INT:
6338 vpsubd(dst, src1, src2, vec_enc);
6339 break;
6340 case T_LONG:
6341 vpsubq(dst, src1, src2, vec_enc);
6342 break;
6343 default:
6344 fatal("Unsupported type %s", type2name(bt));
6345 break;
6346 }
6347 }
6348
6349 // Trailing zero count computation is based on leading zero count operation as per
6350 // following equation. All AVX3 targets support AVX512CD feature which offers
6351 // direct vector instruction to compute leading zero count.
6352 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6353 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6354 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6355 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6356 assert(is_integral_type(bt), "");
6357 // xtmp = -1
6358 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6359 // xtmp = xtmp + src
6360 vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6361 // xtmp = xtmp & ~src
6362 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6363 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6364 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6365 vpsub(bt, dst, xtmp4, dst, vec_enc);
6366 }
6367
6368 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6369 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6370 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6371 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6372 assert(is_integral_type(bt), "");
6373 // xtmp = 0
6374 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6375 // xtmp = 0 - src
6376 vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6377 // xtmp = xtmp | src
6378 vpor(xtmp3, xtmp3, src, vec_enc);
6379 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6380 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6381 vpsub(bt, dst, xtmp1, dst, vec_enc);
6382 }
6383
6384 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6385 Label done;
6386 Label neg_divisor_fastpath;
6387 cmpl(divisor, 0);
6388 jccb(Assembler::less, neg_divisor_fastpath);
6389 xorl(rdx, rdx);
6390 divl(divisor);
6391 jmpb(done);
6392 bind(neg_divisor_fastpath);
6393 // Fastpath for divisor < 0:
6394 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6395 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6396 movl(rdx, rax);
6397 subl(rdx, divisor);
6398 if (VM_Version::supports_bmi1()) {
6399 andnl(rax, rdx, rax);
6400 } else {
6401 notl(rdx);
6402 andl(rax, rdx);
6403 }
6404 shrl(rax, 31);
6405 bind(done);
6406 }
6407
6408 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6409 Label done;
6410 Label neg_divisor_fastpath;
6411 cmpl(divisor, 0);
6412 jccb(Assembler::less, neg_divisor_fastpath);
6413 xorl(rdx, rdx);
6414 divl(divisor);
6415 jmpb(done);
6416 bind(neg_divisor_fastpath);
6417 // Fastpath when divisor < 0:
6418 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6419 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6420 movl(rdx, rax);
6421 subl(rax, divisor);
6422 if (VM_Version::supports_bmi1()) {
6423 andnl(rax, rax, rdx);
6424 } else {
6425 notl(rax);
6426 andl(rax, rdx);
6427 }
6428 sarl(rax, 31);
6429 andl(rax, divisor);
6430 subl(rdx, rax);
6431 bind(done);
6432 }
6433
6434 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6435 Label done;
6436 Label neg_divisor_fastpath;
6437
6438 cmpl(divisor, 0);
6439 jccb(Assembler::less, neg_divisor_fastpath);
6440 xorl(rdx, rdx);
6441 divl(divisor);
6442 jmpb(done);
6443 bind(neg_divisor_fastpath);
6444 // Fastpath for divisor < 0:
6445 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6446 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6447 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6448 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6449 movl(rdx, rax);
6450 subl(rax, divisor);
6451 if (VM_Version::supports_bmi1()) {
6452 andnl(rax, rax, rdx);
6453 } else {
6454 notl(rax);
6455 andl(rax, rdx);
6456 }
6457 movl(tmp, rax);
6458 shrl(rax, 31); // quotient
6459 sarl(tmp, 31);
6460 andl(tmp, divisor);
6461 subl(rdx, tmp); // remainder
6462 bind(done);
6463 }
6464
6465 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6466 XMMRegister xtmp2, Register rtmp) {
6467 if(VM_Version::supports_gfni()) {
6468 // Galois field instruction based bit reversal based on following algorithm.
6469 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6470 mov64(rtmp, 0x8040201008040201L);
6471 movq(xtmp1, src);
6472 movq(xtmp2, rtmp);
6473 gf2p8affineqb(xtmp1, xtmp2, 0);
6474 movq(dst, xtmp1);
6475 } else {
6476 // Swap even and odd numbered bits.
6477 movl(rtmp, src);
6478 andl(rtmp, 0x55555555);
6479 shll(rtmp, 1);
6480 movl(dst, src);
6481 andl(dst, 0xAAAAAAAA);
6482 shrl(dst, 1);
6483 orl(dst, rtmp);
6484
6485 // Swap LSB and MSB 2 bits of each nibble.
6486 movl(rtmp, dst);
6487 andl(rtmp, 0x33333333);
6488 shll(rtmp, 2);
6489 andl(dst, 0xCCCCCCCC);
6490 shrl(dst, 2);
6491 orl(dst, rtmp);
6492
6493 // Swap LSB and MSB 4 bits of each byte.
6494 movl(rtmp, dst);
6495 andl(rtmp, 0x0F0F0F0F);
6496 shll(rtmp, 4);
6497 andl(dst, 0xF0F0F0F0);
6498 shrl(dst, 4);
6499 orl(dst, rtmp);
6500 }
6501 bswapl(dst);
6502 }
6503
6504 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6505 XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6506 if(VM_Version::supports_gfni()) {
6507 // Galois field instruction based bit reversal based on following algorithm.
6508 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6509 mov64(rtmp1, 0x8040201008040201L);
6510 movq(xtmp1, src);
6511 movq(xtmp2, rtmp1);
6512 gf2p8affineqb(xtmp1, xtmp2, 0);
6513 movq(dst, xtmp1);
6514 } else {
6515 // Swap even and odd numbered bits.
6516 movq(rtmp1, src);
6517 mov64(rtmp2, 0x5555555555555555L);
6518 andq(rtmp1, rtmp2);
6519 shlq(rtmp1, 1);
6520 movq(dst, src);
6521 notq(rtmp2);
6522 andq(dst, rtmp2);
6523 shrq(dst, 1);
6524 orq(dst, rtmp1);
6525
6526 // Swap LSB and MSB 2 bits of each nibble.
6527 movq(rtmp1, dst);
6528 mov64(rtmp2, 0x3333333333333333L);
6529 andq(rtmp1, rtmp2);
6530 shlq(rtmp1, 2);
6531 notq(rtmp2);
6532 andq(dst, rtmp2);
6533 shrq(dst, 2);
6534 orq(dst, rtmp1);
6535
6536 // Swap LSB and MSB 4 bits of each byte.
6537 movq(rtmp1, dst);
6538 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6539 andq(rtmp1, rtmp2);
6540 shlq(rtmp1, 4);
6541 notq(rtmp2);
6542 andq(dst, rtmp2);
6543 shrq(dst, 4);
6544 orq(dst, rtmp1);
6545 }
6546 bswapq(dst);
6547 }
6548
6549 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6550 Label done;
6551 Label neg_divisor_fastpath;
6552 cmpq(divisor, 0);
6553 jccb(Assembler::less, neg_divisor_fastpath);
6554 xorl(rdx, rdx);
6555 divq(divisor);
6556 jmpb(done);
6557 bind(neg_divisor_fastpath);
6558 // Fastpath for divisor < 0:
6559 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6560 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6561 movq(rdx, rax);
6562 subq(rdx, divisor);
6563 if (VM_Version::supports_bmi1()) {
6564 andnq(rax, rdx, rax);
6565 } else {
6566 notq(rdx);
6567 andq(rax, rdx);
6568 }
6569 shrq(rax, 63);
6570 bind(done);
6571 }
6572
6573 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6574 Label done;
6575 Label neg_divisor_fastpath;
6576 cmpq(divisor, 0);
6577 jccb(Assembler::less, neg_divisor_fastpath);
6578 xorq(rdx, rdx);
6579 divq(divisor);
6580 jmp(done);
6581 bind(neg_divisor_fastpath);
6582 // Fastpath when divisor < 0:
6583 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6584 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6585 movq(rdx, rax);
6586 subq(rax, divisor);
6587 if (VM_Version::supports_bmi1()) {
6588 andnq(rax, rax, rdx);
6589 } else {
6590 notq(rax);
6591 andq(rax, rdx);
6592 }
6593 sarq(rax, 63);
6594 andq(rax, divisor);
6595 subq(rdx, rax);
6596 bind(done);
6597 }
6598
6599 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6600 Label done;
6601 Label neg_divisor_fastpath;
6602 cmpq(divisor, 0);
6603 jccb(Assembler::less, neg_divisor_fastpath);
6604 xorq(rdx, rdx);
6605 divq(divisor);
6606 jmp(done);
6607 bind(neg_divisor_fastpath);
6608 // Fastpath for divisor < 0:
6609 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6610 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6611 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6612 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6613 movq(rdx, rax);
6614 subq(rax, divisor);
6615 if (VM_Version::supports_bmi1()) {
6616 andnq(rax, rax, rdx);
6617 } else {
6618 notq(rax);
6619 andq(rax, rdx);
6620 }
6621 movq(tmp, rax);
6622 shrq(rax, 63); // quotient
6623 sarq(tmp, 63);
6624 andq(tmp, divisor);
6625 subq(rdx, tmp); // remainder
6626 bind(done);
6627 }
6628
6629 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6630 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6631 int vlen_enc) {
6632 assert(VM_Version::supports_avx512bw(), "");
6633 // Byte shuffles are inlane operations and indices are determined using
6634 // lower 4 bit of each shuffle lane, thus all shuffle indices are
6635 // normalized to index range 0-15. This makes sure that all the multiples
6636 // of an index value are placed at same relative position in 128 bit
6637 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6638 // will be 16th element in their respective 128 bit lanes.
6639 movl(rtmp, 16);
6640 evpbroadcastb(xtmp1, rtmp, vlen_enc);
6641
6642 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6643 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6644 // original shuffle indices and move the shuffled lanes corresponding to true
6645 // mask to destination vector.
6646 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6647 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6648 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6649
6650 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6651 // and broadcasting second 128 bit lane.
6652 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
6653 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6654 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6655 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6656 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6657
6658 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6659 // and broadcasting third 128 bit lane.
6660 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc);
6661 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6662 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6663 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6664 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6665
6666 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6667 // and broadcasting third 128 bit lane.
6668 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
6669 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6670 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6671 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6672 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6673 }
6674
6675 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6676 XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6677 if (vlen_enc == AVX_128bit) {
6678 vpermilps(dst, src, shuffle, vlen_enc);
6679 } else if (bt == T_INT) {
6680 vpermd(dst, shuffle, src, vlen_enc);
6681 } else {
6682 assert(bt == T_FLOAT, "");
6683 vpermps(dst, shuffle, src, vlen_enc);
6684 }
6685 }
6686
6687 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6688 switch(opcode) {
6689 case Op_AddHF: vaddsh(dst, src1, src2); break;
6690 case Op_SubHF: vsubsh(dst, src1, src2); break;
6691 case Op_MulHF: vmulsh(dst, src1, src2); break;
6692 case Op_DivHF: vdivsh(dst, src1, src2); break;
6693 default: assert(false, "%s", NodeClassNames[opcode]); break;
6694 }
6695 }
6696
6697 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6698 switch(elem_bt) {
6699 case T_BYTE:
6700 if (ideal_opc == Op_SaturatingAddV) {
6701 vpaddsb(dst, src1, src2, vlen_enc);
6702 } else {
6703 assert(ideal_opc == Op_SaturatingSubV, "");
6704 vpsubsb(dst, src1, src2, vlen_enc);
6705 }
6706 break;
6707 case T_SHORT:
6708 if (ideal_opc == Op_SaturatingAddV) {
6709 vpaddsw(dst, src1, src2, vlen_enc);
6710 } else {
6711 assert(ideal_opc == Op_SaturatingSubV, "");
6712 vpsubsw(dst, src1, src2, vlen_enc);
6713 }
6714 break;
6715 default:
6716 fatal("Unsupported type %s", type2name(elem_bt));
6717 break;
6718 }
6719 }
6720
6721 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6722 switch(elem_bt) {
6723 case T_BYTE:
6724 if (ideal_opc == Op_SaturatingAddV) {
6725 vpaddusb(dst, src1, src2, vlen_enc);
6726 } else {
6727 assert(ideal_opc == Op_SaturatingSubV, "");
6728 vpsubusb(dst, src1, src2, vlen_enc);
6729 }
6730 break;
6731 case T_SHORT:
6732 if (ideal_opc == Op_SaturatingAddV) {
6733 vpaddusw(dst, src1, src2, vlen_enc);
6734 } else {
6735 assert(ideal_opc == Op_SaturatingSubV, "");
6736 vpsubusw(dst, src1, src2, vlen_enc);
6737 }
6738 break;
6739 default:
6740 fatal("Unsupported type %s", type2name(elem_bt));
6741 break;
6742 }
6743 }
6744
6745 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6746 XMMRegister src2, KRegister ktmp, int vlen_enc) {
6747 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6748 // overflow_mask = Inp1 <u Inp2
6749 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc);
6750 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6751 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6752 }
6753
6754 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6755 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6756 // Emulate unsigned comparison using signed comparison
6757 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6758 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6759 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6760 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6761
6762 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6763
6764 // Res = INP1 - INP2 (non-commutative and non-associative)
6765 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6766 // Res = Mask ? Zero : Res
6767 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6768 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6769 }
6770
6771 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6772 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6773 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6774 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6775 // Res = Signed Add INP1, INP2
6776 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6777 // T1 = SRC1 | SRC2
6778 vpor(xtmp1, src1, src2, vlen_enc);
6779 // Max_Unsigned = -1
6780 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6781 // Unsigned compare: Mask = Res <u T1
6782 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6783 // res = Mask ? Max_Unsigned : Res
6784 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc);
6785 }
6786
6787 //
6788 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6789 // unsigned addition operation.
6790 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6791 //
6792 // We empirically determined its semantic equivalence to following reduced expression
6793 // overflow_mask = (a + b) <u (a | b)
6794 //
6795 // and also verified it though Alive2 solver.
6796 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6797 //
6798
6799 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6800 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6801 // Res = Signed Add INP1, INP2
6802 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6803 // Compute T1 = INP1 | INP2
6804 vpor(xtmp3, src1, src2, vlen_enc);
6805 // T1 = Minimum signed value.
6806 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6807 // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6808 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6809 // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6810 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6811 // Compute overflow detection mask = Res<1> <s T1
6812 if (elem_bt == T_INT) {
6813 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6814 } else {
6815 assert(elem_bt == T_LONG, "");
6816 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6817 }
6818 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6819 }
6820
6821 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6822 int vlen_enc, bool xtmp2_hold_M1) {
6823 if (VM_Version::supports_avx512dq()) {
6824 evpmovq2m(ktmp, src, vlen_enc);
6825 } else {
6826 assert(VM_Version::supports_evex(), "");
6827 if (!xtmp2_hold_M1) {
6828 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6829 }
6830 evpsraq(xtmp1, src, 63, vlen_enc);
6831 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6832 }
6833 }
6834
6835 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6836 int vlen_enc, bool xtmp2_hold_M1) {
6837 if (VM_Version::supports_avx512dq()) {
6838 evpmovd2m(ktmp, src, vlen_enc);
6839 } else {
6840 assert(VM_Version::supports_evex(), "");
6841 if (!xtmp2_hold_M1) {
6842 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6843 }
6844 vpsrad(xtmp1, src, 31, vlen_enc);
6845 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6846 }
6847 }
6848
6849
6850 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6851 if (elem_bt == T_LONG) {
6852 if (VM_Version::supports_evex()) {
6853 evpsraq(dst, src, 63, vlen_enc);
6854 } else {
6855 vpsrad(dst, src, 31, vlen_enc);
6856 vpshufd(dst, dst, 0xF5, vlen_enc);
6857 }
6858 } else {
6859 assert(elem_bt == T_INT, "");
6860 vpsrad(dst, src, 31, vlen_enc);
6861 }
6862 }
6863
6864 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6865 if (compute_allones) {
6866 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6867 vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6868 } else {
6869 vpcmpeqq(allones, allones, allones, vlen_enc);
6870 }
6871 }
6872 if (elem_bt == T_LONG) {
6873 vpsrlq(dst, allones, 1, vlen_enc);
6874 } else {
6875 assert(elem_bt == T_INT, "");
6876 vpsrld(dst, allones, 1, vlen_enc);
6877 }
6878 }
6879
6880 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6881 if (compute_allones) {
6882 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6883 vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6884 } else {
6885 vpcmpeqq(allones, allones, allones, vlen_enc);
6886 }
6887 }
6888 if (elem_bt == T_LONG) {
6889 vpsllq(dst, allones, 63, vlen_enc);
6890 } else {
6891 assert(elem_bt == T_INT, "");
6892 vpslld(dst, allones, 31, vlen_enc);
6893 }
6894 }
6895
6896 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2,
6897 Assembler::ComparisonPredicate cond, int vlen_enc) {
6898 switch(elem_bt) {
6899 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6900 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6901 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6902 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6903 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6904 }
6905 }
6906
6907 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6908 switch(elem_bt) {
6909 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break;
6910 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break;
6911 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6912 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break;
6913 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6914 }
6915 }
6916
6917 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6918 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6919 if (elem_bt == T_LONG) {
6920 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6921 } else {
6922 assert(elem_bt == T_INT, "");
6923 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6924 }
6925 }
6926
6927 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6928 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6929 KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6930 assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6931 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6932 // Overflow detection based on Hacker's delight section 2-13.
6933 if (ideal_opc == Op_SaturatingAddV) {
6934 // res = src1 + src2
6935 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6936 // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6937 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6938 vpxor(xtmp1, dst, src1, vlen_enc);
6939 vpxor(xtmp2, dst, src2, vlen_enc);
6940 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6941 } else {
6942 assert(ideal_opc == Op_SaturatingSubV, "");
6943 // res = src1 - src2
6944 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6945 // Overflow occurs when both inputs have opposite polarity and
6946 // result polarity does not comply with first input polarity.
6947 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6948 vpxor(xtmp1, src1, src2, vlen_enc);
6949 vpxor(xtmp2, dst, src1, vlen_enc);
6950 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6951 }
6952
6953 // Compute overflow detection mask.
6954 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6955 // Note: xtmp1 hold -1 in all its lanes after above call.
6956
6957 // Compute mask based on first input polarity.
6958 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6959
6960 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6961 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6962
6963 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6964 // set bits in first input polarity mask holds a min value.
6965 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6966 // Blend destination lanes with saturated values using overflow detection mask.
6967 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6968 }
6969
6970
6971 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6972 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6973 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6974 assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6975 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6976 // Overflow detection based on Hacker's delight section 2-13.
6977 if (ideal_opc == Op_SaturatingAddV) {
6978 // res = src1 + src2
6979 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6980 // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6981 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6982 vpxor(xtmp1, dst, src1, vlen_enc);
6983 vpxor(xtmp2, dst, src2, vlen_enc);
6984 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6985 } else {
6986 assert(ideal_opc == Op_SaturatingSubV, "");
6987 // res = src1 - src2
6988 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6989 // Overflow occurs when both inputs have opposite polarity and
6990 // result polarity does not comply with first input polarity.
6991 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6992 vpxor(xtmp1, src1, src2, vlen_enc);
6993 vpxor(xtmp2, dst, src1, vlen_enc);
6994 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6995 }
6996
6997 // Sign-extend to compute overflow detection mask.
6998 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6999
7000 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
7001 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
7002 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
7003
7004 // Compose saturating min/max vector using first input polarity mask.
7005 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
7006 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
7007
7008 // Blend result with saturating vector using overflow detection mask.
7009 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
7010 }
7011
7012 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7013 switch(elem_bt) {
7014 case T_BYTE:
7015 if (ideal_opc == Op_SaturatingAddV) {
7016 vpaddsb(dst, src1, src2, vlen_enc);
7017 } else {
7018 assert(ideal_opc == Op_SaturatingSubV, "");
7019 vpsubsb(dst, src1, src2, vlen_enc);
7020 }
7021 break;
7022 case T_SHORT:
7023 if (ideal_opc == Op_SaturatingAddV) {
7024 vpaddsw(dst, src1, src2, vlen_enc);
7025 } else {
7026 assert(ideal_opc == Op_SaturatingSubV, "");
7027 vpsubsw(dst, src1, src2, vlen_enc);
7028 }
7029 break;
7030 default:
7031 fatal("Unsupported type %s", type2name(elem_bt));
7032 break;
7033 }
7034 }
7035
7036 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7037 switch(elem_bt) {
7038 case T_BYTE:
7039 if (ideal_opc == Op_SaturatingAddV) {
7040 vpaddusb(dst, src1, src2, vlen_enc);
7041 } else {
7042 assert(ideal_opc == Op_SaturatingSubV, "");
7043 vpsubusb(dst, src1, src2, vlen_enc);
7044 }
7045 break;
7046 case T_SHORT:
7047 if (ideal_opc == Op_SaturatingAddV) {
7048 vpaddusw(dst, src1, src2, vlen_enc);
7049 } else {
7050 assert(ideal_opc == Op_SaturatingSubV, "");
7051 vpsubusw(dst, src1, src2, vlen_enc);
7052 }
7053 break;
7054 default:
7055 fatal("Unsupported type %s", type2name(elem_bt));
7056 break;
7057 }
7058 }
7059
7060 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
7061 XMMRegister src2, int vlen_enc) {
7062 switch(elem_bt) {
7063 case T_BYTE:
7064 evpermi2b(dst, src1, src2, vlen_enc);
7065 break;
7066 case T_SHORT:
7067 evpermi2w(dst, src1, src2, vlen_enc);
7068 break;
7069 case T_INT:
7070 evpermi2d(dst, src1, src2, vlen_enc);
7071 break;
7072 case T_LONG:
7073 evpermi2q(dst, src1, src2, vlen_enc);
7074 break;
7075 case T_FLOAT:
7076 evpermi2ps(dst, src1, src2, vlen_enc);
7077 break;
7078 case T_DOUBLE:
7079 evpermi2pd(dst, src1, src2, vlen_enc);
7080 break;
7081 default:
7082 fatal("Unsupported type %s", type2name(elem_bt));
7083 break;
7084 }
7085 }
7086
7087 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7088 if (is_unsigned) {
7089 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7090 } else {
7091 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7092 }
7093 }
7094
7095 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7096 if (is_unsigned) {
7097 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7098 } else {
7099 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7100 }
7101 }
7102
7103 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
7104 switch(opcode) {
7105 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7106 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7107 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7108 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7109 default: assert(false, "%s", NodeClassNames[opcode]); break;
7110 }
7111 }
7112
7113 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7114 switch(opcode) {
7115 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7116 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7117 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7118 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7119 default: assert(false, "%s", NodeClassNames[opcode]); break;
7120 }
7121 }
7122
7123 void C2_MacroAssembler::sminmax_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7124 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7125 vminmax_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7126 }
7127
7128 void C2_MacroAssembler::sminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7129 KRegister ktmp) {
7130 if (opcode == Op_MaxHF) {
7131 // dst = max(src1, src2)
7132 evminmaxsh(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN);
7133 } else {
7134 assert(opcode == Op_MinHF, "");
7135 // dst = min(src1, src2)
7136 evminmaxsh(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN);
7137 }
7138 }
7139
7140 void C2_MacroAssembler::vminmax_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7141 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7142 if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7143 // Move sign bits of src2 to mask register.
7144 evpmovw2m(ktmp, src2, vlen_enc);
7145 // xtmp1 = src2 < 0 ? src2 : src1
7146 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7147 // xtmp2 = src2 < 0 ? ? src1 : src2
7148 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7149 // Idea behind above swapping is to make seconds source operand a +ve value.
7150 // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7151 // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7152 // the second source operand, either a NaN or a valid floating-point value, is returned
7153 // dst = max(xtmp1, xtmp2)
7154 evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7155 // isNaN = is_unordered_quiet(xtmp1)
7156 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7157 // Final result is same as first source if its a NaN value,
7158 // in case second operand holds a NaN value then as per above semantics
7159 // result is same as second operand.
7160 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7161 } else {
7162 assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7163 // Move sign bits of src1 to mask register.
7164 evpmovw2m(ktmp, src1, vlen_enc);
7165 // xtmp1 = src1 < 0 ? src2 : src1
7166 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7167 // xtmp2 = src1 < 0 ? src1 : src2
7168 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7169 // Idea behind above swapping is to make seconds source operand a -ve value.
7170 // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7171 // the second source operand is returned.
7172 // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7173 // or a valid floating-point value, is written to the result.
7174 // dst = min(xtmp1, xtmp2)
7175 evminph(dst, xtmp1, xtmp2, vlen_enc);
7176 // isNaN = is_unordered_quiet(xtmp1)
7177 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7178 // Final result is same as first source if its a NaN value,
7179 // in case second operand holds a NaN value then as per above semantics
7180 // result is same as second operand.
7181 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7182 }
7183 }
7184
7185 void C2_MacroAssembler::vminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7186 KRegister ktmp, int vlen_enc) {
7187 if (opcode == Op_MaxVHF) {
7188 // dst = max(src1, src2)
7189 evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vlen_enc);
7190 } else {
7191 assert(opcode == Op_MinVHF, "");
7192 // dst = min(src1, src2)
7193 evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vlen_enc);
7194 }
7195 }
7196
7197 void C2_MacroAssembler::vminmax_fp16_avx10_2(int opcode, XMMRegister dst, XMMRegister src1, Address src2,
7198 KRegister ktmp, int vlen_enc) {
7199 if (opcode == Op_MaxVHF) {
7200 // dst = max(src1, src2)
7201 evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MAX_COMPARE_SIGN, vlen_enc);
7202 } else {
7203 assert(opcode == Op_MinVHF, "");
7204 // dst = min(src1, src2)
7205 evminmaxph(dst, ktmp, src1, src2, true, AVX10_2_MINMAX_MIN_COMPARE_SIGN, vlen_enc);
7206 }
7207 }