1 /*
2 * Copyright (c) 2020, 2026, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "asm/assembler.hpp"
26 #include "asm/assembler.inline.hpp"
27 #include "gc/shared/barrierSet.hpp"
28 #include "gc/shared/barrierSetAssembler.hpp"
29 #include "oops/methodData.hpp"
30 #include "opto/c2_MacroAssembler.hpp"
31 #include "opto/intrinsicnode.hpp"
32 #include "opto/output.hpp"
33 #include "opto/opcodes.hpp"
34 #include "opto/subnode.hpp"
35 #include "runtime/globals.hpp"
36 #include "runtime/objectMonitor.hpp"
37 #include "runtime/objectMonitorTable.hpp"
38 #include "runtime/stubRoutines.hpp"
39 #include "runtime/synchronizer.hpp"
40 #include "utilities/checkedCast.hpp"
41 #include "utilities/globalDefinitions.hpp"
42 #include "utilities/powerOfTwo.hpp"
43 #include "utilities/sizes.hpp"
44
45 #ifdef PRODUCT
46 #define BLOCK_COMMENT(str) /* nothing */
47 #define STOP(error) stop(error)
48 #else
49 #define BLOCK_COMMENT(str) block_comment(str)
50 #define STOP(error) block_comment(error); stop(error)
51 #endif
52
53 // C2 compiled method's prolog code.
54 // Beware! This sp_inc is NOT the same as the one mentioned in MacroAssembler::remove_frame but only the size
55 // of the extension space + the additional copy of the return address. That means, it doesn't contain the
56 // frame size (where the local and sp_inc are) and the saved RBP.
57 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) {
58 if (C->clinit_barrier_on_entry()) {
59 assert(VM_Version::supports_fast_class_init_checks(), "sanity");
60 assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
61
62 Label L_skip_barrier;
63 Register klass = rscratch1;
64
65 mov_metadata(klass, C->method()->holder()->constant_encoding());
66 clinit_barrier(klass, &L_skip_barrier /*L_fast_path*/);
67
68 jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path
69
70 bind(L_skip_barrier);
71 }
72
73 int framesize = C->output()->frame_size_in_bytes();
74 int bangsize = C->output()->bang_size_in_bytes();
75 bool fp_mode_24b = false;
76 int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0;
77
78 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
79
80 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
81 // Remove word for return addr
82 framesize -= wordSize;
83 stack_bang_size -= wordSize;
84
85 // Calls to C2R adapters often do not accept exceptional returns.
86 // We require that their callers must bang for them. But be careful, because
87 // some VM calls (such as call site linkage) can use several kilobytes of
88 // stack. But the stack safety zone should account for that.
89 // See bugs 4446381, 4468289, 4497237.
90 if (stack_bang_size > 0) {
91 generate_stack_overflow_check(stack_bang_size);
92
93 // We always push rbp, so that on return to interpreter rbp, will be
94 // restored correctly and we can correct the stack.
95 push(rbp);
96 #ifdef ASSERT
97 if (sp_inc > 0) {
98 movl(Address(rsp, 0), badRegWordVal);
99 movl(Address(rsp, VMRegImpl::stack_slot_size), badRegWordVal);
100 }
101 #endif
102 // Save caller's stack pointer into RBP if the frame pointer is preserved.
103 if (PreserveFramePointer) {
104 mov(rbp, rsp);
105 }
106 // Remove word for ebp
107 framesize -= wordSize;
108
109 // Create frame
110 if (framesize) {
111 subptr(rsp, framesize);
112 }
113 } else {
114 subptr(rsp, framesize);
115
116 // Save RBP register now.
117 framesize -= wordSize;
118 movptr(Address(rsp, framesize), rbp);
119 #ifdef ASSERT
120 if (sp_inc > 0) {
121 movl(Address(rsp, framesize), badRegWordVal);
122 movl(Address(rsp, framesize + VMRegImpl::stack_slot_size), badRegWordVal);
123 }
124 #endif
125 // Save caller's stack pointer into RBP if the frame pointer is preserved.
126 if (PreserveFramePointer) {
127 movptr(rbp, rsp);
128 if (framesize > 0) {
129 addptr(rbp, framesize);
130 }
131 }
132 }
133
134 if (C->needs_stack_repair()) {
135 // Save stack increment just below the saved rbp (also account for fixed framesize and rbp)
136 assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned");
137 movptr(Address(rsp, framesize - wordSize), sp_inc + framesize);
138 }
139
140 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
141 framesize -= wordSize;
142 movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
143 }
144
145 #ifdef ASSERT
146 if (VerifyStackAtCalls) {
147 Label L;
148 push(rax);
149 mov(rax, rsp);
150 andptr(rax, StackAlignmentInBytes-1);
151 cmpptr(rax, StackAlignmentInBytes-wordSize);
152 pop(rax);
153 jcc(Assembler::equal, L);
154 STOP("Stack is not properly aligned!");
155 bind(L);
156 }
157 #endif
158 }
159
160 void C2_MacroAssembler::entry_barrier() {
161 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
162 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
163 Label dummy_slow_path;
164 Label dummy_continuation;
165 Label* slow_path = &dummy_slow_path;
166 Label* continuation = &dummy_continuation;
167 if (!Compile::current()->output()->in_scratch_emit_size()) {
168 // Use real labels from actual stub when not emitting code for the purpose of measuring its size
169 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
170 Compile::current()->output()->add_stub(stub);
171 slow_path = &stub->entry();
172 continuation = &stub->continuation();
173 }
174 bs->nmethod_entry_barrier(this, slow_path, continuation);
175 }
176
177 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
178 switch (vlen_in_bytes) {
179 case 4: // fall-through
180 case 8: // fall-through
181 case 16: return Assembler::AVX_128bit;
182 case 32: return Assembler::AVX_256bit;
183 case 64: return Assembler::AVX_512bit;
184
185 default: {
186 ShouldNotReachHere();
187 return Assembler::AVX_NoVec;
188 }
189 }
190 }
191
192 // fast_lock and fast_unlock used by C2
193
194 // Because the transitions from emitted code to the runtime
195 // monitorenter/exit helper stubs are so slow it's critical that
196 // we inline both the stack-locking fast path and the inflated fast path.
197 //
198 // See also: cmpFastLock and cmpFastUnlock.
199 //
200 // What follows is a specialized inline transliteration of the code
201 // in enter() and exit(). If we're concerned about I$ bloat another
202 // option would be to emit TrySlowEnter and TrySlowExit methods
203 // at startup-time. These methods would accept arguments as
204 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
205 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply
206 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
207 // In practice, however, the # of lock sites is bounded and is usually small.
208 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
209 // if the processor uses simple bimodal branch predictors keyed by EIP
210 // Since the helper routines would be called from multiple synchronization
211 // sites.
212 //
213 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
214 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
215 // to those specialized methods. That'd give us a mostly platform-independent
216 // implementation that the JITs could optimize and inline at their pleasure.
217 // Done correctly, the only time we'd need to cross to native could would be
218 // to park() or unpark() threads. We'd also need a few more unsafe operators
219 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
220 // (b) explicit barriers or fence operations.
221 //
222 // TODO:
223 //
224 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
225 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
226 // Given TLAB allocation, Self is usually manifested in a register, so passing it into
227 // the lock operators would typically be faster than reifying Self.
228 //
229 // * Ideally I'd define the primitives as:
230 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
231 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
232 // Unfortunately ADLC bugs prevent us from expressing the ideal form.
233 // Instead, we're stuck with a rather awkward and brittle register assignments below.
234 // Furthermore the register assignments are overconstrained, possibly resulting in
235 // sub-optimal code near the synchronization site.
236 //
237 // * Eliminate the sp-proximity tests and just use "== Self" tests instead.
238 // Alternately, use a better sp-proximity test.
239 //
240 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
241 // Either one is sufficient to uniquely identify a thread.
242 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
243 //
244 // * Intrinsify notify() and notifyAll() for the common cases where the
245 // object is locked by the calling thread but the waitlist is empty.
246 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
247 //
248 // * use jccb and jmpb instead of jcc and jmp to improve code density.
249 // But beware of excessive branch density on AMD Opterons.
250 //
251 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success
252 // or failure of the fast path. If the fast path fails then we pass
253 // control to the slow path, typically in C. In fast_lock and
254 // fast_unlock we often branch to DONE_LABEL, just to find that C2
255 // will emit a conditional branch immediately after the node.
256 // So we have branches to branches and lots of ICC.ZF games.
257 // Instead, it might be better to have C2 pass a "FailureLabel"
258 // into fast_lock and fast_unlock. In the case of success, control
259 // will drop through the node. ICC.ZF is undefined at exit.
260 // In the case of failure, the node will branch directly to the
261 // FailureLabel
262
263 // obj: object to lock
264 // box: on-stack box address -- KILLED
265 // rax: tmp -- KILLED
266 // t : tmp -- KILLED
267 void C2_MacroAssembler::fast_lock(Register obj, Register box, Register rax_reg,
268 Register t, Register thread) {
269 assert(rax_reg == rax, "Used for CAS");
270 assert_different_registers(obj, box, rax_reg, t, thread);
271
272 // Handle inflated monitor.
273 Label inflated;
274 // Finish fast lock successfully. ZF value is irrelevant.
275 Label locked;
276 // Finish fast lock unsuccessfully. MUST jump with ZF == 0
277 Label slow_path;
278
279 if (UseObjectMonitorTable) {
280 // Clear cache in case fast locking succeeds or we need to take the slow-path.
281 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0);
282 }
283
284 if (DiagnoseSyncOnValueBasedClasses != 0) {
285 load_klass(rax_reg, obj, t);
286 testb(Address(rax_reg, Klass::misc_flags_offset()), KlassFlags::_misc_is_value_based_class);
287 jcc(Assembler::notZero, slow_path);
288 }
289
290 const Register mark = t;
291
292 { // Fast Lock
293
294 Label push;
295
296 const Register top = UseObjectMonitorTable ? rax_reg : box;
297
298 // Load the mark.
299 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
300
301 // Prefetch top.
302 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
303
304 // Check for monitor (0b10).
305 testptr(mark, markWord::monitor_value);
306 jcc(Assembler::notZero, inflated);
307
308 // Check if lock-stack is full.
309 cmpl(top, LockStack::end_offset() - 1);
310 jcc(Assembler::greater, slow_path);
311
312 // Check if recursive.
313 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
314 jccb(Assembler::equal, push);
315
316 // Try to lock. Transition lock bits 0b01 => 0b00
317 movptr(rax_reg, mark);
318 orptr(rax_reg, markWord::unlocked_value);
319 andptr(mark, ~(int32_t)markWord::unlocked_value);
320 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
321 jcc(Assembler::notEqual, slow_path);
322
323 if (UseObjectMonitorTable) {
324 // Need to reload top, clobbered by CAS.
325 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
326 }
327 bind(push);
328 // After successful lock, push object on lock-stack.
329 movptr(Address(thread, top), obj);
330 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
331 jmp(locked);
332 }
333
334 { // Handle inflated monitor.
335 bind(inflated);
336
337 const Register monitor = t;
338
339 if (!UseObjectMonitorTable) {
340 assert(mark == monitor, "should be the same here");
341 } else {
342 const Register hash = t;
343 Label monitor_found;
344
345 // Look for the monitor in the om_cache.
346
347 ByteSize cache_offset = JavaThread::om_cache_oops_offset();
348 ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
349 const int num_unrolled = OMCache::CAPACITY;
350 for (int i = 0; i < num_unrolled; i++) {
351 movptr(monitor, Address(thread, cache_offset + monitor_offset));
352 cmpptr(obj, Address(thread, cache_offset));
353 jccb(Assembler::equal, monitor_found);
354 cache_offset = cache_offset + OMCache::oop_to_oop_difference();
355 }
356
357 // Look for the monitor in the table.
358
359 // Get the hash code.
360 movptr(hash, Address(obj, oopDesc::mark_offset_in_bytes()));
361 shrq(hash, markWord::hash_shift);
362 andq(hash, markWord::hash_mask);
363
364 // Get the table and calculate the bucket's address.
365 lea(rax_reg, ExternalAddress(ObjectMonitorTable::current_table_address()));
366 movptr(rax_reg, Address(rax_reg));
367 andq(hash, Address(rax_reg, ObjectMonitorTable::table_capacity_mask_offset()));
368 movptr(rax_reg, Address(rax_reg, ObjectMonitorTable::table_buckets_offset()));
369
370 // Read the monitor from the bucket.
371 movptr(monitor, Address(rax_reg, hash, Address::times_ptr));
372
373 // Check if the monitor in the bucket is special (empty, tombstone or removed)
374 cmpptr(monitor, ObjectMonitorTable::SpecialPointerValues::below_is_special);
375 jcc(Assembler::below, slow_path);
376
377 // Check if object matches.
378 movptr(rax_reg, Address(monitor, ObjectMonitor::object_offset()));
379 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
380 bs_asm->try_resolve_weak_handle_in_c2(this, rax_reg, slow_path);
381 cmpptr(rax_reg, obj);
382 jcc(Assembler::notEqual, slow_path);
383
384 bind(monitor_found);
385 }
386 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
387 const Address recursions_address(monitor, ObjectMonitor::recursions_offset() - monitor_tag);
388 const Address owner_address(monitor, ObjectMonitor::owner_offset() - monitor_tag);
389
390 Label monitor_locked;
391 // Lock the monitor.
392
393 if (UseObjectMonitorTable) {
394 // Cache the monitor for unlock before trashing box. On failure to acquire
395 // the lock, the slow path will reset the entry accordingly (see CacheSetter).
396 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor);
397 }
398
399 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
400 xorptr(rax_reg, rax_reg);
401 movptr(box, Address(thread, JavaThread::monitor_owner_id_offset()));
402 lock(); cmpxchgptr(box, owner_address);
403 jccb(Assembler::equal, monitor_locked);
404
405 // Check if recursive.
406 cmpptr(box, rax_reg);
407 jccb(Assembler::notEqual, slow_path);
408
409 // Recursive.
410 increment(recursions_address);
411
412 bind(monitor_locked);
413 }
414
415 bind(locked);
416 // Set ZF = 1
417 xorl(rax_reg, rax_reg);
418
419 #ifdef ASSERT
420 // Check that locked label is reached with ZF set.
421 Label zf_correct;
422 Label zf_bad_zero;
423 jcc(Assembler::zero, zf_correct);
424 jmp(zf_bad_zero);
425 #endif
426
427 bind(slow_path);
428 #ifdef ASSERT
429 // Check that slow_path label is reached with ZF not set.
430 jcc(Assembler::notZero, zf_correct);
431 stop("Fast Lock ZF != 0");
432 bind(zf_bad_zero);
433 stop("Fast Lock ZF != 1");
434 bind(zf_correct);
435 #endif
436 // C2 uses the value of ZF to determine the continuation.
437 }
438
439 // obj: object to lock
440 // rax: tmp -- KILLED
441 // t : tmp - cannot be obj nor rax -- KILLED
442 //
443 // Some commentary on balanced locking:
444 //
445 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
446 // Methods that don't have provably balanced locking are forced to run in the
447 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
448 // The interpreter provides two properties:
449 // I1: At return-time the interpreter automatically and quietly unlocks any
450 // objects acquired in the current activation (frame). Recall that the
451 // interpreter maintains an on-stack list of locks currently held by
452 // a frame.
453 // I2: If a method attempts to unlock an object that is not held by the
454 // frame the interpreter throws IMSX.
455 //
456 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
457 // B() doesn't have provably balanced locking so it runs in the interpreter.
458 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
459 // is still locked by A().
460 //
461 // The only other source of unbalanced locking would be JNI. The "Java Native Interface
462 // Specification" states that an object locked by JNI's MonitorEnter should not be
463 // unlocked by "normal" java-level locking and vice-versa. The specification doesn't
464 // specify what will occur if a program engages in such mixed-mode locking, however.
465 // Arguably given that the spec legislates the JNI case as undefined our implementation
466 // could reasonably *avoid* checking owner in fast_unlock().
467 // In the interest of performance we elide m->Owner==Self check in unlock.
468 // A perfectly viable alternative is to elide the owner check except when
469 // Xcheck:jni is enabled.
470
471 void C2_MacroAssembler::fast_unlock(Register obj, Register reg_rax, Register t, Register thread) {
472 assert(reg_rax == rax, "Used for CAS");
473 assert_different_registers(obj, reg_rax, t);
474
475 // Handle inflated monitor.
476 Label inflated, inflated_check_lock_stack;
477 // Finish fast unlock successfully. MUST jump with ZF == 1
478 Label unlocked, slow_path;
479
480 const Register mark = t;
481 const Register monitor = t;
482 const Register top = UseObjectMonitorTable ? t : reg_rax;
483 const Register box = reg_rax;
484
485 Label dummy;
486 C2FastUnlockStub* stub = nullptr;
487
488 if (!Compile::current()->output()->in_scratch_emit_size()) {
489 stub = new (Compile::current()->comp_arena()) C2FastUnlockStub(obj, mark, reg_rax, thread);
490 Compile::current()->output()->add_stub(stub);
491 }
492
493 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
494
495 { // Fast Unlock
496
497 // Load top.
498 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
499
500 if (!UseObjectMonitorTable) {
501 // Prefetch mark.
502 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
503 }
504
505 // Check if obj is top of lock-stack.
506 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
507 // Top of lock stack was not obj. Must be monitor.
508 jcc(Assembler::notEqual, inflated_check_lock_stack);
509
510 // Pop lock-stack.
511 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
512 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
513
514 // Check if recursive.
515 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
516 jcc(Assembler::equal, unlocked);
517
518 // We elide the monitor check, let the CAS fail instead.
519
520 if (UseObjectMonitorTable) {
521 // Load mark.
522 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
523 }
524
525 // Try to unlock. Transition lock bits 0b00 => 0b01
526 movptr(reg_rax, mark);
527 andptr(reg_rax, ~(int32_t)markWord::lock_mask);
528 orptr(mark, markWord::unlocked_value);
529 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
530 jcc(Assembler::notEqual, push_and_slow_path);
531 jmp(unlocked);
532 }
533
534
535 { // Handle inflated monitor.
536 bind(inflated_check_lock_stack);
537 #ifdef ASSERT
538 Label check_done;
539 subl(top, oopSize);
540 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
541 jcc(Assembler::below, check_done);
542 cmpptr(obj, Address(thread, top));
543 jcc(Assembler::notEqual, inflated_check_lock_stack);
544 stop("Fast Unlock lock on stack");
545 bind(check_done);
546 if (UseObjectMonitorTable) {
547 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
548 }
549 testptr(mark, markWord::monitor_value);
550 jcc(Assembler::notZero, inflated);
551 stop("Fast Unlock not monitor");
552 #endif
553
554 bind(inflated);
555
556 if (!UseObjectMonitorTable) {
557 assert(mark == monitor, "should be the same here");
558 } else {
559 // Uses ObjectMonitorTable. Look for the monitor in our BasicLock on the stack.
560 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
561 // null check with ZF == 0, no valid pointer below alignof(ObjectMonitor*)
562 cmpptr(monitor, alignof(ObjectMonitor*));
563 jcc(Assembler::below, slow_path);
564 }
565 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
566 const Address recursions_address{monitor, ObjectMonitor::recursions_offset() - monitor_tag};
567 const Address succ_address{monitor, ObjectMonitor::succ_offset() - monitor_tag};
568 const Address entry_list_address{monitor, ObjectMonitor::entry_list_offset() - monitor_tag};
569 const Address owner_address{monitor, ObjectMonitor::owner_offset() - monitor_tag};
570
571 Label recursive;
572
573 // Check if recursive.
574 cmpptr(recursions_address, 0);
575 jcc(Assembler::notZero, recursive);
576
577 // Set owner to null.
578 // Release to satisfy the JMM
579 movptr(owner_address, NULL_WORD);
580 // We need a full fence after clearing owner to avoid stranding.
581 // StoreLoad achieves this.
582 membar(StoreLoad);
583
584 // Check if the entry_list is empty.
585 cmpptr(entry_list_address, NULL_WORD);
586 jcc(Assembler::zero, unlocked); // If so we are done.
587
588 // Check if there is a successor.
589 cmpptr(succ_address, NULL_WORD);
590 jcc(Assembler::notZero, unlocked); // If so we are done.
591
592 // Save the monitor pointer in the current thread, so we can try to
593 // reacquire the lock in SharedRuntime::monitor_exit_helper().
594 if (!UseObjectMonitorTable) {
595 andptr(monitor, ~(int32_t)markWord::monitor_value);
596 }
597 movptr(Address(thread, JavaThread::unlocked_inflated_monitor_offset()), monitor);
598
599 orl(t, 1); // Fast Unlock ZF = 0
600 jmpb(slow_path);
601
602 // Recursive unlock.
603 bind(recursive);
604 decrement(recursions_address);
605 }
606
607 bind(unlocked);
608 xorl(t, t); // Fast Unlock ZF = 1
609
610 #ifdef ASSERT
611 // Check that unlocked label is reached with ZF set.
612 Label zf_correct;
613 Label zf_bad_zero;
614 jcc(Assembler::zero, zf_correct);
615 jmp(zf_bad_zero);
616 #endif
617
618 bind(slow_path);
619 if (stub != nullptr) {
620 bind(stub->slow_path_continuation());
621 }
622 #ifdef ASSERT
623 // Check that stub->continuation() label is reached with ZF not set.
624 jcc(Assembler::notZero, zf_correct);
625 stop("Fast Unlock ZF != 0");
626 bind(zf_bad_zero);
627 stop("Fast Unlock ZF != 1");
628 bind(zf_correct);
629 #endif
630 // C2 uses the value of ZF to determine the continuation.
631 }
632
633 static void abort_verify_int_in_range(uint idx, jint val, jint lo, jint hi) {
634 fatal("Invalid CastII, idx: %u, val: %d, lo: %d, hi: %d", idx, val, lo, hi);
635 }
636
637 static void reconstruct_frame_pointer_helper(MacroAssembler* masm, Register dst) {
638 const int framesize = Compile::current()->output()->frame_size_in_bytes();
639 masm->movptr(dst, rsp);
640 if (framesize > 2 * wordSize) {
641 masm->addptr(dst, framesize - 2 * wordSize);
642 }
643 }
644
645 void C2_MacroAssembler::reconstruct_frame_pointer(Register rtmp) {
646 if (PreserveFramePointer) {
647 // frame pointer is valid
648 #ifdef ASSERT
649 // Verify frame pointer value in rbp.
650 reconstruct_frame_pointer_helper(this, rtmp);
651 Label L_success;
652 cmpq(rbp, rtmp);
653 jccb(Assembler::equal, L_success);
654 STOP("frame pointer mismatch");
655 bind(L_success);
656 #endif // ASSERT
657 } else {
658 reconstruct_frame_pointer_helper(this, rbp);
659 }
660 }
661
662 void C2_MacroAssembler::verify_int_in_range(uint idx, const TypeInt* t, Register val) {
663 jint lo = t->_lo;
664 jint hi = t->_hi;
665 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: %d, hi: %d", idx, lo, hi);
666 if (t == TypeInt::INT) {
667 return;
668 }
669
670 BLOCK_COMMENT("CastII {");
671 Label fail;
672 Label succeed;
673
674 if (lo != min_jint) {
675 cmpl(val, lo);
676 jccb(Assembler::less, fail);
677 }
678 if (hi != max_jint) {
679 cmpl(val, hi);
680 jccb(Assembler::greater, fail);
681 }
682 jmpb(succeed);
683
684 bind(fail);
685 movl(c_rarg0, idx);
686 movl(c_rarg1, val);
687 movl(c_rarg2, lo);
688 movl(c_rarg3, hi);
689 reconstruct_frame_pointer(rscratch1);
690 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_int_in_range)));
691 hlt();
692 bind(succeed);
693 BLOCK_COMMENT("} // CastII");
694 }
695
696 static void abort_verify_long_in_range(uint idx, jlong val, jlong lo, jlong hi) {
697 fatal("Invalid CastLL, idx: %u, val: " JLONG_FORMAT ", lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, val, lo, hi);
698 }
699
700 void C2_MacroAssembler::verify_long_in_range(uint idx, const TypeLong* t, Register val, Register tmp) {
701 jlong lo = t->_lo;
702 jlong hi = t->_hi;
703 assert(lo < hi, "type should not be empty or constant, idx: %u, lo: " JLONG_FORMAT ", hi: " JLONG_FORMAT, idx, lo, hi);
704 if (t == TypeLong::LONG) {
705 return;
706 }
707
708 BLOCK_COMMENT("CastLL {");
709 Label fail;
710 Label succeed;
711
712 auto cmp_val = [&](jlong bound) {
713 if (is_simm32(bound)) {
714 cmpq(val, checked_cast<int>(bound));
715 } else {
716 mov64(tmp, bound);
717 cmpq(val, tmp);
718 }
719 };
720
721 if (lo != min_jlong) {
722 cmp_val(lo);
723 jccb(Assembler::less, fail);
724 }
725 if (hi != max_jlong) {
726 cmp_val(hi);
727 jccb(Assembler::greater, fail);
728 }
729 jmpb(succeed);
730
731 bind(fail);
732 movl(c_rarg0, idx);
733 movq(c_rarg1, val);
734 mov64(c_rarg2, lo);
735 mov64(c_rarg3, hi);
736 reconstruct_frame_pointer(rscratch1);
737 call(RuntimeAddress(CAST_FROM_FN_PTR(address, abort_verify_long_in_range)));
738 hlt();
739 bind(succeed);
740 BLOCK_COMMENT("} // CastLL");
741 }
742
743 //-------------------------------------------------------------------------------------------
744 // Generic instructions support for use in .ad files C2 code generation
745
746 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) {
747 if (dst != src) {
748 movdqu(dst, src);
749 }
750 if (opcode == Op_AbsVD) {
751 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
752 } else {
753 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
754 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
755 }
756 }
757
758 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
759 if (opcode == Op_AbsVD) {
760 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
761 } else {
762 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
763 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
764 }
765 }
766
767 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) {
768 if (dst != src) {
769 movdqu(dst, src);
770 }
771 if (opcode == Op_AbsVF) {
772 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
773 } else {
774 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
775 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
776 }
777 }
778
779 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) {
780 if (opcode == Op_AbsVF) {
781 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
782 } else {
783 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
784 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
785 }
786 }
787
788 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
789 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
790 assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
791
792 if (opcode == Op_MinV) {
793 if (elem_bt == T_BYTE) {
794 pminsb(dst, src);
795 } else if (elem_bt == T_SHORT) {
796 pminsw(dst, src);
797 } else if (elem_bt == T_INT) {
798 pminsd(dst, src);
799 } else {
800 assert(elem_bt == T_LONG, "required");
801 assert(tmp == xmm0, "required");
802 assert_different_registers(dst, src, tmp);
803 movdqu(xmm0, dst);
804 pcmpgtq(xmm0, src);
805 blendvpd(dst, src); // xmm0 as mask
806 }
807 } else { // opcode == Op_MaxV
808 if (elem_bt == T_BYTE) {
809 pmaxsb(dst, src);
810 } else if (elem_bt == T_SHORT) {
811 pmaxsw(dst, src);
812 } else if (elem_bt == T_INT) {
813 pmaxsd(dst, src);
814 } else {
815 assert(elem_bt == T_LONG, "required");
816 assert(tmp == xmm0, "required");
817 assert_different_registers(dst, src, tmp);
818 movdqu(xmm0, src);
819 pcmpgtq(xmm0, dst);
820 blendvpd(dst, src); // xmm0 as mask
821 }
822 }
823 }
824
825 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
826 XMMRegister src1, Address src2, int vlen_enc) {
827 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
828 if (opcode == Op_UMinV) {
829 switch(elem_bt) {
830 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break;
831 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
832 case T_INT: vpminud(dst, src1, src2, vlen_enc); break;
833 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
834 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
835 }
836 } else {
837 assert(opcode == Op_UMaxV, "required");
838 switch(elem_bt) {
839 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break;
840 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
841 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break;
842 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
843 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
844 }
845 }
846 }
847
848 void C2_MacroAssembler::vpuminmaxq(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
849 // For optimality, leverage a full vector width of 512 bits
850 // for operations over smaller vector sizes on AVX512 targets.
851 if (VM_Version::supports_evex() && !VM_Version::supports_avx512vl()) {
852 if (opcode == Op_UMaxV) {
853 evpmaxuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
854 } else {
855 assert(opcode == Op_UMinV, "required");
856 evpminuq(dst, k0, src1, src2, false, Assembler::AVX_512bit);
857 }
858 } else {
859 // T1 = -1
860 vpcmpeqq(xtmp1, xtmp1, xtmp1, vlen_enc);
861 // T1 = -1 << 63
862 vpsllq(xtmp1, xtmp1, 63, vlen_enc);
863 // Convert SRC2 to signed value i.e. T2 = T1 + SRC2
864 vpaddq(xtmp2, xtmp1, src2, vlen_enc);
865 // Convert SRC1 to signed value i.e. T1 = T1 + SRC1
866 vpaddq(xtmp1, xtmp1, src1, vlen_enc);
867 // Mask = T2 > T1
868 vpcmpgtq(xtmp1, xtmp2, xtmp1, vlen_enc);
869 if (opcode == Op_UMaxV) {
870 // Res = Mask ? Src2 : Src1
871 vpblendvb(dst, src1, src2, xtmp1, vlen_enc);
872 } else {
873 // Res = Mask ? Src1 : Src2
874 vpblendvb(dst, src2, src1, xtmp1, vlen_enc);
875 }
876 }
877 }
878
879 void C2_MacroAssembler::vpuminmax(int opcode, BasicType elem_bt, XMMRegister dst,
880 XMMRegister src1, XMMRegister src2, int vlen_enc) {
881 assert(opcode == Op_UMinV || opcode == Op_UMaxV, "sanity");
882 if (opcode == Op_UMinV) {
883 switch(elem_bt) {
884 case T_BYTE: vpminub(dst, src1, src2, vlen_enc); break;
885 case T_SHORT: vpminuw(dst, src1, src2, vlen_enc); break;
886 case T_INT: vpminud(dst, src1, src2, vlen_enc); break;
887 case T_LONG: evpminuq(dst, k0, src1, src2, false, vlen_enc); break;
888 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
889 }
890 } else {
891 assert(opcode == Op_UMaxV, "required");
892 switch(elem_bt) {
893 case T_BYTE: vpmaxub(dst, src1, src2, vlen_enc); break;
894 case T_SHORT: vpmaxuw(dst, src1, src2, vlen_enc); break;
895 case T_INT: vpmaxud(dst, src1, src2, vlen_enc); break;
896 case T_LONG: evpmaxuq(dst, k0, src1, src2, false, vlen_enc); break;
897 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
898 }
899 }
900 }
901
902 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
903 XMMRegister dst, XMMRegister src1, XMMRegister src2,
904 int vlen_enc) {
905 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
906
907 if (opcode == Op_MinV) {
908 if (elem_bt == T_BYTE) {
909 vpminsb(dst, src1, src2, vlen_enc);
910 } else if (elem_bt == T_SHORT) {
911 vpminsw(dst, src1, src2, vlen_enc);
912 } else if (elem_bt == T_INT) {
913 vpminsd(dst, src1, src2, vlen_enc);
914 } else {
915 assert(elem_bt == T_LONG, "required");
916 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
917 vpminsq(dst, src1, src2, vlen_enc);
918 } else {
919 assert_different_registers(dst, src1, src2);
920 vpcmpgtq(dst, src1, src2, vlen_enc);
921 vblendvpd(dst, src1, src2, dst, vlen_enc);
922 }
923 }
924 } else { // opcode == Op_MaxV
925 if (elem_bt == T_BYTE) {
926 vpmaxsb(dst, src1, src2, vlen_enc);
927 } else if (elem_bt == T_SHORT) {
928 vpmaxsw(dst, src1, src2, vlen_enc);
929 } else if (elem_bt == T_INT) {
930 vpmaxsd(dst, src1, src2, vlen_enc);
931 } else {
932 assert(elem_bt == T_LONG, "required");
933 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
934 vpmaxsq(dst, src1, src2, vlen_enc);
935 } else {
936 assert_different_registers(dst, src1, src2);
937 vpcmpgtq(dst, src1, src2, vlen_enc);
938 vblendvpd(dst, src2, src1, dst, vlen_enc);
939 }
940 }
941 }
942 }
943
944 // Float/Double min max
945
946 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
947 XMMRegister dst, XMMRegister a, XMMRegister b,
948 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
949 int vlen_enc) {
950 assert(UseAVX > 0, "required");
951 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
952 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
953 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
954 assert_different_registers(a, tmp, atmp, btmp);
955 assert_different_registers(b, tmp, atmp, btmp);
956
957 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
958 bool is_double_word = is_double_word_type(elem_bt);
959
960 /* Note on 'non-obvious' assembly sequence:
961 *
962 * While there are vminps/vmaxps instructions, there are two important differences between hardware
963 * and Java on how they handle floats:
964 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal)
965 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN)
966 *
967 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing:
968 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps)
969 * (only useful when signs differ, noop otherwise)
970 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q])
971
972 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines):
973 * btmp = (b < +0.0) ? a : b
974 * atmp = (b < +0.0) ? b : a
975 * Tmp = Max_Float(atmp , btmp)
976 * Res = (atmp == NaN) ? atmp : Tmp
977 */
978
979 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister);
980 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int);
981 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int);
982 XMMRegister mask;
983
984 if (!is_double_word && is_min) {
985 mask = a;
986 vblend = &MacroAssembler::vblendvps;
987 vmaxmin = &MacroAssembler::vminps;
988 vcmp = &MacroAssembler::vcmpps;
989 } else if (!is_double_word && !is_min) {
990 mask = b;
991 vblend = &MacroAssembler::vblendvps;
992 vmaxmin = &MacroAssembler::vmaxps;
993 vcmp = &MacroAssembler::vcmpps;
994 } else if (is_double_word && is_min) {
995 mask = a;
996 vblend = &MacroAssembler::vblendvpd;
997 vmaxmin = &MacroAssembler::vminpd;
998 vcmp = &MacroAssembler::vcmppd;
999 } else {
1000 assert(is_double_word && !is_min, "sanity");
1001 mask = b;
1002 vblend = &MacroAssembler::vblendvpd;
1003 vmaxmin = &MacroAssembler::vmaxpd;
1004 vcmp = &MacroAssembler::vcmppd;
1005 }
1006
1007 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps
1008 XMMRegister maxmin, scratch;
1009 if (dst == btmp) {
1010 maxmin = btmp;
1011 scratch = tmp;
1012 } else {
1013 maxmin = tmp;
1014 scratch = btmp;
1015 }
1016
1017 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1;
1018 if (precompute_mask && !is_double_word) {
1019 vpsrad(tmp, mask, 32, vlen_enc);
1020 mask = tmp;
1021 } else if (precompute_mask && is_double_word) {
1022 vpxor(tmp, tmp, tmp, vlen_enc);
1023 vpcmpgtq(tmp, tmp, mask, vlen_enc);
1024 mask = tmp;
1025 }
1026
1027 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp);
1028 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp);
1029 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc);
1030 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1031 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch);
1032 }
1033
1034 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1035 XMMRegister dst, XMMRegister a, XMMRegister b,
1036 KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1037 int vlen_enc) {
1038 assert(UseAVX > 2, "required");
1039 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1040 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1041 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1042 assert_different_registers(dst, a, atmp, btmp);
1043 assert_different_registers(dst, b, atmp, btmp);
1044
1045 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1046 bool is_double_word = is_double_word_type(elem_bt);
1047 bool merge = true;
1048
1049 if (!is_double_word && is_min) {
1050 evpmovd2m(ktmp, a, vlen_enc);
1051 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1052 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1053 vminps(dst, atmp, btmp, vlen_enc);
1054 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1055 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1056 } else if (!is_double_word && !is_min) {
1057 evpmovd2m(ktmp, b, vlen_enc);
1058 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1059 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1060 vmaxps(dst, atmp, btmp, vlen_enc);
1061 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1062 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1063 } else if (is_double_word && is_min) {
1064 evpmovq2m(ktmp, a, vlen_enc);
1065 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1066 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1067 vminpd(dst, atmp, btmp, vlen_enc);
1068 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1069 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1070 } else {
1071 assert(is_double_word && !is_min, "sanity");
1072 evpmovq2m(ktmp, b, vlen_enc);
1073 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1074 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1075 vmaxpd(dst, atmp, btmp, vlen_enc);
1076 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1077 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1078 }
1079 }
1080
1081 void C2_MacroAssembler::vminmax_fp(int opc, BasicType elem_bt, XMMRegister dst, KRegister mask,
1082 XMMRegister src1, XMMRegister src2, int vlen_enc) {
1083 assert(opc == Op_MinV || opc == Op_MinReductionV ||
1084 opc == Op_MaxV || opc == Op_MaxReductionV, "sanity");
1085
1086 int imm8 = (opc == Op_MinV || opc == Op_MinReductionV) ? AVX10_2_MINMAX_MIN_COMPARE_SIGN
1087 : AVX10_2_MINMAX_MAX_COMPARE_SIGN;
1088 if (elem_bt == T_FLOAT) {
1089 evminmaxps(dst, mask, src1, src2, true, imm8, vlen_enc);
1090 } else {
1091 assert(elem_bt == T_DOUBLE, "");
1092 evminmaxpd(dst, mask, src1, src2, true, imm8, vlen_enc);
1093 }
1094 }
1095
1096 // Float/Double signum
1097 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) {
1098 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1099
1100 Label DONE_LABEL;
1101
1102 // Handle special cases +0.0/-0.0 and NaN, if argument is +0.0/-0.0 or NaN, return argument
1103 // If AVX10.2 (or newer) floating point comparison instructions used, SF=1 for equal and unordered cases
1104 // If other floating point comparison instructions used, ZF=1 for equal and unordered cases
1105 if (opcode == Op_SignumF) {
1106 if (VM_Version::supports_avx10_2()) {
1107 vucomxss(dst, zero);
1108 jcc(Assembler::negative, DONE_LABEL);
1109 } else {
1110 ucomiss(dst, zero);
1111 jcc(Assembler::equal, DONE_LABEL);
1112 }
1113 movflt(dst, one);
1114 jcc(Assembler::above, DONE_LABEL);
1115 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
1116 } else if (opcode == Op_SignumD) {
1117 if (VM_Version::supports_avx10_2()) {
1118 vucomxsd(dst, zero);
1119 jcc(Assembler::negative, DONE_LABEL);
1120 } else {
1121 ucomisd(dst, zero);
1122 jcc(Assembler::equal, DONE_LABEL);
1123 }
1124 movdbl(dst, one);
1125 jcc(Assembler::above, DONE_LABEL);
1126 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
1127 }
1128
1129 bind(DONE_LABEL);
1130 }
1131
1132 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1133 if (sign) {
1134 pmovsxbw(dst, src);
1135 } else {
1136 pmovzxbw(dst, src);
1137 }
1138 }
1139
1140 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1141 if (sign) {
1142 vpmovsxbw(dst, src, vector_len);
1143 } else {
1144 vpmovzxbw(dst, src, vector_len);
1145 }
1146 }
1147
1148 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1149 if (sign) {
1150 vpmovsxbd(dst, src, vector_len);
1151 } else {
1152 vpmovzxbd(dst, src, vector_len);
1153 }
1154 }
1155
1156 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1157 if (sign) {
1158 vpmovsxwd(dst, src, vector_len);
1159 } else {
1160 vpmovzxwd(dst, src, vector_len);
1161 }
1162 }
1163
1164 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1165 int shift, int vector_len) {
1166 if (opcode == Op_RotateLeftV) {
1167 if (etype == T_INT) {
1168 evprold(dst, src, shift, vector_len);
1169 } else {
1170 assert(etype == T_LONG, "expected type T_LONG");
1171 evprolq(dst, src, shift, vector_len);
1172 }
1173 } else {
1174 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1175 if (etype == T_INT) {
1176 evprord(dst, src, shift, vector_len);
1177 } else {
1178 assert(etype == T_LONG, "expected type T_LONG");
1179 evprorq(dst, src, shift, vector_len);
1180 }
1181 }
1182 }
1183
1184 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1185 XMMRegister shift, int vector_len) {
1186 if (opcode == Op_RotateLeftV) {
1187 if (etype == T_INT) {
1188 evprolvd(dst, src, shift, vector_len);
1189 } else {
1190 assert(etype == T_LONG, "expected type T_LONG");
1191 evprolvq(dst, src, shift, vector_len);
1192 }
1193 } else {
1194 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1195 if (etype == T_INT) {
1196 evprorvd(dst, src, shift, vector_len);
1197 } else {
1198 assert(etype == T_LONG, "expected type T_LONG");
1199 evprorvq(dst, src, shift, vector_len);
1200 }
1201 }
1202 }
1203
1204 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1205 if (opcode == Op_RShiftVI) {
1206 psrad(dst, shift);
1207 } else if (opcode == Op_LShiftVI) {
1208 pslld(dst, shift);
1209 } else {
1210 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1211 psrld(dst, shift);
1212 }
1213 }
1214
1215 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1216 switch (opcode) {
1217 case Op_RShiftVI: psrad(dst, shift); break;
1218 case Op_LShiftVI: pslld(dst, shift); break;
1219 case Op_URShiftVI: psrld(dst, shift); break;
1220
1221 default: assert(false, "%s", NodeClassNames[opcode]);
1222 }
1223 }
1224
1225 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1226 if (opcode == Op_RShiftVI) {
1227 vpsrad(dst, nds, shift, vector_len);
1228 } else if (opcode == Op_LShiftVI) {
1229 vpslld(dst, nds, shift, vector_len);
1230 } else {
1231 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1232 vpsrld(dst, nds, shift, vector_len);
1233 }
1234 }
1235
1236 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1237 switch (opcode) {
1238 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break;
1239 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break;
1240 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1241
1242 default: assert(false, "%s", NodeClassNames[opcode]);
1243 }
1244 }
1245
1246 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1247 switch (opcode) {
1248 case Op_RShiftVB: // fall-through
1249 case Op_RShiftVS: psraw(dst, shift); break;
1250
1251 case Op_LShiftVB: // fall-through
1252 case Op_LShiftVS: psllw(dst, shift); break;
1253
1254 case Op_URShiftVS: // fall-through
1255 case Op_URShiftVB: psrlw(dst, shift); break;
1256
1257 default: assert(false, "%s", NodeClassNames[opcode]);
1258 }
1259 }
1260
1261 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1262 switch (opcode) {
1263 case Op_RShiftVB: // fall-through
1264 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break;
1265
1266 case Op_LShiftVB: // fall-through
1267 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break;
1268
1269 case Op_URShiftVS: // fall-through
1270 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1271
1272 default: assert(false, "%s", NodeClassNames[opcode]);
1273 }
1274 }
1275
1276 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1277 switch (opcode) {
1278 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1279 case Op_LShiftVL: psllq(dst, shift); break;
1280 case Op_URShiftVL: psrlq(dst, shift); break;
1281
1282 default: assert(false, "%s", NodeClassNames[opcode]);
1283 }
1284 }
1285
1286 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1287 if (opcode == Op_RShiftVL) {
1288 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems
1289 } else if (opcode == Op_LShiftVL) {
1290 psllq(dst, shift);
1291 } else {
1292 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1293 psrlq(dst, shift);
1294 }
1295 }
1296
1297 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1298 switch (opcode) {
1299 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1300 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break;
1301 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1302
1303 default: assert(false, "%s", NodeClassNames[opcode]);
1304 }
1305 }
1306
1307 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1308 if (opcode == Op_RShiftVL) {
1309 evpsraq(dst, nds, shift, vector_len);
1310 } else if (opcode == Op_LShiftVL) {
1311 vpsllq(dst, nds, shift, vector_len);
1312 } else {
1313 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1314 vpsrlq(dst, nds, shift, vector_len);
1315 }
1316 }
1317
1318 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1319 switch (opcode) {
1320 case Op_RShiftVB: // fall-through
1321 case Op_RShiftVS: // fall-through
1322 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break;
1323
1324 case Op_LShiftVB: // fall-through
1325 case Op_LShiftVS: // fall-through
1326 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break;
1327
1328 case Op_URShiftVB: // fall-through
1329 case Op_URShiftVS: // fall-through
1330 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1331
1332 default: assert(false, "%s", NodeClassNames[opcode]);
1333 }
1334 }
1335
1336 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1337 switch (opcode) {
1338 case Op_RShiftVB: // fall-through
1339 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break;
1340
1341 case Op_LShiftVB: // fall-through
1342 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break;
1343
1344 case Op_URShiftVB: // fall-through
1345 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1346
1347 default: assert(false, "%s", NodeClassNames[opcode]);
1348 }
1349 }
1350
1351 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1352 assert(UseAVX >= 2, "required");
1353 switch (opcode) {
1354 case Op_RShiftVL: {
1355 if (UseAVX > 2) {
1356 assert(tmp == xnoreg, "not used");
1357 if (!VM_Version::supports_avx512vl()) {
1358 vlen_enc = Assembler::AVX_512bit;
1359 }
1360 evpsravq(dst, src, shift, vlen_enc);
1361 } else {
1362 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1363 vpsrlvq(dst, src, shift, vlen_enc);
1364 vpsrlvq(tmp, tmp, shift, vlen_enc);
1365 vpxor(dst, dst, tmp, vlen_enc);
1366 vpsubq(dst, dst, tmp, vlen_enc);
1367 }
1368 break;
1369 }
1370 case Op_LShiftVL: {
1371 assert(tmp == xnoreg, "not used");
1372 vpsllvq(dst, src, shift, vlen_enc);
1373 break;
1374 }
1375 case Op_URShiftVL: {
1376 assert(tmp == xnoreg, "not used");
1377 vpsrlvq(dst, src, shift, vlen_enc);
1378 break;
1379 }
1380 default: assert(false, "%s", NodeClassNames[opcode]);
1381 }
1382 }
1383
1384 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1385 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1386 assert(opcode == Op_LShiftVB ||
1387 opcode == Op_RShiftVB ||
1388 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1389 bool sign = (opcode != Op_URShiftVB);
1390 assert(vector_len == 0, "required");
1391 vextendbd(sign, dst, src, 1);
1392 vpmovzxbd(vtmp, shift, 1);
1393 varshiftd(opcode, dst, dst, vtmp, 1);
1394 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg);
1395 vextracti128_high(vtmp, dst);
1396 vpackusdw(dst, dst, vtmp, 0);
1397 }
1398
1399 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1400 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) {
1401 assert(opcode == Op_LShiftVB ||
1402 opcode == Op_RShiftVB ||
1403 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1404 bool sign = (opcode != Op_URShiftVB);
1405 int ext_vector_len = vector_len + 1;
1406 vextendbw(sign, dst, src, ext_vector_len);
1407 vpmovzxbw(vtmp, shift, ext_vector_len);
1408 varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1409 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg);
1410 if (vector_len == 0) {
1411 vextracti128_high(vtmp, dst);
1412 vpackuswb(dst, dst, vtmp, vector_len);
1413 } else {
1414 vextracti64x4_high(vtmp, dst);
1415 vpackuswb(dst, dst, vtmp, vector_len);
1416 vpermq(dst, dst, 0xD8, vector_len);
1417 }
1418 }
1419
1420 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1421 switch(typ) {
1422 case T_BYTE:
1423 pinsrb(dst, val, idx);
1424 break;
1425 case T_SHORT:
1426 pinsrw(dst, val, idx);
1427 break;
1428 case T_INT:
1429 pinsrd(dst, val, idx);
1430 break;
1431 case T_LONG:
1432 pinsrq(dst, val, idx);
1433 break;
1434 default:
1435 assert(false,"Should not reach here.");
1436 break;
1437 }
1438 }
1439
1440 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1441 switch(typ) {
1442 case T_BYTE:
1443 vpinsrb(dst, src, val, idx);
1444 break;
1445 case T_SHORT:
1446 vpinsrw(dst, src, val, idx);
1447 break;
1448 case T_INT:
1449 vpinsrd(dst, src, val, idx);
1450 break;
1451 case T_LONG:
1452 vpinsrq(dst, src, val, idx);
1453 break;
1454 default:
1455 assert(false,"Should not reach here.");
1456 break;
1457 }
1458 }
1459
1460 void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst,
1461 Register base, Register idx_base,
1462 Register mask, Register mask_idx,
1463 Register rtmp, int vlen_enc) {
1464 vpxor(dst, dst, dst, vlen_enc);
1465 if (elem_bt == T_SHORT) {
1466 for (int i = 0; i < 4; i++) {
1467 // dst[i] = mask[i] ? src[idx_base[i]] : 0
1468 Label skip_load;
1469 btq(mask, mask_idx);
1470 jccb(Assembler::carryClear, skip_load);
1471 movl(rtmp, Address(idx_base, i * 4));
1472 pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1473 bind(skip_load);
1474 incq(mask_idx);
1475 }
1476 } else {
1477 assert(elem_bt == T_BYTE, "");
1478 for (int i = 0; i < 8; i++) {
1479 // dst[i] = mask[i] ? src[idx_base[i]] : 0
1480 Label skip_load;
1481 btq(mask, mask_idx);
1482 jccb(Assembler::carryClear, skip_load);
1483 movl(rtmp, Address(idx_base, i * 4));
1484 pinsrb(dst, Address(base, rtmp), i);
1485 bind(skip_load);
1486 incq(mask_idx);
1487 }
1488 }
1489 }
1490
1491 void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
1492 Register base, Register idx_base,
1493 Register rtmp, int vlen_enc) {
1494 vpxor(dst, dst, dst, vlen_enc);
1495 if (elem_bt == T_SHORT) {
1496 for (int i = 0; i < 4; i++) {
1497 // dst[i] = src[idx_base[i]]
1498 movl(rtmp, Address(idx_base, i * 4));
1499 pinsrw(dst, Address(base, rtmp, Address::times_2), i);
1500 }
1501 } else {
1502 assert(elem_bt == T_BYTE, "");
1503 for (int i = 0; i < 8; i++) {
1504 // dst[i] = src[idx_base[i]]
1505 movl(rtmp, Address(idx_base, i * 4));
1506 pinsrb(dst, Address(base, rtmp), i);
1507 }
1508 }
1509 }
1510
1511 /*
1512 * Gather using hybrid algorithm, first partially unroll scalar loop
1513 * to accumulate values from gather indices into a quad-word(64bit) slice.
1514 * A slice may hold 8 bytes or 4 short values. This is followed by a vector
1515 * permutation to place the slice into appropriate vector lane
1516 * locations in destination vector. Following pseudo code describes the
1517 * algorithm in detail:
1518 *
1519 * DST_VEC = ZERO_VEC
1520 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..}
1521 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..}
1522 * FOREACH_ITER:
1523 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES
1524 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX
1525 * DST_VEC = DST_VEC OR TEMP_PERM_VEC
1526 * PERM_INDEX = PERM_INDEX - TWO_VEC
1527 *
1528 * With each iteration, doubleword permute indices (0,1) corresponding
1529 * to gathered quadword gets right shifted by two lane positions.
1530 *
1531 */
1532 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
1533 Register base, Register idx_base,
1534 Register mask, XMMRegister xtmp1,
1535 XMMRegister xtmp2, XMMRegister temp_dst,
1536 Register rtmp, Register mask_idx,
1537 Register length, int vector_len, int vlen_enc) {
1538 Label GATHER8_LOOP;
1539 assert(is_subword_type(elem_ty), "");
1540 movl(length, vector_len);
1541 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...}
1542 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...}
1543 vallones(xtmp2, vlen_enc);
1544 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc);
1545 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...}
1546 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...}
1547
1548 bind(GATHER8_LOOP);
1549 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
1550 if (mask == noreg) {
1551 vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc);
1552 } else {
1553 vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc);
1554 }
1555 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
1556 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
1557 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2)
1558 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc);
1559 // DST_VEC = DST_VEC OR TEMP_PERM_VEC
1560 vpor(dst, dst, temp_dst, vlen_enc);
1561 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1));
1562 subl(length, 8 >> (type2aelembytes(elem_ty) - 1));
1563 jcc(Assembler::notEqual, GATHER8_LOOP);
1564 }
1565
1566 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1567 switch(typ) {
1568 case T_INT:
1569 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1570 break;
1571 case T_FLOAT:
1572 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1573 break;
1574 case T_LONG:
1575 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1576 break;
1577 case T_DOUBLE:
1578 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1579 break;
1580 default:
1581 assert(false,"Should not reach here.");
1582 break;
1583 }
1584 }
1585
1586 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1587 switch(typ) {
1588 case T_INT:
1589 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1590 break;
1591 case T_FLOAT:
1592 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1593 break;
1594 case T_LONG:
1595 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1596 break;
1597 case T_DOUBLE:
1598 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1599 break;
1600 default:
1601 assert(false,"Should not reach here.");
1602 break;
1603 }
1604 }
1605
1606 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1607 switch(typ) {
1608 case T_INT:
1609 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1610 break;
1611 case T_FLOAT:
1612 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1613 break;
1614 case T_LONG:
1615 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1616 break;
1617 case T_DOUBLE:
1618 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1619 break;
1620 default:
1621 assert(false,"Should not reach here.");
1622 break;
1623 }
1624 }
1625
1626 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1627 if (vlen_in_bytes <= 16) {
1628 pxor (dst, dst);
1629 psubb(dst, src);
1630 switch (elem_bt) {
1631 case T_BYTE: /* nothing to do */ break;
1632 case T_SHORT: pmovsxbw(dst, dst); break;
1633 case T_INT: pmovsxbd(dst, dst); break;
1634 case T_FLOAT: pmovsxbd(dst, dst); break;
1635 case T_LONG: pmovsxbq(dst, dst); break;
1636 case T_DOUBLE: pmovsxbq(dst, dst); break;
1637
1638 default: assert(false, "%s", type2name(elem_bt));
1639 }
1640 } else {
1641 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1642 int vlen_enc = vector_length_encoding(vlen_in_bytes);
1643
1644 vpxor (dst, dst, dst, vlen_enc);
1645 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1646
1647 switch (elem_bt) {
1648 case T_BYTE: /* nothing to do */ break;
1649 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break;
1650 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break;
1651 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break;
1652 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break;
1653 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1654
1655 default: assert(false, "%s", type2name(elem_bt));
1656 }
1657 }
1658 }
1659
1660 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) {
1661 if (novlbwdq) {
1662 vpmovsxbd(xtmp, src, vlen_enc);
1663 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()),
1664 Assembler::eq, true, vlen_enc, noreg);
1665 } else {
1666 vpxor(xtmp, xtmp, xtmp, vlen_enc);
1667 vpsubb(xtmp, xtmp, src, vlen_enc);
1668 evpmovb2m(dst, xtmp, vlen_enc);
1669 }
1670 }
1671
1672 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, Address src, int vlen_in_bytes) {
1673 if (is_integral_type(bt)) {
1674 switch (vlen_in_bytes) {
1675 case 4: movdl(dst, src); break;
1676 case 8: movq(dst, src); break;
1677 case 16: movdqu(dst, src); break;
1678 case 32: vmovdqu(dst, src); break;
1679 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break;
1680 default: ShouldNotReachHere();
1681 }
1682 } else {
1683 switch (vlen_in_bytes) {
1684 case 4: movflt(dst, src); break;
1685 case 8: movdbl(dst, src); break;
1686 case 16: movups(dst, src); break;
1687 case 32: vmovups(dst, src, Assembler::AVX_256bit); break;
1688 case 64: vmovups(dst, src, Assembler::AVX_512bit); break;
1689 default: ShouldNotReachHere();
1690 }
1691 }
1692 }
1693
1694 void C2_MacroAssembler::load_vector(BasicType bt, XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) {
1695 assert(rscratch != noreg || always_reachable(src), "missing");
1696
1697 if (reachable(src)) {
1698 load_vector(bt, dst, as_Address(src), vlen_in_bytes);
1699 } else {
1700 lea(rscratch, src);
1701 load_vector(bt, dst, Address(rscratch, 0), vlen_in_bytes);
1702 }
1703 }
1704
1705 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) {
1706 int vlen_enc = vector_length_encoding(vlen);
1707 if (VM_Version::supports_avx()) {
1708 if (bt == T_LONG) {
1709 if (VM_Version::supports_avx2()) {
1710 vpbroadcastq(dst, src, vlen_enc);
1711 } else {
1712 vmovddup(dst, src, vlen_enc);
1713 }
1714 } else if (bt == T_DOUBLE) {
1715 if (vlen_enc != Assembler::AVX_128bit) {
1716 vbroadcastsd(dst, src, vlen_enc, noreg);
1717 } else {
1718 vmovddup(dst, src, vlen_enc);
1719 }
1720 } else {
1721 if (VM_Version::supports_avx2() && is_integral_type(bt)) {
1722 vpbroadcastd(dst, src, vlen_enc);
1723 } else {
1724 vbroadcastss(dst, src, vlen_enc);
1725 }
1726 }
1727 } else if (VM_Version::supports_sse3()) {
1728 movddup(dst, src);
1729 } else {
1730 load_vector(bt, dst, src, vlen);
1731 }
1732 }
1733
1734 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1735 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1736 int offset = exact_log2(type2aelembytes(bt)) << 6;
1737 if (is_floating_point_type(bt)) {
1738 offset += 128;
1739 }
1740 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
1741 load_vector(T_BYTE, dst, addr, vlen_in_bytes);
1742 }
1743
1744 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1745
1746 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1747 int vector_len = Assembler::AVX_128bit;
1748
1749 switch (opcode) {
1750 case Op_AndReductionV: pand(dst, src); break;
1751 case Op_OrReductionV: por (dst, src); break;
1752 case Op_XorReductionV: pxor(dst, src); break;
1753 case Op_MinReductionV:
1754 switch (typ) {
1755 case T_BYTE: pminsb(dst, src); break;
1756 case T_SHORT: pminsw(dst, src); break;
1757 case T_INT: pminsd(dst, src); break;
1758 case T_LONG: assert(UseAVX > 2, "required");
1759 vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1760 default: assert(false, "wrong type");
1761 }
1762 break;
1763 case Op_MaxReductionV:
1764 switch (typ) {
1765 case T_BYTE: pmaxsb(dst, src); break;
1766 case T_SHORT: pmaxsw(dst, src); break;
1767 case T_INT: pmaxsd(dst, src); break;
1768 case T_LONG: assert(UseAVX > 2, "required");
1769 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1770 default: assert(false, "wrong type");
1771 }
1772 break;
1773 case Op_UMinReductionV:
1774 switch (typ) {
1775 case T_BYTE: vpminub(dst, dst, src, Assembler::AVX_128bit); break;
1776 case T_SHORT: vpminuw(dst, dst, src, Assembler::AVX_128bit); break;
1777 case T_INT: vpminud(dst, dst, src, Assembler::AVX_128bit); break;
1778 case T_LONG: evpminuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1779 default: assert(false, "wrong type");
1780 }
1781 break;
1782 case Op_UMaxReductionV:
1783 switch (typ) {
1784 case T_BYTE: vpmaxub(dst, dst, src, Assembler::AVX_128bit); break;
1785 case T_SHORT: vpmaxuw(dst, dst, src, Assembler::AVX_128bit); break;
1786 case T_INT: vpmaxud(dst, dst, src, Assembler::AVX_128bit); break;
1787 case T_LONG: evpmaxuq(dst, k0, dst, src, true, Assembler::AVX_128bit); break;
1788 default: assert(false, "wrong type");
1789 }
1790 break;
1791 case Op_AddReductionVF: addss(dst, src); break;
1792 case Op_AddReductionVD: addsd(dst, src); break;
1793 case Op_AddReductionVI:
1794 switch (typ) {
1795 case T_BYTE: paddb(dst, src); break;
1796 case T_SHORT: paddw(dst, src); break;
1797 case T_INT: paddd(dst, src); break;
1798 default: assert(false, "wrong type");
1799 }
1800 break;
1801 case Op_AddReductionVL: paddq(dst, src); break;
1802 case Op_MulReductionVF: mulss(dst, src); break;
1803 case Op_MulReductionVD: mulsd(dst, src); break;
1804 case Op_MulReductionVI:
1805 switch (typ) {
1806 case T_SHORT: pmullw(dst, src); break;
1807 case T_INT: pmulld(dst, src); break;
1808 default: assert(false, "wrong type");
1809 }
1810 break;
1811 case Op_MulReductionVL: assert(UseAVX > 2, "required");
1812 evpmullq(dst, dst, src, vector_len); break;
1813 default: assert(false, "wrong opcode");
1814 }
1815 }
1816
1817 void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1818 switch (opcode) {
1819 case Op_AddReductionVF: addps(dst, src); break;
1820 case Op_AddReductionVD: addpd(dst, src); break;
1821 case Op_MulReductionVF: mulps(dst, src); break;
1822 case Op_MulReductionVD: mulpd(dst, src); break;
1823 default: assert(false, "%s", NodeClassNames[opcode]);
1824 }
1825 }
1826
1827 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1828 int vector_len = Assembler::AVX_256bit;
1829
1830 switch (opcode) {
1831 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break;
1832 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break;
1833 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break;
1834 case Op_MinReductionV:
1835 switch (typ) {
1836 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break;
1837 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break;
1838 case T_INT: vpminsd(dst, src1, src2, vector_len); break;
1839 case T_LONG: assert(UseAVX > 2, "required");
1840 vpminsq(dst, src1, src2, vector_len); break;
1841 default: assert(false, "wrong type");
1842 }
1843 break;
1844 case Op_MaxReductionV:
1845 switch (typ) {
1846 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break;
1847 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break;
1848 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break;
1849 case T_LONG: assert(UseAVX > 2, "required");
1850 vpmaxsq(dst, src1, src2, vector_len); break;
1851 default: assert(false, "wrong type");
1852 }
1853 break;
1854 case Op_UMinReductionV:
1855 switch (typ) {
1856 case T_BYTE: vpminub(dst, src1, src2, vector_len); break;
1857 case T_SHORT: vpminuw(dst, src1, src2, vector_len); break;
1858 case T_INT: vpminud(dst, src1, src2, vector_len); break;
1859 case T_LONG: evpminuq(dst, k0, src1, src2, true, vector_len); break;
1860 default: assert(false, "wrong type");
1861 }
1862 break;
1863 case Op_UMaxReductionV:
1864 switch (typ) {
1865 case T_BYTE: vpmaxub(dst, src1, src2, vector_len); break;
1866 case T_SHORT: vpmaxuw(dst, src1, src2, vector_len); break;
1867 case T_INT: vpmaxud(dst, src1, src2, vector_len); break;
1868 case T_LONG: evpmaxuq(dst, k0, src1, src2, true, vector_len); break;
1869 default: assert(false, "wrong type");
1870 }
1871 break;
1872 case Op_AddReductionVI:
1873 switch (typ) {
1874 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break;
1875 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break;
1876 case T_INT: vpaddd(dst, src1, src2, vector_len); break;
1877 default: assert(false, "wrong type");
1878 }
1879 break;
1880 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1881 case Op_MulReductionVI:
1882 switch (typ) {
1883 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break;
1884 case T_INT: vpmulld(dst, src1, src2, vector_len); break;
1885 default: assert(false, "wrong type");
1886 }
1887 break;
1888 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break;
1889 default: assert(false, "wrong opcode");
1890 }
1891 }
1892
1893 void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1894 int vector_len = Assembler::AVX_256bit;
1895
1896 switch (opcode) {
1897 case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
1898 case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
1899 case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
1900 case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
1901 default: assert(false, "%s", NodeClassNames[opcode]);
1902 }
1903 }
1904
1905 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1906 XMMRegister dst, XMMRegister src,
1907 XMMRegister vtmp1, XMMRegister vtmp2) {
1908 switch (opcode) {
1909 case Op_AddReductionVF:
1910 case Op_MulReductionVF:
1911 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1912 break;
1913
1914 case Op_AddReductionVD:
1915 case Op_MulReductionVD:
1916 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1917 break;
1918
1919 default: assert(false, "wrong opcode");
1920 }
1921 }
1922
1923 void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
1924 XMMRegister dst, XMMRegister src,
1925 XMMRegister vtmp1, XMMRegister vtmp2) {
1926 switch (opcode) {
1927 case Op_AddReductionVF:
1928 case Op_MulReductionVF:
1929 unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1930 break;
1931
1932 case Op_AddReductionVD:
1933 case Op_MulReductionVD:
1934 unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1935 break;
1936
1937 default: assert(false, "%s", NodeClassNames[opcode]);
1938 }
1939 }
1940
1941 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1942 Register dst, Register src1, XMMRegister src2,
1943 XMMRegister vtmp1, XMMRegister vtmp2) {
1944 switch (vlen) {
1945 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1946 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1947 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1948 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1949
1950 default: assert(false, "wrong vector length");
1951 }
1952 }
1953
1954 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1955 Register dst, Register src1, XMMRegister src2,
1956 XMMRegister vtmp1, XMMRegister vtmp2) {
1957 switch (vlen) {
1958 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1959 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1960 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1961 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1962
1963 default: assert(false, "wrong vector length");
1964 }
1965 }
1966
1967 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1968 Register dst, Register src1, XMMRegister src2,
1969 XMMRegister vtmp1, XMMRegister vtmp2) {
1970 switch (vlen) {
1971 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1972 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1973 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1974 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1975
1976 default: assert(false, "wrong vector length");
1977 }
1978 }
1979
1980 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1981 Register dst, Register src1, XMMRegister src2,
1982 XMMRegister vtmp1, XMMRegister vtmp2) {
1983 switch (vlen) {
1984 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1985 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1986 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1987 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1988
1989 default: assert(false, "wrong vector length");
1990 }
1991 }
1992
1993 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1994 Register dst, Register src1, XMMRegister src2,
1995 XMMRegister vtmp1, XMMRegister vtmp2) {
1996 switch (vlen) {
1997 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1998 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1999 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
2000
2001 default: assert(false, "wrong vector length");
2002 }
2003 }
2004
2005 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2006 switch (vlen) {
2007 case 2:
2008 assert(vtmp2 == xnoreg, "");
2009 reduce2F(opcode, dst, src, vtmp1);
2010 break;
2011 case 4:
2012 assert(vtmp2 == xnoreg, "");
2013 reduce4F(opcode, dst, src, vtmp1);
2014 break;
2015 case 8:
2016 reduce8F(opcode, dst, src, vtmp1, vtmp2);
2017 break;
2018 case 16:
2019 reduce16F(opcode, dst, src, vtmp1, vtmp2);
2020 break;
2021 default: assert(false, "wrong vector length");
2022 }
2023 }
2024
2025 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2026 switch (vlen) {
2027 case 2:
2028 assert(vtmp2 == xnoreg, "");
2029 reduce2D(opcode, dst, src, vtmp1);
2030 break;
2031 case 4:
2032 reduce4D(opcode, dst, src, vtmp1, vtmp2);
2033 break;
2034 case 8:
2035 reduce8D(opcode, dst, src, vtmp1, vtmp2);
2036 break;
2037 default: assert(false, "wrong vector length");
2038 }
2039 }
2040
2041 void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2042 switch (vlen) {
2043 case 2:
2044 assert(vtmp1 == xnoreg, "");
2045 assert(vtmp2 == xnoreg, "");
2046 unorderedReduce2F(opcode, dst, src);
2047 break;
2048 case 4:
2049 assert(vtmp2 == xnoreg, "");
2050 unorderedReduce4F(opcode, dst, src, vtmp1);
2051 break;
2052 case 8:
2053 unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
2054 break;
2055 case 16:
2056 unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
2057 break;
2058 default: assert(false, "wrong vector length");
2059 }
2060 }
2061
2062 void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2063 switch (vlen) {
2064 case 2:
2065 assert(vtmp1 == xnoreg, "");
2066 assert(vtmp2 == xnoreg, "");
2067 unorderedReduce2D(opcode, dst, src);
2068 break;
2069 case 4:
2070 assert(vtmp2 == xnoreg, "");
2071 unorderedReduce4D(opcode, dst, src, vtmp1);
2072 break;
2073 case 8:
2074 unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
2075 break;
2076 default: assert(false, "wrong vector length");
2077 }
2078 }
2079
2080 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2081 if (opcode == Op_AddReductionVI) {
2082 if (vtmp1 != src2) {
2083 movdqu(vtmp1, src2);
2084 }
2085 phaddd(vtmp1, vtmp1);
2086 } else {
2087 pshufd(vtmp1, src2, 0x1);
2088 reduce_operation_128(T_INT, opcode, vtmp1, src2);
2089 }
2090 movdl(vtmp2, src1);
2091 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2092 movdl(dst, vtmp1);
2093 }
2094
2095 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2096 if (opcode == Op_AddReductionVI) {
2097 if (vtmp1 != src2) {
2098 movdqu(vtmp1, src2);
2099 }
2100 phaddd(vtmp1, src2);
2101 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2102 } else {
2103 pshufd(vtmp2, src2, 0xE);
2104 reduce_operation_128(T_INT, opcode, vtmp2, src2);
2105 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2106 }
2107 }
2108
2109 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2110 if (opcode == Op_AddReductionVI) {
2111 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2112 vextracti128_high(vtmp2, vtmp1);
2113 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2114 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2115 } else {
2116 vextracti128_high(vtmp1, src2);
2117 reduce_operation_128(T_INT, opcode, vtmp1, src2);
2118 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2119 }
2120 }
2121
2122 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2123 vextracti64x4_high(vtmp2, src2);
2124 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2125 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2126 }
2127
2128 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2129 pshufd(vtmp2, src2, 0x1);
2130 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2131 movdqu(vtmp1, vtmp2);
2132 psrldq(vtmp1, 2);
2133 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2134 movdqu(vtmp2, vtmp1);
2135 psrldq(vtmp2, 1);
2136 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2137 movdl(vtmp2, src1);
2138 if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2139 pmovzxbd(vtmp1, vtmp1);
2140 } else {
2141 pmovsxbd(vtmp1, vtmp1);
2142 }
2143 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2144 pextrb(dst, vtmp1, 0x0);
2145 movsbl(dst, dst);
2146 }
2147
2148 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2149 pshufd(vtmp1, src2, 0xE);
2150 reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2151 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2152 }
2153
2154 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2155 vextracti128_high(vtmp2, src2);
2156 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2157 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2158 }
2159
2160 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2161 vextracti64x4_high(vtmp1, src2);
2162 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2163 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2164 }
2165
2166 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2167 pmovsxbw(vtmp2, src2);
2168 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2169 }
2170
2171 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2172 if (UseAVX > 1) {
2173 int vector_len = Assembler::AVX_256bit;
2174 vpmovsxbw(vtmp1, src2, vector_len);
2175 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2176 } else {
2177 pmovsxbw(vtmp2, src2);
2178 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2179 pshufd(vtmp2, src2, 0x1);
2180 pmovsxbw(vtmp2, src2);
2181 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2182 }
2183 }
2184
2185 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2186 if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2187 int vector_len = Assembler::AVX_512bit;
2188 vpmovsxbw(vtmp1, src2, vector_len);
2189 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2190 } else {
2191 assert(UseAVX >= 2,"Should not reach here.");
2192 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2193 vextracti128_high(vtmp2, src2);
2194 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2195 }
2196 }
2197
2198 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2199 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2200 vextracti64x4_high(vtmp2, src2);
2201 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2202 }
2203
2204 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2205 if (opcode == Op_AddReductionVI) {
2206 if (vtmp1 != src2) {
2207 movdqu(vtmp1, src2);
2208 }
2209 phaddw(vtmp1, vtmp1);
2210 phaddw(vtmp1, vtmp1);
2211 } else {
2212 pshufd(vtmp2, src2, 0x1);
2213 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2214 movdqu(vtmp1, vtmp2);
2215 psrldq(vtmp1, 2);
2216 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2217 }
2218 movdl(vtmp2, src1);
2219 if (opcode == Op_UMinReductionV || opcode == Op_UMaxReductionV) {
2220 pmovzxwd(vtmp1, vtmp1);
2221 } else {
2222 pmovsxwd(vtmp1, vtmp1);
2223 }
2224 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2225 pextrw(dst, vtmp1, 0x0);
2226 movswl(dst, dst);
2227 }
2228
2229 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2230 if (opcode == Op_AddReductionVI) {
2231 if (vtmp1 != src2) {
2232 movdqu(vtmp1, src2);
2233 }
2234 phaddw(vtmp1, src2);
2235 } else {
2236 pshufd(vtmp1, src2, 0xE);
2237 reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2238 }
2239 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2240 }
2241
2242 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2243 if (opcode == Op_AddReductionVI) {
2244 int vector_len = Assembler::AVX_256bit;
2245 vphaddw(vtmp2, src2, src2, vector_len);
2246 vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2247 } else {
2248 vextracti128_high(vtmp2, src2);
2249 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2250 }
2251 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2252 }
2253
2254 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2255 int vector_len = Assembler::AVX_256bit;
2256 vextracti64x4_high(vtmp1, src2);
2257 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2258 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2259 }
2260
2261 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2262 pshufd(vtmp2, src2, 0xE);
2263 reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2264 movdq(vtmp1, src1);
2265 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2266 movdq(dst, vtmp1);
2267 }
2268
2269 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2270 vextracti128_high(vtmp1, src2);
2271 reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2272 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2273 }
2274
2275 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2276 vextracti64x4_high(vtmp2, src2);
2277 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2278 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2279 }
2280
2281 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2282 mov64(temp, -1L);
2283 bzhiq(temp, temp, len);
2284 kmovql(dst, temp);
2285 }
2286
2287 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2288 reduce_operation_128(T_FLOAT, opcode, dst, src);
2289 pshufd(vtmp, src, 0x1);
2290 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2291 }
2292
2293 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2294 reduce2F(opcode, dst, src, vtmp);
2295 pshufd(vtmp, src, 0x2);
2296 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2297 pshufd(vtmp, src, 0x3);
2298 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2299 }
2300
2301 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2302 reduce4F(opcode, dst, src, vtmp2);
2303 vextractf128_high(vtmp2, src);
2304 reduce4F(opcode, dst, vtmp2, vtmp1);
2305 }
2306
2307 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2308 reduce8F(opcode, dst, src, vtmp1, vtmp2);
2309 vextracti64x4_high(vtmp1, src);
2310 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2311 }
2312
2313 void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
2314 pshufd(dst, src, 0x1);
2315 reduce_operation_128(T_FLOAT, opcode, dst, src);
2316 }
2317
2318 void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2319 pshufd(vtmp, src, 0xE);
2320 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
2321 unorderedReduce2F(opcode, dst, vtmp);
2322 }
2323
2324 void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2325 vextractf128_high(vtmp1, src);
2326 unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
2327 unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
2328 }
2329
2330 void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2331 vextractf64x4_high(vtmp2, src);
2332 unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
2333 unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
2334 }
2335
2336 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2337 reduce_operation_128(T_DOUBLE, opcode, dst, src);
2338 pshufd(vtmp, src, 0xE);
2339 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2340 }
2341
2342 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2343 reduce2D(opcode, dst, src, vtmp2);
2344 vextractf128_high(vtmp2, src);
2345 reduce2D(opcode, dst, vtmp2, vtmp1);
2346 }
2347
2348 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2349 reduce4D(opcode, dst, src, vtmp1, vtmp2);
2350 vextracti64x4_high(vtmp1, src);
2351 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2352 }
2353
2354 void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
2355 pshufd(dst, src, 0xE);
2356 reduce_operation_128(T_DOUBLE, opcode, dst, src);
2357 }
2358
2359 void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2360 vextractf128_high(vtmp, src);
2361 unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
2362 unorderedReduce2D(opcode, dst, vtmp);
2363 }
2364
2365 void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2366 vextractf64x4_high(vtmp2, src);
2367 unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
2368 unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
2369 }
2370
2371 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
2372 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2373 }
2374
2375 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) {
2376 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2377 }
2378
2379 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) {
2380 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
2381 }
2382
2383 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask,
2384 int vec_enc) {
2385 switch(elem_bt) {
2386 case T_INT:
2387 case T_FLOAT:
2388 vmaskmovps(dst, src, mask, vec_enc);
2389 break;
2390 case T_LONG:
2391 case T_DOUBLE:
2392 vmaskmovpd(dst, src, mask, vec_enc);
2393 break;
2394 default:
2395 fatal("Unsupported type %s", type2name(elem_bt));
2396 break;
2397 }
2398 }
2399
2400 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask,
2401 int vec_enc) {
2402 switch(elem_bt) {
2403 case T_INT:
2404 case T_FLOAT:
2405 vmaskmovps(dst, src, mask, vec_enc);
2406 break;
2407 case T_LONG:
2408 case T_DOUBLE:
2409 vmaskmovpd(dst, src, mask, vec_enc);
2410 break;
2411 default:
2412 fatal("Unsupported type %s", type2name(elem_bt));
2413 break;
2414 }
2415 }
2416
2417 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2418 XMMRegister dst, XMMRegister src,
2419 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2420 XMMRegister xmm_0, XMMRegister xmm_1) {
2421 const int permconst[] = {1, 14};
2422 XMMRegister wsrc = src;
2423 XMMRegister wdst = xmm_0;
2424 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2425
2426 int vlen_enc = Assembler::AVX_128bit;
2427 if (vlen == 16) {
2428 vlen_enc = Assembler::AVX_256bit;
2429 }
2430
2431 for (int i = log2(vlen) - 1; i >=0; i--) {
2432 if (i == 0 && !is_dst_valid) {
2433 wdst = dst;
2434 }
2435 if (i == 3) {
2436 vextracti64x4_high(wtmp, wsrc);
2437 } else if (i == 2) {
2438 vextracti128_high(wtmp, wsrc);
2439 } else { // i = [0,1]
2440 vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2441 }
2442
2443 if (VM_Version::supports_avx10_2()) {
2444 vminmax_fp(opcode, T_FLOAT, wdst, k0, wtmp, wsrc, vlen_enc);
2445 } else {
2446 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2447 }
2448 wsrc = wdst;
2449 vlen_enc = Assembler::AVX_128bit;
2450 }
2451 if (is_dst_valid) {
2452 if (VM_Version::supports_avx10_2()) {
2453 vminmax_fp(opcode, T_FLOAT, dst, k0, wdst, dst, Assembler::AVX_128bit);
2454 } else {
2455 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2456 }
2457 }
2458 }
2459
2460 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2461 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2462 XMMRegister xmm_0, XMMRegister xmm_1) {
2463 XMMRegister wsrc = src;
2464 XMMRegister wdst = xmm_0;
2465 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2466 int vlen_enc = Assembler::AVX_128bit;
2467 if (vlen == 8) {
2468 vlen_enc = Assembler::AVX_256bit;
2469 }
2470 for (int i = log2(vlen) - 1; i >=0; i--) {
2471 if (i == 0 && !is_dst_valid) {
2472 wdst = dst;
2473 }
2474 if (i == 1) {
2475 vextracti128_high(wtmp, wsrc);
2476 } else if (i == 2) {
2477 vextracti64x4_high(wtmp, wsrc);
2478 } else {
2479 assert(i == 0, "%d", i);
2480 vpermilpd(wtmp, wsrc, 1, vlen_enc);
2481 }
2482
2483 if (VM_Version::supports_avx10_2()) {
2484 vminmax_fp(opcode, T_DOUBLE, wdst, k0, wtmp, wsrc, vlen_enc);
2485 } else {
2486 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2487 }
2488
2489 wsrc = wdst;
2490 vlen_enc = Assembler::AVX_128bit;
2491 }
2492
2493 if (is_dst_valid) {
2494 if (VM_Version::supports_avx10_2()) {
2495 vminmax_fp(opcode, T_DOUBLE, dst, k0, wdst, dst, Assembler::AVX_128bit);
2496 } else {
2497 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2498 }
2499 }
2500 }
2501
2502 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2503 switch (bt) {
2504 case T_BYTE: pextrb(dst, src, idx); break;
2505 case T_SHORT: pextrw(dst, src, idx); break;
2506 case T_INT: pextrd(dst, src, idx); break;
2507 case T_LONG: pextrq(dst, src, idx); break;
2508
2509 default:
2510 assert(false,"Should not reach here.");
2511 break;
2512 }
2513 }
2514
2515 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2516 int esize = type2aelembytes(typ);
2517 int elem_per_lane = 16/esize;
2518 int lane = elemindex / elem_per_lane;
2519 int eindex = elemindex % elem_per_lane;
2520
2521 if (lane >= 2) {
2522 assert(UseAVX > 2, "required");
2523 vextractf32x4(dst, src, lane & 3);
2524 return dst;
2525 } else if (lane > 0) {
2526 assert(UseAVX > 0, "required");
2527 vextractf128(dst, src, lane);
2528 return dst;
2529 } else {
2530 return src;
2531 }
2532 }
2533
2534 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2535 if (typ == T_BYTE) {
2536 movsbl(dst, dst);
2537 } else if (typ == T_SHORT) {
2538 movswl(dst, dst);
2539 }
2540 }
2541
2542 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2543 int esize = type2aelembytes(typ);
2544 int elem_per_lane = 16/esize;
2545 int eindex = elemindex % elem_per_lane;
2546 assert(is_integral_type(typ),"required");
2547
2548 if (eindex == 0) {
2549 if (typ == T_LONG) {
2550 movq(dst, src);
2551 } else {
2552 movdl(dst, src);
2553 movsxl(typ, dst);
2554 }
2555 } else {
2556 extract(typ, dst, src, eindex);
2557 movsxl(typ, dst);
2558 }
2559 }
2560
2561 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) {
2562 int esize = type2aelembytes(typ);
2563 int elem_per_lane = 16/esize;
2564 int eindex = elemindex % elem_per_lane;
2565 assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2566
2567 if (eindex == 0) {
2568 movq(dst, src);
2569 } else {
2570 if (typ == T_FLOAT) {
2571 if (UseAVX == 0) {
2572 movdqu(dst, src);
2573 shufps(dst, dst, eindex);
2574 } else {
2575 vshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2576 }
2577 } else {
2578 if (UseAVX == 0) {
2579 movdqu(dst, src);
2580 psrldq(dst, eindex*esize);
2581 } else {
2582 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2583 }
2584 movq(dst, dst);
2585 }
2586 }
2587 // Zero upper bits
2588 if (typ == T_FLOAT) {
2589 if (UseAVX == 0) {
2590 assert(vtmp != xnoreg, "required.");
2591 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg);
2592 pand(dst, vtmp);
2593 } else {
2594 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg);
2595 }
2596 }
2597 }
2598
2599 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2600 switch(typ) {
2601 case T_BYTE:
2602 case T_BOOLEAN:
2603 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2604 break;
2605 case T_SHORT:
2606 case T_CHAR:
2607 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2608 break;
2609 case T_INT:
2610 case T_FLOAT:
2611 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2612 break;
2613 case T_LONG:
2614 case T_DOUBLE:
2615 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2616 break;
2617 default:
2618 assert(false,"Should not reach here.");
2619 break;
2620 }
2621 }
2622
2623 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) {
2624 assert(rscratch != noreg || always_reachable(src2), "missing");
2625
2626 switch(typ) {
2627 case T_BOOLEAN:
2628 case T_BYTE:
2629 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2630 break;
2631 case T_CHAR:
2632 case T_SHORT:
2633 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2634 break;
2635 case T_INT:
2636 case T_FLOAT:
2637 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2638 break;
2639 case T_LONG:
2640 case T_DOUBLE:
2641 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch);
2642 break;
2643 default:
2644 assert(false,"Should not reach here.");
2645 break;
2646 }
2647 }
2648
2649 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2650 switch(typ) {
2651 case T_BYTE:
2652 evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2653 break;
2654 case T_SHORT:
2655 evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2656 break;
2657 case T_INT:
2658 case T_FLOAT:
2659 evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2660 break;
2661 case T_LONG:
2662 case T_DOUBLE:
2663 evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2664 break;
2665 default:
2666 assert(false,"Should not reach here.");
2667 break;
2668 }
2669 }
2670
2671 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) {
2672 assert(vlen_in_bytes <= 32, "");
2673 int esize = type2aelembytes(bt);
2674 if (vlen_in_bytes == 32) {
2675 assert(vtmp == xnoreg, "required.");
2676 if (esize >= 4) {
2677 vtestps(src1, src2, AVX_256bit);
2678 } else {
2679 vptest(src1, src2, AVX_256bit);
2680 }
2681 return;
2682 }
2683 if (vlen_in_bytes < 16) {
2684 // Duplicate the lower part to fill the whole register,
2685 // Don't need to do so for src2
2686 assert(vtmp != xnoreg, "required");
2687 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04;
2688 pshufd(vtmp, src1, shuffle_imm);
2689 } else {
2690 assert(vtmp == xnoreg, "required");
2691 vtmp = src1;
2692 }
2693 if (esize >= 4 && VM_Version::supports_avx()) {
2694 vtestps(vtmp, src2, AVX_128bit);
2695 } else {
2696 ptest(vtmp, src2);
2697 }
2698 }
2699
2700 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
2701 #ifdef ASSERT
2702 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2703 bool is_bw_supported = VM_Version::supports_avx512bw();
2704 if (is_bw && !is_bw_supported) {
2705 assert(vlen_enc != Assembler::AVX_512bit, "required");
2706 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16),
2707 "XMM register should be 0-15");
2708 }
2709 #endif // ASSERT
2710 switch (elem_bt) {
2711 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return;
2712 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return;
2713 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return;
2714 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return;
2715 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return;
2716 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return;
2717 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2718 }
2719 }
2720
2721 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) {
2722 assert(UseAVX >= 2, "required");
2723 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT));
2724 bool is_vl = vlen_enc != Assembler::AVX_512bit;
2725 if ((UseAVX > 2) &&
2726 (!is_bw || VM_Version::supports_avx512bw()) &&
2727 (!is_vl || VM_Version::supports_avx512vl())) {
2728 switch (elem_bt) {
2729 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return;
2730 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return;
2731 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return;
2732 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return;
2733 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2734 }
2735 } else {
2736 assert(vlen_enc != Assembler::AVX_512bit, "required");
2737 assert((dst->encoding() < 16),"XMM register should be 0-15");
2738 switch (elem_bt) {
2739 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return;
2740 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return;
2741 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return;
2742 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return;
2743 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return;
2744 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return;
2745 default: fatal("Unsupported type %s", type2name(elem_bt)); return;
2746 }
2747 }
2748 }
2749
2750 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
2751 switch (to_elem_bt) {
2752 case T_SHORT:
2753 vpmovsxbw(dst, src, vlen_enc);
2754 break;
2755 case T_INT:
2756 vpmovsxbd(dst, src, vlen_enc);
2757 break;
2758 case T_FLOAT:
2759 vpmovsxbd(dst, src, vlen_enc);
2760 vcvtdq2ps(dst, dst, vlen_enc);
2761 break;
2762 case T_LONG:
2763 vpmovsxbq(dst, src, vlen_enc);
2764 break;
2765 case T_DOUBLE: {
2766 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
2767 vpmovsxbd(dst, src, mid_vlen_enc);
2768 vcvtdq2pd(dst, dst, vlen_enc);
2769 break;
2770 }
2771 default:
2772 fatal("Unsupported type %s", type2name(to_elem_bt));
2773 break;
2774 }
2775 }
2776
2777 //-------------------------------------------------------------------------------------------
2778
2779 // IndexOf for constant substrings with size >= 8 chars
2780 // which don't need to be loaded through stack.
2781 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2782 Register cnt1, Register cnt2,
2783 int int_cnt2, Register result,
2784 XMMRegister vec, Register tmp,
2785 int ae) {
2786 ShortBranchVerifier sbv(this);
2787 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2788 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2789
2790 // This method uses the pcmpestri instruction with bound registers
2791 // inputs:
2792 // xmm - substring
2793 // rax - substring length (elements count)
2794 // mem - scanned string
2795 // rdx - string length (elements count)
2796 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2797 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2798 // outputs:
2799 // rcx - matched index in string
2800 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2801 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2802 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2803 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2804 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2805
2806 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2807 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2808 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2809
2810 // Note, inline_string_indexOf() generates checks:
2811 // if (substr.count > string.count) return -1;
2812 // if (substr.count == 0) return 0;
2813 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2814
2815 // Load substring.
2816 if (ae == StrIntrinsicNode::UL) {
2817 pmovzxbw(vec, Address(str2, 0));
2818 } else {
2819 movdqu(vec, Address(str2, 0));
2820 }
2821 movl(cnt2, int_cnt2);
2822 movptr(result, str1); // string addr
2823
2824 if (int_cnt2 > stride) {
2825 jmpb(SCAN_TO_SUBSTR);
2826
2827 // Reload substr for rescan, this code
2828 // is executed only for large substrings (> 8 chars)
2829 bind(RELOAD_SUBSTR);
2830 if (ae == StrIntrinsicNode::UL) {
2831 pmovzxbw(vec, Address(str2, 0));
2832 } else {
2833 movdqu(vec, Address(str2, 0));
2834 }
2835 negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2836
2837 bind(RELOAD_STR);
2838 // We came here after the beginning of the substring was
2839 // matched but the rest of it was not so we need to search
2840 // again. Start from the next element after the previous match.
2841
2842 // cnt2 is number of substring reminding elements and
2843 // cnt1 is number of string reminding elements when cmp failed.
2844 // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2845 subl(cnt1, cnt2);
2846 addl(cnt1, int_cnt2);
2847 movl(cnt2, int_cnt2); // Now restore cnt2
2848
2849 decrementl(cnt1); // Shift to next element
2850 cmpl(cnt1, cnt2);
2851 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2852
2853 addptr(result, (1<<scale1));
2854
2855 } // (int_cnt2 > 8)
2856
2857 // Scan string for start of substr in 16-byte vectors
2858 bind(SCAN_TO_SUBSTR);
2859 pcmpestri(vec, Address(result, 0), mode);
2860 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
2861 subl(cnt1, stride);
2862 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2863 cmpl(cnt1, cnt2);
2864 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2865 addptr(result, 16);
2866 jmpb(SCAN_TO_SUBSTR);
2867
2868 // Found a potential substr
2869 bind(FOUND_CANDIDATE);
2870 // Matched whole vector if first element matched (tmp(rcx) == 0).
2871 if (int_cnt2 == stride) {
2872 jccb(Assembler::overflow, RET_FOUND); // OF == 1
2873 } else { // int_cnt2 > 8
2874 jccb(Assembler::overflow, FOUND_SUBSTR);
2875 }
2876 // After pcmpestri tmp(rcx) contains matched element index
2877 // Compute start addr of substr
2878 lea(result, Address(result, tmp, scale1));
2879
2880 // Make sure string is still long enough
2881 subl(cnt1, tmp);
2882 cmpl(cnt1, cnt2);
2883 if (int_cnt2 == stride) {
2884 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2885 } else { // int_cnt2 > 8
2886 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2887 }
2888 // Left less then substring.
2889
2890 bind(RET_NOT_FOUND);
2891 movl(result, -1);
2892 jmp(EXIT);
2893
2894 if (int_cnt2 > stride) {
2895 // This code is optimized for the case when whole substring
2896 // is matched if its head is matched.
2897 bind(MATCH_SUBSTR_HEAD);
2898 pcmpestri(vec, Address(result, 0), mode);
2899 // Reload only string if does not match
2900 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2901
2902 Label CONT_SCAN_SUBSTR;
2903 // Compare the rest of substring (> 8 chars).
2904 bind(FOUND_SUBSTR);
2905 // First 8 chars are already matched.
2906 negptr(cnt2);
2907 addptr(cnt2, stride);
2908
2909 bind(SCAN_SUBSTR);
2910 subl(cnt1, stride);
2911 cmpl(cnt2, -stride); // Do not read beyond substring
2912 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2913 // Back-up strings to avoid reading beyond substring:
2914 // cnt1 = cnt1 - cnt2 + 8
2915 addl(cnt1, cnt2); // cnt2 is negative
2916 addl(cnt1, stride);
2917 movl(cnt2, stride); negptr(cnt2);
2918 bind(CONT_SCAN_SUBSTR);
2919 if (int_cnt2 < (int)G) {
2920 int tail_off1 = int_cnt2<<scale1;
2921 int tail_off2 = int_cnt2<<scale2;
2922 if (ae == StrIntrinsicNode::UL) {
2923 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2924 } else {
2925 movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2926 }
2927 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2928 } else {
2929 // calculate index in register to avoid integer overflow (int_cnt2*2)
2930 movl(tmp, int_cnt2);
2931 addptr(tmp, cnt2);
2932 if (ae == StrIntrinsicNode::UL) {
2933 pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2934 } else {
2935 movdqu(vec, Address(str2, tmp, scale2, 0));
2936 }
2937 pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2938 }
2939 // Need to reload strings pointers if not matched whole vector
2940 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2941 addptr(cnt2, stride);
2942 jcc(Assembler::negative, SCAN_SUBSTR);
2943 // Fall through if found full substring
2944
2945 } // (int_cnt2 > 8)
2946
2947 bind(RET_FOUND);
2948 // Found result if we matched full small substring.
2949 // Compute substr offset
2950 subptr(result, str1);
2951 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2952 shrl(result, 1); // index
2953 }
2954 bind(EXIT);
2955
2956 } // string_indexofC8
2957
2958 // Small strings are loaded through stack if they cross page boundary.
2959 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2960 Register cnt1, Register cnt2,
2961 int int_cnt2, Register result,
2962 XMMRegister vec, Register tmp,
2963 int ae) {
2964 ShortBranchVerifier sbv(this);
2965 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2966 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2967
2968 //
2969 // int_cnt2 is length of small (< 8 chars) constant substring
2970 // or (-1) for non constant substring in which case its length
2971 // is in cnt2 register.
2972 //
2973 // Note, inline_string_indexOf() generates checks:
2974 // if (substr.count > string.count) return -1;
2975 // if (substr.count == 0) return 0;
2976 //
2977 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2978 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2979 // This method uses the pcmpestri instruction with bound registers
2980 // inputs:
2981 // xmm - substring
2982 // rax - substring length (elements count)
2983 // mem - scanned string
2984 // rdx - string length (elements count)
2985 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2986 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2987 // outputs:
2988 // rcx - matched index in string
2989 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2990 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2991 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2992 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2993
2994 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2995 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2996 FOUND_CANDIDATE;
2997
2998 { //========================================================
2999 // We don't know where these strings are located
3000 // and we can't read beyond them. Load them through stack.
3001 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
3002
3003 movptr(tmp, rsp); // save old SP
3004
3005 if (int_cnt2 > 0) { // small (< 8 chars) constant substring
3006 if (int_cnt2 == (1>>scale2)) { // One byte
3007 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
3008 load_unsigned_byte(result, Address(str2, 0));
3009 movdl(vec, result); // move 32 bits
3010 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes
3011 // Not enough header space in 32-bit VM: 12+3 = 15.
3012 movl(result, Address(str2, -1));
3013 shrl(result, 8);
3014 movdl(vec, result); // move 32 bits
3015 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char
3016 load_unsigned_short(result, Address(str2, 0));
3017 movdl(vec, result); // move 32 bits
3018 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
3019 movdl(vec, Address(str2, 0)); // move 32 bits
3020 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
3021 movq(vec, Address(str2, 0)); // move 64 bits
3022 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
3023 // Array header size is 12 bytes in 32-bit VM
3024 // + 6 bytes for 3 chars == 18 bytes,
3025 // enough space to load vec and shift.
3026 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
3027 if (ae == StrIntrinsicNode::UL) {
3028 int tail_off = int_cnt2-8;
3029 pmovzxbw(vec, Address(str2, tail_off));
3030 psrldq(vec, -2*tail_off);
3031 }
3032 else {
3033 int tail_off = int_cnt2*(1<<scale2);
3034 movdqu(vec, Address(str2, tail_off-16));
3035 psrldq(vec, 16-tail_off);
3036 }
3037 }
3038 } else { // not constant substring
3039 cmpl(cnt2, stride);
3040 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
3041
3042 // We can read beyond string if srt+16 does not cross page boundary
3043 // since heaps are aligned and mapped by pages.
3044 assert(os::vm_page_size() < (int)G, "default page should be small");
3045 movl(result, str2); // We need only low 32 bits
3046 andl(result, ((int)os::vm_page_size()-1));
3047 cmpl(result, ((int)os::vm_page_size()-16));
3048 jccb(Assembler::belowEqual, CHECK_STR);
3049
3050 // Move small strings to stack to allow load 16 bytes into vec.
3051 subptr(rsp, 16);
3052 int stk_offset = wordSize-(1<<scale2);
3053 push(cnt2);
3054
3055 bind(COPY_SUBSTR);
3056 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
3057 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
3058 movb(Address(rsp, cnt2, scale2, stk_offset), result);
3059 } else if (ae == StrIntrinsicNode::UU) {
3060 load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
3061 movw(Address(rsp, cnt2, scale2, stk_offset), result);
3062 }
3063 decrement(cnt2);
3064 jccb(Assembler::notZero, COPY_SUBSTR);
3065
3066 pop(cnt2);
3067 movptr(str2, rsp); // New substring address
3068 } // non constant
3069
3070 bind(CHECK_STR);
3071 cmpl(cnt1, stride);
3072 jccb(Assembler::aboveEqual, BIG_STRINGS);
3073
3074 // Check cross page boundary.
3075 movl(result, str1); // We need only low 32 bits
3076 andl(result, ((int)os::vm_page_size()-1));
3077 cmpl(result, ((int)os::vm_page_size()-16));
3078 jccb(Assembler::belowEqual, BIG_STRINGS);
3079
3080 subptr(rsp, 16);
3081 int stk_offset = -(1<<scale1);
3082 if (int_cnt2 < 0) { // not constant
3083 push(cnt2);
3084 stk_offset += wordSize;
3085 }
3086 movl(cnt2, cnt1);
3087
3088 bind(COPY_STR);
3089 if (ae == StrIntrinsicNode::LL) {
3090 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
3091 movb(Address(rsp, cnt2, scale1, stk_offset), result);
3092 } else {
3093 load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
3094 movw(Address(rsp, cnt2, scale1, stk_offset), result);
3095 }
3096 decrement(cnt2);
3097 jccb(Assembler::notZero, COPY_STR);
3098
3099 if (int_cnt2 < 0) { // not constant
3100 pop(cnt2);
3101 }
3102 movptr(str1, rsp); // New string address
3103
3104 bind(BIG_STRINGS);
3105 // Load substring.
3106 if (int_cnt2 < 0) { // -1
3107 if (ae == StrIntrinsicNode::UL) {
3108 pmovzxbw(vec, Address(str2, 0));
3109 } else {
3110 movdqu(vec, Address(str2, 0));
3111 }
3112 push(cnt2); // substr count
3113 push(str2); // substr addr
3114 push(str1); // string addr
3115 } else {
3116 // Small (< 8 chars) constant substrings are loaded already.
3117 movl(cnt2, int_cnt2);
3118 }
3119 push(tmp); // original SP
3120
3121 } // Finished loading
3122
3123 //========================================================
3124 // Start search
3125 //
3126
3127 movptr(result, str1); // string addr
3128
3129 if (int_cnt2 < 0) { // Only for non constant substring
3130 jmpb(SCAN_TO_SUBSTR);
3131
3132 // SP saved at sp+0
3133 // String saved at sp+1*wordSize
3134 // Substr saved at sp+2*wordSize
3135 // Substr count saved at sp+3*wordSize
3136
3137 // Reload substr for rescan, this code
3138 // is executed only for large substrings (> 8 chars)
3139 bind(RELOAD_SUBSTR);
3140 movptr(str2, Address(rsp, 2*wordSize));
3141 movl(cnt2, Address(rsp, 3*wordSize));
3142 if (ae == StrIntrinsicNode::UL) {
3143 pmovzxbw(vec, Address(str2, 0));
3144 } else {
3145 movdqu(vec, Address(str2, 0));
3146 }
3147 // We came here after the beginning of the substring was
3148 // matched but the rest of it was not so we need to search
3149 // again. Start from the next element after the previous match.
3150 subptr(str1, result); // Restore counter
3151 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3152 shrl(str1, 1);
3153 }
3154 addl(cnt1, str1);
3155 decrementl(cnt1); // Shift to next element
3156 cmpl(cnt1, cnt2);
3157 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
3158
3159 addptr(result, (1<<scale1));
3160 } // non constant
3161
3162 // Scan string for start of substr in 16-byte vectors
3163 bind(SCAN_TO_SUBSTR);
3164 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3165 pcmpestri(vec, Address(result, 0), mode);
3166 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
3167 subl(cnt1, stride);
3168 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3169 cmpl(cnt1, cnt2);
3170 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
3171 addptr(result, 16);
3172
3173 bind(ADJUST_STR);
3174 cmpl(cnt1, stride); // Do not read beyond string
3175 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3176 // Back-up string to avoid reading beyond string.
3177 lea(result, Address(result, cnt1, scale1, -16));
3178 movl(cnt1, stride);
3179 jmpb(SCAN_TO_SUBSTR);
3180
3181 // Found a potential substr
3182 bind(FOUND_CANDIDATE);
3183 // After pcmpestri tmp(rcx) contains matched element index
3184
3185 // Make sure string is still long enough
3186 subl(cnt1, tmp);
3187 cmpl(cnt1, cnt2);
3188 jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3189 // Left less then substring.
3190
3191 bind(RET_NOT_FOUND);
3192 movl(result, -1);
3193 jmp(CLEANUP);
3194
3195 bind(FOUND_SUBSTR);
3196 // Compute start addr of substr
3197 lea(result, Address(result, tmp, scale1));
3198 if (int_cnt2 > 0) { // Constant substring
3199 // Repeat search for small substring (< 8 chars)
3200 // from new point without reloading substring.
3201 // Have to check that we don't read beyond string.
3202 cmpl(tmp, stride-int_cnt2);
3203 jccb(Assembler::greater, ADJUST_STR);
3204 // Fall through if matched whole substring.
3205 } else { // non constant
3206 assert(int_cnt2 == -1, "should be != 0");
3207
3208 addl(tmp, cnt2);
3209 // Found result if we matched whole substring.
3210 cmpl(tmp, stride);
3211 jcc(Assembler::lessEqual, RET_FOUND);
3212
3213 // Repeat search for small substring (<= 8 chars)
3214 // from new point 'str1' without reloading substring.
3215 cmpl(cnt2, stride);
3216 // Have to check that we don't read beyond string.
3217 jccb(Assembler::lessEqual, ADJUST_STR);
3218
3219 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3220 // Compare the rest of substring (> 8 chars).
3221 movptr(str1, result);
3222
3223 cmpl(tmp, cnt2);
3224 // First 8 chars are already matched.
3225 jccb(Assembler::equal, CHECK_NEXT);
3226
3227 bind(SCAN_SUBSTR);
3228 pcmpestri(vec, Address(str1, 0), mode);
3229 // Need to reload strings pointers if not matched whole vector
3230 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3231
3232 bind(CHECK_NEXT);
3233 subl(cnt2, stride);
3234 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3235 addptr(str1, 16);
3236 if (ae == StrIntrinsicNode::UL) {
3237 addptr(str2, 8);
3238 } else {
3239 addptr(str2, 16);
3240 }
3241 subl(cnt1, stride);
3242 cmpl(cnt2, stride); // Do not read beyond substring
3243 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3244 // Back-up strings to avoid reading beyond substring.
3245
3246 if (ae == StrIntrinsicNode::UL) {
3247 lea(str2, Address(str2, cnt2, scale2, -8));
3248 lea(str1, Address(str1, cnt2, scale1, -16));
3249 } else {
3250 lea(str2, Address(str2, cnt2, scale2, -16));
3251 lea(str1, Address(str1, cnt2, scale1, -16));
3252 }
3253 subl(cnt1, cnt2);
3254 movl(cnt2, stride);
3255 addl(cnt1, stride);
3256 bind(CONT_SCAN_SUBSTR);
3257 if (ae == StrIntrinsicNode::UL) {
3258 pmovzxbw(vec, Address(str2, 0));
3259 } else {
3260 movdqu(vec, Address(str2, 0));
3261 }
3262 jmp(SCAN_SUBSTR);
3263
3264 bind(RET_FOUND_LONG);
3265 movptr(str1, Address(rsp, wordSize));
3266 } // non constant
3267
3268 bind(RET_FOUND);
3269 // Compute substr offset
3270 subptr(result, str1);
3271 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3272 shrl(result, 1); // index
3273 }
3274 bind(CLEANUP);
3275 pop(rsp); // restore SP
3276
3277 } // string_indexof
3278
3279 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3280 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3281 ShortBranchVerifier sbv(this);
3282 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3283
3284 int stride = 8;
3285
3286 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3287 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3288 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3289 FOUND_SEQ_CHAR, DONE_LABEL;
3290
3291 movptr(result, str1);
3292 if (UseAVX >= 2) {
3293 cmpl(cnt1, stride);
3294 jcc(Assembler::less, SCAN_TO_CHAR);
3295 cmpl(cnt1, 2*stride);
3296 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3297 movdl(vec1, ch);
3298 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3299 vpxor(vec2, vec2);
3300 movl(tmp, cnt1);
3301 andl(tmp, 0xFFFFFFF0); //vector count (in chars)
3302 andl(cnt1,0x0000000F); //tail count (in chars)
3303
3304 bind(SCAN_TO_16_CHAR_LOOP);
3305 vmovdqu(vec3, Address(result, 0));
3306 vpcmpeqw(vec3, vec3, vec1, 1);
3307 vptest(vec2, vec3);
3308 jcc(Assembler::carryClear, FOUND_CHAR);
3309 addptr(result, 32);
3310 subl(tmp, 2*stride);
3311 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3312 jmp(SCAN_TO_8_CHAR);
3313 bind(SCAN_TO_8_CHAR_INIT);
3314 movdl(vec1, ch);
3315 pshuflw(vec1, vec1, 0x00);
3316 pshufd(vec1, vec1, 0);
3317 pxor(vec2, vec2);
3318 }
3319 bind(SCAN_TO_8_CHAR);
3320 cmpl(cnt1, stride);
3321 jcc(Assembler::less, SCAN_TO_CHAR);
3322 if (UseAVX < 2) {
3323 movdl(vec1, ch);
3324 pshuflw(vec1, vec1, 0x00);
3325 pshufd(vec1, vec1, 0);
3326 pxor(vec2, vec2);
3327 }
3328 movl(tmp, cnt1);
3329 andl(tmp, 0xFFFFFFF8); //vector count (in chars)
3330 andl(cnt1,0x00000007); //tail count (in chars)
3331
3332 bind(SCAN_TO_8_CHAR_LOOP);
3333 movdqu(vec3, Address(result, 0));
3334 pcmpeqw(vec3, vec1);
3335 ptest(vec2, vec3);
3336 jcc(Assembler::carryClear, FOUND_CHAR);
3337 addptr(result, 16);
3338 subl(tmp, stride);
3339 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3340 bind(SCAN_TO_CHAR);
3341 testl(cnt1, cnt1);
3342 jcc(Assembler::zero, RET_NOT_FOUND);
3343 bind(SCAN_TO_CHAR_LOOP);
3344 load_unsigned_short(tmp, Address(result, 0));
3345 cmpl(ch, tmp);
3346 jccb(Assembler::equal, FOUND_SEQ_CHAR);
3347 addptr(result, 2);
3348 subl(cnt1, 1);
3349 jccb(Assembler::zero, RET_NOT_FOUND);
3350 jmp(SCAN_TO_CHAR_LOOP);
3351
3352 bind(RET_NOT_FOUND);
3353 movl(result, -1);
3354 jmpb(DONE_LABEL);
3355
3356 bind(FOUND_CHAR);
3357 if (UseAVX >= 2) {
3358 vpmovmskb(tmp, vec3);
3359 } else {
3360 pmovmskb(tmp, vec3);
3361 }
3362 bsfl(ch, tmp);
3363 addptr(result, ch);
3364
3365 bind(FOUND_SEQ_CHAR);
3366 subptr(result, str1);
3367 shrl(result, 1);
3368
3369 bind(DONE_LABEL);
3370 } // string_indexof_char
3371
3372 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3373 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3374 ShortBranchVerifier sbv(this);
3375 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3376
3377 int stride = 16;
3378
3379 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3380 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3381 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3382 FOUND_SEQ_CHAR, DONE_LABEL;
3383
3384 movptr(result, str1);
3385 if (UseAVX >= 2) {
3386 cmpl(cnt1, stride);
3387 jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3388 cmpl(cnt1, stride*2);
3389 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3390 movdl(vec1, ch);
3391 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3392 vpxor(vec2, vec2);
3393 movl(tmp, cnt1);
3394 andl(tmp, 0xFFFFFFE0); //vector count (in chars)
3395 andl(cnt1,0x0000001F); //tail count (in chars)
3396
3397 bind(SCAN_TO_32_CHAR_LOOP);
3398 vmovdqu(vec3, Address(result, 0));
3399 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3400 vptest(vec2, vec3);
3401 jcc(Assembler::carryClear, FOUND_CHAR);
3402 addptr(result, 32);
3403 subl(tmp, stride*2);
3404 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3405 jmp(SCAN_TO_16_CHAR);
3406
3407 bind(SCAN_TO_16_CHAR_INIT);
3408 movdl(vec1, ch);
3409 pxor(vec2, vec2);
3410 pshufb(vec1, vec2);
3411 }
3412
3413 bind(SCAN_TO_16_CHAR);
3414 cmpl(cnt1, stride);
3415 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left
3416 if (UseAVX < 2) {
3417 movdl(vec1, ch);
3418 pxor(vec2, vec2);
3419 pshufb(vec1, vec2);
3420 }
3421 movl(tmp, cnt1);
3422 andl(tmp, 0xFFFFFFF0); //vector count (in bytes)
3423 andl(cnt1,0x0000000F); //tail count (in bytes)
3424
3425 bind(SCAN_TO_16_CHAR_LOOP);
3426 movdqu(vec3, Address(result, 0));
3427 pcmpeqb(vec3, vec1);
3428 ptest(vec2, vec3);
3429 jcc(Assembler::carryClear, FOUND_CHAR);
3430 addptr(result, 16);
3431 subl(tmp, stride);
3432 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3433
3434 bind(SCAN_TO_CHAR_INIT);
3435 testl(cnt1, cnt1);
3436 jcc(Assembler::zero, RET_NOT_FOUND);
3437 bind(SCAN_TO_CHAR_LOOP);
3438 load_unsigned_byte(tmp, Address(result, 0));
3439 cmpl(ch, tmp);
3440 jccb(Assembler::equal, FOUND_SEQ_CHAR);
3441 addptr(result, 1);
3442 subl(cnt1, 1);
3443 jccb(Assembler::zero, RET_NOT_FOUND);
3444 jmp(SCAN_TO_CHAR_LOOP);
3445
3446 bind(RET_NOT_FOUND);
3447 movl(result, -1);
3448 jmpb(DONE_LABEL);
3449
3450 bind(FOUND_CHAR);
3451 if (UseAVX >= 2) {
3452 vpmovmskb(tmp, vec3);
3453 } else {
3454 pmovmskb(tmp, vec3);
3455 }
3456 bsfl(ch, tmp);
3457 addptr(result, ch);
3458
3459 bind(FOUND_SEQ_CHAR);
3460 subptr(result, str1);
3461
3462 bind(DONE_LABEL);
3463 } // stringL_indexof_char
3464
3465 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
3466 switch (eltype) {
3467 case T_BOOLEAN: return sizeof(jboolean);
3468 case T_BYTE: return sizeof(jbyte);
3469 case T_SHORT: return sizeof(jshort);
3470 case T_CHAR: return sizeof(jchar);
3471 case T_INT: return sizeof(jint);
3472 default:
3473 ShouldNotReachHere();
3474 return -1;
3475 }
3476 }
3477
3478 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
3479 switch (eltype) {
3480 // T_BOOLEAN used as surrogate for unsigned byte
3481 case T_BOOLEAN: movzbl(dst, src); break;
3482 case T_BYTE: movsbl(dst, src); break;
3483 case T_SHORT: movswl(dst, src); break;
3484 case T_CHAR: movzwl(dst, src); break;
3485 case T_INT: movl(dst, src); break;
3486 default:
3487 ShouldNotReachHere();
3488 }
3489 }
3490
3491 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
3492 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3493 }
3494
3495 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
3496 load_vector(eltype, dst, src, arrays_hashcode_elsize(eltype) * 8);
3497 }
3498
3499 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
3500 const int vlen = Assembler::AVX_256bit;
3501 switch (eltype) {
3502 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break;
3503 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break;
3504 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3505 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break;
3506 case T_INT:
3507 // do nothing
3508 break;
3509 default:
3510 ShouldNotReachHere();
3511 }
3512 }
3513
3514 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
3515 Register index, Register tmp2, Register tmp3, XMMRegister vnext,
3516 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
3517 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
3518 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
3519 BasicType eltype) {
3520 ShortBranchVerifier sbv(this);
3521 assert(UseAVX >= 2, "AVX2 intrinsics are required");
3522 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3);
3523 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3);
3524
3525 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN,
3526 SHORT_UNROLLED_LOOP_EXIT,
3527 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME,
3528 UNROLLED_VECTOR_LOOP_BEGIN,
3529 END;
3530 switch (eltype) {
3531 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break;
3532 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break;
3533 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break;
3534 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break;
3535 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break;
3536 default: BLOCK_COMMENT("arrays_hashcode {"); break;
3537 }
3538
3539 // For "renaming" for readibility of the code
3540 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
3541 vresult[] = { vresult0, vresult1, vresult2, vresult3 },
3542 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };
3543
3544 const int elsize = arrays_hashcode_elsize(eltype);
3545
3546 /*
3547 if (cnt1 >= 2) {
3548 if (cnt1 >= 32) {
3549 UNROLLED VECTOR LOOP
3550 }
3551 UNROLLED SCALAR LOOP
3552 }
3553 SINGLE SCALAR
3554 */
3555
3556 cmpl(cnt1, 32);
3557 jcc(Assembler::less, SHORT_UNROLLED_BEGIN);
3558
3559 // cnt1 >= 32 && generate_vectorized_loop
3560 xorl(index, index);
3561
3562 // vresult = IntVector.zero(I256);
3563 for (int idx = 0; idx < 4; idx++) {
3564 vpxor(vresult[idx], vresult[idx]);
3565 }
3566 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
3567 Register bound = tmp2;
3568 Register next = tmp3;
3569 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint))));
3570 movl(next, Address(tmp2, 0));
3571 movdl(vnext, next);
3572 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
3573
3574 // index = 0;
3575 // bound = cnt1 & ~(32 - 1);
3576 movl(bound, cnt1);
3577 andl(bound, ~(32 - 1));
3578 // for (; index < bound; index += 32) {
3579 bind(UNROLLED_VECTOR_LOOP_BEGIN);
3580 // result *= next;
3581 imull(result, next);
3582 // loop fission to upfront the cost of fetching from memory, OOO execution
3583 // can then hopefully do a better job of prefetching
3584 for (int idx = 0; idx < 4; idx++) {
3585 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype);
3586 }
3587 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7];
3588 for (int idx = 0; idx < 4; idx++) {
3589 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit);
3590 arrays_hashcode_elvcast(vtmp[idx], eltype);
3591 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
3592 }
3593 // index += 32;
3594 addl(index, 32);
3595 // index < bound;
3596 cmpl(index, bound);
3597 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN);
3598 // }
3599
3600 lea(ary1, Address(ary1, bound, Address::times(elsize)));
3601 subl(cnt1, bound);
3602 // release bound
3603
3604 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1);
3605 for (int idx = 0; idx < 4; idx++) {
3606 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint))));
3607 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT);
3608 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit);
3609 }
3610 // result += vresult.reduceLanes(ADD);
3611 for (int idx = 0; idx < 4; idx++) {
3612 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]);
3613 }
3614
3615 // } else if (cnt1 < 32) {
3616
3617 bind(SHORT_UNROLLED_BEGIN);
3618 // int i = 1;
3619 movl(index, 1);
3620 cmpl(index, cnt1);
3621 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT);
3622
3623 // for (; i < cnt1 ; i += 2) {
3624 bind(SHORT_UNROLLED_LOOP_BEGIN);
3625 movl(tmp3, 961);
3626 imull(result, tmp3);
3627 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3628 movl(tmp3, tmp2);
3629 shll(tmp3, 5);
3630 subl(tmp3, tmp2);
3631 addl(result, tmp3);
3632 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype);
3633 addl(result, tmp3);
3634 addl(index, 2);
3635 cmpl(index, cnt1);
3636 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN);
3637
3638 // }
3639 // if (i >= cnt1) {
3640 bind(SHORT_UNROLLED_LOOP_EXIT);
3641 jccb(Assembler::greater, END);
3642 movl(tmp2, result);
3643 shll(result, 5);
3644 subl(result, tmp2);
3645 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype);
3646 addl(result, tmp3);
3647 // }
3648 bind(END);
3649
3650 BLOCK_COMMENT("} // arrays_hashcode");
3651
3652 } // arrays_hashcode
3653
3654 // helper function for string_compare
3655 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3656 Address::ScaleFactor scale, Address::ScaleFactor scale1,
3657 Address::ScaleFactor scale2, Register index, int ae) {
3658 if (ae == StrIntrinsicNode::LL) {
3659 load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3660 load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3661 } else if (ae == StrIntrinsicNode::UU) {
3662 load_unsigned_short(elem1, Address(str1, index, scale, 0));
3663 load_unsigned_short(elem2, Address(str2, index, scale, 0));
3664 } else {
3665 load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3666 load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3667 }
3668 }
3669
3670 // Compare strings, used for char[] and byte[].
3671 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3672 Register cnt1, Register cnt2, Register result,
3673 XMMRegister vec1, int ae, KRegister mask) {
3674 ShortBranchVerifier sbv(this);
3675 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3676 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only AVX3
3677 int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3678 int stride2x2 = 0x40;
3679 Address::ScaleFactor scale = Address::no_scale;
3680 Address::ScaleFactor scale1 = Address::no_scale;
3681 Address::ScaleFactor scale2 = Address::no_scale;
3682
3683 if (ae != StrIntrinsicNode::LL) {
3684 stride2x2 = 0x20;
3685 }
3686
3687 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3688 shrl(cnt2, 1);
3689 }
3690 // Compute the minimum of the string lengths and the
3691 // difference of the string lengths (stack).
3692 // Do the conditional move stuff
3693 movl(result, cnt1);
3694 subl(cnt1, cnt2);
3695 push(cnt1);
3696 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2)
3697
3698 // Is the minimum length zero?
3699 testl(cnt2, cnt2);
3700 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3701 if (ae == StrIntrinsicNode::LL) {
3702 // Load first bytes
3703 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0]
3704 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0]
3705 } else if (ae == StrIntrinsicNode::UU) {
3706 // Load first characters
3707 load_unsigned_short(result, Address(str1, 0));
3708 load_unsigned_short(cnt1, Address(str2, 0));
3709 } else {
3710 load_unsigned_byte(result, Address(str1, 0));
3711 load_unsigned_short(cnt1, Address(str2, 0));
3712 }
3713 subl(result, cnt1);
3714 jcc(Assembler::notZero, POP_LABEL);
3715
3716 if (ae == StrIntrinsicNode::UU) {
3717 // Divide length by 2 to get number of chars
3718 shrl(cnt2, 1);
3719 }
3720 cmpl(cnt2, 1);
3721 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3722
3723 // Check if the strings start at the same location and setup scale and stride
3724 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3725 cmpptr(str1, str2);
3726 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3727 if (ae == StrIntrinsicNode::LL) {
3728 scale = Address::times_1;
3729 stride = 16;
3730 } else {
3731 scale = Address::times_2;
3732 stride = 8;
3733 }
3734 } else {
3735 scale1 = Address::times_1;
3736 scale2 = Address::times_2;
3737 // scale not used
3738 stride = 8;
3739 }
3740
3741 if (UseAVX >= 2 && UseSSE42Intrinsics) {
3742 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3743 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3744 Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3745 Label COMPARE_TAIL_LONG;
3746 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only AVX3
3747
3748 int pcmpmask = 0x19;
3749 if (ae == StrIntrinsicNode::LL) {
3750 pcmpmask &= ~0x01;
3751 }
3752
3753 // Setup to compare 16-chars (32-bytes) vectors,
3754 // start from first character again because it has aligned address.
3755 if (ae == StrIntrinsicNode::LL) {
3756 stride2 = 32;
3757 } else {
3758 stride2 = 16;
3759 }
3760 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3761 adr_stride = stride << scale;
3762 } else {
3763 adr_stride1 = 8; //stride << scale1;
3764 adr_stride2 = 16; //stride << scale2;
3765 }
3766
3767 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3768 // rax and rdx are used by pcmpestri as elements counters
3769 movl(result, cnt2);
3770 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count
3771 jcc(Assembler::zero, COMPARE_TAIL_LONG);
3772
3773 // fast path : compare first 2 8-char vectors.
3774 bind(COMPARE_16_CHARS);
3775 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3776 movdqu(vec1, Address(str1, 0));
3777 } else {
3778 pmovzxbw(vec1, Address(str1, 0));
3779 }
3780 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3781 jccb(Assembler::below, COMPARE_INDEX_CHAR);
3782
3783 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3784 movdqu(vec1, Address(str1, adr_stride));
3785 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3786 } else {
3787 pmovzxbw(vec1, Address(str1, adr_stride1));
3788 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3789 }
3790 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3791 addl(cnt1, stride);
3792
3793 // Compare the characters at index in cnt1
3794 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3795 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3796 subl(result, cnt2);
3797 jmp(POP_LABEL);
3798
3799 // Setup the registers to start vector comparison loop
3800 bind(COMPARE_WIDE_VECTORS);
3801 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3802 lea(str1, Address(str1, result, scale));
3803 lea(str2, Address(str2, result, scale));
3804 } else {
3805 lea(str1, Address(str1, result, scale1));
3806 lea(str2, Address(str2, result, scale2));
3807 }
3808 subl(result, stride2);
3809 subl(cnt2, stride2);
3810 jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3811 negptr(result);
3812
3813 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3814 bind(COMPARE_WIDE_VECTORS_LOOP);
3815
3816 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3817 cmpl(cnt2, stride2x2);
3818 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3819 testl(cnt2, stride2x2-1); // cnt2 holds the vector count
3820 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40
3821
3822 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3823 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3824 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3825 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3826 } else {
3827 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3828 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3829 }
3830 kortestql(mask, mask);
3831 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare
3832 addptr(result, stride2x2); // update since we already compared at this addr
3833 subl(cnt2, stride2x2); // and sub the size too
3834 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3835
3836 vpxor(vec1, vec1);
3837 jmpb(COMPARE_WIDE_TAIL);
3838 }//if (VM_Version::supports_avx512vlbw())
3839
3840 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3841 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3842 vmovdqu(vec1, Address(str1, result, scale));
3843 vpxor(vec1, Address(str2, result, scale));
3844 } else {
3845 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3846 vpxor(vec1, Address(str2, result, scale2));
3847 }
3848 vptest(vec1, vec1);
3849 jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3850 addptr(result, stride2);
3851 subl(cnt2, stride2);
3852 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3853 // clean upper bits of YMM registers
3854 vpxor(vec1, vec1);
3855
3856 // compare wide vectors tail
3857 bind(COMPARE_WIDE_TAIL);
3858 testptr(result, result);
3859 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3860
3861 movl(result, stride2);
3862 movl(cnt2, result);
3863 negptr(result);
3864 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3865
3866 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3867 bind(VECTOR_NOT_EQUAL);
3868 // clean upper bits of YMM registers
3869 vpxor(vec1, vec1);
3870 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3871 lea(str1, Address(str1, result, scale));
3872 lea(str2, Address(str2, result, scale));
3873 } else {
3874 lea(str1, Address(str1, result, scale1));
3875 lea(str2, Address(str2, result, scale2));
3876 }
3877 jmp(COMPARE_16_CHARS);
3878
3879 // Compare tail chars, length between 1 to 15 chars
3880 bind(COMPARE_TAIL_LONG);
3881 movl(cnt2, result);
3882 cmpl(cnt2, stride);
3883 jcc(Assembler::less, COMPARE_SMALL_STR);
3884
3885 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3886 movdqu(vec1, Address(str1, 0));
3887 } else {
3888 pmovzxbw(vec1, Address(str1, 0));
3889 }
3890 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3891 jcc(Assembler::below, COMPARE_INDEX_CHAR);
3892 subptr(cnt2, stride);
3893 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3894 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3895 lea(str1, Address(str1, result, scale));
3896 lea(str2, Address(str2, result, scale));
3897 } else {
3898 lea(str1, Address(str1, result, scale1));
3899 lea(str2, Address(str2, result, scale2));
3900 }
3901 negptr(cnt2);
3902 jmpb(WHILE_HEAD_LABEL);
3903
3904 bind(COMPARE_SMALL_STR);
3905 } else if (UseSSE42Intrinsics) {
3906 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3907 int pcmpmask = 0x19;
3908 // Setup to compare 8-char (16-byte) vectors,
3909 // start from first character again because it has aligned address.
3910 movl(result, cnt2);
3911 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count
3912 if (ae == StrIntrinsicNode::LL) {
3913 pcmpmask &= ~0x01;
3914 }
3915 jcc(Assembler::zero, COMPARE_TAIL);
3916 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3917 lea(str1, Address(str1, result, scale));
3918 lea(str2, Address(str2, result, scale));
3919 } else {
3920 lea(str1, Address(str1, result, scale1));
3921 lea(str2, Address(str2, result, scale2));
3922 }
3923 negptr(result);
3924
3925 // pcmpestri
3926 // inputs:
3927 // vec1- substring
3928 // rax - negative string length (elements count)
3929 // mem - scanned string
3930 // rdx - string length (elements count)
3931 // pcmpmask - cmp mode: 11000 (string compare with negated result)
3932 // + 00 (unsigned bytes) or + 01 (unsigned shorts)
3933 // outputs:
3934 // rcx - first mismatched element index
3935 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3936
3937 bind(COMPARE_WIDE_VECTORS);
3938 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3939 movdqu(vec1, Address(str1, result, scale));
3940 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3941 } else {
3942 pmovzxbw(vec1, Address(str1, result, scale1));
3943 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3944 }
3945 // After pcmpestri cnt1(rcx) contains mismatched element index
3946
3947 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1
3948 addptr(result, stride);
3949 subptr(cnt2, stride);
3950 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3951
3952 // compare wide vectors tail
3953 testptr(result, result);
3954 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3955
3956 movl(cnt2, stride);
3957 movl(result, stride);
3958 negptr(result);
3959 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3960 movdqu(vec1, Address(str1, result, scale));
3961 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3962 } else {
3963 pmovzxbw(vec1, Address(str1, result, scale1));
3964 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3965 }
3966 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3967
3968 // Mismatched characters in the vectors
3969 bind(VECTOR_NOT_EQUAL);
3970 addptr(cnt1, result);
3971 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3972 subl(result, cnt2);
3973 jmpb(POP_LABEL);
3974
3975 bind(COMPARE_TAIL); // limit is zero
3976 movl(cnt2, result);
3977 // Fallthru to tail compare
3978 }
3979 // Shift str2 and str1 to the end of the arrays, negate min
3980 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3981 lea(str1, Address(str1, cnt2, scale));
3982 lea(str2, Address(str2, cnt2, scale));
3983 } else {
3984 lea(str1, Address(str1, cnt2, scale1));
3985 lea(str2, Address(str2, cnt2, scale2));
3986 }
3987 decrementl(cnt2); // first character was compared already
3988 negptr(cnt2);
3989
3990 // Compare the rest of the elements
3991 bind(WHILE_HEAD_LABEL);
3992 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3993 subl(result, cnt1);
3994 jccb(Assembler::notZero, POP_LABEL);
3995 increment(cnt2);
3996 jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3997
3998 // Strings are equal up to min length. Return the length difference.
3999 bind(LENGTH_DIFF_LABEL);
4000 pop(result);
4001 if (ae == StrIntrinsicNode::UU) {
4002 // Divide diff by 2 to get number of chars
4003 sarl(result, 1);
4004 }
4005 jmpb(DONE_LABEL);
4006
4007 if (VM_Version::supports_avx512vlbw()) {
4008
4009 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
4010
4011 kmovql(cnt1, mask);
4012 notq(cnt1);
4013 bsfq(cnt2, cnt1);
4014 if (ae != StrIntrinsicNode::LL) {
4015 // Divide diff by 2 to get number of chars
4016 sarl(cnt2, 1);
4017 }
4018 addq(result, cnt2);
4019 if (ae == StrIntrinsicNode::LL) {
4020 load_unsigned_byte(cnt1, Address(str2, result));
4021 load_unsigned_byte(result, Address(str1, result));
4022 } else if (ae == StrIntrinsicNode::UU) {
4023 load_unsigned_short(cnt1, Address(str2, result, scale));
4024 load_unsigned_short(result, Address(str1, result, scale));
4025 } else {
4026 load_unsigned_short(cnt1, Address(str2, result, scale2));
4027 load_unsigned_byte(result, Address(str1, result, scale1));
4028 }
4029 subl(result, cnt1);
4030 jmpb(POP_LABEL);
4031 }//if (VM_Version::supports_avx512vlbw())
4032
4033 // Discard the stored length difference
4034 bind(POP_LABEL);
4035 pop(cnt1);
4036
4037 // That's it
4038 bind(DONE_LABEL);
4039 if(ae == StrIntrinsicNode::UL) {
4040 negl(result);
4041 }
4042
4043 }
4044
4045 // Search for Non-ASCII character (Negative byte value) in a byte array,
4046 // return the index of the first such character, otherwise the length
4047 // of the array segment searched.
4048 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
4049 // @IntrinsicCandidate
4050 // public static int countPositives(byte[] ba, int off, int len) {
4051 // for (int i = off; i < off + len; i++) {
4052 // if (ba[i] < 0) {
4053 // return i - off;
4054 // }
4055 // }
4056 // return len;
4057 // }
4058 void C2_MacroAssembler::count_positives(Register ary1, Register len,
4059 Register result, Register tmp1,
4060 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
4061 // rsi: byte array
4062 // rcx: len
4063 // rax: result
4064 ShortBranchVerifier sbv(this);
4065 assert_different_registers(ary1, len, result, tmp1);
4066 assert_different_registers(vec1, vec2);
4067 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
4068
4069 movl(result, len); // copy
4070 // len == 0
4071 testl(len, len);
4072 jcc(Assembler::zero, DONE);
4073
4074 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
4075 VM_Version::supports_avx512vlbw() &&
4076 VM_Version::supports_bmi2()) {
4077
4078 Label test_64_loop, test_tail, BREAK_LOOP;
4079 movl(tmp1, len);
4080 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
4081
4082 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F
4083 andl(len, 0xffffffc0); // vector count (in chars)
4084 jccb(Assembler::zero, test_tail);
4085
4086 lea(ary1, Address(ary1, len, Address::times_1));
4087 negptr(len);
4088
4089 bind(test_64_loop);
4090 // Check whether our 64 elements of size byte contain negatives
4091 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
4092 kortestql(mask1, mask1);
4093 jcc(Assembler::notZero, BREAK_LOOP);
4094
4095 addptr(len, 64);
4096 jccb(Assembler::notZero, test_64_loop);
4097
4098 bind(test_tail);
4099 // bail out when there is nothing to be done
4100 testl(tmp1, -1);
4101 jcc(Assembler::zero, DONE);
4102
4103
4104 // check the tail for absense of negatives
4105 // ~(~0 << len) applied up to two times (for 32-bit scenario)
4106 {
4107 Register tmp3_aliased = len;
4108 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
4109 shlxq(tmp3_aliased, tmp3_aliased, tmp1);
4110 notq(tmp3_aliased);
4111 kmovql(mask2, tmp3_aliased);
4112 }
4113
4114 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
4115 ktestq(mask1, mask2);
4116 jcc(Assembler::zero, DONE);
4117
4118 // do a full check for negative registers in the tail
4119 movl(len, tmp1); // tmp1 holds low 6-bit from original len;
4120 // ary1 already pointing to the right place
4121 jmpb(TAIL_START);
4122
4123 bind(BREAK_LOOP);
4124 // At least one byte in the last 64 byte block was negative.
4125 // Set up to look at the last 64 bytes as if they were a tail
4126 lea(ary1, Address(ary1, len, Address::times_1));
4127 addptr(result, len);
4128 // Ignore the very last byte: if all others are positive,
4129 // it must be negative, so we can skip right to the 2+1 byte
4130 // end comparison at this point
4131 orl(result, 63);
4132 movl(len, 63);
4133 // Fallthru to tail compare
4134 } else {
4135
4136 if (UseAVX >= 2) {
4137 // With AVX2, use 32-byte vector compare
4138 Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4139
4140 // Compare 32-byte vectors
4141 testl(len, 0xffffffe0); // vector count (in bytes)
4142 jccb(Assembler::zero, TAIL_START);
4143
4144 andl(len, 0xffffffe0);
4145 lea(ary1, Address(ary1, len, Address::times_1));
4146 negptr(len);
4147
4148 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
4149 movdl(vec2, tmp1);
4150 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
4151
4152 bind(COMPARE_WIDE_VECTORS);
4153 vmovdqu(vec1, Address(ary1, len, Address::times_1));
4154 vptest(vec1, vec2);
4155 jccb(Assembler::notZero, BREAK_LOOP);
4156 addptr(len, 32);
4157 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4158
4159 testl(result, 0x0000001f); // any bytes remaining?
4160 jcc(Assembler::zero, DONE);
4161
4162 // Quick test using the already prepared vector mask
4163 movl(len, result);
4164 andl(len, 0x0000001f);
4165 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
4166 vptest(vec1, vec2);
4167 jcc(Assembler::zero, DONE);
4168 // There are zeros, jump to the tail to determine exactly where
4169 jmpb(TAIL_START);
4170
4171 bind(BREAK_LOOP);
4172 // At least one byte in the last 32-byte vector is negative.
4173 // Set up to look at the last 32 bytes as if they were a tail
4174 lea(ary1, Address(ary1, len, Address::times_1));
4175 addptr(result, len);
4176 // Ignore the very last byte: if all others are positive,
4177 // it must be negative, so we can skip right to the 2+1 byte
4178 // end comparison at this point
4179 orl(result, 31);
4180 movl(len, 31);
4181 // Fallthru to tail compare
4182 } else if (UseSSE42Intrinsics) {
4183 // With SSE4.2, use double quad vector compare
4184 Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
4185
4186 // Compare 16-byte vectors
4187 testl(len, 0xfffffff0); // vector count (in bytes)
4188 jcc(Assembler::zero, TAIL_START);
4189
4190 andl(len, 0xfffffff0);
4191 lea(ary1, Address(ary1, len, Address::times_1));
4192 negptr(len);
4193
4194 movl(tmp1, 0x80808080);
4195 movdl(vec2, tmp1);
4196 pshufd(vec2, vec2, 0);
4197
4198 bind(COMPARE_WIDE_VECTORS);
4199 movdqu(vec1, Address(ary1, len, Address::times_1));
4200 ptest(vec1, vec2);
4201 jccb(Assembler::notZero, BREAK_LOOP);
4202 addptr(len, 16);
4203 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
4204
4205 testl(result, 0x0000000f); // len is zero, any bytes remaining?
4206 jcc(Assembler::zero, DONE);
4207
4208 // Quick test using the already prepared vector mask
4209 movl(len, result);
4210 andl(len, 0x0000000f); // tail count (in bytes)
4211 movdqu(vec1, Address(ary1, len, Address::times_1, -16));
4212 ptest(vec1, vec2);
4213 jcc(Assembler::zero, DONE);
4214 jmpb(TAIL_START);
4215
4216 bind(BREAK_LOOP);
4217 // At least one byte in the last 16-byte vector is negative.
4218 // Set up and look at the last 16 bytes as if they were a tail
4219 lea(ary1, Address(ary1, len, Address::times_1));
4220 addptr(result, len);
4221 // Ignore the very last byte: if all others are positive,
4222 // it must be negative, so we can skip right to the 2+1 byte
4223 // end comparison at this point
4224 orl(result, 15);
4225 movl(len, 15);
4226 // Fallthru to tail compare
4227 }
4228 }
4229
4230 bind(TAIL_START);
4231 // Compare 4-byte vectors
4232 andl(len, 0xfffffffc); // vector count (in bytes)
4233 jccb(Assembler::zero, COMPARE_CHAR);
4234
4235 lea(ary1, Address(ary1, len, Address::times_1));
4236 negptr(len);
4237
4238 bind(COMPARE_VECTORS);
4239 movl(tmp1, Address(ary1, len, Address::times_1));
4240 andl(tmp1, 0x80808080);
4241 jccb(Assembler::notZero, TAIL_ADJUST);
4242 addptr(len, 4);
4243 jccb(Assembler::notZero, COMPARE_VECTORS);
4244
4245 // Compare trailing char (final 2-3 bytes), if any
4246 bind(COMPARE_CHAR);
4247
4248 testl(result, 0x2); // tail char
4249 jccb(Assembler::zero, COMPARE_BYTE);
4250 load_unsigned_short(tmp1, Address(ary1, 0));
4251 andl(tmp1, 0x00008080);
4252 jccb(Assembler::notZero, CHAR_ADJUST);
4253 lea(ary1, Address(ary1, 2));
4254
4255 bind(COMPARE_BYTE);
4256 testl(result, 0x1); // tail byte
4257 jccb(Assembler::zero, DONE);
4258 load_unsigned_byte(tmp1, Address(ary1, 0));
4259 testl(tmp1, 0x00000080);
4260 jccb(Assembler::zero, DONE);
4261 subptr(result, 1);
4262 jmpb(DONE);
4263
4264 bind(TAIL_ADJUST);
4265 // there are negative bits in the last 4 byte block.
4266 // Adjust result and check the next three bytes
4267 addptr(result, len);
4268 orl(result, 3);
4269 lea(ary1, Address(ary1, len, Address::times_1));
4270 jmpb(COMPARE_CHAR);
4271
4272 bind(CHAR_ADJUST);
4273 // We are looking at a char + optional byte tail, and found that one
4274 // of the bytes in the char is negative. Adjust the result, check the
4275 // first byte and readjust if needed.
4276 andl(result, 0xfffffffc);
4277 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
4278 jccb(Assembler::notZero, DONE);
4279 addptr(result, 1);
4280
4281 // That's it
4282 bind(DONE);
4283 if (UseAVX >= 2) {
4284 // clean upper bits of YMM registers
4285 vpxor(vec1, vec1);
4286 vpxor(vec2, vec2);
4287 }
4288 }
4289
4290 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
4291 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
4292 Register limit, Register result, Register chr,
4293 XMMRegister vec1, XMMRegister vec2, bool is_char,
4294 KRegister mask, bool expand_ary2) {
4295 // for expand_ary2, limit is the (smaller) size of the second array.
4296 ShortBranchVerifier sbv(this);
4297 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
4298
4299 assert((!expand_ary2) || ((expand_ary2) && (UseAVX == 2)),
4300 "Expansion only implemented for AVX2");
4301
4302 int length_offset = arrayOopDesc::length_offset_in_bytes();
4303 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
4304
4305 Address::ScaleFactor scaleFactor = expand_ary2 ? Address::times_2 : Address::times_1;
4306 int scaleIncr = expand_ary2 ? 8 : 16;
4307
4308 if (is_array_equ) {
4309 // Check the input args
4310 cmpoop(ary1, ary2);
4311 jcc(Assembler::equal, TRUE_LABEL);
4312
4313 // Need additional checks for arrays_equals.
4314 testptr(ary1, ary1);
4315 jcc(Assembler::zero, FALSE_LABEL);
4316 testptr(ary2, ary2);
4317 jcc(Assembler::zero, FALSE_LABEL);
4318
4319 // Check the lengths
4320 movl(limit, Address(ary1, length_offset));
4321 cmpl(limit, Address(ary2, length_offset));
4322 jcc(Assembler::notEqual, FALSE_LABEL);
4323 }
4324
4325 // count == 0
4326 testl(limit, limit);
4327 jcc(Assembler::zero, TRUE_LABEL);
4328
4329 if (is_array_equ) {
4330 // Load array address
4331 lea(ary1, Address(ary1, base_offset));
4332 lea(ary2, Address(ary2, base_offset));
4333 }
4334
4335 if (is_array_equ && is_char) {
4336 // arrays_equals when used for char[].
4337 shll(limit, 1); // byte count != 0
4338 }
4339 movl(result, limit); // copy
4340
4341 if (UseAVX >= 2) {
4342 // With AVX2, use 32-byte vector compare
4343 Label COMPARE_WIDE_VECTORS, COMPARE_WIDE_VECTORS_16, COMPARE_TAIL, COMPARE_TAIL_16;
4344
4345 // Compare 32-byte vectors
4346 if (expand_ary2) {
4347 andl(result, 0x0000000f); // tail count (in bytes)
4348 andl(limit, 0xfffffff0); // vector count (in bytes)
4349 jcc(Assembler::zero, COMPARE_TAIL);
4350 } else {
4351 andl(result, 0x0000001f); // tail count (in bytes)
4352 andl(limit, 0xffffffe0); // vector count (in bytes)
4353 jcc(Assembler::zero, COMPARE_TAIL_16);
4354 }
4355
4356 lea(ary1, Address(ary1, limit, scaleFactor));
4357 lea(ary2, Address(ary2, limit, Address::times_1));
4358 negptr(limit);
4359
4360 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
4361 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
4362
4363 cmpl(limit, -64);
4364 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
4365
4366 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
4367
4368 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
4369 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
4370 kortestql(mask, mask);
4371 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
4372 addptr(limit, 64); // update since we already compared at this addr
4373 cmpl(limit, -64);
4374 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
4375
4376 // At this point we may still need to compare -limit+result bytes.
4377 // We could execute the next two instruction and just continue via non-wide path:
4378 // cmpl(limit, 0);
4379 // jcc(Assembler::equal, COMPARE_TAIL); // true
4380 // But since we stopped at the points ary{1,2}+limit which are
4381 // not farther than 64 bytes from the ends of arrays ary{1,2}+result
4382 // (|limit| <= 32 and result < 32),
4383 // we may just compare the last 64 bytes.
4384 //
4385 addptr(result, -64); // it is safe, bc we just came from this area
4386 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4387 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4388 kortestql(mask, mask);
4389 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
4390
4391 jmp(TRUE_LABEL);
4392
4393 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4394
4395 }//if (VM_Version::supports_avx512vlbw())
4396
4397 bind(COMPARE_WIDE_VECTORS);
4398 vmovdqu(vec1, Address(ary1, limit, scaleFactor));
4399 if (expand_ary2) {
4400 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_256bit);
4401 } else {
4402 vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4403 }
4404 vpxor(vec1, vec2);
4405
4406 vptest(vec1, vec1);
4407 jcc(Assembler::notZero, FALSE_LABEL);
4408 addptr(limit, scaleIncr * 2);
4409 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4410
4411 testl(result, result);
4412 jcc(Assembler::zero, TRUE_LABEL);
4413
4414 vmovdqu(vec1, Address(ary1, result, scaleFactor, -32));
4415 if (expand_ary2) {
4416 vpmovzxbw(vec2, Address(ary2, result, Address::times_1, -16), Assembler::AVX_256bit);
4417 } else {
4418 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4419 }
4420 vpxor(vec1, vec2);
4421
4422 vptest(vec1, vec1);
4423 jcc(Assembler::notZero, FALSE_LABEL);
4424 jmp(TRUE_LABEL);
4425
4426 bind(COMPARE_TAIL_16); // limit is zero
4427 movl(limit, result);
4428
4429 // Compare 16-byte chunks
4430 andl(result, 0x0000000f); // tail count (in bytes)
4431 andl(limit, 0xfffffff0); // vector count (in bytes)
4432 jcc(Assembler::zero, COMPARE_TAIL);
4433
4434 lea(ary1, Address(ary1, limit, scaleFactor));
4435 lea(ary2, Address(ary2, limit, Address::times_1));
4436 negptr(limit);
4437
4438 bind(COMPARE_WIDE_VECTORS_16);
4439 movdqu(vec1, Address(ary1, limit, scaleFactor));
4440 if (expand_ary2) {
4441 vpmovzxbw(vec2, Address(ary2, limit, Address::times_1), Assembler::AVX_128bit);
4442 } else {
4443 movdqu(vec2, Address(ary2, limit, Address::times_1));
4444 }
4445 pxor(vec1, vec2);
4446
4447 ptest(vec1, vec1);
4448 jcc(Assembler::notZero, FALSE_LABEL);
4449 addptr(limit, scaleIncr);
4450 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_16);
4451
4452 bind(COMPARE_TAIL); // limit is zero
4453 movl(limit, result);
4454 // Fallthru to tail compare
4455 } else if (UseSSE42Intrinsics) {
4456 // With SSE4.2, use double quad vector compare
4457 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4458
4459 // Compare 16-byte vectors
4460 andl(result, 0x0000000f); // tail count (in bytes)
4461 andl(limit, 0xfffffff0); // vector count (in bytes)
4462 jcc(Assembler::zero, COMPARE_TAIL);
4463
4464 lea(ary1, Address(ary1, limit, Address::times_1));
4465 lea(ary2, Address(ary2, limit, Address::times_1));
4466 negptr(limit);
4467
4468 bind(COMPARE_WIDE_VECTORS);
4469 movdqu(vec1, Address(ary1, limit, Address::times_1));
4470 movdqu(vec2, Address(ary2, limit, Address::times_1));
4471 pxor(vec1, vec2);
4472
4473 ptest(vec1, vec1);
4474 jcc(Assembler::notZero, FALSE_LABEL);
4475 addptr(limit, 16);
4476 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4477
4478 testl(result, result);
4479 jcc(Assembler::zero, TRUE_LABEL);
4480
4481 movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4482 movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4483 pxor(vec1, vec2);
4484
4485 ptest(vec1, vec1);
4486 jccb(Assembler::notZero, FALSE_LABEL);
4487 jmpb(TRUE_LABEL);
4488
4489 bind(COMPARE_TAIL); // limit is zero
4490 movl(limit, result);
4491 // Fallthru to tail compare
4492 }
4493
4494 // Compare 4-byte vectors
4495 if (expand_ary2) {
4496 testl(result, result);
4497 jccb(Assembler::zero, TRUE_LABEL);
4498 } else {
4499 andl(limit, 0xfffffffc); // vector count (in bytes)
4500 jccb(Assembler::zero, COMPARE_CHAR);
4501 }
4502
4503 lea(ary1, Address(ary1, limit, scaleFactor));
4504 lea(ary2, Address(ary2, limit, Address::times_1));
4505 negptr(limit);
4506
4507 bind(COMPARE_VECTORS);
4508 if (expand_ary2) {
4509 // There are no "vector" operations for bytes to shorts
4510 movzbl(chr, Address(ary2, limit, Address::times_1));
4511 cmpw(Address(ary1, limit, Address::times_2), chr);
4512 jccb(Assembler::notEqual, FALSE_LABEL);
4513 addptr(limit, 1);
4514 jcc(Assembler::notZero, COMPARE_VECTORS);
4515 jmp(TRUE_LABEL);
4516 } else {
4517 movl(chr, Address(ary1, limit, Address::times_1));
4518 cmpl(chr, Address(ary2, limit, Address::times_1));
4519 jccb(Assembler::notEqual, FALSE_LABEL);
4520 addptr(limit, 4);
4521 jcc(Assembler::notZero, COMPARE_VECTORS);
4522 }
4523
4524 // Compare trailing char (final 2 bytes), if any
4525 bind(COMPARE_CHAR);
4526 testl(result, 0x2); // tail char
4527 jccb(Assembler::zero, COMPARE_BYTE);
4528 load_unsigned_short(chr, Address(ary1, 0));
4529 load_unsigned_short(limit, Address(ary2, 0));
4530 cmpl(chr, limit);
4531 jccb(Assembler::notEqual, FALSE_LABEL);
4532
4533 if (is_array_equ && is_char) {
4534 bind(COMPARE_BYTE);
4535 } else {
4536 lea(ary1, Address(ary1, 2));
4537 lea(ary2, Address(ary2, 2));
4538
4539 bind(COMPARE_BYTE);
4540 testl(result, 0x1); // tail byte
4541 jccb(Assembler::zero, TRUE_LABEL);
4542 load_unsigned_byte(chr, Address(ary1, 0));
4543 load_unsigned_byte(limit, Address(ary2, 0));
4544 cmpl(chr, limit);
4545 jccb(Assembler::notEqual, FALSE_LABEL);
4546 }
4547 bind(TRUE_LABEL);
4548 movl(result, 1); // return true
4549 jmpb(DONE);
4550
4551 bind(FALSE_LABEL);
4552 xorl(result, result); // return false
4553
4554 // That's it
4555 bind(DONE);
4556 if (UseAVX >= 2) {
4557 // clean upper bits of YMM registers
4558 vpxor(vec1, vec1);
4559 vpxor(vec2, vec2);
4560 }
4561 }
4562
4563 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) {
4564 #define __ masm.
4565 Register dst = stub.data<0>();
4566 XMMRegister src = stub.data<1>();
4567 address target = stub.data<2>();
4568 __ bind(stub.entry());
4569 __ subptr(rsp, 8);
4570 __ movdbl(Address(rsp), src);
4571 __ call(RuntimeAddress(target));
4572 // APX REX2 encoding for pop(dst) increases the stub size by 1 byte.
4573 __ pop(dst);
4574 __ jmp(stub.continuation());
4575 #undef __
4576 }
4577
4578 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) {
4579 assert(dst_bt == T_INT || dst_bt == T_LONG, "");
4580 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, "");
4581
4582 address slowpath_target;
4583 if (dst_bt == T_INT) {
4584 if (src_bt == T_FLOAT) {
4585 cvttss2sil(dst, src);
4586 cmpl(dst, 0x80000000);
4587 slowpath_target = StubRoutines::x86::f2i_fixup();
4588 } else {
4589 cvttsd2sil(dst, src);
4590 cmpl(dst, 0x80000000);
4591 slowpath_target = StubRoutines::x86::d2i_fixup();
4592 }
4593 } else {
4594 if (src_bt == T_FLOAT) {
4595 cvttss2siq(dst, src);
4596 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4597 slowpath_target = StubRoutines::x86::f2l_fixup();
4598 } else {
4599 cvttsd2siq(dst, src);
4600 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip()));
4601 slowpath_target = StubRoutines::x86::d2l_fixup();
4602 }
4603 }
4604
4605 // Using the APX extended general purpose registers increases the instruction encoding size by 1 byte.
4606 int max_size = 23 + (UseAPX ? 1 : 0);
4607 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, max_size, convertF2I_slowpath);
4608 jcc(Assembler::equal, stub->entry());
4609 bind(stub->continuation());
4610 }
4611
4612 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4613 XMMRegister src1, int imm8, bool merge, int vlen_enc) {
4614 switch(ideal_opc) {
4615 case Op_LShiftVS:
4616 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break;
4617 case Op_LShiftVI:
4618 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break;
4619 case Op_LShiftVL:
4620 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break;
4621 case Op_RShiftVS:
4622 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break;
4623 case Op_RShiftVI:
4624 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break;
4625 case Op_RShiftVL:
4626 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break;
4627 case Op_URShiftVS:
4628 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break;
4629 case Op_URShiftVI:
4630 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break;
4631 case Op_URShiftVL:
4632 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break;
4633 case Op_RotateRightV:
4634 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4635 case Op_RotateLeftV:
4636 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break;
4637 default:
4638 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4639 break;
4640 }
4641 }
4642
4643 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4644 XMMRegister src2, bool is_unsigned, bool merge, int vlen_enc) {
4645 if (is_unsigned) {
4646 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4647 } else {
4648 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4649 }
4650 }
4651
4652 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4653 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4654 switch (elem_bt) {
4655 case T_BYTE:
4656 if (ideal_opc == Op_SaturatingAddV) {
4657 evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4658 } else {
4659 assert(ideal_opc == Op_SaturatingSubV, "");
4660 evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4661 }
4662 break;
4663 case T_SHORT:
4664 if (ideal_opc == Op_SaturatingAddV) {
4665 evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4666 } else {
4667 assert(ideal_opc == Op_SaturatingSubV, "");
4668 evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4669 }
4670 break;
4671 default:
4672 fatal("Unsupported type %s", type2name(elem_bt));
4673 break;
4674 }
4675 }
4676
4677 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4678 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) {
4679 switch (elem_bt) {
4680 case T_BYTE:
4681 if (ideal_opc == Op_SaturatingAddV) {
4682 evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4683 } else {
4684 assert(ideal_opc == Op_SaturatingSubV, "");
4685 evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4686 }
4687 break;
4688 case T_SHORT:
4689 if (ideal_opc == Op_SaturatingAddV) {
4690 evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4691 } else {
4692 assert(ideal_opc == Op_SaturatingSubV, "");
4693 evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4694 }
4695 break;
4696 default:
4697 fatal("Unsupported type %s", type2name(elem_bt));
4698 break;
4699 }
4700 }
4701
4702 void C2_MacroAssembler::evmasked_saturating_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst, XMMRegister src1,
4703 Address src2, bool is_unsigned, bool merge, int vlen_enc) {
4704 if (is_unsigned) {
4705 evmasked_saturating_unsigned_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4706 } else {
4707 evmasked_saturating_signed_op(ideal_opc, elem_bt, mask, dst, src1, src2, merge, vlen_enc);
4708 }
4709 }
4710
4711 void C2_MacroAssembler::evmasked_saturating_signed_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4712 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4713 switch (elem_bt) {
4714 case T_BYTE:
4715 if (ideal_opc == Op_SaturatingAddV) {
4716 evpaddsb(dst, mask, src1, src2, merge, vlen_enc);
4717 } else {
4718 assert(ideal_opc == Op_SaturatingSubV, "");
4719 evpsubsb(dst, mask, src1, src2, merge, vlen_enc);
4720 }
4721 break;
4722 case T_SHORT:
4723 if (ideal_opc == Op_SaturatingAddV) {
4724 evpaddsw(dst, mask, src1, src2, merge, vlen_enc);
4725 } else {
4726 assert(ideal_opc == Op_SaturatingSubV, "");
4727 evpsubsw(dst, mask, src1, src2, merge, vlen_enc);
4728 }
4729 break;
4730 default:
4731 fatal("Unsupported type %s", type2name(elem_bt));
4732 break;
4733 }
4734 }
4735
4736 void C2_MacroAssembler::evmasked_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, KRegister mask, XMMRegister dst,
4737 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4738 switch (elem_bt) {
4739 case T_BYTE:
4740 if (ideal_opc == Op_SaturatingAddV) {
4741 evpaddusb(dst, mask, src1, src2, merge, vlen_enc);
4742 } else {
4743 assert(ideal_opc == Op_SaturatingSubV, "");
4744 evpsubusb(dst, mask, src1, src2, merge, vlen_enc);
4745 }
4746 break;
4747 case T_SHORT:
4748 if (ideal_opc == Op_SaturatingAddV) {
4749 evpaddusw(dst, mask, src1, src2, merge, vlen_enc);
4750 } else {
4751 assert(ideal_opc == Op_SaturatingSubV, "");
4752 evpsubusw(dst, mask, src1, src2, merge, vlen_enc);
4753 }
4754 break;
4755 default:
4756 fatal("Unsupported type %s", type2name(elem_bt));
4757 break;
4758 }
4759 }
4760
4761 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4762 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc,
4763 bool is_varshift) {
4764 switch (ideal_opc) {
4765 case Op_AddVB:
4766 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4767 case Op_AddVS:
4768 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4769 case Op_AddVI:
4770 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4771 case Op_AddVL:
4772 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4773 case Op_AddVF:
4774 evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4775 case Op_AddVD:
4776 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4777 case Op_SubVB:
4778 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4779 case Op_SubVS:
4780 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4781 case Op_SubVI:
4782 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4783 case Op_SubVL:
4784 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4785 case Op_SubVF:
4786 evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4787 case Op_SubVD:
4788 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4789 case Op_MulVS:
4790 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4791 case Op_MulVI:
4792 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4793 case Op_MulVL:
4794 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4795 case Op_MulVF:
4796 evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4797 case Op_MulVD:
4798 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4799 case Op_DivVF:
4800 evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4801 case Op_DivVD:
4802 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4803 case Op_SqrtVF:
4804 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break;
4805 case Op_SqrtVD:
4806 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break;
4807 case Op_AbsVB:
4808 evpabsb(dst, mask, src2, merge, vlen_enc); break;
4809 case Op_AbsVS:
4810 evpabsw(dst, mask, src2, merge, vlen_enc); break;
4811 case Op_AbsVI:
4812 evpabsd(dst, mask, src2, merge, vlen_enc); break;
4813 case Op_AbsVL:
4814 evpabsq(dst, mask, src2, merge, vlen_enc); break;
4815 case Op_FmaVF:
4816 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4817 case Op_FmaVD:
4818 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4819 case Op_VectorRearrange:
4820 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break;
4821 case Op_LShiftVS:
4822 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4823 case Op_LShiftVI:
4824 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4825 case Op_LShiftVL:
4826 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4827 case Op_RShiftVS:
4828 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4829 case Op_RShiftVI:
4830 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4831 case Op_RShiftVL:
4832 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4833 case Op_URShiftVS:
4834 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4835 case Op_URShiftVI:
4836 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4837 case Op_URShiftVL:
4838 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break;
4839 case Op_RotateLeftV:
4840 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4841 case Op_RotateRightV:
4842 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4843 case Op_MaxV:
4844 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4845 case Op_MinV:
4846 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4847 case Op_UMinV:
4848 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4849 case Op_UMaxV:
4850 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4851 case Op_XorV:
4852 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4853 case Op_OrV:
4854 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4855 case Op_AndV:
4856 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4857 default:
4858 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4859 break;
4860 }
4861 }
4862
4863 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
4864 XMMRegister src1, Address src2, bool merge, int vlen_enc) {
4865 switch (ideal_opc) {
4866 case Op_AddVB:
4867 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break;
4868 case Op_AddVS:
4869 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break;
4870 case Op_AddVI:
4871 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break;
4872 case Op_AddVL:
4873 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break;
4874 case Op_AddVF:
4875 evaddps(dst, mask, src1, src2, merge, vlen_enc); break;
4876 case Op_AddVD:
4877 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break;
4878 case Op_SubVB:
4879 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break;
4880 case Op_SubVS:
4881 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break;
4882 case Op_SubVI:
4883 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break;
4884 case Op_SubVL:
4885 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break;
4886 case Op_SubVF:
4887 evsubps(dst, mask, src1, src2, merge, vlen_enc); break;
4888 case Op_SubVD:
4889 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break;
4890 case Op_MulVS:
4891 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break;
4892 case Op_MulVI:
4893 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break;
4894 case Op_MulVL:
4895 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break;
4896 case Op_MulVF:
4897 evmulps(dst, mask, src1, src2, merge, vlen_enc); break;
4898 case Op_MulVD:
4899 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break;
4900 case Op_DivVF:
4901 evdivps(dst, mask, src1, src2, merge, vlen_enc); break;
4902 case Op_DivVD:
4903 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break;
4904 case Op_FmaVF:
4905 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break;
4906 case Op_FmaVD:
4907 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break;
4908 case Op_MaxV:
4909 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4910 case Op_MinV:
4911 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4912 case Op_UMaxV:
4913 evpmaxu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4914 case Op_UMinV:
4915 evpminu(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4916 case Op_XorV:
4917 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4918 case Op_OrV:
4919 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4920 case Op_AndV:
4921 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break;
4922 default:
4923 fatal("Unsupported operation %s", NodeClassNames[ideal_opc]);
4924 break;
4925 }
4926 }
4927
4928 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
4929 KRegister src1, KRegister src2) {
4930 BasicType etype = T_ILLEGAL;
4931 switch(mask_len) {
4932 case 2:
4933 case 4:
4934 case 8: etype = T_BYTE; break;
4935 case 16: etype = T_SHORT; break;
4936 case 32: etype = T_INT; break;
4937 case 64: etype = T_LONG; break;
4938 default: fatal("Unsupported type"); break;
4939 }
4940 assert(etype != T_ILLEGAL, "");
4941 switch(ideal_opc) {
4942 case Op_AndVMask:
4943 kand(etype, dst, src1, src2); break;
4944 case Op_OrVMask:
4945 kor(etype, dst, src1, src2); break;
4946 case Op_XorVMask:
4947 kxor(etype, dst, src1, src2); break;
4948 default:
4949 fatal("Unsupported masked operation"); break;
4950 }
4951 }
4952
4953 /*
4954 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
4955 * If src is NaN, the result is 0.
4956 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE,
4957 * the result is equal to the value of Integer.MIN_VALUE.
4958 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE,
4959 * the result is equal to the value of Integer.MAX_VALUE.
4960 */
4961 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4962 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
4963 Register rscratch, AddressLiteral float_sign_flip,
4964 int vec_enc) {
4965 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4966 Label done;
4967 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
4968 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
4969 vptest(xtmp2, xtmp2, vec_enc);
4970 jccb(Assembler::equal, done);
4971
4972 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc);
4973 vpxor(xtmp1, xtmp1, xtmp4, vec_enc);
4974
4975 vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
4976 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc);
4977 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc);
4978
4979 // Recompute the mask for remaining special value.
4980 vpxor(xtmp2, xtmp2, xtmp3, vec_enc);
4981 // Extract SRC values corresponding to TRUE mask lanes.
4982 vpand(xtmp4, xtmp2, src, vec_enc);
4983 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special
4984 // values are set.
4985 vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
4986
4987 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc);
4988 bind(done);
4989 }
4990
4991 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
4992 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
4993 Register rscratch, AddressLiteral float_sign_flip,
4994 int vec_enc) {
4995 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
4996 Label done;
4997 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
4998 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
4999 kortestwl(ktmp1, ktmp1);
5000 jccb(Assembler::equal, done);
5001
5002 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5003 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5004 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5005
5006 kxorwl(ktmp1, ktmp1, ktmp2);
5007 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5008 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5009 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5010 bind(done);
5011 }
5012
5013 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5014 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5015 Register rscratch, AddressLiteral double_sign_flip,
5016 int vec_enc) {
5017 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5018
5019 Label done;
5020 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5021 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc);
5022 kortestwl(ktmp1, ktmp1);
5023 jccb(Assembler::equal, done);
5024
5025 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5026 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5027 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5028
5029 kxorwl(ktmp1, ktmp1, ktmp2);
5030 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5031 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5032 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5033 bind(done);
5034 }
5035
5036 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5037 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5038 Register rscratch, AddressLiteral float_sign_flip,
5039 int vec_enc) {
5040 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5041 Label done;
5042 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch);
5043 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
5044 kortestwl(ktmp1, ktmp1);
5045 jccb(Assembler::equal, done);
5046
5047 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5048 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5049 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc);
5050
5051 kxorwl(ktmp1, ktmp1, ktmp2);
5052 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5053 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5054 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc);
5055 bind(done);
5056 }
5057
5058 /*
5059 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation.
5060 * If src is NaN, the result is 0.
5061 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE,
5062 * the result is equal to the value of Long.MIN_VALUE.
5063 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE,
5064 * the result is equal to the value of Long.MAX_VALUE.
5065 */
5066 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5067 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
5068 Register rscratch, AddressLiteral double_sign_flip,
5069 int vec_enc) {
5070 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
5071
5072 Label done;
5073 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch);
5074 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
5075 kortestwl(ktmp1, ktmp1);
5076 jccb(Assembler::equal, done);
5077
5078 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5079 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
5080 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);
5081
5082 kxorwl(ktmp1, ktmp1, ktmp2);
5083 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
5084 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
5085 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
5086 bind(done);
5087 }
5088
5089 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero,
5090 XMMRegister xtmp, int index, int vec_enc) {
5091 assert(vec_enc < Assembler::AVX_512bit, "");
5092 if (vec_enc == Assembler::AVX_256bit) {
5093 vextractf128_high(xtmp, src);
5094 vshufps(dst, src, xtmp, index, vec_enc);
5095 } else {
5096 vshufps(dst, src, zero, index, vec_enc);
5097 }
5098 }
5099
5100 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5101 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch,
5102 AddressLiteral float_sign_flip, int src_vec_enc) {
5103 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
5104
5105 Label done;
5106 // Compare the destination lanes with float_sign_flip
5107 // value to get mask for all special values.
5108 movdqu(xtmp1, float_sign_flip, rscratch);
5109 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
5110 ptest(xtmp2, xtmp2);
5111 jccb(Assembler::equal, done);
5112
5113 // Flip float_sign_flip to get max integer value.
5114 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
5115 pxor(xtmp1, xtmp4);
5116
5117 // Set detination lanes corresponding to unordered source lanes as zero.
5118 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
5119 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
5120
5121 // Shuffle mask vector and pack lower doubles word from each quadword lane.
5122 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5123 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
5124
5125 // Recompute the mask for remaining special value.
5126 pxor(xtmp2, xtmp3);
5127 // Extract mask corresponding to non-negative source lanes.
5128 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
5129
5130 // Shuffle mask vector and pack lower doubles word from each quadword lane.
5131 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
5132 pand(xtmp3, xtmp2);
5133
5134 // Replace destination lanes holding special value(0x80000000) with max int
5135 // if corresponding source lane holds a +ve value.
5136 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
5137 bind(done);
5138 }
5139
5140
5141 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero,
5142 XMMRegister xtmp, Register rscratch, int vec_enc) {
5143 switch(to_elem_bt) {
5144 case T_SHORT:
5145 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing");
5146 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch);
5147 vpackusdw(dst, dst, zero, vec_enc);
5148 if (vec_enc == Assembler::AVX_256bit) {
5149 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5150 }
5151 break;
5152 case T_BYTE:
5153 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing");
5154 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch);
5155 vpackusdw(dst, dst, zero, vec_enc);
5156 if (vec_enc == Assembler::AVX_256bit) {
5157 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc);
5158 }
5159 vpackuswb(dst, dst, zero, vec_enc);
5160 break;
5161 default: assert(false, "Unexpected basic type for target of vector cast int to subword: %s", type2name(to_elem_bt));
5162 }
5163 }
5164
5165 /*
5166 * Algorithm for vector D2L and F2I conversions (AVX 10.2 unsupported):-
5167 * a) Perform vector D2L/F2I cast.
5168 * b) Choose fast path if none of the result vector lane contains 0x80000000 value.
5169 * It signifies that source value could be any of the special floating point
5170 * values(NaN,-Inf,Inf,Max,-Min).
5171 * c) Set destination to zero if source is NaN value.
5172 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
5173 */
5174
5175 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5176 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
5177 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5178 int to_elem_sz = type2aelembytes(to_elem_bt);
5179 assert(to_elem_sz <= 4, "");
5180 vcvttps2dq(dst, src, vec_enc);
5181 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc);
5182 if (to_elem_sz < 4) {
5183 vpxor(xtmp4, xtmp4, xtmp4, vec_enc);
5184 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc);
5185 }
5186 }
5187
5188 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5189 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
5190 Register rscratch, int vec_enc) {
5191 int to_elem_sz = type2aelembytes(to_elem_bt);
5192 assert(to_elem_sz <= 4, "");
5193 vcvttps2dq(dst, src, vec_enc);
5194 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc);
5195 switch(to_elem_bt) {
5196 case T_INT:
5197 break;
5198 case T_SHORT:
5199 evpmovdw(dst, dst, vec_enc);
5200 break;
5201 case T_BYTE:
5202 evpmovdb(dst, dst, vec_enc);
5203 break;
5204 default: assert(false, "Unexpected basic type for target of vector castF2X EVEX: %s", type2name(to_elem_bt));
5205 }
5206 }
5207
5208 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
5209 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
5210 Register rscratch, int vec_enc) {
5211 evcvttps2qq(dst, src, vec_enc);
5212 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc);
5213 }
5214
5215 // Handling for downcasting from double to integer or sub-word types on AVX2.
5216 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5217 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5,
5218 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) {
5219 int to_elem_sz = type2aelembytes(to_elem_bt);
5220 assert(to_elem_sz < 8, "");
5221 vcvttpd2dq(dst, src, vec_enc);
5222 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch,
5223 float_sign_flip, vec_enc);
5224 if (to_elem_sz < 4) {
5225 // xtmp4 holds all zero lanes.
5226 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit);
5227 }
5228 }
5229
5230 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src,
5231 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1,
5232 KRegister ktmp2, AddressLiteral sign_flip,
5233 Register rscratch, int vec_enc) {
5234 if (VM_Version::supports_avx512dq()) {
5235 evcvttpd2qq(dst, src, vec_enc);
5236 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5237 switch(to_elem_bt) {
5238 case T_LONG:
5239 break;
5240 case T_INT:
5241 evpmovsqd(dst, dst, vec_enc);
5242 break;
5243 case T_SHORT:
5244 evpmovsqd(dst, dst, vec_enc);
5245 evpmovdw(dst, dst, vec_enc);
5246 break;
5247 case T_BYTE:
5248 evpmovsqd(dst, dst, vec_enc);
5249 evpmovdb(dst, dst, vec_enc);
5250 break;
5251 default: assert(false, "Unexpected basic type for target of vector castD2X AVX512DQ EVEX: %s", type2name(to_elem_bt));
5252 }
5253 } else {
5254 assert(type2aelembytes(to_elem_bt) <= 4, "");
5255 vcvttpd2dq(dst, src, vec_enc);
5256 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc);
5257 switch(to_elem_bt) {
5258 case T_INT:
5259 break;
5260 case T_SHORT:
5261 evpmovdw(dst, dst, vec_enc);
5262 break;
5263 case T_BYTE:
5264 evpmovdb(dst, dst, vec_enc);
5265 break;
5266 default: assert(false, "Unexpected basic type for target of vector castD2X EVEX: %s", type2name(to_elem_bt));
5267 }
5268 }
5269 }
5270
5271 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5272 switch(to_elem_bt) {
5273 case T_LONG:
5274 evcvttps2qqs(dst, src, vec_enc);
5275 break;
5276 case T_INT:
5277 evcvttps2dqs(dst, src, vec_enc);
5278 break;
5279 case T_SHORT:
5280 evcvttps2dqs(dst, src, vec_enc);
5281 evpmovdw(dst, dst, vec_enc);
5282 break;
5283 case T_BYTE:
5284 evcvttps2dqs(dst, src, vec_enc);
5285 evpmovdb(dst, dst, vec_enc);
5286 break;
5287 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (reg src): %s", type2name(to_elem_bt));
5288 }
5289 }
5290
5291 void C2_MacroAssembler::vector_castF2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5292 switch(to_elem_bt) {
5293 case T_LONG:
5294 evcvttps2qqs(dst, src, vec_enc);
5295 break;
5296 case T_INT:
5297 evcvttps2dqs(dst, src, vec_enc);
5298 break;
5299 case T_SHORT:
5300 evcvttps2dqs(dst, src, vec_enc);
5301 evpmovdw(dst, dst, vec_enc);
5302 break;
5303 case T_BYTE:
5304 evcvttps2dqs(dst, src, vec_enc);
5305 evpmovdb(dst, dst, vec_enc);
5306 break;
5307 default: assert(false, "Unexpected basic type for target of vector castF2X AVX10 (mem src): %s", type2name(to_elem_bt));
5308 }
5309 }
5310
5311 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vec_enc) {
5312 switch(to_elem_bt) {
5313 case T_LONG:
5314 evcvttpd2qqs(dst, src, vec_enc);
5315 break;
5316 case T_INT:
5317 evcvttpd2dqs(dst, src, vec_enc);
5318 break;
5319 case T_SHORT:
5320 evcvttpd2dqs(dst, src, vec_enc);
5321 evpmovdw(dst, dst, vec_enc);
5322 break;
5323 case T_BYTE:
5324 evcvttpd2dqs(dst, src, vec_enc);
5325 evpmovdb(dst, dst, vec_enc);
5326 break;
5327 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (reg src): %s", type2name(to_elem_bt));
5328 }
5329 }
5330
5331 void C2_MacroAssembler::vector_castD2X_avx10_2(BasicType to_elem_bt, XMMRegister dst, Address src, int vec_enc) {
5332 switch(to_elem_bt) {
5333 case T_LONG:
5334 evcvttpd2qqs(dst, src, vec_enc);
5335 break;
5336 case T_INT:
5337 evcvttpd2dqs(dst, src, vec_enc);
5338 break;
5339 case T_SHORT:
5340 evcvttpd2dqs(dst, src, vec_enc);
5341 evpmovdw(dst, dst, vec_enc);
5342 break;
5343 case T_BYTE:
5344 evcvttpd2dqs(dst, src, vec_enc);
5345 evpmovdb(dst, dst, vec_enc);
5346 break;
5347 default: assert(false, "Unexpected basic type for target of vector castD2X AVX10 (mem src): %s", type2name(to_elem_bt));
5348 }
5349 }
5350
5351 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src,
5352 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5353 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5354 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5355 // and re-instantiate original MXCSR.RC mode after that.
5356 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5357
5358 mov64(tmp, julong_cast(0.5L));
5359 evpbroadcastq(xtmp1, tmp, vec_enc);
5360 vaddpd(xtmp1, src , xtmp1, vec_enc);
5361 evcvtpd2qq(dst, xtmp1, vec_enc);
5362 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5363 double_sign_flip, vec_enc);;
5364
5365 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5366 }
5367
5368 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src,
5369 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5370 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) {
5371 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5372 // and re-instantiate original MXCSR.RC mode after that.
5373 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5374
5375 movl(tmp, jint_cast(0.5));
5376 movq(xtmp1, tmp);
5377 vbroadcastss(xtmp1, xtmp1, vec_enc);
5378 vaddps(xtmp1, src , xtmp1, vec_enc);
5379 vcvtps2dq(dst, xtmp1, vec_enc);
5380 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/,
5381 float_sign_flip, vec_enc);
5382
5383 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5384 }
5385
5386 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src,
5387 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
5388 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) {
5389 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf.
5390 // and re-instantiate original MXCSR.RC mode after that.
5391 ldmxcsr(new_mxcsr, tmp /*rscratch*/);
5392
5393 movl(tmp, jint_cast(0.5));
5394 movq(xtmp1, tmp);
5395 vbroadcastss(xtmp1, xtmp1, vec_enc);
5396 vaddps(xtmp1, src , xtmp1, vec_enc);
5397 vcvtps2dq(dst, xtmp1, vec_enc);
5398 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc);
5399
5400 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/);
5401 }
5402
5403 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5404 BasicType from_elem_bt, BasicType to_elem_bt) {
5405 switch (from_elem_bt) {
5406 case T_BYTE:
5407 switch (to_elem_bt) {
5408 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break;
5409 case T_INT: vpmovzxbd(dst, src, vlen_enc); break;
5410 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break;
5411 default: ShouldNotReachHere();
5412 }
5413 break;
5414 case T_SHORT:
5415 switch (to_elem_bt) {
5416 case T_INT: vpmovzxwd(dst, src, vlen_enc); break;
5417 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break;
5418 default: ShouldNotReachHere();
5419 }
5420 break;
5421 case T_INT:
5422 assert(to_elem_bt == T_LONG, "");
5423 vpmovzxdq(dst, src, vlen_enc);
5424 break;
5425 default:
5426 ShouldNotReachHere();
5427 }
5428 }
5429
5430 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc,
5431 BasicType from_elem_bt, BasicType to_elem_bt) {
5432 switch (from_elem_bt) {
5433 case T_BYTE:
5434 switch (to_elem_bt) {
5435 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break;
5436 case T_INT: vpmovsxbd(dst, src, vlen_enc); break;
5437 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break;
5438 default: ShouldNotReachHere();
5439 }
5440 break;
5441 case T_SHORT:
5442 switch (to_elem_bt) {
5443 case T_INT: vpmovsxwd(dst, src, vlen_enc); break;
5444 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break;
5445 default: ShouldNotReachHere();
5446 }
5447 break;
5448 case T_INT:
5449 assert(to_elem_bt == T_LONG, "");
5450 vpmovsxdq(dst, src, vlen_enc);
5451 break;
5452 default:
5453 ShouldNotReachHere();
5454 }
5455 }
5456
5457 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src,
5458 BasicType dst_bt, BasicType src_bt, int vlen) {
5459 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen);
5460 assert(vlen_enc != AVX_512bit, "");
5461
5462 int dst_bt_size = type2aelembytes(dst_bt);
5463 int src_bt_size = type2aelembytes(src_bt);
5464 if (dst_bt_size > src_bt_size) {
5465 switch (dst_bt_size / src_bt_size) {
5466 case 2: vpmovsxbw(dst, src, vlen_enc); break;
5467 case 4: vpmovsxbd(dst, src, vlen_enc); break;
5468 case 8: vpmovsxbq(dst, src, vlen_enc); break;
5469 default: ShouldNotReachHere();
5470 }
5471 } else {
5472 assert(dst_bt_size < src_bt_size, "");
5473 switch (src_bt_size / dst_bt_size) {
5474 case 2: {
5475 if (vlen_enc == AVX_128bit) {
5476 vpacksswb(dst, src, src, vlen_enc);
5477 } else {
5478 vpacksswb(dst, src, src, vlen_enc);
5479 vpermq(dst, dst, 0x08, vlen_enc);
5480 }
5481 break;
5482 }
5483 case 4: {
5484 if (vlen_enc == AVX_128bit) {
5485 vpackssdw(dst, src, src, vlen_enc);
5486 vpacksswb(dst, dst, dst, vlen_enc);
5487 } else {
5488 vpackssdw(dst, src, src, vlen_enc);
5489 vpermq(dst, dst, 0x08, vlen_enc);
5490 vpacksswb(dst, dst, dst, AVX_128bit);
5491 }
5492 break;
5493 }
5494 case 8: {
5495 if (vlen_enc == AVX_128bit) {
5496 vpshufd(dst, src, 0x08, vlen_enc);
5497 vpackssdw(dst, dst, dst, vlen_enc);
5498 vpacksswb(dst, dst, dst, vlen_enc);
5499 } else {
5500 vpshufd(dst, src, 0x08, vlen_enc);
5501 vpermq(dst, dst, 0x08, vlen_enc);
5502 vpackssdw(dst, dst, dst, AVX_128bit);
5503 vpacksswb(dst, dst, dst, AVX_128bit);
5504 }
5505 break;
5506 }
5507 default: ShouldNotReachHere();
5508 }
5509 }
5510 }
5511
5512 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
5513 bool merge, BasicType bt, int vlen_enc) {
5514 if (bt == T_INT) {
5515 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5516 } else {
5517 assert(bt == T_LONG, "");
5518 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5519 }
5520 }
5521
5522 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
5523 bool merge, BasicType bt, int vlen_enc) {
5524 if (bt == T_INT) {
5525 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc);
5526 } else {
5527 assert(bt == T_LONG, "");
5528 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc);
5529 }
5530 }
5531
5532 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1,
5533 Register rtmp2, XMMRegister xtmp, int mask_len,
5534 int vec_enc) {
5535 int index = 0;
5536 int vindex = 0;
5537 mov64(rtmp1, 0x0101010101010101L);
5538 pdepq(rtmp1, src, rtmp1);
5539 if (mask_len > 8) {
5540 movq(rtmp2, src);
5541 vpxor(xtmp, xtmp, xtmp, vec_enc);
5542 movq(xtmp, rtmp1);
5543 }
5544 movq(dst, rtmp1);
5545
5546 mask_len -= 8;
5547 while (mask_len > 0) {
5548 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8");
5549 index++;
5550 if ((index % 2) == 0) {
5551 pxor(xtmp, xtmp);
5552 }
5553 mov64(rtmp1, 0x0101010101010101L);
5554 shrq(rtmp2, 8);
5555 pdepq(rtmp1, rtmp2, rtmp1);
5556 pinsrq(xtmp, rtmp1, index % 2);
5557 vindex = index / 2;
5558 if (vindex) {
5559 // Write entire 16 byte vector when both 64 bit
5560 // lanes are update to save redundant instructions.
5561 if (index % 2) {
5562 vinsertf128(dst, dst, xtmp, vindex);
5563 }
5564 } else {
5565 vmovdqu(dst, xtmp);
5566 }
5567 mask_len -= 8;
5568 }
5569 }
5570
5571 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
5572 switch(opc) {
5573 case Op_VectorMaskTrueCount:
5574 popcntq(dst, tmp);
5575 break;
5576 case Op_VectorMaskLastTrue:
5577 if (VM_Version::supports_lzcnt()) {
5578 lzcntq(tmp, tmp);
5579 movl(dst, 63);
5580 subl(dst, tmp);
5581 } else {
5582 movl(dst, -1);
5583 bsrq(tmp, tmp);
5584 cmov32(Assembler::notZero, dst, tmp);
5585 }
5586 break;
5587 case Op_VectorMaskFirstTrue:
5588 if (VM_Version::supports_bmi1()) {
5589 if (masklen < 32) {
5590 orl(tmp, 1 << masklen);
5591 tzcntl(dst, tmp);
5592 } else if (masklen == 32) {
5593 tzcntl(dst, tmp);
5594 } else {
5595 assert(masklen == 64, "");
5596 tzcntq(dst, tmp);
5597 }
5598 } else {
5599 if (masklen < 32) {
5600 orl(tmp, 1 << masklen);
5601 bsfl(dst, tmp);
5602 } else {
5603 assert(masklen == 32 || masklen == 64, "");
5604 movl(dst, masklen);
5605 if (masklen == 32) {
5606 bsfl(tmp, tmp);
5607 } else {
5608 bsfq(tmp, tmp);
5609 }
5610 cmov32(Assembler::notZero, dst, tmp);
5611 }
5612 }
5613 break;
5614 case Op_VectorMaskToLong:
5615 assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
5616 break;
5617 default: assert(false, "Unhandled mask operation");
5618 }
5619 }
5620
5621 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
5622 int masklen, int masksize, int vec_enc) {
5623 assert(VM_Version::supports_popcnt(), "");
5624
5625 if(VM_Version::supports_avx512bw()) {
5626 kmovql(tmp, mask);
5627 } else {
5628 assert(masklen <= 16, "");
5629 kmovwl(tmp, mask);
5630 }
5631
5632 // Mask generated out of partial vector comparisons/replicate/mask manipulation
5633 // operations needs to be clipped.
5634 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
5635 andq(tmp, (1 << masklen) - 1);
5636 }
5637
5638 vector_mask_operation_helper(opc, dst, tmp, masklen);
5639 }
5640
5641 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
5642 Register tmp, int masklen, BasicType bt, int vec_enc) {
5643 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) ||
5644 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), "");
5645 assert(VM_Version::supports_popcnt(), "");
5646
5647 bool need_clip = false;
5648 switch(bt) {
5649 case T_BOOLEAN:
5650 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
5651 vpxor(xtmp, xtmp, xtmp, vec_enc);
5652 vpsubb(xtmp, xtmp, mask, vec_enc);
5653 vpmovmskb(tmp, xtmp, vec_enc);
5654 need_clip = masklen < 16;
5655 break;
5656 case T_BYTE:
5657 vpmovmskb(tmp, mask, vec_enc);
5658 need_clip = masklen < 16;
5659 break;
5660 case T_SHORT:
5661 vpacksswb(xtmp, mask, mask, vec_enc);
5662 if (masklen >= 16) {
5663 vpermpd(xtmp, xtmp, 8, vec_enc);
5664 }
5665 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
5666 need_clip = masklen < 16;
5667 break;
5668 case T_INT:
5669 case T_FLOAT:
5670 vmovmskps(tmp, mask, vec_enc);
5671 need_clip = masklen < 4;
5672 break;
5673 case T_LONG:
5674 case T_DOUBLE:
5675 vmovmskpd(tmp, mask, vec_enc);
5676 need_clip = masklen < 2;
5677 break;
5678 default: assert(false, "Unhandled type, %s", type2name(bt));
5679 }
5680
5681 // Mask generated out of partial vector comparisons/replicate/mask manipulation
5682 // operations needs to be clipped.
5683 if (need_clip && opc != Op_VectorMaskFirstTrue) {
5684 // need_clip implies masklen < 32
5685 andq(tmp, (1 << masklen) - 1);
5686 }
5687
5688 vector_mask_operation_helper(opc, dst, tmp, masklen);
5689 }
5690
5691 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1,
5692 Register rtmp2, int mask_len) {
5693 kmov(rtmp1, src);
5694 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len)));
5695 mov64(rtmp2, -1L);
5696 pextq(rtmp2, rtmp2, rtmp1);
5697 kmov(dst, rtmp2);
5698 }
5699
5700 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
5701 XMMRegister mask, Register rtmp, Register rscratch,
5702 XMMRegister permv, XMMRegister xtmp, BasicType bt,
5703 int vec_enc) {
5704 assert(type2aelembytes(bt) >= 4, "");
5705 assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
5706 address compress_perm_table = nullptr;
5707 address expand_perm_table = nullptr;
5708 if (type2aelembytes(bt) == 8) {
5709 compress_perm_table = StubRoutines::x86::compress_perm_table64();
5710 expand_perm_table = StubRoutines::x86::expand_perm_table64();
5711 vmovmskpd(rtmp, mask, vec_enc);
5712 } else {
5713 compress_perm_table = StubRoutines::x86::compress_perm_table32();
5714 expand_perm_table = StubRoutines::x86::expand_perm_table32();
5715 vmovmskps(rtmp, mask, vec_enc);
5716 }
5717 shlq(rtmp, 5); // for 32 byte permute row.
5718 if (opcode == Op_CompressV) {
5719 lea(rscratch, ExternalAddress(compress_perm_table));
5720 } else {
5721 lea(rscratch, ExternalAddress(expand_perm_table));
5722 }
5723 addptr(rtmp, rscratch);
5724 vmovdqu(permv, Address(rtmp));
5725 vpermps(dst, permv, src, Assembler::AVX_256bit);
5726 vpxor(xtmp, xtmp, xtmp, vec_enc);
5727 // Blend the result with zero vector using permute mask, each column entry
5728 // in a permute table row contains either a valid permute index or a -1 (default)
5729 // value, this can potentially be used as a blending mask after
5730 // compressing/expanding the source vector lanes.
5731 vblendvps(dst, dst, xtmp, permv, vec_enc, true, permv);
5732 }
5733
5734 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
5735 bool merge, BasicType bt, int vec_enc) {
5736 if (opcode == Op_CompressV) {
5737 switch(bt) {
5738 case T_BYTE:
5739 evpcompressb(dst, mask, src, merge, vec_enc);
5740 break;
5741 case T_CHAR:
5742 case T_SHORT:
5743 evpcompressw(dst, mask, src, merge, vec_enc);
5744 break;
5745 case T_INT:
5746 evpcompressd(dst, mask, src, merge, vec_enc);
5747 break;
5748 case T_FLOAT:
5749 evcompressps(dst, mask, src, merge, vec_enc);
5750 break;
5751 case T_LONG:
5752 evpcompressq(dst, mask, src, merge, vec_enc);
5753 break;
5754 case T_DOUBLE:
5755 evcompresspd(dst, mask, src, merge, vec_enc);
5756 break;
5757 default:
5758 fatal("Unsupported type %s", type2name(bt));
5759 break;
5760 }
5761 } else {
5762 assert(opcode == Op_ExpandV, "");
5763 switch(bt) {
5764 case T_BYTE:
5765 evpexpandb(dst, mask, src, merge, vec_enc);
5766 break;
5767 case T_CHAR:
5768 case T_SHORT:
5769 evpexpandw(dst, mask, src, merge, vec_enc);
5770 break;
5771 case T_INT:
5772 evpexpandd(dst, mask, src, merge, vec_enc);
5773 break;
5774 case T_FLOAT:
5775 evexpandps(dst, mask, src, merge, vec_enc);
5776 break;
5777 case T_LONG:
5778 evpexpandq(dst, mask, src, merge, vec_enc);
5779 break;
5780 case T_DOUBLE:
5781 evexpandpd(dst, mask, src, merge, vec_enc);
5782 break;
5783 default:
5784 fatal("Unsupported type %s", type2name(bt));
5785 break;
5786 }
5787 }
5788 }
5789
5790 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5791 KRegister ktmp1, int vec_enc) {
5792 if (opcode == Op_SignumVD) {
5793 vsubpd(dst, zero, one, vec_enc);
5794 // if src < 0 ? -1 : 1
5795 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5796 evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
5797 // if src == NaN, -0.0 or 0.0 return src.
5798 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5799 evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
5800 } else {
5801 assert(opcode == Op_SignumVF, "");
5802 vsubps(dst, zero, one, vec_enc);
5803 // if src < 0 ? -1 : 1
5804 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
5805 evblendmps(dst, ktmp1, one, dst, true, vec_enc);
5806 // if src == NaN, -0.0 or 0.0 return src.
5807 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
5808 evblendmps(dst, ktmp1, dst, src, true, vec_enc);
5809 }
5810 }
5811
5812 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
5813 XMMRegister xtmp1, int vec_enc) {
5814 if (opcode == Op_SignumVD) {
5815 vsubpd(dst, zero, one, vec_enc);
5816 // if src < 0 ? -1 : 1
5817 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1);
5818 // if src == NaN, -0.0 or 0.0 return src.
5819 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5820 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5821 } else {
5822 assert(opcode == Op_SignumVF, "");
5823 vsubps(dst, zero, one, vec_enc);
5824 // if src < 0 ? -1 : 1
5825 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1);
5826 // if src == NaN, -0.0 or 0.0 return src.
5827 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
5828 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1);
5829 }
5830 }
5831
5832 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
5833 if (VM_Version::supports_avx512bw()) {
5834 if (mask_len > 32) {
5835 kmovql(dst, src);
5836 } else {
5837 kmovdl(dst, src);
5838 if (mask_len != 32) {
5839 kshiftrdl(dst, dst, 32 - mask_len);
5840 }
5841 }
5842 } else {
5843 assert(mask_len <= 16, "");
5844 kmovwl(dst, src);
5845 if (mask_len != 16) {
5846 kshiftrwl(dst, dst, 16 - mask_len);
5847 }
5848 }
5849 }
5850
5851 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) {
5852 int lane_size = type2aelembytes(bt);
5853 if ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
5854 (is_subword_type(bt) && VM_Version::supports_avx512vlbw())) {
5855 movptr(rtmp, imm32);
5856 switch(lane_size) {
5857 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break;
5858 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break;
5859 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break;
5860 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
5861 fatal("Unsupported lane size %d", lane_size);
5862 break;
5863 }
5864 } else {
5865 movptr(rtmp, imm32);
5866 movq(dst, rtmp);
5867 switch(lane_size) {
5868 case 1 : vpbroadcastb(dst, dst, vec_enc); break;
5869 case 2 : vpbroadcastw(dst, dst, vec_enc); break;
5870 case 4 : vpbroadcastd(dst, dst, vec_enc); break;
5871 case 8 : vpbroadcastq(dst, dst, vec_enc); break;
5872 fatal("Unsupported lane size %d", lane_size);
5873 break;
5874 }
5875 }
5876 }
5877
5878 //
5879 // Following is lookup table based popcount computation algorithm:-
5880 // Index Bit set count
5881 // [ 0000 -> 0,
5882 // 0001 -> 1,
5883 // 0010 -> 1,
5884 // 0011 -> 2,
5885 // 0100 -> 1,
5886 // 0101 -> 2,
5887 // 0110 -> 2,
5888 // 0111 -> 3,
5889 // 1000 -> 1,
5890 // 1001 -> 2,
5891 // 1010 -> 3,
5892 // 1011 -> 3,
5893 // 1100 -> 2,
5894 // 1101 -> 3,
5895 // 1111 -> 4 ]
5896 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as
5897 // shuffle indices for lookup table access.
5898 // b. Right shift each byte of vector lane by 4 positions.
5899 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as
5900 // shuffle indices for lookup table access.
5901 // d. Add the bitset count of upper and lower 4 bits of each byte.
5902 // e. Unpack double words to quad words and compute sum of absolute difference of bitset
5903 // count of all the bytes of a quadword.
5904 // f. Perform step e. for upper 128bit vector lane.
5905 // g. Pack the bitset count of quadwords back to double word.
5906 // h. Unpacking and packing operations are not needed for 64bit vector lane.
5907
5908 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5909 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5910 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), "");
5911 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc);
5912 vpsrlw(dst, src, 4, vec_enc);
5913 vpand(dst, dst, xtmp1, vec_enc);
5914 vpand(xtmp1, src, xtmp1, vec_enc);
5915 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg);
5916 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc);
5917 vpshufb(dst, xtmp2, dst, vec_enc);
5918 vpaddb(dst, dst, xtmp1, vec_enc);
5919 }
5920
5921 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5922 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5923 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5924 // Following code is as per steps e,f,g and h of above algorithm.
5925 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5926 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc);
5927 vpsadbw(dst, dst, xtmp2, vec_enc);
5928 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc);
5929 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc);
5930 vpackuswb(dst, xtmp1, dst, vec_enc);
5931 }
5932
5933 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5934 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5935 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5936 // Add the popcount of upper and lower bytes of word.
5937 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc);
5938 vpsrlw(dst, xtmp1, 8, vec_enc);
5939 vpand(xtmp1, xtmp1, xtmp2, vec_enc);
5940 vpaddw(dst, dst, xtmp1, vec_enc);
5941 }
5942
5943 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5944 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5945 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc);
5946 vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
5947 vpsadbw(dst, xtmp1, xtmp2, vec_enc);
5948 }
5949
5950 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
5951 XMMRegister xtmp2, Register rtmp, int vec_enc) {
5952 switch(bt) {
5953 case T_LONG:
5954 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5955 break;
5956 case T_INT:
5957 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5958 break;
5959 case T_CHAR:
5960 case T_SHORT:
5961 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5962 break;
5963 case T_BYTE:
5964 case T_BOOLEAN:
5965 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc);
5966 break;
5967 default:
5968 fatal("Unsupported type %s", type2name(bt));
5969 break;
5970 }
5971 }
5972
5973 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src,
5974 KRegister mask, bool merge, int vec_enc) {
5975 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
5976 switch(bt) {
5977 case T_LONG:
5978 assert(VM_Version::supports_avx512_vpopcntdq(), "");
5979 evpopcntq(dst, mask, src, merge, vec_enc);
5980 break;
5981 case T_INT:
5982 assert(VM_Version::supports_avx512_vpopcntdq(), "");
5983 evpopcntd(dst, mask, src, merge, vec_enc);
5984 break;
5985 case T_CHAR:
5986 case T_SHORT:
5987 assert(VM_Version::supports_avx512_bitalg(), "");
5988 evpopcntw(dst, mask, src, merge, vec_enc);
5989 break;
5990 case T_BYTE:
5991 case T_BOOLEAN:
5992 assert(VM_Version::supports_avx512_bitalg(), "");
5993 evpopcntb(dst, mask, src, merge, vec_enc);
5994 break;
5995 default:
5996 fatal("Unsupported type %s", type2name(bt));
5997 break;
5998 }
5999 }
6000
6001 // Bit reversal algorithm first reverses the bits of each byte followed by
6002 // a byte level reversal for multi-byte primitive types (short/int/long).
6003 // Algorithm performs a lookup table access to get reverse bit sequence
6004 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte
6005 // is obtained by swapping the reverse bit sequences of upper and lower
6006 // nibble of a byte.
6007 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6008 XMMRegister xtmp2, Register rtmp, int vec_enc) {
6009 if (VM_Version::supports_avx512vlbw()) {
6010
6011 // Get the reverse bit sequence of lower nibble of each byte.
6012 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
6013 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6014 evpandq(dst, xtmp2, src, vec_enc);
6015 vpshufb(dst, xtmp1, dst, vec_enc);
6016 vpsllq(dst, dst, 4, vec_enc);
6017
6018 // Get the reverse bit sequence of upper nibble of each byte.
6019 vpandn(xtmp2, xtmp2, src, vec_enc);
6020 vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6021 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6022
6023 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6024 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6025 evporq(xtmp2, dst, xtmp2, vec_enc);
6026 vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6027
6028 } else if(vec_enc == Assembler::AVX_512bit) {
6029 // Shift based bit reversal.
6030 assert(bt == T_LONG || bt == T_INT, "");
6031
6032 // Swap lower and upper nibble of each byte.
6033 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
6034
6035 // Swap two least and most significant bits of each nibble.
6036 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
6037
6038 // Swap adjacent pair of bits.
6039 evmovdqul(xtmp1, k0, dst, true, vec_enc);
6040 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc);
6041
6042 evmovdqul(xtmp1, k0, dst, true, vec_enc);
6043 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc);
6044 } else {
6045 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp);
6046 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6047
6048 // Get the reverse bit sequence of lower nibble of each byte.
6049 vpand(dst, xtmp2, src, vec_enc);
6050 vpshufb(dst, xtmp1, dst, vec_enc);
6051 vpsllq(dst, dst, 4, vec_enc);
6052
6053 // Get the reverse bit sequence of upper nibble of each byte.
6054 vpandn(xtmp2, xtmp2, src, vec_enc);
6055 vpsrlq(xtmp2, xtmp2, 4, vec_enc);
6056 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6057
6058 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
6059 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
6060 vpor(xtmp2, dst, xtmp2, vec_enc);
6061 vector_reverse_byte(bt, dst, xtmp2, vec_enc);
6062 }
6063 }
6064
6065 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc,
6066 XMMRegister xtmp, Register rscratch) {
6067 assert(VM_Version::supports_gfni(), "");
6068 assert(rscratch != noreg || always_reachable(mask), "missing");
6069
6070 // Galois field instruction based bit reversal based on following algorithm.
6071 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6072 vpbroadcastq(xtmp, mask, vec_enc, rscratch);
6073 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
6074 vector_reverse_byte(bt, dst, xtmp, vec_enc);
6075 }
6076
6077 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
6078 XMMRegister xtmp1, Register rtmp, int vec_enc) {
6079 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
6080 evpandq(dst, xtmp1, src, vec_enc);
6081 vpsllq(dst, dst, nbits, vec_enc);
6082 vpandn(xtmp1, xtmp1, src, vec_enc);
6083 vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
6084 evporq(dst, dst, xtmp1, vec_enc);
6085 }
6086
6087 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6088 XMMRegister xtmp2, Register rtmp, int vec_enc) {
6089 // Shift based bit reversal.
6090 assert(VM_Version::supports_evex(), "");
6091 switch(bt) {
6092 case T_LONG:
6093 // Swap upper and lower double word of each quad word.
6094 evprorq(xtmp1, k0, src, 32, true, vec_enc);
6095 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
6096 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6097 break;
6098 case T_INT:
6099 // Swap upper and lower word of each double word.
6100 evprord(xtmp1, k0, src, 16, true, vec_enc);
6101 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc);
6102 break;
6103 case T_CHAR:
6104 case T_SHORT:
6105 // Swap upper and lower byte of each word.
6106 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc);
6107 break;
6108 case T_BYTE:
6109 evmovdquq(dst, k0, src, true, vec_enc);
6110 break;
6111 default:
6112 fatal("Unsupported type %s", type2name(bt));
6113 break;
6114 }
6115 }
6116
6117 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) {
6118 if (bt == T_BYTE) {
6119 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
6120 evmovdquq(dst, k0, src, true, vec_enc);
6121 } else {
6122 vmovdqu(dst, src);
6123 }
6124 return;
6125 }
6126 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using
6127 // pre-computed shuffle indices.
6128 switch(bt) {
6129 case T_LONG:
6130 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg);
6131 break;
6132 case T_INT:
6133 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg);
6134 break;
6135 case T_CHAR:
6136 case T_SHORT:
6137 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg);
6138 break;
6139 default:
6140 fatal("Unsupported type %s", type2name(bt));
6141 break;
6142 }
6143 vpshufb(dst, src, dst, vec_enc);
6144 }
6145
6146 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6147 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6148 KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
6149 assert(is_integral_type(bt), "");
6150 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
6151 assert(VM_Version::supports_avx512cd(), "");
6152 switch(bt) {
6153 case T_LONG:
6154 evplzcntq(dst, ktmp, src, merge, vec_enc);
6155 break;
6156 case T_INT:
6157 evplzcntd(dst, ktmp, src, merge, vec_enc);
6158 break;
6159 case T_SHORT:
6160 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
6161 vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
6162 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
6163 vpunpckhwd(dst, xtmp1, src, vec_enc);
6164 evplzcntd(dst, ktmp, dst, merge, vec_enc);
6165 vpackusdw(dst, xtmp2, dst, vec_enc);
6166 break;
6167 case T_BYTE:
6168 // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6169 // accessing the lookup table.
6170 // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6171 // accessing the lookup table.
6172 // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6173 assert(VM_Version::supports_avx512bw(), "");
6174 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
6175 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
6176 vpand(xtmp2, dst, src, vec_enc);
6177 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
6178 vpsrlw(xtmp3, src, 4, vec_enc);
6179 vpand(xtmp3, dst, xtmp3, vec_enc);
6180 vpshufb(dst, xtmp1, xtmp3, vec_enc);
6181 vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6182 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
6183 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc);
6184 break;
6185 default:
6186 fatal("Unsupported type %s", type2name(bt));
6187 break;
6188 }
6189 }
6190
6191 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6192 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6193 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
6194 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
6195 // T1 = Compute leading zero counts of 4 LSB bits of each byte by
6196 // accessing the lookup table.
6197 vpand(dst, xtmp2, src, vec_enc);
6198 vpshufb(dst, xtmp1, dst, vec_enc);
6199 // T2 = Compute leading zero counts of 4 MSB bits of each byte by
6200 // accessing the lookup table.
6201 vpsrlw(xtmp3, src, 4, vec_enc);
6202 vpand(xtmp3, xtmp2, xtmp3, vec_enc);
6203 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc);
6204 // Add T1 to T2 if 4 MSB bits of byte are all zeros.
6205 vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
6206 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
6207 vpaddb(dst, dst, xtmp2, vec_enc);
6208 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
6209 }
6210
6211 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6212 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6213 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6214 // Add zero counts of lower byte and upper byte of a word if
6215 // upper byte holds a zero value.
6216 vpsrlw(xtmp3, src, 8, vec_enc);
6217 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
6218 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
6219 vpsllw(xtmp2, dst, 8, vec_enc);
6220 vpaddw(xtmp2, xtmp2, dst, vec_enc);
6221 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
6222 vpsrlw(dst, dst, 8, vec_enc);
6223 }
6224
6225 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6226 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) {
6227 // By converting the integer to a float, we can obtain the number of leading zeros based on the exponent of the float.
6228 // As the float exponent contains a bias of 127 for nonzero values, the bias must be removed before interpreting the
6229 // exponent as the leading zero count.
6230
6231 // Remove the bit to the right of the highest set bit ensuring that the conversion to float cannot round up to a higher
6232 // power of 2, which has a higher exponent than the input. This transformation is valid as only the highest set bit
6233 // contributes to the leading number of zeros.
6234 vpsrld(dst, src, 1, vec_enc);
6235 vpandn(dst, dst, src, vec_enc);
6236
6237 vcvtdq2ps(dst, dst, vec_enc);
6238
6239 // By comparing the register to itself, all the bits in the destination are set.
6240 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc);
6241
6242 // Move the biased exponent to the low end of the lane and mask with 0xFF to discard the sign bit.
6243 vpsrld(xtmp2, xtmp1, 24, vec_enc);
6244 vpsrld(dst, dst, 23, vec_enc);
6245 vpand(dst, xtmp2, dst, vec_enc);
6246
6247 // Subtract 127 from the exponent, which removes the bias from the exponent.
6248 vpsrld(xtmp2, xtmp1, 25, vec_enc);
6249 vpsubd(dst, dst, xtmp2, vec_enc);
6250
6251 vpsrld(xtmp2, xtmp1, 27, vec_enc);
6252
6253 // If the original value is 0 the exponent would not have bias, so the subtraction creates a negative number. If this
6254 // is found in any of the lanes, replace the lane with -1 from xtmp1.
6255 vblendvps(dst, dst, xtmp1, dst, vec_enc, true, xtmp3);
6256
6257 // If the original value is negative, replace the lane with 31.
6258 vblendvps(dst, dst, xtmp2, src, vec_enc, true, xtmp3);
6259
6260 // Subtract the exponent from 31, giving the final result. For 0, the result is 32 as the exponent was replaced with -1,
6261 // and for negative numbers the result is 0 as the exponent was replaced with 31.
6262 vpsubd(dst, xtmp2, dst, vec_enc);
6263 }
6264
6265 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6266 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6267 // Find the leading zeros of the top and bottom halves of the long individually.
6268 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6269
6270 // Move the top half result to the bottom half of xtmp1, setting the top half to 0.
6271 vpsrlq(xtmp1, dst, 32, vec_enc);
6272 // By moving the top half result to the right by 6 bits, if the top half was empty (i.e. 32 is returned) the result bit will
6273 // be in the most significant position of the bottom half.
6274 vpsrlq(xtmp2, dst, 6, vec_enc);
6275
6276 // In the bottom half, add the top half and bottom half results.
6277 vpaddq(dst, xtmp1, dst, vec_enc);
6278
6279 // For the bottom half, choose between the values using the most significant bit of xtmp2.
6280 // If the MSB is set, then bottom+top in dst is the resulting value. If the top half is less than 32 xtmp1 is chosen,
6281 // which contains only the top half result.
6282 // In the top half the MSB is always zero, so the value in xtmp1 is always chosen. This value is always 0, which clears
6283 // the lane as required.
6284 vblendvps(dst, xtmp1, dst, xtmp2, vec_enc, true, xtmp3);
6285 }
6286
6287 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src,
6288 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6289 Register rtmp, int vec_enc) {
6290 assert(is_integral_type(bt), "unexpected type");
6291 assert(vec_enc < Assembler::AVX_512bit, "");
6292 switch(bt) {
6293 case T_LONG:
6294 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6295 break;
6296 case T_INT:
6297 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc);
6298 break;
6299 case T_SHORT:
6300 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6301 break;
6302 case T_BYTE:
6303 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc);
6304 break;
6305 default:
6306 fatal("Unsupported type %s", type2name(bt));
6307 break;
6308 }
6309 }
6310
6311 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) {
6312 switch(bt) {
6313 case T_BYTE:
6314 vpsubb(dst, src1, src2, vec_enc);
6315 break;
6316 case T_SHORT:
6317 vpsubw(dst, src1, src2, vec_enc);
6318 break;
6319 case T_INT:
6320 vpsubd(dst, src1, src2, vec_enc);
6321 break;
6322 case T_LONG:
6323 vpsubq(dst, src1, src2, vec_enc);
6324 break;
6325 default:
6326 fatal("Unsupported type %s", type2name(bt));
6327 break;
6328 }
6329 }
6330
6331 // Trailing zero count computation is based on leading zero count operation as per
6332 // following equation. All AVX3 targets support AVX512CD feature which offers
6333 // direct vector instruction to compute leading zero count.
6334 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x)
6335 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
6336 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
6337 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) {
6338 assert(is_integral_type(bt), "");
6339 // xtmp = -1
6340 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc);
6341 // xtmp = xtmp + src
6342 vpadd(bt, xtmp4, xtmp4, src, vec_enc);
6343 // xtmp = xtmp & ~src
6344 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc);
6345 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc);
6346 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc);
6347 vpsub(bt, dst, xtmp4, dst, vec_enc);
6348 }
6349
6350 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation
6351 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x)
6352 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
6353 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
6354 assert(is_integral_type(bt), "");
6355 // xtmp = 0
6356 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc);
6357 // xtmp = 0 - src
6358 vpsub(bt, xtmp3, xtmp3, src, vec_enc);
6359 // xtmp = xtmp | src
6360 vpor(xtmp3, xtmp3, src, vec_enc);
6361 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc);
6362 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc);
6363 vpsub(bt, dst, xtmp1, dst, vec_enc);
6364 }
6365
6366 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) {
6367 Label done;
6368 Label neg_divisor_fastpath;
6369 cmpl(divisor, 0);
6370 jccb(Assembler::less, neg_divisor_fastpath);
6371 xorl(rdx, rdx);
6372 divl(divisor);
6373 jmpb(done);
6374 bind(neg_divisor_fastpath);
6375 // Fastpath for divisor < 0:
6376 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6377 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6378 movl(rdx, rax);
6379 subl(rdx, divisor);
6380 if (VM_Version::supports_bmi1()) {
6381 andnl(rax, rdx, rax);
6382 } else {
6383 notl(rdx);
6384 andl(rax, rdx);
6385 }
6386 shrl(rax, 31);
6387 bind(done);
6388 }
6389
6390 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) {
6391 Label done;
6392 Label neg_divisor_fastpath;
6393 cmpl(divisor, 0);
6394 jccb(Assembler::less, neg_divisor_fastpath);
6395 xorl(rdx, rdx);
6396 divl(divisor);
6397 jmpb(done);
6398 bind(neg_divisor_fastpath);
6399 // Fastpath when divisor < 0:
6400 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6401 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6402 movl(rdx, rax);
6403 subl(rax, divisor);
6404 if (VM_Version::supports_bmi1()) {
6405 andnl(rax, rax, rdx);
6406 } else {
6407 notl(rax);
6408 andl(rax, rdx);
6409 }
6410 sarl(rax, 31);
6411 andl(rax, divisor);
6412 subl(rdx, rax);
6413 bind(done);
6414 }
6415
6416 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) {
6417 Label done;
6418 Label neg_divisor_fastpath;
6419
6420 cmpl(divisor, 0);
6421 jccb(Assembler::less, neg_divisor_fastpath);
6422 xorl(rdx, rdx);
6423 divl(divisor);
6424 jmpb(done);
6425 bind(neg_divisor_fastpath);
6426 // Fastpath for divisor < 0:
6427 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1)
6428 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor)
6429 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6430 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6431 movl(rdx, rax);
6432 subl(rax, divisor);
6433 if (VM_Version::supports_bmi1()) {
6434 andnl(rax, rax, rdx);
6435 } else {
6436 notl(rax);
6437 andl(rax, rdx);
6438 }
6439 movl(tmp, rax);
6440 shrl(rax, 31); // quotient
6441 sarl(tmp, 31);
6442 andl(tmp, divisor);
6443 subl(rdx, tmp); // remainder
6444 bind(done);
6445 }
6446
6447 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
6448 XMMRegister xtmp2, Register rtmp) {
6449 if(VM_Version::supports_gfni()) {
6450 // Galois field instruction based bit reversal based on following algorithm.
6451 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6452 mov64(rtmp, 0x8040201008040201L);
6453 movq(xtmp1, src);
6454 movq(xtmp2, rtmp);
6455 gf2p8affineqb(xtmp1, xtmp2, 0);
6456 movq(dst, xtmp1);
6457 } else {
6458 // Swap even and odd numbered bits.
6459 movl(rtmp, src);
6460 andl(rtmp, 0x55555555);
6461 shll(rtmp, 1);
6462 movl(dst, src);
6463 andl(dst, 0xAAAAAAAA);
6464 shrl(dst, 1);
6465 orl(dst, rtmp);
6466
6467 // Swap LSB and MSB 2 bits of each nibble.
6468 movl(rtmp, dst);
6469 andl(rtmp, 0x33333333);
6470 shll(rtmp, 2);
6471 andl(dst, 0xCCCCCCCC);
6472 shrl(dst, 2);
6473 orl(dst, rtmp);
6474
6475 // Swap LSB and MSB 4 bits of each byte.
6476 movl(rtmp, dst);
6477 andl(rtmp, 0x0F0F0F0F);
6478 shll(rtmp, 4);
6479 andl(dst, 0xF0F0F0F0);
6480 shrl(dst, 4);
6481 orl(dst, rtmp);
6482 }
6483 bswapl(dst);
6484 }
6485
6486 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
6487 XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
6488 if(VM_Version::supports_gfni()) {
6489 // Galois field instruction based bit reversal based on following algorithm.
6490 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
6491 mov64(rtmp1, 0x8040201008040201L);
6492 movq(xtmp1, src);
6493 movq(xtmp2, rtmp1);
6494 gf2p8affineqb(xtmp1, xtmp2, 0);
6495 movq(dst, xtmp1);
6496 } else {
6497 // Swap even and odd numbered bits.
6498 movq(rtmp1, src);
6499 mov64(rtmp2, 0x5555555555555555L);
6500 andq(rtmp1, rtmp2);
6501 shlq(rtmp1, 1);
6502 movq(dst, src);
6503 notq(rtmp2);
6504 andq(dst, rtmp2);
6505 shrq(dst, 1);
6506 orq(dst, rtmp1);
6507
6508 // Swap LSB and MSB 2 bits of each nibble.
6509 movq(rtmp1, dst);
6510 mov64(rtmp2, 0x3333333333333333L);
6511 andq(rtmp1, rtmp2);
6512 shlq(rtmp1, 2);
6513 notq(rtmp2);
6514 andq(dst, rtmp2);
6515 shrq(dst, 2);
6516 orq(dst, rtmp1);
6517
6518 // Swap LSB and MSB 4 bits of each byte.
6519 movq(rtmp1, dst);
6520 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
6521 andq(rtmp1, rtmp2);
6522 shlq(rtmp1, 4);
6523 notq(rtmp2);
6524 andq(dst, rtmp2);
6525 shrq(dst, 4);
6526 orq(dst, rtmp1);
6527 }
6528 bswapq(dst);
6529 }
6530
6531 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
6532 Label done;
6533 Label neg_divisor_fastpath;
6534 cmpq(divisor, 0);
6535 jccb(Assembler::less, neg_divisor_fastpath);
6536 xorl(rdx, rdx);
6537 divq(divisor);
6538 jmpb(done);
6539 bind(neg_divisor_fastpath);
6540 // Fastpath for divisor < 0:
6541 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6542 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned()
6543 movq(rdx, rax);
6544 subq(rdx, divisor);
6545 if (VM_Version::supports_bmi1()) {
6546 andnq(rax, rdx, rax);
6547 } else {
6548 notq(rdx);
6549 andq(rax, rdx);
6550 }
6551 shrq(rax, 63);
6552 bind(done);
6553 }
6554
6555 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) {
6556 Label done;
6557 Label neg_divisor_fastpath;
6558 cmpq(divisor, 0);
6559 jccb(Assembler::less, neg_divisor_fastpath);
6560 xorq(rdx, rdx);
6561 divq(divisor);
6562 jmp(done);
6563 bind(neg_divisor_fastpath);
6564 // Fastpath when divisor < 0:
6565 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6566 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned()
6567 movq(rdx, rax);
6568 subq(rax, divisor);
6569 if (VM_Version::supports_bmi1()) {
6570 andnq(rax, rax, rdx);
6571 } else {
6572 notq(rax);
6573 andq(rax, rdx);
6574 }
6575 sarq(rax, 63);
6576 andq(rax, divisor);
6577 subq(rdx, rax);
6578 bind(done);
6579 }
6580
6581 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) {
6582 Label done;
6583 Label neg_divisor_fastpath;
6584 cmpq(divisor, 0);
6585 jccb(Assembler::less, neg_divisor_fastpath);
6586 xorq(rdx, rdx);
6587 divq(divisor);
6588 jmp(done);
6589 bind(neg_divisor_fastpath);
6590 // Fastpath for divisor < 0:
6591 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1)
6592 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor)
6593 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in
6594 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned()
6595 movq(rdx, rax);
6596 subq(rax, divisor);
6597 if (VM_Version::supports_bmi1()) {
6598 andnq(rax, rax, rdx);
6599 } else {
6600 notq(rax);
6601 andq(rax, rdx);
6602 }
6603 movq(tmp, rax);
6604 shrq(rax, 63); // quotient
6605 sarq(tmp, 63);
6606 andq(tmp, divisor);
6607 subq(rdx, tmp); // remainder
6608 bind(done);
6609 }
6610
6611 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
6612 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
6613 int vlen_enc) {
6614 assert(VM_Version::supports_avx512bw(), "");
6615 // Byte shuffles are inlane operations and indices are determined using
6616 // lower 4 bit of each shuffle lane, thus all shuffle indices are
6617 // normalized to index range 0-15. This makes sure that all the multiples
6618 // of an index value are placed at same relative position in 128 bit
6619 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
6620 // will be 16th element in their respective 128 bit lanes.
6621 movl(rtmp, 16);
6622 evpbroadcastb(xtmp1, rtmp, vlen_enc);
6623
6624 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
6625 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
6626 // original shuffle indices and move the shuffled lanes corresponding to true
6627 // mask to destination vector.
6628 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6629 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
6630 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
6631
6632 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
6633 // and broadcasting second 128 bit lane.
6634 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
6635 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
6636 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6637 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
6638 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6639
6640 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
6641 // and broadcasting third 128 bit lane.
6642 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc);
6643 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
6644 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
6645 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
6646 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6647
6648 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
6649 // and broadcasting third 128 bit lane.
6650 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
6651 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
6652 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
6653 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
6654 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
6655 }
6656
6657 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst,
6658 XMMRegister shuffle, XMMRegister src, int vlen_enc) {
6659 if (vlen_enc == AVX_128bit) {
6660 vpermilps(dst, src, shuffle, vlen_enc);
6661 } else if (bt == T_INT) {
6662 vpermd(dst, shuffle, src, vlen_enc);
6663 } else {
6664 assert(bt == T_FLOAT, "");
6665 vpermps(dst, shuffle, src, vlen_enc);
6666 }
6667 }
6668
6669 void C2_MacroAssembler::efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
6670 switch(opcode) {
6671 case Op_AddHF: vaddsh(dst, src1, src2); break;
6672 case Op_SubHF: vsubsh(dst, src1, src2); break;
6673 case Op_MulHF: vmulsh(dst, src1, src2); break;
6674 case Op_DivHF: vdivsh(dst, src1, src2); break;
6675 default: assert(false, "%s", NodeClassNames[opcode]); break;
6676 }
6677 }
6678
6679 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6680 switch(elem_bt) {
6681 case T_BYTE:
6682 if (ideal_opc == Op_SaturatingAddV) {
6683 vpaddsb(dst, src1, src2, vlen_enc);
6684 } else {
6685 assert(ideal_opc == Op_SaturatingSubV, "");
6686 vpsubsb(dst, src1, src2, vlen_enc);
6687 }
6688 break;
6689 case T_SHORT:
6690 if (ideal_opc == Op_SaturatingAddV) {
6691 vpaddsw(dst, src1, src2, vlen_enc);
6692 } else {
6693 assert(ideal_opc == Op_SaturatingSubV, "");
6694 vpsubsw(dst, src1, src2, vlen_enc);
6695 }
6696 break;
6697 default:
6698 fatal("Unsupported type %s", type2name(elem_bt));
6699 break;
6700 }
6701 }
6702
6703 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6704 switch(elem_bt) {
6705 case T_BYTE:
6706 if (ideal_opc == Op_SaturatingAddV) {
6707 vpaddusb(dst, src1, src2, vlen_enc);
6708 } else {
6709 assert(ideal_opc == Op_SaturatingSubV, "");
6710 vpsubusb(dst, src1, src2, vlen_enc);
6711 }
6712 break;
6713 case T_SHORT:
6714 if (ideal_opc == Op_SaturatingAddV) {
6715 vpaddusw(dst, src1, src2, vlen_enc);
6716 } else {
6717 assert(ideal_opc == Op_SaturatingSubV, "");
6718 vpsubusw(dst, src1, src2, vlen_enc);
6719 }
6720 break;
6721 default:
6722 fatal("Unsupported type %s", type2name(elem_bt));
6723 break;
6724 }
6725 }
6726
6727 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6728 XMMRegister src2, KRegister ktmp, int vlen_enc) {
6729 // For unsigned subtraction, overflow happens when magnitude of second input is greater than first input.
6730 // overflow_mask = Inp1 <u Inp2
6731 evpcmpu(elem_bt, ktmp, src2, src1, Assembler::lt, vlen_enc);
6732 // Res = overflow_mask ? Zero : INP1 - INP2 (non-commutative and non-associative)
6733 evmasked_op(elem_bt == T_INT ? Op_SubVI : Op_SubVL, elem_bt, ktmp, dst, src1, src2, false, vlen_enc, false);
6734 }
6735
6736 void C2_MacroAssembler::vector_sub_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6737 XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
6738 // Emulate unsigned comparison using signed comparison
6739 // Mask = Inp1 <u Inp2 => Inp1 + MIN_VALUE < Inp2 + MIN_VALUE
6740 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc, true);
6741 vpadd(elem_bt, xtmp2, src1, xtmp1, vlen_enc);
6742 vpadd(elem_bt, xtmp1, src2, xtmp1, vlen_enc);
6743
6744 vpcmpgt(elem_bt, xtmp2, xtmp1, xtmp2, vlen_enc);
6745
6746 // Res = INP1 - INP2 (non-commutative and non-associative)
6747 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6748 // Res = Mask ? Zero : Res
6749 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc);
6750 vpblendvb(dst, dst, xtmp1, xtmp2, vlen_enc);
6751 }
6752
6753 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6754 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp, int vlen_enc) {
6755 // Unsigned values ranges comprise of only +ve numbers, thus there exist only an upper bound saturation.
6756 // overflow_mask = (SRC1 + SRC2) <u (SRC1 | SRC2)
6757 // Res = Signed Add INP1, INP2
6758 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6759 // T1 = SRC1 | SRC2
6760 vpor(xtmp1, src1, src2, vlen_enc);
6761 // Max_Unsigned = -1
6762 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6763 // Unsigned compare: Mask = Res <u T1
6764 evpcmpu(elem_bt, ktmp, dst, xtmp1, Assembler::lt, vlen_enc);
6765 // res = Mask ? Max_Unsigned : Res
6766 evpblend(elem_bt, dst, ktmp, dst, xtmp2, true, vlen_enc);
6767 }
6768
6769 //
6770 // Section 2-13 Hacker's Delight list following overflow detection check for saturating
6771 // unsigned addition operation.
6772 // overflow_mask = ((a & b) | ((a | b) & ~( a + b))) >>> 31 == 1
6773 //
6774 // We empirically determined its semantic equivalence to following reduced expression
6775 // overflow_mask = (a + b) <u (a | b)
6776 //
6777 // and also verified it though Alive2 solver.
6778 // (https://alive2.llvm.org/ce/z/XDQ7dY)
6779 //
6780
6781 void C2_MacroAssembler::vector_add_dq_saturating_unsigned_avx(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2,
6782 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, int vlen_enc) {
6783 // Res = Signed Add INP1, INP2
6784 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6785 // Compute T1 = INP1 | INP2
6786 vpor(xtmp3, src1, src2, vlen_enc);
6787 // T1 = Minimum signed value.
6788 vpgenmin_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6789 // Convert T1 to signed value, T1 = T1 + MIN_VALUE
6790 vpadd(elem_bt, xtmp3, xtmp3, xtmp2, vlen_enc);
6791 // Convert Res to signed value, Res<s> = Res + MIN_VALUE
6792 vpadd(elem_bt, xtmp2, xtmp2, dst, vlen_enc);
6793 // Compute overflow detection mask = Res<1> <s T1
6794 if (elem_bt == T_INT) {
6795 vpcmpgtd(xtmp3, xtmp3, xtmp2, vlen_enc);
6796 } else {
6797 assert(elem_bt == T_LONG, "");
6798 vpcmpgtq(xtmp3, xtmp3, xtmp2, vlen_enc);
6799 }
6800 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6801 }
6802
6803 void C2_MacroAssembler::evpmovq2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6804 int vlen_enc, bool xtmp2_hold_M1) {
6805 if (VM_Version::supports_avx512dq()) {
6806 evpmovq2m(ktmp, src, vlen_enc);
6807 } else {
6808 assert(VM_Version::supports_evex(), "");
6809 if (!xtmp2_hold_M1) {
6810 vpternlogq(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6811 }
6812 evpsraq(xtmp1, src, 63, vlen_enc);
6813 evpcmpeqq(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6814 }
6815 }
6816
6817 void C2_MacroAssembler::evpmovd2m_emu(KRegister ktmp, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
6818 int vlen_enc, bool xtmp2_hold_M1) {
6819 if (VM_Version::supports_avx512dq()) {
6820 evpmovd2m(ktmp, src, vlen_enc);
6821 } else {
6822 assert(VM_Version::supports_evex(), "");
6823 if (!xtmp2_hold_M1) {
6824 vpternlogd(xtmp2, 0xff, xtmp2, xtmp2, vlen_enc);
6825 }
6826 vpsrad(xtmp1, src, 31, vlen_enc);
6827 Assembler::evpcmpeqd(ktmp, k0, xtmp1, xtmp2, vlen_enc);
6828 }
6829 }
6830
6831
6832 void C2_MacroAssembler::vpsign_extend_dq(BasicType elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) {
6833 if (elem_bt == T_LONG) {
6834 if (VM_Version::supports_evex()) {
6835 evpsraq(dst, src, 63, vlen_enc);
6836 } else {
6837 vpsrad(dst, src, 31, vlen_enc);
6838 vpshufd(dst, dst, 0xF5, vlen_enc);
6839 }
6840 } else {
6841 assert(elem_bt == T_INT, "");
6842 vpsrad(dst, src, 31, vlen_enc);
6843 }
6844 }
6845
6846 void C2_MacroAssembler::vpgenmax_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6847 if (compute_allones) {
6848 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6849 vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6850 } else {
6851 vpcmpeqq(allones, allones, allones, vlen_enc);
6852 }
6853 }
6854 if (elem_bt == T_LONG) {
6855 vpsrlq(dst, allones, 1, vlen_enc);
6856 } else {
6857 assert(elem_bt == T_INT, "");
6858 vpsrld(dst, allones, 1, vlen_enc);
6859 }
6860 }
6861
6862 void C2_MacroAssembler::vpgenmin_value(BasicType elem_bt, XMMRegister dst, XMMRegister allones, int vlen_enc, bool compute_allones) {
6863 if (compute_allones) {
6864 if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
6865 vpternlogd(allones, 0xff, allones, allones, vlen_enc);
6866 } else {
6867 vpcmpeqq(allones, allones, allones, vlen_enc);
6868 }
6869 }
6870 if (elem_bt == T_LONG) {
6871 vpsllq(dst, allones, 63, vlen_enc);
6872 } else {
6873 assert(elem_bt == T_INT, "");
6874 vpslld(dst, allones, 31, vlen_enc);
6875 }
6876 }
6877
6878 void C2_MacroAssembler::evpcmpu(BasicType elem_bt, KRegister kmask, XMMRegister src1, XMMRegister src2,
6879 Assembler::ComparisonPredicate cond, int vlen_enc) {
6880 switch(elem_bt) {
6881 case T_LONG: evpcmpuq(kmask, src1, src2, cond, vlen_enc); break;
6882 case T_INT: evpcmpud(kmask, src1, src2, cond, vlen_enc); break;
6883 case T_SHORT: evpcmpuw(kmask, src1, src2, cond, vlen_enc); break;
6884 case T_BYTE: evpcmpub(kmask, src1, src2, cond, vlen_enc); break;
6885 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6886 }
6887 }
6888
6889 void C2_MacroAssembler::vpcmpgt(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
6890 switch(elem_bt) {
6891 case T_LONG: vpcmpgtq(dst, src1, src2, vlen_enc); break;
6892 case T_INT: vpcmpgtd(dst, src1, src2, vlen_enc); break;
6893 case T_SHORT: vpcmpgtw(dst, src1, src2, vlen_enc); break;
6894 case T_BYTE: vpcmpgtb(dst, src1, src2, vlen_enc); break;
6895 default: fatal("Unsupported type %s", type2name(elem_bt)); break;
6896 }
6897 }
6898
6899 void C2_MacroAssembler::evpmov_vec_to_mask(BasicType elem_bt, KRegister ktmp, XMMRegister src, XMMRegister xtmp1,
6900 XMMRegister xtmp2, int vlen_enc, bool xtmp2_hold_M1) {
6901 if (elem_bt == T_LONG) {
6902 evpmovq2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6903 } else {
6904 assert(elem_bt == T_INT, "");
6905 evpmovd2m_emu(ktmp, src, xtmp1, xtmp2, vlen_enc, xtmp2_hold_M1);
6906 }
6907 }
6908
6909 void C2_MacroAssembler::vector_addsub_dq_saturating_evex(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6910 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6911 KRegister ktmp1, KRegister ktmp2, int vlen_enc) {
6912 assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6913 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6914 // Overflow detection based on Hacker's delight section 2-13.
6915 if (ideal_opc == Op_SaturatingAddV) {
6916 // res = src1 + src2
6917 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6918 // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6919 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6920 vpxor(xtmp1, dst, src1, vlen_enc);
6921 vpxor(xtmp2, dst, src2, vlen_enc);
6922 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6923 } else {
6924 assert(ideal_opc == Op_SaturatingSubV, "");
6925 // res = src1 - src2
6926 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6927 // Overflow occurs when both inputs have opposite polarity and
6928 // result polarity does not comply with first input polarity.
6929 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6930 vpxor(xtmp1, src1, src2, vlen_enc);
6931 vpxor(xtmp2, dst, src1, vlen_enc);
6932 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6933 }
6934
6935 // Compute overflow detection mask.
6936 evpmov_vec_to_mask(elem_bt, ktmp1, xtmp2, xtmp2, xtmp1, vlen_enc);
6937 // Note: xtmp1 hold -1 in all its lanes after above call.
6938
6939 // Compute mask based on first input polarity.
6940 evpmov_vec_to_mask(elem_bt, ktmp2, src1, xtmp2, xtmp1, vlen_enc, true);
6941
6942 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc, true);
6943 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6944
6945 // Compose a vector of saturating (MAX/MIN) values, where lanes corresponding to
6946 // set bits in first input polarity mask holds a min value.
6947 evpblend(elem_bt, xtmp2, ktmp2, xtmp2, xtmp1, true, vlen_enc);
6948 // Blend destination lanes with saturated values using overflow detection mask.
6949 evpblend(elem_bt, dst, ktmp1, dst, xtmp2, true, vlen_enc);
6950 }
6951
6952
6953 void C2_MacroAssembler::vector_addsub_dq_saturating_avx(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1,
6954 XMMRegister src2, XMMRegister xtmp1, XMMRegister xtmp2,
6955 XMMRegister xtmp3, XMMRegister xtmp4, int vlen_enc) {
6956 assert(elem_bt == T_INT || elem_bt == T_LONG, "");
6957 // Addition/Subtraction happens over two's compliment representation of numbers and is agnostic to signed'ness.
6958 // Overflow detection based on Hacker's delight section 2-13.
6959 if (ideal_opc == Op_SaturatingAddV) {
6960 // res = src1 + src2
6961 vpadd(elem_bt, dst, src1, src2, vlen_enc);
6962 // Overflow occurs if result polarity does not comply with equivalent polarity inputs.
6963 // overflow = (((res ^ src1) & (res ^ src2)) >>> 31(I)/63(L)) == 1
6964 vpxor(xtmp1, dst, src1, vlen_enc);
6965 vpxor(xtmp2, dst, src2, vlen_enc);
6966 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6967 } else {
6968 assert(ideal_opc == Op_SaturatingSubV, "");
6969 // res = src1 - src2
6970 vpsub(elem_bt, dst, src1, src2, vlen_enc);
6971 // Overflow occurs when both inputs have opposite polarity and
6972 // result polarity does not comply with first input polarity.
6973 // overflow = ((src1 ^ src2) & (res ^ src1) >>> 31(I)/63(L)) == 1;
6974 vpxor(xtmp1, src1, src2, vlen_enc);
6975 vpxor(xtmp2, dst, src1, vlen_enc);
6976 vpand(xtmp2, xtmp1, xtmp2, vlen_enc);
6977 }
6978
6979 // Sign-extend to compute overflow detection mask.
6980 vpsign_extend_dq(elem_bt, xtmp3, xtmp2, vlen_enc);
6981
6982 vpcmpeqd(xtmp1, xtmp1, xtmp1, vlen_enc);
6983 vpgenmax_value(elem_bt, xtmp2, xtmp1, vlen_enc);
6984 vpgenmin_value(elem_bt, xtmp1, xtmp1, vlen_enc);
6985
6986 // Compose saturating min/max vector using first input polarity mask.
6987 vpsign_extend_dq(elem_bt, xtmp4, src1, vlen_enc);
6988 vpblendvb(xtmp1, xtmp2, xtmp1, xtmp4, vlen_enc);
6989
6990 // Blend result with saturating vector using overflow detection mask.
6991 vpblendvb(dst, dst, xtmp1, xtmp3, vlen_enc);
6992 }
6993
6994 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
6995 switch(elem_bt) {
6996 case T_BYTE:
6997 if (ideal_opc == Op_SaturatingAddV) {
6998 vpaddsb(dst, src1, src2, vlen_enc);
6999 } else {
7000 assert(ideal_opc == Op_SaturatingSubV, "");
7001 vpsubsb(dst, src1, src2, vlen_enc);
7002 }
7003 break;
7004 case T_SHORT:
7005 if (ideal_opc == Op_SaturatingAddV) {
7006 vpaddsw(dst, src1, src2, vlen_enc);
7007 } else {
7008 assert(ideal_opc == Op_SaturatingSubV, "");
7009 vpsubsw(dst, src1, src2, vlen_enc);
7010 }
7011 break;
7012 default:
7013 fatal("Unsupported type %s", type2name(elem_bt));
7014 break;
7015 }
7016 }
7017
7018 void C2_MacroAssembler::vector_saturating_unsigned_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7019 switch(elem_bt) {
7020 case T_BYTE:
7021 if (ideal_opc == Op_SaturatingAddV) {
7022 vpaddusb(dst, src1, src2, vlen_enc);
7023 } else {
7024 assert(ideal_opc == Op_SaturatingSubV, "");
7025 vpsubusb(dst, src1, src2, vlen_enc);
7026 }
7027 break;
7028 case T_SHORT:
7029 if (ideal_opc == Op_SaturatingAddV) {
7030 vpaddusw(dst, src1, src2, vlen_enc);
7031 } else {
7032 assert(ideal_opc == Op_SaturatingSubV, "");
7033 vpsubusw(dst, src1, src2, vlen_enc);
7034 }
7035 break;
7036 default:
7037 fatal("Unsupported type %s", type2name(elem_bt));
7038 break;
7039 }
7040 }
7041
7042 void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1,
7043 XMMRegister src2, int vlen_enc) {
7044 switch(elem_bt) {
7045 case T_BYTE:
7046 evpermi2b(dst, src1, src2, vlen_enc);
7047 break;
7048 case T_SHORT:
7049 evpermi2w(dst, src1, src2, vlen_enc);
7050 break;
7051 case T_INT:
7052 evpermi2d(dst, src1, src2, vlen_enc);
7053 break;
7054 case T_LONG:
7055 evpermi2q(dst, src1, src2, vlen_enc);
7056 break;
7057 case T_FLOAT:
7058 evpermi2ps(dst, src1, src2, vlen_enc);
7059 break;
7060 case T_DOUBLE:
7061 evpermi2pd(dst, src1, src2, vlen_enc);
7062 break;
7063 default:
7064 fatal("Unsupported type %s", type2name(elem_bt));
7065 break;
7066 }
7067 }
7068
7069 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc) {
7070 if (is_unsigned) {
7071 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7072 } else {
7073 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7074 }
7075 }
7076
7077 void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, Address src2, bool is_unsigned, int vlen_enc) {
7078 if (is_unsigned) {
7079 vector_saturating_unsigned_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7080 } else {
7081 vector_saturating_op(ideal_opc, elem_bt, dst, src1, src2, vlen_enc);
7082 }
7083 }
7084
7085 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
7086 switch(opcode) {
7087 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7088 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7089 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7090 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7091 default: assert(false, "%s", NodeClassNames[opcode]); break;
7092 }
7093 }
7094
7095 void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc) {
7096 switch(opcode) {
7097 case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
7098 case Op_SubVHF: evsubph(dst, src1, src2, vlen_enc); break;
7099 case Op_MulVHF: evmulph(dst, src1, src2, vlen_enc); break;
7100 case Op_DivVHF: evdivph(dst, src1, src2, vlen_enc); break;
7101 default: assert(false, "%s", NodeClassNames[opcode]); break;
7102 }
7103 }
7104
7105 void C2_MacroAssembler::scalar_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7106 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2) {
7107 vector_max_min_fp16(opcode, dst, src1, src2, ktmp, xtmp1, xtmp2, Assembler::AVX_128bit);
7108 }
7109
7110 void C2_MacroAssembler::vector_max_min_fp16(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2,
7111 KRegister ktmp, XMMRegister xtmp1, XMMRegister xtmp2, int vlen_enc) {
7112 if (opcode == Op_MaxVHF || opcode == Op_MaxHF) {
7113 // Move sign bits of src2 to mask register.
7114 evpmovw2m(ktmp, src2, vlen_enc);
7115 // xtmp1 = src2 < 0 ? src2 : src1
7116 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7117 // xtmp2 = src2 < 0 ? ? src1 : src2
7118 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7119 // Idea behind above swapping is to make seconds source operand a +ve value.
7120 // As per instruction semantic, if the values being compared are both 0.0s (of either sign), the value in
7121 // the second source operand is returned. If only one value is a NaN (SNaN or QNaN) for this instruction,
7122 // the second source operand, either a NaN or a valid floating-point value, is returned
7123 // dst = max(xtmp1, xtmp2)
7124 evmaxph(dst, xtmp1, xtmp2, vlen_enc);
7125 // isNaN = is_unordered_quiet(xtmp1)
7126 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7127 // Final result is same as first source if its a NaN value,
7128 // in case second operand holds a NaN value then as per above semantics
7129 // result is same as second operand.
7130 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7131 } else {
7132 assert(opcode == Op_MinVHF || opcode == Op_MinHF, "");
7133 // Move sign bits of src1 to mask register.
7134 evpmovw2m(ktmp, src1, vlen_enc);
7135 // xtmp1 = src1 < 0 ? src2 : src1
7136 evpblendmw(xtmp1, ktmp, src1, src2, true, vlen_enc);
7137 // xtmp2 = src1 < 0 ? src1 : src2
7138 evpblendmw(xtmp2, ktmp, src2, src1, true, vlen_enc);
7139 // Idea behind above swapping is to make seconds source operand a -ve value.
7140 // As per instruction semantics, if the values being compared are both 0.0s (of either sign), the value in
7141 // the second source operand is returned.
7142 // If only one value is a NaN (SNaN or QNaN) for this instruction, the second source operand, either a NaN
7143 // or a valid floating-point value, is written to the result.
7144 // dst = min(xtmp1, xtmp2)
7145 evminph(dst, xtmp1, xtmp2, vlen_enc);
7146 // isNaN = is_unordered_quiet(xtmp1)
7147 evcmpph(ktmp, k0, xtmp1, xtmp1, Assembler::UNORD_Q, vlen_enc);
7148 // Final result is same as first source if its a NaN value,
7149 // in case second operand holds a NaN value then as per above semantics
7150 // result is same as second operand.
7151 Assembler::evmovdquw(dst, ktmp, xtmp1, true, vlen_enc);
7152 }
7153 }