1 /*
2 * Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "precompiled.hpp"
26 #include "asm/assembler.hpp"
27 #include "asm/assembler.inline.hpp"
28 #include "oops/methodData.hpp"
29 #include "opto/c2_CodeStubs.hpp"
30 #include "opto/c2_MacroAssembler.hpp"
31 #include "opto/intrinsicnode.hpp"
32 #include "opto/opcodes.hpp"
33 #include "opto/output.hpp"
34 #include "opto/subnode.hpp"
35 #include "runtime/biasedLocking.hpp"
36 #include "runtime/globals.hpp"
37 #include "runtime/objectMonitor.hpp"
38 #include "runtime/stubRoutines.hpp"
39 #include "utilities/globalDefinitions.hpp"
40 #include "utilities/powerOfTwo.hpp"
41 #include "utilities/sizes.hpp"
42
43 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
44 switch (vlen_in_bytes) {
45 case 4: // fall-through
46 case 8: // fall-through
47 case 16: return Assembler::AVX_128bit;
48 case 32: return Assembler::AVX_256bit;
49 case 64: return Assembler::AVX_512bit;
50
51 default: {
52 ShouldNotReachHere();
53 return Assembler::AVX_NoVec;
54 }
55 }
56 }
57
58 void C2_MacroAssembler::setvectmask(Register dst, Register src, KRegister mask) {
59 guarantee(PostLoopMultiversioning, "must be");
60 Assembler::movl(dst, 1);
61 Assembler::shlxl(dst, dst, src);
62 Assembler::decl(dst);
63 Assembler::kmovdl(mask, dst);
64 Assembler::movl(dst, src);
65 }
66
67 void C2_MacroAssembler::restorevectmask(KRegister mask) {
68 guarantee(PostLoopMultiversioning, "must be");
69 Assembler::knotwl(mask, k0);
70 }
71
72 #if INCLUDE_RTM_OPT
73
74 // Update rtm_counters based on abort status
75 // input: abort_status
76 // rtm_counters (RTMLockingCounters*)
77 // flags are killed
78 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
79
80 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
81 if (PrintPreciseRTMLockingStatistics) {
82 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
83 Label check_abort;
84 testl(abort_status, (1<<i));
85 jccb(Assembler::equal, check_abort);
86 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
87 bind(check_abort);
88 }
89 }
90 }
91
92 // Branch if (random & (count-1) != 0), count is 2^n
93 // tmp, scr and flags are killed
94 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
95 assert(tmp == rax, "");
96 assert(scr == rdx, "");
97 rdtsc(); // modifies EDX:EAX
98 andptr(tmp, count-1);
99 jccb(Assembler::notZero, brLabel);
100 }
101
102 // Perform abort ratio calculation, set no_rtm bit if high ratio
103 // input: rtm_counters_Reg (RTMLockingCounters* address)
104 // tmpReg, rtm_counters_Reg and flags are killed
105 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
106 Register rtm_counters_Reg,
107 RTMLockingCounters* rtm_counters,
108 Metadata* method_data) {
109 Label L_done, L_check_always_rtm1, L_check_always_rtm2;
110
111 if (RTMLockingCalculationDelay > 0) {
112 // Delay calculation
113 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
114 testptr(tmpReg, tmpReg);
115 jccb(Assembler::equal, L_done);
116 }
117 // Abort ratio calculation only if abort_count > RTMAbortThreshold
118 // Aborted transactions = abort_count * 100
119 // All transactions = total_count * RTMTotalCountIncrRate
120 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
121
122 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
123 cmpptr(tmpReg, RTMAbortThreshold);
124 jccb(Assembler::below, L_check_always_rtm2);
125 imulptr(tmpReg, tmpReg, 100);
126
127 Register scrReg = rtm_counters_Reg;
128 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
129 imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
130 imulptr(scrReg, scrReg, RTMAbortRatio);
131 cmpptr(tmpReg, scrReg);
132 jccb(Assembler::below, L_check_always_rtm1);
133 if (method_data != NULL) {
134 // set rtm_state to "no rtm" in MDO
135 mov_metadata(tmpReg, method_data);
136 lock();
137 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
138 }
139 jmpb(L_done);
140 bind(L_check_always_rtm1);
141 // Reload RTMLockingCounters* address
142 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
143 bind(L_check_always_rtm2);
144 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
145 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
146 jccb(Assembler::below, L_done);
147 if (method_data != NULL) {
148 // set rtm_state to "always rtm" in MDO
149 mov_metadata(tmpReg, method_data);
150 lock();
151 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
152 }
153 bind(L_done);
154 }
155
156 // Update counters and perform abort ratio calculation
157 // input: abort_status_Reg
158 // rtm_counters_Reg, flags are killed
159 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
160 Register rtm_counters_Reg,
161 RTMLockingCounters* rtm_counters,
162 Metadata* method_data,
163 bool profile_rtm) {
164
165 assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
166 // update rtm counters based on rax value at abort
167 // reads abort_status_Reg, updates flags
168 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
169 rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
170 if (profile_rtm) {
171 // Save abort status because abort_status_Reg is used by following code.
172 if (RTMRetryCount > 0) {
173 push(abort_status_Reg);
174 }
175 assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
176 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
177 // restore abort status
178 if (RTMRetryCount > 0) {
179 pop(abort_status_Reg);
180 }
181 }
182 }
183
184 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
185 // inputs: retry_count_Reg
186 // : abort_status_Reg
187 // output: retry_count_Reg decremented by 1
188 // flags are killed
189 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
190 Label doneRetry;
191 assert(abort_status_Reg == rax, "");
192 // The abort reason bits are in eax (see all states in rtmLocking.hpp)
193 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
194 // if reason is in 0x6 and retry count != 0 then retry
195 andptr(abort_status_Reg, 0x6);
196 jccb(Assembler::zero, doneRetry);
197 testl(retry_count_Reg, retry_count_Reg);
198 jccb(Assembler::zero, doneRetry);
199 pause();
200 decrementl(retry_count_Reg);
201 jmp(retryLabel);
202 bind(doneRetry);
203 }
204
205 // Spin and retry if lock is busy,
206 // inputs: box_Reg (monitor address)
207 // : retry_count_Reg
208 // output: retry_count_Reg decremented by 1
209 // : clear z flag if retry count exceeded
210 // tmp_Reg, scr_Reg, flags are killed
211 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
212 Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
213 Label SpinLoop, SpinExit, doneRetry;
214 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
215
216 testl(retry_count_Reg, retry_count_Reg);
217 jccb(Assembler::zero, doneRetry);
218 decrementl(retry_count_Reg);
219 movptr(scr_Reg, RTMSpinLoopCount);
220
221 bind(SpinLoop);
222 pause();
223 decrementl(scr_Reg);
224 jccb(Assembler::lessEqual, SpinExit);
225 movptr(tmp_Reg, Address(box_Reg, owner_offset));
226 testptr(tmp_Reg, tmp_Reg);
227 jccb(Assembler::notZero, SpinLoop);
228
229 bind(SpinExit);
230 jmp(retryLabel);
231 bind(doneRetry);
232 incrementl(retry_count_Reg); // clear z flag
233 }
234
235 // Use RTM for normal stack locks
236 // Input: objReg (object to lock)
237 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
238 Register retry_on_abort_count_Reg,
239 RTMLockingCounters* stack_rtm_counters,
240 Metadata* method_data, bool profile_rtm,
241 Label& DONE_LABEL, Label& IsInflated) {
242 assert(UseRTMForStackLocks, "why call this otherwise?");
243 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
244 assert(tmpReg == rax, "");
245 assert(scrReg == rdx, "");
246 Label L_rtm_retry, L_decrement_retry, L_on_abort;
247
248 if (RTMRetryCount > 0) {
249 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
250 bind(L_rtm_retry);
251 }
252 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
253 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
254 jcc(Assembler::notZero, IsInflated);
255
256 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
257 Label L_noincrement;
258 if (RTMTotalCountIncrRate > 1) {
259 // tmpReg, scrReg and flags are killed
260 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
261 }
262 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
263 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
264 bind(L_noincrement);
265 }
266 xbegin(L_on_abort);
267 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
268 andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
269 cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked
270 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked
271
272 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
273 if (UseRTMXendForLockBusy) {
274 xend();
275 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry)
276 jmp(L_decrement_retry);
277 }
278 else {
279 xabort(0);
280 }
281 bind(L_on_abort);
282 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
283 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
284 }
285 bind(L_decrement_retry);
286 if (RTMRetryCount > 0) {
287 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
288 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
289 }
290 }
291
292 // Use RTM for inflating locks
293 // inputs: objReg (object to lock)
294 // boxReg (on-stack box address (displaced header location) - KILLED)
295 // tmpReg (ObjectMonitor address + markWord::monitor_value)
296 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
297 Register scrReg, Register retry_on_busy_count_Reg,
298 Register retry_on_abort_count_Reg,
299 RTMLockingCounters* rtm_counters,
300 Metadata* method_data, bool profile_rtm,
301 Label& DONE_LABEL) {
302 assert(UseRTMLocking, "why call this otherwise?");
303 assert(tmpReg == rax, "");
304 assert(scrReg == rdx, "");
305 Label L_rtm_retry, L_decrement_retry, L_on_abort;
306 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
307
308 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
309 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
310 movptr(boxReg, tmpReg); // Save ObjectMonitor address
311
312 if (RTMRetryCount > 0) {
313 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy
314 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
315 bind(L_rtm_retry);
316 }
317 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
318 Label L_noincrement;
319 if (RTMTotalCountIncrRate > 1) {
320 // tmpReg, scrReg and flags are killed
321 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
322 }
323 assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
324 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
325 bind(L_noincrement);
326 }
327 xbegin(L_on_abort);
328 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
329 movptr(tmpReg, Address(tmpReg, owner_offset));
330 testptr(tmpReg, tmpReg);
331 jcc(Assembler::zero, DONE_LABEL);
332 if (UseRTMXendForLockBusy) {
333 xend();
334 jmp(L_decrement_retry);
335 }
336 else {
337 xabort(0);
338 }
339 bind(L_on_abort);
340 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
341 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
342 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
343 }
344 if (RTMRetryCount > 0) {
345 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
346 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
347 }
348
349 movptr(tmpReg, Address(boxReg, owner_offset)) ;
350 testptr(tmpReg, tmpReg) ;
351 jccb(Assembler::notZero, L_decrement_retry) ;
352
353 // Appears unlocked - try to swing _owner from null to non-null.
354 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
355 #ifdef _LP64
356 Register threadReg = r15_thread;
357 #else
358 get_thread(scrReg);
359 Register threadReg = scrReg;
360 #endif
361 lock();
362 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
363
364 if (RTMRetryCount > 0) {
365 // success done else retry
366 jccb(Assembler::equal, DONE_LABEL) ;
367 bind(L_decrement_retry);
368 // Spin and retry if lock is busy.
369 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
370 }
371 else {
372 bind(L_decrement_retry);
373 }
374 }
375
376 #endif // INCLUDE_RTM_OPT
377
378 // fast_lock and fast_unlock used by C2
379
380 // Because the transitions from emitted code to the runtime
381 // monitorenter/exit helper stubs are so slow it's critical that
382 // we inline both the stack-locking fast path and the inflated fast path.
383 //
384 // See also: cmpFastLock and cmpFastUnlock.
385 //
386 // What follows is a specialized inline transliteration of the code
387 // in enter() and exit(). If we're concerned about I$ bloat another
388 // option would be to emit TrySlowEnter and TrySlowExit methods
389 // at startup-time. These methods would accept arguments as
390 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
391 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply
392 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
393 // In practice, however, the # of lock sites is bounded and is usually small.
394 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
395 // if the processor uses simple bimodal branch predictors keyed by EIP
396 // Since the helper routines would be called from multiple synchronization
397 // sites.
398 //
399 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
400 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
401 // to those specialized methods. That'd give us a mostly platform-independent
402 // implementation that the JITs could optimize and inline at their pleasure.
403 // Done correctly, the only time we'd need to cross to native could would be
404 // to park() or unpark() threads. We'd also need a few more unsafe operators
405 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
406 // (b) explicit barriers or fence operations.
407 //
408 // TODO:
409 //
410 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
411 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
412 // Given TLAB allocation, Self is usually manifested in a register, so passing it into
413 // the lock operators would typically be faster than reifying Self.
414 //
415 // * Ideally I'd define the primitives as:
416 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
417 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
418 // Unfortunately ADLC bugs prevent us from expressing the ideal form.
419 // Instead, we're stuck with a rather awkward and brittle register assignments below.
420 // Furthermore the register assignments are overconstrained, possibly resulting in
421 // sub-optimal code near the synchronization site.
422 //
423 // * Eliminate the sp-proximity tests and just use "== Self" tests instead.
424 // Alternately, use a better sp-proximity test.
425 //
426 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
427 // Either one is sufficient to uniquely identify a thread.
428 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
429 //
430 // * Intrinsify notify() and notifyAll() for the common cases where the
431 // object is locked by the calling thread but the waitlist is empty.
432 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
433 //
434 // * use jccb and jmpb instead of jcc and jmp to improve code density.
435 // But beware of excessive branch density on AMD Opterons.
436 //
437 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success
438 // or failure of the fast path. If the fast path fails then we pass
439 // control to the slow path, typically in C. In fast_lock and
440 // fast_unlock we often branch to DONE_LABEL, just to find that C2
441 // will emit a conditional branch immediately after the node.
442 // So we have branches to branches and lots of ICC.ZF games.
443 // Instead, it might be better to have C2 pass a "FailureLabel"
444 // into fast_lock and fast_unlock. In the case of success, control
445 // will drop through the node. ICC.ZF is undefined at exit.
446 // In the case of failure, the node will branch directly to the
447 // FailureLabel
448
449
450 // obj: object to lock
451 // box: on-stack box address (displaced header location) - KILLED
452 // rax,: tmp -- KILLED
453 // scr: tmp -- KILLED
454 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
455 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread,
456 BiasedLockingCounters* counters,
457 RTMLockingCounters* rtm_counters,
458 RTMLockingCounters* stack_rtm_counters,
459 Metadata* method_data,
460 bool use_rtm, bool profile_rtm) {
461 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
462 // Ensure the register assignments are disjoint
463 assert(tmpReg == rax, "");
464
465 if (use_rtm) {
466 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
467 } else {
468 assert(cx2Reg == noreg, "");
469 assert_different_registers(objReg, boxReg, tmpReg, scrReg);
470 }
471
472 if (counters != NULL) {
473 atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
474 }
475
476 // Possible cases that we'll encounter in fast_lock
477 // ------------------------------------------------
478 // * Inflated
479 // -- unlocked
480 // -- Locked
481 // = by self
482 // = by other
483 // * biased
484 // -- by Self
485 // -- by other
486 // * neutral
487 // * stack-locked
488 // -- by self
489 // = sp-proximity test hits
490 // = sp-proximity test generates false-negative
491 // -- by other
492 //
493
494 Label IsInflated, DONE_LABEL;
495
496 if (DiagnoseSyncOnValueBasedClasses != 0) {
497 load_klass(tmpReg, objReg, cx1Reg);
498 movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
499 testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
500 jcc(Assembler::notZero, DONE_LABEL);
501 }
502
503 // it's stack-locked, biased or neutral
504 // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
505 // order to reduce the number of conditional branches in the most common cases.
506 // Beware -- there's a subtle invariant that fetch of the markword
507 // at [FETCH], below, will never observe a biased encoding (*101b).
508 // If this invariant is not held we risk exclusion (safety) failure.
509 if (UseBiasedLocking && !UseOptoBiasInlining) {
510 biased_locking_enter(boxReg, objReg, tmpReg, scrReg, cx1Reg, false, DONE_LABEL, NULL, counters);
511 }
512
513 #if INCLUDE_RTM_OPT
514 if (UseRTMForStackLocks && use_rtm) {
515 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
516 stack_rtm_counters, method_data, profile_rtm,
517 DONE_LABEL, IsInflated);
518 }
519 #endif // INCLUDE_RTM_OPT
520
521 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH]
522 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
523 jcc(Assembler::notZero, IsInflated);
524
525 if (LockingMode == LM_MONITOR) {
526 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
527 testptr(objReg, objReg);
528 } else {
529 assert(LockingMode == LM_LEGACY, "must be");
530 // Attempt stack-locking ...
531 orptr (tmpReg, markWord::unlocked_value);
532 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS
533 lock();
534 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg
535 if (counters != NULL) {
536 cond_inc32(Assembler::equal,
537 ExternalAddress((address)counters->fast_path_entry_count_addr()));
538 }
539 jcc(Assembler::equal, DONE_LABEL); // Success
540
541 // Recursive locking.
542 // The object is stack-locked: markword contains stack pointer to BasicLock.
543 // Locked by current thread if difference with current SP is less than one page.
544 subptr(tmpReg, rsp);
545 // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
546 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
547 movptr(Address(boxReg, 0), tmpReg);
548 if (counters != NULL) {
549 cond_inc32(Assembler::equal,
550 ExternalAddress((address)counters->fast_path_entry_count_addr()));
551 }
552 }
553 jmp(DONE_LABEL);
554
555 bind(IsInflated);
556 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
557
558 #if INCLUDE_RTM_OPT
559 // Use the same RTM locking code in 32- and 64-bit VM.
560 if (use_rtm) {
561 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
562 rtm_counters, method_data, profile_rtm, DONE_LABEL);
563 } else {
564 #endif // INCLUDE_RTM_OPT
565
566 #ifndef _LP64
567 // The object is inflated.
568
569 // boxReg refers to the on-stack BasicLock in the current frame.
570 // We'd like to write:
571 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices.
572 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers
573 // additional latency as we have another ST in the store buffer that must drain.
574
575 // avoid ST-before-CAS
576 // register juggle because we need tmpReg for cmpxchgptr below
577 movptr(scrReg, boxReg);
578 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
579
580 // Optimistic form: consider XORL tmpReg,tmpReg
581 movptr(tmpReg, NULL_WORD);
582
583 // Appears unlocked - try to swing _owner from null to non-null.
584 // Ideally, I'd manifest "Self" with get_thread and then attempt
585 // to CAS the register containing Self into m->Owner.
586 // But we don't have enough registers, so instead we can either try to CAS
587 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds
588 // we later store "Self" into m->Owner. Transiently storing a stack address
589 // (rsp or the address of the box) into m->owner is harmless.
590 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
591 lock();
592 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
593 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3
594 // If we weren't able to swing _owner from NULL to the BasicLock
595 // then take the slow path.
596 jccb (Assembler::notZero, DONE_LABEL);
597 // update _owner from BasicLock to thread
598 get_thread (scrReg); // beware: clobbers ICCs
599 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
600 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success
601
602 // If the CAS fails we can either retry or pass control to the slow path.
603 // We use the latter tactic.
604 // Pass the CAS result in the icc.ZFlag into DONE_LABEL
605 // If the CAS was successful ...
606 // Self has acquired the lock
607 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
608 // Intentional fall-through into DONE_LABEL ...
609 #else // _LP64
610 // It's inflated and we use scrReg for ObjectMonitor* in this section.
611 movq(scrReg, tmpReg);
612 xorq(tmpReg, tmpReg);
613 lock();
614 cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
615 // Unconditionally set box->_displaced_header = markWord::unused_mark().
616 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
617 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
618 // Propagate ICC.ZF from CAS above into DONE_LABEL.
619 jcc(Assembler::equal, DONE_LABEL); // CAS above succeeded; propagate ZF = 1 (success)
620
621 cmpptr(r15_thread, rax); // Check if we are already the owner (recursive lock)
622 jcc(Assembler::notEqual, DONE_LABEL); // If not recursive, ZF = 0 at this point (fail)
623 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
624 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
625 #endif // _LP64
626 #if INCLUDE_RTM_OPT
627 } // use_rtm()
628 #endif
629 // DONE_LABEL is a hot target - we'd really like to place it at the
630 // start of cache line by padding with NOPs.
631 // See the AMD and Intel software optimization manuals for the
632 // most efficient "long" NOP encodings.
633 // Unfortunately none of our alignment mechanisms suffice.
634 bind(DONE_LABEL);
635
636 // At DONE_LABEL the icc ZFlag is set as follows ...
637 // fast_unlock uses the same protocol.
638 // ZFlag == 1 -> Success
639 // ZFlag == 0 -> Failure - force control through the slow path
640 }
641
642 // obj: object to unlock
643 // box: box address (displaced header location), killed. Must be EAX.
644 // tmp: killed, cannot be obj nor box.
645 //
646 // Some commentary on balanced locking:
647 //
648 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
649 // Methods that don't have provably balanced locking are forced to run in the
650 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
651 // The interpreter provides two properties:
652 // I1: At return-time the interpreter automatically and quietly unlocks any
653 // objects acquired the current activation (frame). Recall that the
654 // interpreter maintains an on-stack list of locks currently held by
655 // a frame.
656 // I2: If a method attempts to unlock an object that is not held by the
657 // the frame the interpreter throws IMSX.
658 //
659 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
660 // B() doesn't have provably balanced locking so it runs in the interpreter.
661 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
662 // is still locked by A().
663 //
664 // The only other source of unbalanced locking would be JNI. The "Java Native Interface:
665 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
666 // should not be unlocked by "normal" java-level locking and vice-versa. The specification
667 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
668 // Arguably given that the spec legislates the JNI case as undefined our implementation
669 // could reasonably *avoid* checking owner in fast_unlock().
670 // In the interest of performance we elide m->Owner==Self check in unlock.
671 // A perfectly viable alternative is to elide the owner check except when
672 // Xcheck:jni is enabled.
673
674 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
675 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
676 assert(boxReg == rax, "");
677 assert_different_registers(objReg, boxReg, tmpReg);
678
679 Label DONE_LABEL, Stacked, CheckSucc;
680
681 // Critically, the biased locking test must have precedence over
682 // and appear before the (box->dhw == 0) recursive stack-lock test.
683 if (UseBiasedLocking && !UseOptoBiasInlining) {
684 biased_locking_exit(objReg, tmpReg, DONE_LABEL);
685 }
686
687 #if INCLUDE_RTM_OPT
688 if (UseRTMForStackLocks && use_rtm) {
689 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
690 Label L_regular_unlock;
691 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
692 andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
693 cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked
694 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock
695 xend(); // otherwise end...
696 jmp(DONE_LABEL); // ... and we're done
697 bind(L_regular_unlock);
698 }
699 #endif
700
701 if (LockingMode == LM_LEGACY) {
702 cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header
703 jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock
704 }
705 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
706 if (LockingMode != LM_MONITOR) {
707 testptr(tmpReg, markWord::monitor_value); // Inflated?
708 jcc(Assembler::zero, Stacked);
709 }
710
711 // It's inflated.
712
713 #if INCLUDE_RTM_OPT
714 if (use_rtm) {
715 Label L_regular_inflated_unlock;
716 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
717 movptr(boxReg, Address(tmpReg, owner_offset));
718 testptr(boxReg, boxReg);
719 jccb(Assembler::notZero, L_regular_inflated_unlock);
720 xend();
721 jmp(DONE_LABEL);
722 bind(L_regular_inflated_unlock);
723 }
724 #endif
725
726 // Despite our balanced locking property we still check that m->_owner == Self
727 // as java routines or native JNI code called by this thread might
728 // have released the lock.
729 // Refer to the comments in synchronizer.cpp for how we might encode extra
730 // state in _succ so we can avoid fetching EntryList|cxq.
731 //
732 // If there's no contention try a 1-0 exit. That is, exit without
733 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how
734 // we detect and recover from the race that the 1-0 exit admits.
735 //
736 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
737 // before it STs null into _owner, releasing the lock. Updates
738 // to data protected by the critical section must be visible before
739 // we drop the lock (and thus before any other thread could acquire
740 // the lock and observe the fields protected by the lock).
741 // IA32's memory-model is SPO, so STs are ordered with respect to
742 // each other and there's no need for an explicit barrier (fence).
743 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
744 #ifndef _LP64
745 get_thread (boxReg);
746
747 // Note that we could employ various encoding schemes to reduce
748 // the number of loads below (currently 4) to just 2 or 3.
749 // Refer to the comments in synchronizer.cpp.
750 // In practice the chain of fetches doesn't seem to impact performance, however.
751 xorptr(boxReg, boxReg);
752 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
753 jccb (Assembler::notZero, DONE_LABEL);
754 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
755 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
756 jccb (Assembler::notZero, DONE_LABEL);
757 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
758 jmpb (DONE_LABEL);
759
760 // Intention fall-thru into DONE_LABEL
761
762 // DONE_LABEL is a hot target - we'd really like to place it at the
763 // start of cache line by padding with NOPs.
764 // See the AMD and Intel software optimization manuals for the
765 // most efficient "long" NOP encodings.
766 // Unfortunately none of our alignment mechanisms suffice.
767 bind (CheckSucc);
768 #else // _LP64
769 // It's inflated
770 Label LNotRecursive, LSuccess, LGoSlowPath;
771
772 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
773 jccb(Assembler::equal, LNotRecursive);
774
775 // Recursive inflated unlock
776 decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
777 jmpb(LSuccess);
778
779 bind(LNotRecursive);
780 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
781 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
782 jccb (Assembler::notZero, CheckSucc);
783 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
784 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
785 jmpb (DONE_LABEL);
786
787 // Try to avoid passing control into the slow_path ...
788 bind (CheckSucc);
789
790 // The following optional optimization can be elided if necessary
791 // Effectively: if (succ == null) goto slow path
792 // The code reduces the window for a race, however,
793 // and thus benefits performance.
794 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
795 jccb (Assembler::zero, LGoSlowPath);
796
797 xorptr(boxReg, boxReg);
798 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
799 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
800
801 // Memory barrier/fence
802 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
803 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
804 // This is faster on Nehalem and AMD Shanghai/Barcelona.
805 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
806 // We might also restructure (ST Owner=0;barrier;LD _Succ) to
807 // (mov box,0; xchgq box, &m->Owner; LD _succ) .
808 lock(); addl(Address(rsp, 0), 0);
809
810 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
811 jccb (Assembler::notZero, LSuccess);
812
813 // Rare inopportune interleaving - race.
814 // The successor vanished in the small window above.
815 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
816 // We need to ensure progress and succession.
817 // Try to reacquire the lock.
818 // If that fails then the new owner is responsible for succession and this
819 // thread needs to take no further action and can exit via the fast path (success).
820 // If the re-acquire succeeds then pass control into the slow path.
821 // As implemented, this latter mode is horrible because we generated more
822 // coherence traffic on the lock *and* artifically extended the critical section
823 // length while by virtue of passing control into the slow path.
824
825 // box is really RAX -- the following CMPXCHG depends on that binding
826 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
827 lock();
828 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
829 // There's no successor so we tried to regrab the lock.
830 // If that didn't work, then another thread grabbed the
831 // lock so we're done (and exit was a success).
832 jccb (Assembler::notEqual, LSuccess);
833 // Intentional fall-through into slow path
834
835 bind (LGoSlowPath);
836 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure
837 jmpb (DONE_LABEL);
838
839 bind (LSuccess);
840 testl (boxReg, 0); // set ICC.ZF=1 to indicate success
841 jmpb (DONE_LABEL);
842
843 #endif
844 if (LockingMode == LM_LEGACY) {
845 bind (Stacked);
846 movptr(tmpReg, Address (boxReg, 0)); // re-fetch
847 lock();
848 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
849 // Intentional fall-thru into DONE_LABEL
850 }
851
852 bind(DONE_LABEL);
853 }
854
855 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg,
856 Register t, Register thread) {
857 assert(LockingMode == LM_LIGHTWEIGHT, "must be");
858 assert(rax_reg == rax, "Used for CAS");
859 assert_different_registers(obj, box, rax_reg, t, thread);
860
861 // Handle inflated monitor.
862 Label inflated;
863 // Finish fast lock successfully. ZF value is irrelevant.
864 Label locked;
865 // Finish fast lock unsuccessfully. MUST jump with ZF == 0
866 Label slow_path;
867
868 if (DiagnoseSyncOnValueBasedClasses != 0) {
869 load_klass(rax_reg, obj, t);
870 movl(rax_reg, Address(rax_reg, Klass::access_flags_offset()));
871 testl(rax_reg, JVM_ACC_IS_VALUE_BASED_CLASS);
872 jcc(Assembler::notZero, slow_path);
873 }
874
875 const Register mark = t;
876
877 { // Lightweight Lock
878
879 Label push;
880
881 const Register top = box;
882
883 // Load the mark.
884 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
885
886 // Prefetch top.
887 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
888
889 // Check for monitor (0b10).
890 testptr(mark, markWord::monitor_value);
891 jcc(Assembler::notZero, inflated);
892
893 // Check if lock-stack is full.
894 cmpl(top, LockStack::end_offset() - 1);
895 jcc(Assembler::greater, slow_path);
896
897 // Check if recursive.
898 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
899 jccb(Assembler::equal, push);
900
901 // Try to lock. Transition lock bits 0b01 => 0b00
902 movptr(rax_reg, mark);
903 orptr(rax_reg, markWord::unlocked_value);
904 andptr(mark, ~(int32_t)markWord::unlocked_value);
905 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
906 jcc(Assembler::notEqual, slow_path);
907
908 bind(push);
909 // After successful lock, push object on lock-stack.
910 movptr(Address(thread, top), obj);
911 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
912 jmpb(locked);
913 }
914
915 { // Handle inflated monitor.
916 bind(inflated);
917
918 const Register tagged_monitor = mark;
919
920 // CAS owner (null => current thread).
921 xorptr(rax_reg, rax_reg);
922 lock(); cmpxchgptr(thread, Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
923 jccb(Assembler::equal, locked);
924
925 // Check if recursive.
926 cmpptr(thread, rax_reg);
927 jccb(Assembler::notEqual, slow_path);
928
929 // Recursive.
930 increment(Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
931 }
932
933 bind(locked);
934 // Set ZF = 1
935 xorl(rax_reg, rax_reg);
936
937 #ifdef ASSERT
938 // Check that locked label is reached with ZF set.
939 Label zf_correct;
940 jccb(Assembler::zero, zf_correct);
941 stop("Fast Lock ZF != 1");
942 #endif
943
944 bind(slow_path);
945 #ifdef ASSERT
946 // Check that slow_path label is reached with ZF not set.
947 jccb(Assembler::notZero, zf_correct);
948 stop("Fast Lock ZF != 0");
949 bind(zf_correct);
950 #endif
951 // C2 uses the value of ZF to determine the continuation.
952 }
953
954 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) {
955 assert(LockingMode == LM_LIGHTWEIGHT, "must be");
956 assert(reg_rax == rax, "Used for CAS");
957 assert_different_registers(obj, reg_rax, t);
958
959 // Handle inflated monitor.
960 Label inflated, inflated_check_lock_stack;
961 // Finish fast unlock successfully. MUST jump with ZF == 1
962 Label unlocked;
963
964 const Register mark = t;
965 const Register top = reg_rax;
966
967 Label dummy;
968 C2FastUnlockLightweightStub* stub = nullptr;
969
970 if (!Compile::current()->output()->in_scratch_emit_size()) {
971 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread);
972 Compile::current()->output()->add_stub(stub);
973 }
974
975 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path();
976 Label& check_successor = stub == nullptr ? dummy : stub->check_successor();
977
978 { // Lightweight Unlock
979
980 // Load top.
981 movl(top, Address(thread, JavaThread::lock_stack_top_offset()));
982
983 // Prefetch mark.
984 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
985
986 // Check if obj is top of lock-stack.
987 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize));
988 // Top of lock stack was not obj. Must be monitor.
989 jcc(Assembler::notEqual, inflated_check_lock_stack);
990
991 // Pop lock-stack.
992 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);)
993 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize);
994
995 // Check if recursive.
996 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize));
997 jcc(Assembler::equal, unlocked);
998
999 // We elide the monitor check, let the CAS fail instead.
1000
1001 // Try to unlock. Transition lock bits 0b00 => 0b01
1002 movptr(reg_rax, mark);
1003 andptr(reg_rax, ~(int32_t)markWord::lock_mask);
1004 orptr(mark, markWord::unlocked_value);
1005 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
1006 jcc(Assembler::notEqual, push_and_slow_path);
1007 jmp(unlocked);
1008 }
1009
1010
1011 { // Handle inflated monitor.
1012 bind(inflated_check_lock_stack);
1013 #ifdef ASSERT
1014 Label check_done;
1015 subl(top, oopSize);
1016 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset()));
1017 jcc(Assembler::below, check_done);
1018 cmpptr(obj, Address(thread, top));
1019 jccb(Assembler::notEqual, inflated_check_lock_stack);
1020 stop("Fast Unlock lock on stack");
1021 bind(check_done);
1022 testptr(mark, markWord::monitor_value);
1023 jccb(Assembler::notZero, inflated);
1024 stop("Fast Unlock not monitor");
1025 #endif
1026
1027 bind(inflated);
1028
1029 // mark contains the tagged ObjectMonitor*.
1030 const Register monitor = mark;
1031
1032 #ifndef _LP64
1033 // Check if recursive.
1034 xorptr(reg_rax, reg_rax);
1035 orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1036 jcc(Assembler::notZero, check_successor);
1037
1038 // Check if the entry lists are empty.
1039 movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1040 orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1041 jcc(Assembler::notZero, check_successor);
1042
1043 // Release lock.
1044 movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
1045 #else // _LP64
1046 Label recursive;
1047
1048 // Check if recursive.
1049 cmpptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
1050 jccb(Assembler::notEqual, recursive);
1051
1052 // Check if the entry lists are empty.
1053 movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
1054 orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
1055 jcc(Assembler::notZero, check_successor);
1056
1057 // Release lock.
1058 movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
1059 jmpb(unlocked);
1060
1061 // Recursive unlock.
1062 bind(recursive);
1063 decrement(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
1064 xorl(t, t);
1065 #endif
1066 }
1067
1068 bind(unlocked);
1069 if (stub != nullptr) {
1070 bind(stub->unlocked_continuation());
1071 }
1072
1073 #ifdef ASSERT
1074 // Check that unlocked label is reached with ZF set.
1075 Label zf_correct;
1076 jccb(Assembler::zero, zf_correct);
1077 stop("Fast Unlock ZF != 1");
1078 #endif
1079
1080 if (stub != nullptr) {
1081 bind(stub->slow_path_continuation());
1082 }
1083 #ifdef ASSERT
1084 // Check that stub->continuation() label is reached with ZF not set.
1085 jccb(Assembler::notZero, zf_correct);
1086 stop("Fast Unlock ZF != 0");
1087 bind(zf_correct);
1088 #endif
1089 // C2 uses the value of ZF to determine the continuation.
1090 }
1091
1092 //-------------------------------------------------------------------------------------------
1093 // Generic instructions support for use in .ad files C2 code generation
1094
1095 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
1096 if (dst != src) {
1097 movdqu(dst, src);
1098 }
1099 if (opcode == Op_AbsVD) {
1100 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
1101 } else {
1102 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
1103 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
1104 }
1105 }
1106
1107 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
1108 if (opcode == Op_AbsVD) {
1109 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
1110 } else {
1111 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
1112 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
1113 }
1114 }
1115
1116 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
1117 if (dst != src) {
1118 movdqu(dst, src);
1119 }
1120 if (opcode == Op_AbsVF) {
1121 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
1122 } else {
1123 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1124 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
1125 }
1126 }
1127
1128 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
1129 if (opcode == Op_AbsVF) {
1130 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
1131 } else {
1132 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
1133 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
1134 }
1135 }
1136
1137 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
1138 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1139 assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
1140
1141 if (opcode == Op_MinV) {
1142 if (elem_bt == T_BYTE) {
1143 pminsb(dst, src);
1144 } else if (elem_bt == T_SHORT) {
1145 pminsw(dst, src);
1146 } else if (elem_bt == T_INT) {
1147 pminsd(dst, src);
1148 } else {
1149 assert(elem_bt == T_LONG, "required");
1150 assert(tmp == xmm0, "required");
1151 assert_different_registers(dst, src, tmp);
1152 movdqu(xmm0, dst);
1153 pcmpgtq(xmm0, src);
1154 blendvpd(dst, src); // xmm0 as mask
1155 }
1156 } else { // opcode == Op_MaxV
1157 if (elem_bt == T_BYTE) {
1158 pmaxsb(dst, src);
1159 } else if (elem_bt == T_SHORT) {
1160 pmaxsw(dst, src);
1161 } else if (elem_bt == T_INT) {
1162 pmaxsd(dst, src);
1163 } else {
1164 assert(elem_bt == T_LONG, "required");
1165 assert(tmp == xmm0, "required");
1166 assert_different_registers(dst, src, tmp);
1167 movdqu(xmm0, src);
1168 pcmpgtq(xmm0, dst);
1169 blendvpd(dst, src); // xmm0 as mask
1170 }
1171 }
1172 }
1173
1174 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
1175 XMMRegister dst, XMMRegister src1, XMMRegister src2,
1176 int vlen_enc) {
1177 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
1178
1179 if (opcode == Op_MinV) {
1180 if (elem_bt == T_BYTE) {
1181 vpminsb(dst, src1, src2, vlen_enc);
1182 } else if (elem_bt == T_SHORT) {
1183 vpminsw(dst, src1, src2, vlen_enc);
1184 } else if (elem_bt == T_INT) {
1185 vpminsd(dst, src1, src2, vlen_enc);
1186 } else {
1187 assert(elem_bt == T_LONG, "required");
1188 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1189 vpminsq(dst, src1, src2, vlen_enc);
1190 } else {
1191 assert_different_registers(dst, src1, src2);
1192 vpcmpgtq(dst, src1, src2, vlen_enc);
1193 vblendvpd(dst, src1, src2, dst, vlen_enc);
1194 }
1195 }
1196 } else { // opcode == Op_MaxV
1197 if (elem_bt == T_BYTE) {
1198 vpmaxsb(dst, src1, src2, vlen_enc);
1199 } else if (elem_bt == T_SHORT) {
1200 vpmaxsw(dst, src1, src2, vlen_enc);
1201 } else if (elem_bt == T_INT) {
1202 vpmaxsd(dst, src1, src2, vlen_enc);
1203 } else {
1204 assert(elem_bt == T_LONG, "required");
1205 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
1206 vpmaxsq(dst, src1, src2, vlen_enc);
1207 } else {
1208 assert_different_registers(dst, src1, src2);
1209 vpcmpgtq(dst, src1, src2, vlen_enc);
1210 vblendvpd(dst, src2, src1, dst, vlen_enc);
1211 }
1212 }
1213 }
1214 }
1215
1216 // Float/Double min max
1217
1218 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
1219 XMMRegister dst, XMMRegister a, XMMRegister b,
1220 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
1221 int vlen_enc) {
1222 assert(UseAVX > 0, "required");
1223 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1224 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1225 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1226 assert_different_registers(a, b, tmp, atmp, btmp);
1227
1228 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1229 bool is_double_word = is_double_word_type(elem_bt);
1230
1231 if (!is_double_word && is_min) {
1232 vblendvps(atmp, a, b, a, vlen_enc);
1233 vblendvps(btmp, b, a, a, vlen_enc);
1234 vminps(tmp, atmp, btmp, vlen_enc);
1235 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1236 vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1237 } else if (!is_double_word && !is_min) {
1238 vblendvps(btmp, b, a, b, vlen_enc);
1239 vblendvps(atmp, a, b, b, vlen_enc);
1240 vmaxps(tmp, atmp, btmp, vlen_enc);
1241 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1242 vblendvps(dst, tmp, atmp, btmp, vlen_enc);
1243 } else if (is_double_word && is_min) {
1244 vblendvpd(atmp, a, b, a, vlen_enc);
1245 vblendvpd(btmp, b, a, a, vlen_enc);
1246 vminpd(tmp, atmp, btmp, vlen_enc);
1247 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1248 vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1249 } else {
1250 assert(is_double_word && !is_min, "sanity");
1251 vblendvpd(btmp, b, a, b, vlen_enc);
1252 vblendvpd(atmp, a, b, b, vlen_enc);
1253 vmaxpd(tmp, atmp, btmp, vlen_enc);
1254 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1255 vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1256 }
1257 }
1258
1259 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1260 XMMRegister dst, XMMRegister a, XMMRegister b,
1261 KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1262 int vlen_enc) {
1263 assert(UseAVX > 2, "required");
1264 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1265 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1266 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1267 assert_different_registers(dst, a, b, atmp, btmp);
1268
1269 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1270 bool is_double_word = is_double_word_type(elem_bt);
1271 bool merge = true;
1272
1273 if (!is_double_word && is_min) {
1274 evpmovd2m(ktmp, a, vlen_enc);
1275 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1276 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1277 vminps(dst, atmp, btmp, vlen_enc);
1278 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1279 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1280 } else if (!is_double_word && !is_min) {
1281 evpmovd2m(ktmp, b, vlen_enc);
1282 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1283 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1284 vmaxps(dst, atmp, btmp, vlen_enc);
1285 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1286 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1287 } else if (is_double_word && is_min) {
1288 evpmovq2m(ktmp, a, vlen_enc);
1289 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1290 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1291 vminpd(dst, atmp, btmp, vlen_enc);
1292 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1293 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1294 } else {
1295 assert(is_double_word && !is_min, "sanity");
1296 evpmovq2m(ktmp, b, vlen_enc);
1297 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1298 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1299 vmaxpd(dst, atmp, btmp, vlen_enc);
1300 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1301 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1302 }
1303 }
1304
1305 // Float/Double signum
1306 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst,
1307 XMMRegister zero, XMMRegister one,
1308 Register scratch) {
1309 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1310
1311 Label DONE_LABEL;
1312
1313 if (opcode == Op_SignumF) {
1314 assert(UseSSE > 0, "required");
1315 ucomiss(dst, zero);
1316 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1317 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN
1318 movflt(dst, one);
1319 jcc(Assembler::above, DONE_LABEL);
1320 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scratch);
1321 } else if (opcode == Op_SignumD) {
1322 assert(UseSSE > 1, "required");
1323 ucomisd(dst, zero);
1324 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1325 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN
1326 movdbl(dst, one);
1327 jcc(Assembler::above, DONE_LABEL);
1328 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scratch);
1329 }
1330
1331 bind(DONE_LABEL);
1332 }
1333
1334 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1335 if (sign) {
1336 pmovsxbw(dst, src);
1337 } else {
1338 pmovzxbw(dst, src);
1339 }
1340 }
1341
1342 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1343 if (sign) {
1344 vpmovsxbw(dst, src, vector_len);
1345 } else {
1346 vpmovzxbw(dst, src, vector_len);
1347 }
1348 }
1349
1350 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1351 if (sign) {
1352 vpmovsxbd(dst, src, vector_len);
1353 } else {
1354 vpmovzxbd(dst, src, vector_len);
1355 }
1356 }
1357
1358 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1359 if (sign) {
1360 vpmovsxwd(dst, src, vector_len);
1361 } else {
1362 vpmovzxwd(dst, src, vector_len);
1363 }
1364 }
1365
1366 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1367 int shift, int vector_len) {
1368 if (opcode == Op_RotateLeftV) {
1369 if (etype == T_INT) {
1370 evprold(dst, src, shift, vector_len);
1371 } else {
1372 assert(etype == T_LONG, "expected type T_LONG");
1373 evprolq(dst, src, shift, vector_len);
1374 }
1375 } else {
1376 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1377 if (etype == T_INT) {
1378 evprord(dst, src, shift, vector_len);
1379 } else {
1380 assert(etype == T_LONG, "expected type T_LONG");
1381 evprorq(dst, src, shift, vector_len);
1382 }
1383 }
1384 }
1385
1386 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1387 XMMRegister shift, int vector_len) {
1388 if (opcode == Op_RotateLeftV) {
1389 if (etype == T_INT) {
1390 evprolvd(dst, src, shift, vector_len);
1391 } else {
1392 assert(etype == T_LONG, "expected type T_LONG");
1393 evprolvq(dst, src, shift, vector_len);
1394 }
1395 } else {
1396 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1397 if (etype == T_INT) {
1398 evprorvd(dst, src, shift, vector_len);
1399 } else {
1400 assert(etype == T_LONG, "expected type T_LONG");
1401 evprorvq(dst, src, shift, vector_len);
1402 }
1403 }
1404 }
1405
1406 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1407 if (opcode == Op_RShiftVI) {
1408 psrad(dst, shift);
1409 } else if (opcode == Op_LShiftVI) {
1410 pslld(dst, shift);
1411 } else {
1412 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1413 psrld(dst, shift);
1414 }
1415 }
1416
1417 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1418 switch (opcode) {
1419 case Op_RShiftVI: psrad(dst, shift); break;
1420 case Op_LShiftVI: pslld(dst, shift); break;
1421 case Op_URShiftVI: psrld(dst, shift); break;
1422
1423 default: assert(false, "%s", NodeClassNames[opcode]);
1424 }
1425 }
1426
1427 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1428 if (opcode == Op_RShiftVI) {
1429 vpsrad(dst, nds, shift, vector_len);
1430 } else if (opcode == Op_LShiftVI) {
1431 vpslld(dst, nds, shift, vector_len);
1432 } else {
1433 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1434 vpsrld(dst, nds, shift, vector_len);
1435 }
1436 }
1437
1438 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1439 switch (opcode) {
1440 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break;
1441 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break;
1442 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1443
1444 default: assert(false, "%s", NodeClassNames[opcode]);
1445 }
1446 }
1447
1448 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1449 switch (opcode) {
1450 case Op_RShiftVB: // fall-through
1451 case Op_RShiftVS: psraw(dst, shift); break;
1452
1453 case Op_LShiftVB: // fall-through
1454 case Op_LShiftVS: psllw(dst, shift); break;
1455
1456 case Op_URShiftVS: // fall-through
1457 case Op_URShiftVB: psrlw(dst, shift); break;
1458
1459 default: assert(false, "%s", NodeClassNames[opcode]);
1460 }
1461 }
1462
1463 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1464 switch (opcode) {
1465 case Op_RShiftVB: // fall-through
1466 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break;
1467
1468 case Op_LShiftVB: // fall-through
1469 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break;
1470
1471 case Op_URShiftVS: // fall-through
1472 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1473
1474 default: assert(false, "%s", NodeClassNames[opcode]);
1475 }
1476 }
1477
1478 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1479 switch (opcode) {
1480 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1481 case Op_LShiftVL: psllq(dst, shift); break;
1482 case Op_URShiftVL: psrlq(dst, shift); break;
1483
1484 default: assert(false, "%s", NodeClassNames[opcode]);
1485 }
1486 }
1487
1488 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1489 if (opcode == Op_RShiftVL) {
1490 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems
1491 } else if (opcode == Op_LShiftVL) {
1492 psllq(dst, shift);
1493 } else {
1494 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1495 psrlq(dst, shift);
1496 }
1497 }
1498
1499 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1500 switch (opcode) {
1501 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1502 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break;
1503 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1504
1505 default: assert(false, "%s", NodeClassNames[opcode]);
1506 }
1507 }
1508
1509 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1510 if (opcode == Op_RShiftVL) {
1511 evpsraq(dst, nds, shift, vector_len);
1512 } else if (opcode == Op_LShiftVL) {
1513 vpsllq(dst, nds, shift, vector_len);
1514 } else {
1515 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1516 vpsrlq(dst, nds, shift, vector_len);
1517 }
1518 }
1519
1520 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1521 switch (opcode) {
1522 case Op_RShiftVB: // fall-through
1523 case Op_RShiftVS: // fall-through
1524 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break;
1525
1526 case Op_LShiftVB: // fall-through
1527 case Op_LShiftVS: // fall-through
1528 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break;
1529
1530 case Op_URShiftVB: // fall-through
1531 case Op_URShiftVS: // fall-through
1532 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1533
1534 default: assert(false, "%s", NodeClassNames[opcode]);
1535 }
1536 }
1537
1538 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1539 switch (opcode) {
1540 case Op_RShiftVB: // fall-through
1541 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break;
1542
1543 case Op_LShiftVB: // fall-through
1544 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break;
1545
1546 case Op_URShiftVB: // fall-through
1547 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1548
1549 default: assert(false, "%s", NodeClassNames[opcode]);
1550 }
1551 }
1552
1553 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1554 assert(UseAVX >= 2, "required");
1555 switch (opcode) {
1556 case Op_RShiftVL: {
1557 if (UseAVX > 2) {
1558 assert(tmp == xnoreg, "not used");
1559 if (!VM_Version::supports_avx512vl()) {
1560 vlen_enc = Assembler::AVX_512bit;
1561 }
1562 evpsravq(dst, src, shift, vlen_enc);
1563 } else {
1564 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1565 vpsrlvq(dst, src, shift, vlen_enc);
1566 vpsrlvq(tmp, tmp, shift, vlen_enc);
1567 vpxor(dst, dst, tmp, vlen_enc);
1568 vpsubq(dst, dst, tmp, vlen_enc);
1569 }
1570 break;
1571 }
1572 case Op_LShiftVL: {
1573 assert(tmp == xnoreg, "not used");
1574 vpsllvq(dst, src, shift, vlen_enc);
1575 break;
1576 }
1577 case Op_URShiftVL: {
1578 assert(tmp == xnoreg, "not used");
1579 vpsrlvq(dst, src, shift, vlen_enc);
1580 break;
1581 }
1582 default: assert(false, "%s", NodeClassNames[opcode]);
1583 }
1584 }
1585
1586 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1587 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1588 assert(opcode == Op_LShiftVB ||
1589 opcode == Op_RShiftVB ||
1590 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1591 bool sign = (opcode != Op_URShiftVB);
1592 assert(vector_len == 0, "required");
1593 vextendbd(sign, dst, src, 1);
1594 vpmovzxbd(vtmp, shift, 1);
1595 varshiftd(opcode, dst, dst, vtmp, 1);
1596 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch);
1597 vextracti128_high(vtmp, dst);
1598 vpackusdw(dst, dst, vtmp, 0);
1599 }
1600
1601 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1602 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1603 assert(opcode == Op_LShiftVB ||
1604 opcode == Op_RShiftVB ||
1605 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1606 bool sign = (opcode != Op_URShiftVB);
1607 int ext_vector_len = vector_len + 1;
1608 vextendbw(sign, dst, src, ext_vector_len);
1609 vpmovzxbw(vtmp, shift, ext_vector_len);
1610 varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1611 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch);
1612 if (vector_len == 0) {
1613 vextracti128_high(vtmp, dst);
1614 vpackuswb(dst, dst, vtmp, vector_len);
1615 } else {
1616 vextracti64x4_high(vtmp, dst);
1617 vpackuswb(dst, dst, vtmp, vector_len);
1618 vpermq(dst, dst, 0xD8, vector_len);
1619 }
1620 }
1621
1622 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1623 switch(typ) {
1624 case T_BYTE:
1625 pinsrb(dst, val, idx);
1626 break;
1627 case T_SHORT:
1628 pinsrw(dst, val, idx);
1629 break;
1630 case T_INT:
1631 pinsrd(dst, val, idx);
1632 break;
1633 case T_LONG:
1634 pinsrq(dst, val, idx);
1635 break;
1636 default:
1637 assert(false,"Should not reach here.");
1638 break;
1639 }
1640 }
1641
1642 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1643 switch(typ) {
1644 case T_BYTE:
1645 vpinsrb(dst, src, val, idx);
1646 break;
1647 case T_SHORT:
1648 vpinsrw(dst, src, val, idx);
1649 break;
1650 case T_INT:
1651 vpinsrd(dst, src, val, idx);
1652 break;
1653 case T_LONG:
1654 vpinsrq(dst, src, val, idx);
1655 break;
1656 default:
1657 assert(false,"Should not reach here.");
1658 break;
1659 }
1660 }
1661
1662 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1663 switch(typ) {
1664 case T_INT:
1665 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1666 break;
1667 case T_FLOAT:
1668 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1669 break;
1670 case T_LONG:
1671 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1672 break;
1673 case T_DOUBLE:
1674 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1675 break;
1676 default:
1677 assert(false,"Should not reach here.");
1678 break;
1679 }
1680 }
1681
1682 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1683 switch(typ) {
1684 case T_INT:
1685 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1686 break;
1687 case T_FLOAT:
1688 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1689 break;
1690 case T_LONG:
1691 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1692 break;
1693 case T_DOUBLE:
1694 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1695 break;
1696 default:
1697 assert(false,"Should not reach here.");
1698 break;
1699 }
1700 }
1701
1702 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1703 switch(typ) {
1704 case T_INT:
1705 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1706 break;
1707 case T_FLOAT:
1708 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1709 break;
1710 case T_LONG:
1711 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1712 break;
1713 case T_DOUBLE:
1714 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1715 break;
1716 default:
1717 assert(false,"Should not reach here.");
1718 break;
1719 }
1720 }
1721
1722 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1723 if (vlen_in_bytes <= 16) {
1724 pxor (dst, dst);
1725 psubb(dst, src);
1726 switch (elem_bt) {
1727 case T_BYTE: /* nothing to do */ break;
1728 case T_SHORT: pmovsxbw(dst, dst); break;
1729 case T_INT: pmovsxbd(dst, dst); break;
1730 case T_FLOAT: pmovsxbd(dst, dst); break;
1731 case T_LONG: pmovsxbq(dst, dst); break;
1732 case T_DOUBLE: pmovsxbq(dst, dst); break;
1733
1734 default: assert(false, "%s", type2name(elem_bt));
1735 }
1736 } else {
1737 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1738 int vlen_enc = vector_length_encoding(vlen_in_bytes);
1739
1740 vpxor (dst, dst, dst, vlen_enc);
1741 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1742
1743 switch (elem_bt) {
1744 case T_BYTE: /* nothing to do */ break;
1745 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break;
1746 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break;
1747 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break;
1748 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break;
1749 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1750
1751 default: assert(false, "%s", type2name(elem_bt));
1752 }
1753 }
1754 }
1755
1756 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) {
1757 ExternalAddress addr(StubRoutines::x86::vector_iota_indices());
1758 if (vlen_in_bytes == 4) {
1759 movdl(dst, addr);
1760 } else if (vlen_in_bytes == 8) {
1761 movq(dst, addr);
1762 } else if (vlen_in_bytes == 16) {
1763 movdqu(dst, addr, scratch);
1764 } else if (vlen_in_bytes == 32) {
1765 vmovdqu(dst, addr, scratch);
1766 } else {
1767 assert(vlen_in_bytes == 64, "%d", vlen_in_bytes);
1768 evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch);
1769 }
1770 }
1771
1772 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1773
1774 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1775 int vector_len = Assembler::AVX_128bit;
1776
1777 switch (opcode) {
1778 case Op_AndReductionV: pand(dst, src); break;
1779 case Op_OrReductionV: por (dst, src); break;
1780 case Op_XorReductionV: pxor(dst, src); break;
1781 case Op_MinReductionV:
1782 switch (typ) {
1783 case T_BYTE: pminsb(dst, src); break;
1784 case T_SHORT: pminsw(dst, src); break;
1785 case T_INT: pminsd(dst, src); break;
1786 case T_LONG: assert(UseAVX > 2, "required");
1787 vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1788 default: assert(false, "wrong type");
1789 }
1790 break;
1791 case Op_MaxReductionV:
1792 switch (typ) {
1793 case T_BYTE: pmaxsb(dst, src); break;
1794 case T_SHORT: pmaxsw(dst, src); break;
1795 case T_INT: pmaxsd(dst, src); break;
1796 case T_LONG: assert(UseAVX > 2, "required");
1797 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1798 default: assert(false, "wrong type");
1799 }
1800 break;
1801 case Op_AddReductionVF: addss(dst, src); break;
1802 case Op_AddReductionVD: addsd(dst, src); break;
1803 case Op_AddReductionVI:
1804 switch (typ) {
1805 case T_BYTE: paddb(dst, src); break;
1806 case T_SHORT: paddw(dst, src); break;
1807 case T_INT: paddd(dst, src); break;
1808 default: assert(false, "wrong type");
1809 }
1810 break;
1811 case Op_AddReductionVL: paddq(dst, src); break;
1812 case Op_MulReductionVF: mulss(dst, src); break;
1813 case Op_MulReductionVD: mulsd(dst, src); break;
1814 case Op_MulReductionVI:
1815 switch (typ) {
1816 case T_SHORT: pmullw(dst, src); break;
1817 case T_INT: pmulld(dst, src); break;
1818 default: assert(false, "wrong type");
1819 }
1820 break;
1821 case Op_MulReductionVL: assert(UseAVX > 2, "required");
1822 vpmullq(dst, dst, src, vector_len); break;
1823 default: assert(false, "wrong opcode");
1824 }
1825 }
1826
1827 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1828 int vector_len = Assembler::AVX_256bit;
1829
1830 switch (opcode) {
1831 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break;
1832 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break;
1833 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break;
1834 case Op_MinReductionV:
1835 switch (typ) {
1836 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break;
1837 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break;
1838 case T_INT: vpminsd(dst, src1, src2, vector_len); break;
1839 case T_LONG: assert(UseAVX > 2, "required");
1840 vpminsq(dst, src1, src2, vector_len); break;
1841 default: assert(false, "wrong type");
1842 }
1843 break;
1844 case Op_MaxReductionV:
1845 switch (typ) {
1846 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break;
1847 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break;
1848 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break;
1849 case T_LONG: assert(UseAVX > 2, "required");
1850 vpmaxsq(dst, src1, src2, vector_len); break;
1851 default: assert(false, "wrong type");
1852 }
1853 break;
1854 case Op_AddReductionVI:
1855 switch (typ) {
1856 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break;
1857 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break;
1858 case T_INT: vpaddd(dst, src1, src2, vector_len); break;
1859 default: assert(false, "wrong type");
1860 }
1861 break;
1862 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1863 case Op_MulReductionVI:
1864 switch (typ) {
1865 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break;
1866 case T_INT: vpmulld(dst, src1, src2, vector_len); break;
1867 default: assert(false, "wrong type");
1868 }
1869 break;
1870 case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
1871 default: assert(false, "wrong opcode");
1872 }
1873 }
1874
1875 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1876 XMMRegister dst, XMMRegister src,
1877 XMMRegister vtmp1, XMMRegister vtmp2) {
1878 switch (opcode) {
1879 case Op_AddReductionVF:
1880 case Op_MulReductionVF:
1881 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1882 break;
1883
1884 case Op_AddReductionVD:
1885 case Op_MulReductionVD:
1886 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1887 break;
1888
1889 default: assert(false, "wrong opcode");
1890 }
1891 }
1892
1893 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1894 Register dst, Register src1, XMMRegister src2,
1895 XMMRegister vtmp1, XMMRegister vtmp2) {
1896 switch (vlen) {
1897 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1898 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1899 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1900 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1901
1902 default: assert(false, "wrong vector length");
1903 }
1904 }
1905
1906 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1907 Register dst, Register src1, XMMRegister src2,
1908 XMMRegister vtmp1, XMMRegister vtmp2) {
1909 switch (vlen) {
1910 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1911 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1912 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1913 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1914
1915 default: assert(false, "wrong vector length");
1916 }
1917 }
1918
1919 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1920 Register dst, Register src1, XMMRegister src2,
1921 XMMRegister vtmp1, XMMRegister vtmp2) {
1922 switch (vlen) {
1923 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1924 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1925 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1926 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1927
1928 default: assert(false, "wrong vector length");
1929 }
1930 }
1931
1932 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1933 Register dst, Register src1, XMMRegister src2,
1934 XMMRegister vtmp1, XMMRegister vtmp2) {
1935 switch (vlen) {
1936 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1937 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1938 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1939 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1940
1941 default: assert(false, "wrong vector length");
1942 }
1943 }
1944
1945 #ifdef _LP64
1946 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1947 Register dst, Register src1, XMMRegister src2,
1948 XMMRegister vtmp1, XMMRegister vtmp2) {
1949 switch (vlen) {
1950 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1951 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1952 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1953
1954 default: assert(false, "wrong vector length");
1955 }
1956 }
1957 #endif // _LP64
1958
1959 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1960 switch (vlen) {
1961 case 2:
1962 assert(vtmp2 == xnoreg, "");
1963 reduce2F(opcode, dst, src, vtmp1);
1964 break;
1965 case 4:
1966 assert(vtmp2 == xnoreg, "");
1967 reduce4F(opcode, dst, src, vtmp1);
1968 break;
1969 case 8:
1970 reduce8F(opcode, dst, src, vtmp1, vtmp2);
1971 break;
1972 case 16:
1973 reduce16F(opcode, dst, src, vtmp1, vtmp2);
1974 break;
1975 default: assert(false, "wrong vector length");
1976 }
1977 }
1978
1979 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1980 switch (vlen) {
1981 case 2:
1982 assert(vtmp2 == xnoreg, "");
1983 reduce2D(opcode, dst, src, vtmp1);
1984 break;
1985 case 4:
1986 reduce4D(opcode, dst, src, vtmp1, vtmp2);
1987 break;
1988 case 8:
1989 reduce8D(opcode, dst, src, vtmp1, vtmp2);
1990 break;
1991 default: assert(false, "wrong vector length");
1992 }
1993 }
1994
1995 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1996 if (opcode == Op_AddReductionVI) {
1997 if (vtmp1 != src2) {
1998 movdqu(vtmp1, src2);
1999 }
2000 phaddd(vtmp1, vtmp1);
2001 } else {
2002 pshufd(vtmp1, src2, 0x1);
2003 reduce_operation_128(T_INT, opcode, vtmp1, src2);
2004 }
2005 movdl(vtmp2, src1);
2006 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2007 movdl(dst, vtmp1);
2008 }
2009
2010 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2011 if (opcode == Op_AddReductionVI) {
2012 if (vtmp1 != src2) {
2013 movdqu(vtmp1, src2);
2014 }
2015 phaddd(vtmp1, src2);
2016 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2017 } else {
2018 pshufd(vtmp2, src2, 0xE);
2019 reduce_operation_128(T_INT, opcode, vtmp2, src2);
2020 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2021 }
2022 }
2023
2024 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2025 if (opcode == Op_AddReductionVI) {
2026 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
2027 vextracti128_high(vtmp2, vtmp1);
2028 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
2029 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2030 } else {
2031 vextracti128_high(vtmp1, src2);
2032 reduce_operation_128(T_INT, opcode, vtmp1, src2);
2033 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2034 }
2035 }
2036
2037 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2038 vextracti64x4_high(vtmp2, src2);
2039 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
2040 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2041 }
2042
2043 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2044 pshufd(vtmp2, src2, 0x1);
2045 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2046 movdqu(vtmp1, vtmp2);
2047 psrldq(vtmp1, 2);
2048 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2049 movdqu(vtmp2, vtmp1);
2050 psrldq(vtmp2, 1);
2051 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
2052 movdl(vtmp2, src1);
2053 pmovsxbd(vtmp1, vtmp1);
2054 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2055 pextrb(dst, vtmp1, 0x0);
2056 movsbl(dst, dst);
2057 }
2058
2059 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2060 pshufd(vtmp1, src2, 0xE);
2061 reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
2062 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2063 }
2064
2065 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2066 vextracti128_high(vtmp2, src2);
2067 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
2068 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2069 }
2070
2071 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2072 vextracti64x4_high(vtmp1, src2);
2073 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
2074 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2075 }
2076
2077 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2078 pmovsxbw(vtmp2, src2);
2079 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2080 }
2081
2082 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2083 if (UseAVX > 1) {
2084 int vector_len = Assembler::AVX_256bit;
2085 vpmovsxbw(vtmp1, src2, vector_len);
2086 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2087 } else {
2088 pmovsxbw(vtmp2, src2);
2089 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2090 pshufd(vtmp2, src2, 0x1);
2091 pmovsxbw(vtmp2, src2);
2092 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2093 }
2094 }
2095
2096 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2097 if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
2098 int vector_len = Assembler::AVX_512bit;
2099 vpmovsxbw(vtmp1, src2, vector_len);
2100 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2101 } else {
2102 assert(UseAVX >= 2,"Should not reach here.");
2103 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
2104 vextracti128_high(vtmp2, src2);
2105 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2106 }
2107 }
2108
2109 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2110 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
2111 vextracti64x4_high(vtmp2, src2);
2112 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
2113 }
2114
2115 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2116 if (opcode == Op_AddReductionVI) {
2117 if (vtmp1 != src2) {
2118 movdqu(vtmp1, src2);
2119 }
2120 phaddw(vtmp1, vtmp1);
2121 phaddw(vtmp1, vtmp1);
2122 } else {
2123 pshufd(vtmp2, src2, 0x1);
2124 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2125 movdqu(vtmp1, vtmp2);
2126 psrldq(vtmp1, 2);
2127 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
2128 }
2129 movdl(vtmp2, src1);
2130 pmovsxwd(vtmp1, vtmp1);
2131 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
2132 pextrw(dst, vtmp1, 0x0);
2133 movswl(dst, dst);
2134 }
2135
2136 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2137 if (opcode == Op_AddReductionVI) {
2138 if (vtmp1 != src2) {
2139 movdqu(vtmp1, src2);
2140 }
2141 phaddw(vtmp1, src2);
2142 } else {
2143 pshufd(vtmp1, src2, 0xE);
2144 reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
2145 }
2146 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2147 }
2148
2149 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2150 if (opcode == Op_AddReductionVI) {
2151 int vector_len = Assembler::AVX_256bit;
2152 vphaddw(vtmp2, src2, src2, vector_len);
2153 vpermq(vtmp2, vtmp2, 0xD8, vector_len);
2154 } else {
2155 vextracti128_high(vtmp2, src2);
2156 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
2157 }
2158 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2159 }
2160
2161 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2162 int vector_len = Assembler::AVX_256bit;
2163 vextracti64x4_high(vtmp1, src2);
2164 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
2165 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2166 }
2167
2168 #ifdef _LP64
2169 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2170 pshufd(vtmp2, src2, 0xE);
2171 reduce_operation_128(T_LONG, opcode, vtmp2, src2);
2172 movdq(vtmp1, src1);
2173 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
2174 movdq(dst, vtmp1);
2175 }
2176
2177 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2178 vextracti128_high(vtmp1, src2);
2179 reduce_operation_128(T_LONG, opcode, vtmp1, src2);
2180 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
2181 }
2182
2183 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
2184 vextracti64x4_high(vtmp2, src2);
2185 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
2186 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
2187 }
2188
2189 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
2190 assert(ArrayOperationPartialInlineSize > 0 && ArrayOperationPartialInlineSize <= 64, "invalid");
2191 mov64(temp, -1L);
2192 bzhiq(temp, temp, len);
2193 kmovql(dst, temp);
2194 }
2195 #endif // _LP64
2196
2197 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2198 reduce_operation_128(T_FLOAT, opcode, dst, src);
2199 pshufd(vtmp, src, 0x1);
2200 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2201 }
2202
2203 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2204 reduce2F(opcode, dst, src, vtmp);
2205 pshufd(vtmp, src, 0x2);
2206 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2207 pshufd(vtmp, src, 0x3);
2208 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
2209 }
2210
2211 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2212 reduce4F(opcode, dst, src, vtmp2);
2213 vextractf128_high(vtmp2, src);
2214 reduce4F(opcode, dst, vtmp2, vtmp1);
2215 }
2216
2217 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2218 reduce8F(opcode, dst, src, vtmp1, vtmp2);
2219 vextracti64x4_high(vtmp1, src);
2220 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
2221 }
2222
2223 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
2224 reduce_operation_128(T_DOUBLE, opcode, dst, src);
2225 pshufd(vtmp, src, 0xE);
2226 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
2227 }
2228
2229 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2230 reduce2D(opcode, dst, src, vtmp2);
2231 vextractf128_high(vtmp2, src);
2232 reduce2D(opcode, dst, vtmp2, vtmp1);
2233 }
2234
2235 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
2236 reduce4D(opcode, dst, src, vtmp1, vtmp2);
2237 vextracti64x4_high(vtmp1, src);
2238 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
2239 }
2240
2241 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {
2242 MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
2243 }
2244
2245 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {
2246 MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
2247 }
2248
2249
2250 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2251 XMMRegister dst, XMMRegister src,
2252 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2253 XMMRegister xmm_0, XMMRegister xmm_1) {
2254 int permconst[] = {1, 14};
2255 XMMRegister wsrc = src;
2256 XMMRegister wdst = xmm_0;
2257 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2258
2259 int vlen_enc = Assembler::AVX_128bit;
2260 if (vlen == 16) {
2261 vlen_enc = Assembler::AVX_256bit;
2262 }
2263
2264 for (int i = log2(vlen) - 1; i >=0; i--) {
2265 if (i == 0 && !is_dst_valid) {
2266 wdst = dst;
2267 }
2268 if (i == 3) {
2269 vextracti64x4_high(wtmp, wsrc);
2270 } else if (i == 2) {
2271 vextracti128_high(wtmp, wsrc);
2272 } else { // i = [0,1]
2273 vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2274 }
2275 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2276 wsrc = wdst;
2277 vlen_enc = Assembler::AVX_128bit;
2278 }
2279 if (is_dst_valid) {
2280 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2281 }
2282 }
2283
2284 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2285 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2286 XMMRegister xmm_0, XMMRegister xmm_1) {
2287 XMMRegister wsrc = src;
2288 XMMRegister wdst = xmm_0;
2289 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2290 int vlen_enc = Assembler::AVX_128bit;
2291 if (vlen == 8) {
2292 vlen_enc = Assembler::AVX_256bit;
2293 }
2294 for (int i = log2(vlen) - 1; i >=0; i--) {
2295 if (i == 0 && !is_dst_valid) {
2296 wdst = dst;
2297 }
2298 if (i == 1) {
2299 vextracti128_high(wtmp, wsrc);
2300 } else if (i == 2) {
2301 vextracti64x4_high(wtmp, wsrc);
2302 } else {
2303 assert(i == 0, "%d", i);
2304 vpermilpd(wtmp, wsrc, 1, vlen_enc);
2305 }
2306 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2307 wsrc = wdst;
2308 vlen_enc = Assembler::AVX_128bit;
2309 }
2310 if (is_dst_valid) {
2311 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2312 }
2313 }
2314
2315 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2316 switch (bt) {
2317 case T_BYTE: pextrb(dst, src, idx); break;
2318 case T_SHORT: pextrw(dst, src, idx); break;
2319 case T_INT: pextrd(dst, src, idx); break;
2320 case T_LONG: pextrq(dst, src, idx); break;
2321
2322 default:
2323 assert(false,"Should not reach here.");
2324 break;
2325 }
2326 }
2327
2328 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2329 int esize = type2aelembytes(typ);
2330 int elem_per_lane = 16/esize;
2331 int lane = elemindex / elem_per_lane;
2332 int eindex = elemindex % elem_per_lane;
2333
2334 if (lane >= 2) {
2335 assert(UseAVX > 2, "required");
2336 vextractf32x4(dst, src, lane & 3);
2337 return dst;
2338 } else if (lane > 0) {
2339 assert(UseAVX > 0, "required");
2340 vextractf128(dst, src, lane);
2341 return dst;
2342 } else {
2343 return src;
2344 }
2345 }
2346
2347 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2348 if (typ == T_BYTE) {
2349 movsbl(dst, dst);
2350 } else if (typ == T_SHORT) {
2351 movswl(dst, dst);
2352 }
2353 }
2354
2355 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2356 int esize = type2aelembytes(typ);
2357 int elem_per_lane = 16/esize;
2358 int eindex = elemindex % elem_per_lane;
2359 assert(is_integral_type(typ),"required");
2360
2361 if (eindex == 0) {
2362 if (typ == T_LONG) {
2363 movq(dst, src);
2364 } else {
2365 movdl(dst, src);
2366 movsxl(typ, dst);
2367 }
2368 } else {
2369 extract(typ, dst, src, eindex);
2370 movsxl(typ, dst);
2371 }
2372 }
2373
2374 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) {
2375 int esize = type2aelembytes(typ);
2376 int elem_per_lane = 16/esize;
2377 int eindex = elemindex % elem_per_lane;
2378 assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2379
2380 if (eindex == 0) {
2381 movq(dst, src);
2382 } else {
2383 if (typ == T_FLOAT) {
2384 if (UseAVX == 0) {
2385 movdqu(dst, src);
2386 pshufps(dst, dst, eindex);
2387 } else {
2388 vpshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2389 }
2390 } else {
2391 if (UseAVX == 0) {
2392 movdqu(dst, src);
2393 psrldq(dst, eindex*esize);
2394 } else {
2395 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2396 }
2397 movq(dst, dst);
2398 }
2399 }
2400 // Zero upper bits
2401 if (typ == T_FLOAT) {
2402 if (UseAVX == 0) {
2403 assert((vtmp != xnoreg) && (tmp != noreg), "required.");
2404 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp);
2405 pand(dst, vtmp);
2406 } else {
2407 assert((tmp != noreg), "required.");
2408 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp);
2409 }
2410 }
2411 }
2412
2413 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2414 switch(typ) {
2415 case T_BYTE:
2416 case T_BOOLEAN:
2417 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2418 break;
2419 case T_SHORT:
2420 case T_CHAR:
2421 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2422 break;
2423 case T_INT:
2424 case T_FLOAT:
2425 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2426 break;
2427 case T_LONG:
2428 case T_DOUBLE:
2429 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2430 break;
2431 default:
2432 assert(false,"Should not reach here.");
2433 break;
2434 }
2435 }
2436
2437 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) {
2438 switch(typ) {
2439 case T_BOOLEAN:
2440 case T_BYTE:
2441 evpcmpb(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2442 break;
2443 case T_CHAR:
2444 case T_SHORT:
2445 evpcmpw(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2446 break;
2447 case T_INT:
2448 case T_FLOAT:
2449 evpcmpd(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2450 break;
2451 case T_LONG:
2452 case T_DOUBLE:
2453 evpcmpq(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2454 break;
2455 default:
2456 assert(false,"Should not reach here.");
2457 break;
2458 }
2459 }
2460
2461 void C2_MacroAssembler::vpcmpu(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison,
2462 int vlen_in_bytes, XMMRegister vtmp1, XMMRegister vtmp2, Register scratch) {
2463 int vlen_enc = vector_length_encoding(vlen_in_bytes*2);
2464 switch (typ) {
2465 case T_BYTE:
2466 vpmovzxbw(vtmp1, src1, vlen_enc);
2467 vpmovzxbw(vtmp2, src2, vlen_enc);
2468 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2469 vpacksswb(dst, dst, dst, vlen_enc);
2470 break;
2471 case T_SHORT:
2472 vpmovzxwd(vtmp1, src1, vlen_enc);
2473 vpmovzxwd(vtmp2, src2, vlen_enc);
2474 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
2475 vpackssdw(dst, dst, dst, vlen_enc);
2476 break;
2477 case T_INT:
2478 vpmovzxdq(vtmp1, src1, vlen_enc);
2479 vpmovzxdq(vtmp2, src2, vlen_enc);
2480 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
2481 vpermilps(dst, dst, 8, vlen_enc);
2482 break;
2483 default:
2484 assert(false, "Should not reach here");
2485 }
2486 if (vlen_in_bytes == 16) {
2487 vpermpd(dst, dst, 0x8, vlen_enc);
2488 }
2489 }
2490
2491 void C2_MacroAssembler::vpcmpu32(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison, int vlen_in_bytes,
2492 XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, Register scratch) {
2493 int vlen_enc = vector_length_encoding(vlen_in_bytes);
2494 switch (typ) {
2495 case T_BYTE:
2496 vpmovzxbw(vtmp1, src1, vlen_enc);
2497 vpmovzxbw(vtmp2, src2, vlen_enc);
2498 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2499 vextracti128(vtmp1, src1, 1);
2500 vextracti128(vtmp2, src2, 1);
2501 vpmovzxbw(vtmp1, vtmp1, vlen_enc);
2502 vpmovzxbw(vtmp2, vtmp2, vlen_enc);
2503 vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2504 vpacksswb(dst, dst, vtmp3, vlen_enc);
2505 vpermpd(dst, dst, 0xd8, vlen_enc);
2506 break;
2507 case T_SHORT:
2508 vpmovzxwd(vtmp1, src1, vlen_enc);
2509 vpmovzxwd(vtmp2, src2, vlen_enc);
2510 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
2511 vextracti128(vtmp1, src1, 1);
2512 vextracti128(vtmp2, src2, 1);
2513 vpmovzxwd(vtmp1, vtmp1, vlen_enc);
2514 vpmovzxwd(vtmp2, vtmp2, vlen_enc);
2515 vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
2516 vpackssdw(dst, dst, vtmp3, vlen_enc);
2517 vpermpd(dst, dst, 0xd8, vlen_enc);
2518 break;
2519 case T_INT:
2520 vpmovzxdq(vtmp1, src1, vlen_enc);
2521 vpmovzxdq(vtmp2, src2, vlen_enc);
2522 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
2523 vpshufd(dst, dst, 8, vlen_enc);
2524 vpermq(dst, dst, 8, vlen_enc);
2525 vextracti128(vtmp1, src1, 1);
2526 vextracti128(vtmp2, src2, 1);
2527 vpmovzxdq(vtmp1, vtmp1, vlen_enc);
2528 vpmovzxdq(vtmp2, vtmp2, vlen_enc);
2529 vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
2530 vpshufd(vtmp3, vtmp3, 8, vlen_enc);
2531 vpermq(vtmp3, vtmp3, 0x80, vlen_enc);
2532 vpblendd(dst, dst, vtmp3, 0xf0, vlen_enc);
2533 break;
2534 default:
2535 assert(false, "Should not reach here");
2536 }
2537 }
2538
2539 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2540 switch(typ) {
2541 case T_BYTE:
2542 evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2543 break;
2544 case T_SHORT:
2545 evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2546 break;
2547 case T_INT:
2548 case T_FLOAT:
2549 evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2550 break;
2551 case T_LONG:
2552 case T_DOUBLE:
2553 evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2554 break;
2555 default:
2556 assert(false,"Should not reach here.");
2557 break;
2558 }
2559 }
2560
2561 void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2,
2562 XMMRegister vtmp1, XMMRegister vtmp2, KRegister mask) {
2563 switch(vlen) {
2564 case 4:
2565 assert(vtmp1 != xnoreg, "required.");
2566 // Broadcast lower 32 bits to 128 bits before ptest
2567 pshufd(vtmp1, src1, 0x0);
2568 if (bt == BoolTest::overflow) {
2569 assert(vtmp2 != xnoreg, "required.");
2570 pshufd(vtmp2, src2, 0x0);
2571 } else {
2572 assert(vtmp2 == xnoreg, "required.");
2573 vtmp2 = src2;
2574 }
2575 ptest(vtmp1, vtmp2);
2576 break;
2577 case 8:
2578 assert(vtmp1 != xnoreg, "required.");
2579 // Broadcast lower 64 bits to 128 bits before ptest
2580 pshufd(vtmp1, src1, 0x4);
2581 if (bt == BoolTest::overflow) {
2582 assert(vtmp2 != xnoreg, "required.");
2583 pshufd(vtmp2, src2, 0x4);
2584 } else {
2585 assert(vtmp2 == xnoreg, "required.");
2586 vtmp2 = src2;
2587 }
2588 ptest(vtmp1, vtmp2);
2589 break;
2590 case 16:
2591 assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2592 ptest(src1, src2);
2593 break;
2594 case 32:
2595 assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2596 vptest(src1, src2, Assembler::AVX_256bit);
2597 break;
2598 case 64:
2599 {
2600 assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2601 evpcmpeqb(mask, src1, src2, Assembler::AVX_512bit);
2602 if (bt == BoolTest::ne) {
2603 ktestql(mask, mask);
2604 } else {
2605 assert(bt == BoolTest::overflow, "required");
2606 kortestql(mask, mask);
2607 }
2608 }
2609 break;
2610 default:
2611 assert(false,"Should not reach here.");
2612 break;
2613 }
2614 }
2615
2616 //-------------------------------------------------------------------------------------------
2617
2618 // IndexOf for constant substrings with size >= 8 chars
2619 // which don't need to be loaded through stack.
2620 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2621 Register cnt1, Register cnt2,
2622 int int_cnt2, Register result,
2623 XMMRegister vec, Register tmp,
2624 int ae) {
2625 ShortBranchVerifier sbv(this);
2626 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2627 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2628
2629 // This method uses the pcmpestri instruction with bound registers
2630 // inputs:
2631 // xmm - substring
2632 // rax - substring length (elements count)
2633 // mem - scanned string
2634 // rdx - string length (elements count)
2635 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2636 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2637 // outputs:
2638 // rcx - matched index in string
2639 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2640 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2641 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2642 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2643 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2644
2645 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2646 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2647 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2648
2649 // Note, inline_string_indexOf() generates checks:
2650 // if (substr.count > string.count) return -1;
2651 // if (substr.count == 0) return 0;
2652 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2653
2654 // Load substring.
2655 if (ae == StrIntrinsicNode::UL) {
2656 pmovzxbw(vec, Address(str2, 0));
2657 } else {
2658 movdqu(vec, Address(str2, 0));
2659 }
2660 movl(cnt2, int_cnt2);
2661 movptr(result, str1); // string addr
2662
2663 if (int_cnt2 > stride) {
2664 jmpb(SCAN_TO_SUBSTR);
2665
2666 // Reload substr for rescan, this code
2667 // is executed only for large substrings (> 8 chars)
2668 bind(RELOAD_SUBSTR);
2669 if (ae == StrIntrinsicNode::UL) {
2670 pmovzxbw(vec, Address(str2, 0));
2671 } else {
2672 movdqu(vec, Address(str2, 0));
2673 }
2674 negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2675
2676 bind(RELOAD_STR);
2677 // We came here after the beginning of the substring was
2678 // matched but the rest of it was not so we need to search
2679 // again. Start from the next element after the previous match.
2680
2681 // cnt2 is number of substring reminding elements and
2682 // cnt1 is number of string reminding elements when cmp failed.
2683 // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2684 subl(cnt1, cnt2);
2685 addl(cnt1, int_cnt2);
2686 movl(cnt2, int_cnt2); // Now restore cnt2
2687
2688 decrementl(cnt1); // Shift to next element
2689 cmpl(cnt1, cnt2);
2690 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2691
2692 addptr(result, (1<<scale1));
2693
2694 } // (int_cnt2 > 8)
2695
2696 // Scan string for start of substr in 16-byte vectors
2697 bind(SCAN_TO_SUBSTR);
2698 pcmpestri(vec, Address(result, 0), mode);
2699 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
2700 subl(cnt1, stride);
2701 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2702 cmpl(cnt1, cnt2);
2703 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2704 addptr(result, 16);
2705 jmpb(SCAN_TO_SUBSTR);
2706
2707 // Found a potential substr
2708 bind(FOUND_CANDIDATE);
2709 // Matched whole vector if first element matched (tmp(rcx) == 0).
2710 if (int_cnt2 == stride) {
2711 jccb(Assembler::overflow, RET_FOUND); // OF == 1
2712 } else { // int_cnt2 > 8
2713 jccb(Assembler::overflow, FOUND_SUBSTR);
2714 }
2715 // After pcmpestri tmp(rcx) contains matched element index
2716 // Compute start addr of substr
2717 lea(result, Address(result, tmp, scale1));
2718
2719 // Make sure string is still long enough
2720 subl(cnt1, tmp);
2721 cmpl(cnt1, cnt2);
2722 if (int_cnt2 == stride) {
2723 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2724 } else { // int_cnt2 > 8
2725 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2726 }
2727 // Left less then substring.
2728
2729 bind(RET_NOT_FOUND);
2730 movl(result, -1);
2731 jmp(EXIT);
2732
2733 if (int_cnt2 > stride) {
2734 // This code is optimized for the case when whole substring
2735 // is matched if its head is matched.
2736 bind(MATCH_SUBSTR_HEAD);
2737 pcmpestri(vec, Address(result, 0), mode);
2738 // Reload only string if does not match
2739 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2740
2741 Label CONT_SCAN_SUBSTR;
2742 // Compare the rest of substring (> 8 chars).
2743 bind(FOUND_SUBSTR);
2744 // First 8 chars are already matched.
2745 negptr(cnt2);
2746 addptr(cnt2, stride);
2747
2748 bind(SCAN_SUBSTR);
2749 subl(cnt1, stride);
2750 cmpl(cnt2, -stride); // Do not read beyond substring
2751 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2752 // Back-up strings to avoid reading beyond substring:
2753 // cnt1 = cnt1 - cnt2 + 8
2754 addl(cnt1, cnt2); // cnt2 is negative
2755 addl(cnt1, stride);
2756 movl(cnt2, stride); negptr(cnt2);
2757 bind(CONT_SCAN_SUBSTR);
2758 if (int_cnt2 < (int)G) {
2759 int tail_off1 = int_cnt2<<scale1;
2760 int tail_off2 = int_cnt2<<scale2;
2761 if (ae == StrIntrinsicNode::UL) {
2762 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2763 } else {
2764 movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2765 }
2766 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2767 } else {
2768 // calculate index in register to avoid integer overflow (int_cnt2*2)
2769 movl(tmp, int_cnt2);
2770 addptr(tmp, cnt2);
2771 if (ae == StrIntrinsicNode::UL) {
2772 pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2773 } else {
2774 movdqu(vec, Address(str2, tmp, scale2, 0));
2775 }
2776 pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2777 }
2778 // Need to reload strings pointers if not matched whole vector
2779 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2780 addptr(cnt2, stride);
2781 jcc(Assembler::negative, SCAN_SUBSTR);
2782 // Fall through if found full substring
2783
2784 } // (int_cnt2 > 8)
2785
2786 bind(RET_FOUND);
2787 // Found result if we matched full small substring.
2788 // Compute substr offset
2789 subptr(result, str1);
2790 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2791 shrl(result, 1); // index
2792 }
2793 bind(EXIT);
2794
2795 } // string_indexofC8
2796
2797 // Small strings are loaded through stack if they cross page boundary.
2798 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2799 Register cnt1, Register cnt2,
2800 int int_cnt2, Register result,
2801 XMMRegister vec, Register tmp,
2802 int ae) {
2803 ShortBranchVerifier sbv(this);
2804 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2805 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2806
2807 //
2808 // int_cnt2 is length of small (< 8 chars) constant substring
2809 // or (-1) for non constant substring in which case its length
2810 // is in cnt2 register.
2811 //
2812 // Note, inline_string_indexOf() generates checks:
2813 // if (substr.count > string.count) return -1;
2814 // if (substr.count == 0) return 0;
2815 //
2816 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2817 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2818 // This method uses the pcmpestri instruction with bound registers
2819 // inputs:
2820 // xmm - substring
2821 // rax - substring length (elements count)
2822 // mem - scanned string
2823 // rdx - string length (elements count)
2824 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2825 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2826 // outputs:
2827 // rcx - matched index in string
2828 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2829 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2830 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2831 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2832
2833 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2834 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2835 FOUND_CANDIDATE;
2836
2837 { //========================================================
2838 // We don't know where these strings are located
2839 // and we can't read beyond them. Load them through stack.
2840 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2841
2842 movptr(tmp, rsp); // save old SP
2843
2844 if (int_cnt2 > 0) { // small (< 8 chars) constant substring
2845 if (int_cnt2 == (1>>scale2)) { // One byte
2846 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2847 load_unsigned_byte(result, Address(str2, 0));
2848 movdl(vec, result); // move 32 bits
2849 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes
2850 // Not enough header space in 32-bit VM: 12+3 = 15.
2851 movl(result, Address(str2, -1));
2852 shrl(result, 8);
2853 movdl(vec, result); // move 32 bits
2854 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char
2855 load_unsigned_short(result, Address(str2, 0));
2856 movdl(vec, result); // move 32 bits
2857 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2858 movdl(vec, Address(str2, 0)); // move 32 bits
2859 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2860 movq(vec, Address(str2, 0)); // move 64 bits
2861 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2862 // Array header size is 12 bytes in 32-bit VM
2863 // + 6 bytes for 3 chars == 18 bytes,
2864 // enough space to load vec and shift.
2865 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2866 if (ae == StrIntrinsicNode::UL) {
2867 int tail_off = int_cnt2-8;
2868 pmovzxbw(vec, Address(str2, tail_off));
2869 psrldq(vec, -2*tail_off);
2870 }
2871 else {
2872 int tail_off = int_cnt2*(1<<scale2);
2873 movdqu(vec, Address(str2, tail_off-16));
2874 psrldq(vec, 16-tail_off);
2875 }
2876 }
2877 } else { // not constant substring
2878 cmpl(cnt2, stride);
2879 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2880
2881 // We can read beyond string if srt+16 does not cross page boundary
2882 // since heaps are aligned and mapped by pages.
2883 assert(os::vm_page_size() < (int)G, "default page should be small");
2884 movl(result, str2); // We need only low 32 bits
2885 andl(result, (os::vm_page_size()-1));
2886 cmpl(result, (os::vm_page_size()-16));
2887 jccb(Assembler::belowEqual, CHECK_STR);
2888
2889 // Move small strings to stack to allow load 16 bytes into vec.
2890 subptr(rsp, 16);
2891 int stk_offset = wordSize-(1<<scale2);
2892 push(cnt2);
2893
2894 bind(COPY_SUBSTR);
2895 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2896 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2897 movb(Address(rsp, cnt2, scale2, stk_offset), result);
2898 } else if (ae == StrIntrinsicNode::UU) {
2899 load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2900 movw(Address(rsp, cnt2, scale2, stk_offset), result);
2901 }
2902 decrement(cnt2);
2903 jccb(Assembler::notZero, COPY_SUBSTR);
2904
2905 pop(cnt2);
2906 movptr(str2, rsp); // New substring address
2907 } // non constant
2908
2909 bind(CHECK_STR);
2910 cmpl(cnt1, stride);
2911 jccb(Assembler::aboveEqual, BIG_STRINGS);
2912
2913 // Check cross page boundary.
2914 movl(result, str1); // We need only low 32 bits
2915 andl(result, (os::vm_page_size()-1));
2916 cmpl(result, (os::vm_page_size()-16));
2917 jccb(Assembler::belowEqual, BIG_STRINGS);
2918
2919 subptr(rsp, 16);
2920 int stk_offset = -(1<<scale1);
2921 if (int_cnt2 < 0) { // not constant
2922 push(cnt2);
2923 stk_offset += wordSize;
2924 }
2925 movl(cnt2, cnt1);
2926
2927 bind(COPY_STR);
2928 if (ae == StrIntrinsicNode::LL) {
2929 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2930 movb(Address(rsp, cnt2, scale1, stk_offset), result);
2931 } else {
2932 load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2933 movw(Address(rsp, cnt2, scale1, stk_offset), result);
2934 }
2935 decrement(cnt2);
2936 jccb(Assembler::notZero, COPY_STR);
2937
2938 if (int_cnt2 < 0) { // not constant
2939 pop(cnt2);
2940 }
2941 movptr(str1, rsp); // New string address
2942
2943 bind(BIG_STRINGS);
2944 // Load substring.
2945 if (int_cnt2 < 0) { // -1
2946 if (ae == StrIntrinsicNode::UL) {
2947 pmovzxbw(vec, Address(str2, 0));
2948 } else {
2949 movdqu(vec, Address(str2, 0));
2950 }
2951 push(cnt2); // substr count
2952 push(str2); // substr addr
2953 push(str1); // string addr
2954 } else {
2955 // Small (< 8 chars) constant substrings are loaded already.
2956 movl(cnt2, int_cnt2);
2957 }
2958 push(tmp); // original SP
2959
2960 } // Finished loading
2961
2962 //========================================================
2963 // Start search
2964 //
2965
2966 movptr(result, str1); // string addr
2967
2968 if (int_cnt2 < 0) { // Only for non constant substring
2969 jmpb(SCAN_TO_SUBSTR);
2970
2971 // SP saved at sp+0
2972 // String saved at sp+1*wordSize
2973 // Substr saved at sp+2*wordSize
2974 // Substr count saved at sp+3*wordSize
2975
2976 // Reload substr for rescan, this code
2977 // is executed only for large substrings (> 8 chars)
2978 bind(RELOAD_SUBSTR);
2979 movptr(str2, Address(rsp, 2*wordSize));
2980 movl(cnt2, Address(rsp, 3*wordSize));
2981 if (ae == StrIntrinsicNode::UL) {
2982 pmovzxbw(vec, Address(str2, 0));
2983 } else {
2984 movdqu(vec, Address(str2, 0));
2985 }
2986 // We came here after the beginning of the substring was
2987 // matched but the rest of it was not so we need to search
2988 // again. Start from the next element after the previous match.
2989 subptr(str1, result); // Restore counter
2990 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2991 shrl(str1, 1);
2992 }
2993 addl(cnt1, str1);
2994 decrementl(cnt1); // Shift to next element
2995 cmpl(cnt1, cnt2);
2996 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2997
2998 addptr(result, (1<<scale1));
2999 } // non constant
3000
3001 // Scan string for start of substr in 16-byte vectors
3002 bind(SCAN_TO_SUBSTR);
3003 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
3004 pcmpestri(vec, Address(result, 0), mode);
3005 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
3006 subl(cnt1, stride);
3007 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
3008 cmpl(cnt1, cnt2);
3009 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
3010 addptr(result, 16);
3011
3012 bind(ADJUST_STR);
3013 cmpl(cnt1, stride); // Do not read beyond string
3014 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
3015 // Back-up string to avoid reading beyond string.
3016 lea(result, Address(result, cnt1, scale1, -16));
3017 movl(cnt1, stride);
3018 jmpb(SCAN_TO_SUBSTR);
3019
3020 // Found a potential substr
3021 bind(FOUND_CANDIDATE);
3022 // After pcmpestri tmp(rcx) contains matched element index
3023
3024 // Make sure string is still long enough
3025 subl(cnt1, tmp);
3026 cmpl(cnt1, cnt2);
3027 jccb(Assembler::greaterEqual, FOUND_SUBSTR);
3028 // Left less then substring.
3029
3030 bind(RET_NOT_FOUND);
3031 movl(result, -1);
3032 jmp(CLEANUP);
3033
3034 bind(FOUND_SUBSTR);
3035 // Compute start addr of substr
3036 lea(result, Address(result, tmp, scale1));
3037 if (int_cnt2 > 0) { // Constant substring
3038 // Repeat search for small substring (< 8 chars)
3039 // from new point without reloading substring.
3040 // Have to check that we don't read beyond string.
3041 cmpl(tmp, stride-int_cnt2);
3042 jccb(Assembler::greater, ADJUST_STR);
3043 // Fall through if matched whole substring.
3044 } else { // non constant
3045 assert(int_cnt2 == -1, "should be != 0");
3046
3047 addl(tmp, cnt2);
3048 // Found result if we matched whole substring.
3049 cmpl(tmp, stride);
3050 jcc(Assembler::lessEqual, RET_FOUND);
3051
3052 // Repeat search for small substring (<= 8 chars)
3053 // from new point 'str1' without reloading substring.
3054 cmpl(cnt2, stride);
3055 // Have to check that we don't read beyond string.
3056 jccb(Assembler::lessEqual, ADJUST_STR);
3057
3058 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
3059 // Compare the rest of substring (> 8 chars).
3060 movptr(str1, result);
3061
3062 cmpl(tmp, cnt2);
3063 // First 8 chars are already matched.
3064 jccb(Assembler::equal, CHECK_NEXT);
3065
3066 bind(SCAN_SUBSTR);
3067 pcmpestri(vec, Address(str1, 0), mode);
3068 // Need to reload strings pointers if not matched whole vector
3069 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
3070
3071 bind(CHECK_NEXT);
3072 subl(cnt2, stride);
3073 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
3074 addptr(str1, 16);
3075 if (ae == StrIntrinsicNode::UL) {
3076 addptr(str2, 8);
3077 } else {
3078 addptr(str2, 16);
3079 }
3080 subl(cnt1, stride);
3081 cmpl(cnt2, stride); // Do not read beyond substring
3082 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
3083 // Back-up strings to avoid reading beyond substring.
3084
3085 if (ae == StrIntrinsicNode::UL) {
3086 lea(str2, Address(str2, cnt2, scale2, -8));
3087 lea(str1, Address(str1, cnt2, scale1, -16));
3088 } else {
3089 lea(str2, Address(str2, cnt2, scale2, -16));
3090 lea(str1, Address(str1, cnt2, scale1, -16));
3091 }
3092 subl(cnt1, cnt2);
3093 movl(cnt2, stride);
3094 addl(cnt1, stride);
3095 bind(CONT_SCAN_SUBSTR);
3096 if (ae == StrIntrinsicNode::UL) {
3097 pmovzxbw(vec, Address(str2, 0));
3098 } else {
3099 movdqu(vec, Address(str2, 0));
3100 }
3101 jmp(SCAN_SUBSTR);
3102
3103 bind(RET_FOUND_LONG);
3104 movptr(str1, Address(rsp, wordSize));
3105 } // non constant
3106
3107 bind(RET_FOUND);
3108 // Compute substr offset
3109 subptr(result, str1);
3110 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
3111 shrl(result, 1); // index
3112 }
3113 bind(CLEANUP);
3114 pop(rsp); // restore SP
3115
3116 } // string_indexof
3117
3118 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3119 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3120 ShortBranchVerifier sbv(this);
3121 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3122
3123 int stride = 8;
3124
3125 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
3126 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
3127 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
3128 FOUND_SEQ_CHAR, DONE_LABEL;
3129
3130 movptr(result, str1);
3131 if (UseAVX >= 2) {
3132 cmpl(cnt1, stride);
3133 jcc(Assembler::less, SCAN_TO_CHAR);
3134 cmpl(cnt1, 2*stride);
3135 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
3136 movdl(vec1, ch);
3137 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
3138 vpxor(vec2, vec2);
3139 movl(tmp, cnt1);
3140 andl(tmp, 0xFFFFFFF0); //vector count (in chars)
3141 andl(cnt1,0x0000000F); //tail count (in chars)
3142
3143 bind(SCAN_TO_16_CHAR_LOOP);
3144 vmovdqu(vec3, Address(result, 0));
3145 vpcmpeqw(vec3, vec3, vec1, 1);
3146 vptest(vec2, vec3);
3147 jcc(Assembler::carryClear, FOUND_CHAR);
3148 addptr(result, 32);
3149 subl(tmp, 2*stride);
3150 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
3151 jmp(SCAN_TO_8_CHAR);
3152 bind(SCAN_TO_8_CHAR_INIT);
3153 movdl(vec1, ch);
3154 pshuflw(vec1, vec1, 0x00);
3155 pshufd(vec1, vec1, 0);
3156 pxor(vec2, vec2);
3157 }
3158 bind(SCAN_TO_8_CHAR);
3159 cmpl(cnt1, stride);
3160 jcc(Assembler::less, SCAN_TO_CHAR);
3161 if (UseAVX < 2) {
3162 movdl(vec1, ch);
3163 pshuflw(vec1, vec1, 0x00);
3164 pshufd(vec1, vec1, 0);
3165 pxor(vec2, vec2);
3166 }
3167 movl(tmp, cnt1);
3168 andl(tmp, 0xFFFFFFF8); //vector count (in chars)
3169 andl(cnt1,0x00000007); //tail count (in chars)
3170
3171 bind(SCAN_TO_8_CHAR_LOOP);
3172 movdqu(vec3, Address(result, 0));
3173 pcmpeqw(vec3, vec1);
3174 ptest(vec2, vec3);
3175 jcc(Assembler::carryClear, FOUND_CHAR);
3176 addptr(result, 16);
3177 subl(tmp, stride);
3178 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
3179 bind(SCAN_TO_CHAR);
3180 testl(cnt1, cnt1);
3181 jcc(Assembler::zero, RET_NOT_FOUND);
3182 bind(SCAN_TO_CHAR_LOOP);
3183 load_unsigned_short(tmp, Address(result, 0));
3184 cmpl(ch, tmp);
3185 jccb(Assembler::equal, FOUND_SEQ_CHAR);
3186 addptr(result, 2);
3187 subl(cnt1, 1);
3188 jccb(Assembler::zero, RET_NOT_FOUND);
3189 jmp(SCAN_TO_CHAR_LOOP);
3190
3191 bind(RET_NOT_FOUND);
3192 movl(result, -1);
3193 jmpb(DONE_LABEL);
3194
3195 bind(FOUND_CHAR);
3196 if (UseAVX >= 2) {
3197 vpmovmskb(tmp, vec3);
3198 } else {
3199 pmovmskb(tmp, vec3);
3200 }
3201 bsfl(ch, tmp);
3202 addptr(result, ch);
3203
3204 bind(FOUND_SEQ_CHAR);
3205 subptr(result, str1);
3206 shrl(result, 1);
3207
3208 bind(DONE_LABEL);
3209 } // string_indexof_char
3210
3211 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
3212 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
3213 ShortBranchVerifier sbv(this);
3214 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
3215
3216 int stride = 16;
3217
3218 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
3219 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
3220 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
3221 FOUND_SEQ_CHAR, DONE_LABEL;
3222
3223 movptr(result, str1);
3224 if (UseAVX >= 2) {
3225 cmpl(cnt1, stride);
3226 jcc(Assembler::less, SCAN_TO_CHAR_INIT);
3227 cmpl(cnt1, stride*2);
3228 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
3229 movdl(vec1, ch);
3230 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
3231 vpxor(vec2, vec2);
3232 movl(tmp, cnt1);
3233 andl(tmp, 0xFFFFFFE0); //vector count (in chars)
3234 andl(cnt1,0x0000001F); //tail count (in chars)
3235
3236 bind(SCAN_TO_32_CHAR_LOOP);
3237 vmovdqu(vec3, Address(result, 0));
3238 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
3239 vptest(vec2, vec3);
3240 jcc(Assembler::carryClear, FOUND_CHAR);
3241 addptr(result, 32);
3242 subl(tmp, stride*2);
3243 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
3244 jmp(SCAN_TO_16_CHAR);
3245
3246 bind(SCAN_TO_16_CHAR_INIT);
3247 movdl(vec1, ch);
3248 pxor(vec2, vec2);
3249 pshufb(vec1, vec2);
3250 }
3251
3252 bind(SCAN_TO_16_CHAR);
3253 cmpl(cnt1, stride);
3254 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entires left
3255 if (UseAVX < 2) {
3256 movdl(vec1, ch);
3257 pxor(vec2, vec2);
3258 pshufb(vec1, vec2);
3259 }
3260 movl(tmp, cnt1);
3261 andl(tmp, 0xFFFFFFF0); //vector count (in bytes)
3262 andl(cnt1,0x0000000F); //tail count (in bytes)
3263
3264 bind(SCAN_TO_16_CHAR_LOOP);
3265 movdqu(vec3, Address(result, 0));
3266 pcmpeqb(vec3, vec1);
3267 ptest(vec2, vec3);
3268 jcc(Assembler::carryClear, FOUND_CHAR);
3269 addptr(result, 16);
3270 subl(tmp, stride);
3271 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3272
3273 bind(SCAN_TO_CHAR_INIT);
3274 testl(cnt1, cnt1);
3275 jcc(Assembler::zero, RET_NOT_FOUND);
3276 bind(SCAN_TO_CHAR_LOOP);
3277 load_unsigned_byte(tmp, Address(result, 0));
3278 cmpl(ch, tmp);
3279 jccb(Assembler::equal, FOUND_SEQ_CHAR);
3280 addptr(result, 1);
3281 subl(cnt1, 1);
3282 jccb(Assembler::zero, RET_NOT_FOUND);
3283 jmp(SCAN_TO_CHAR_LOOP);
3284
3285 bind(RET_NOT_FOUND);
3286 movl(result, -1);
3287 jmpb(DONE_LABEL);
3288
3289 bind(FOUND_CHAR);
3290 if (UseAVX >= 2) {
3291 vpmovmskb(tmp, vec3);
3292 } else {
3293 pmovmskb(tmp, vec3);
3294 }
3295 bsfl(ch, tmp);
3296 addptr(result, ch);
3297
3298 bind(FOUND_SEQ_CHAR);
3299 subptr(result, str1);
3300
3301 bind(DONE_LABEL);
3302 } // stringL_indexof_char
3303
3304 // helper function for string_compare
3305 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3306 Address::ScaleFactor scale, Address::ScaleFactor scale1,
3307 Address::ScaleFactor scale2, Register index, int ae) {
3308 if (ae == StrIntrinsicNode::LL) {
3309 load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3310 load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3311 } else if (ae == StrIntrinsicNode::UU) {
3312 load_unsigned_short(elem1, Address(str1, index, scale, 0));
3313 load_unsigned_short(elem2, Address(str2, index, scale, 0));
3314 } else {
3315 load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3316 load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3317 }
3318 }
3319
3320 // Compare strings, used for char[] and byte[].
3321 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3322 Register cnt1, Register cnt2, Register result,
3323 XMMRegister vec1, int ae, KRegister mask) {
3324 ShortBranchVerifier sbv(this);
3325 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3326 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3
3327 int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3328 int stride2x2 = 0x40;
3329 Address::ScaleFactor scale = Address::no_scale;
3330 Address::ScaleFactor scale1 = Address::no_scale;
3331 Address::ScaleFactor scale2 = Address::no_scale;
3332
3333 if (ae != StrIntrinsicNode::LL) {
3334 stride2x2 = 0x20;
3335 }
3336
3337 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3338 shrl(cnt2, 1);
3339 }
3340 // Compute the minimum of the string lengths and the
3341 // difference of the string lengths (stack).
3342 // Do the conditional move stuff
3343 movl(result, cnt1);
3344 subl(cnt1, cnt2);
3345 push(cnt1);
3346 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2)
3347
3348 // Is the minimum length zero?
3349 testl(cnt2, cnt2);
3350 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3351 if (ae == StrIntrinsicNode::LL) {
3352 // Load first bytes
3353 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0]
3354 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0]
3355 } else if (ae == StrIntrinsicNode::UU) {
3356 // Load first characters
3357 load_unsigned_short(result, Address(str1, 0));
3358 load_unsigned_short(cnt1, Address(str2, 0));
3359 } else {
3360 load_unsigned_byte(result, Address(str1, 0));
3361 load_unsigned_short(cnt1, Address(str2, 0));
3362 }
3363 subl(result, cnt1);
3364 jcc(Assembler::notZero, POP_LABEL);
3365
3366 if (ae == StrIntrinsicNode::UU) {
3367 // Divide length by 2 to get number of chars
3368 shrl(cnt2, 1);
3369 }
3370 cmpl(cnt2, 1);
3371 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3372
3373 // Check if the strings start at the same location and setup scale and stride
3374 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3375 cmpptr(str1, str2);
3376 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3377 if (ae == StrIntrinsicNode::LL) {
3378 scale = Address::times_1;
3379 stride = 16;
3380 } else {
3381 scale = Address::times_2;
3382 stride = 8;
3383 }
3384 } else {
3385 scale1 = Address::times_1;
3386 scale2 = Address::times_2;
3387 // scale not used
3388 stride = 8;
3389 }
3390
3391 if (UseAVX >= 2 && UseSSE42Intrinsics) {
3392 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3393 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3394 Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3395 Label COMPARE_TAIL_LONG;
3396 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3
3397
3398 int pcmpmask = 0x19;
3399 if (ae == StrIntrinsicNode::LL) {
3400 pcmpmask &= ~0x01;
3401 }
3402
3403 // Setup to compare 16-chars (32-bytes) vectors,
3404 // start from first character again because it has aligned address.
3405 if (ae == StrIntrinsicNode::LL) {
3406 stride2 = 32;
3407 } else {
3408 stride2 = 16;
3409 }
3410 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3411 adr_stride = stride << scale;
3412 } else {
3413 adr_stride1 = 8; //stride << scale1;
3414 adr_stride2 = 16; //stride << scale2;
3415 }
3416
3417 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3418 // rax and rdx are used by pcmpestri as elements counters
3419 movl(result, cnt2);
3420 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count
3421 jcc(Assembler::zero, COMPARE_TAIL_LONG);
3422
3423 // fast path : compare first 2 8-char vectors.
3424 bind(COMPARE_16_CHARS);
3425 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3426 movdqu(vec1, Address(str1, 0));
3427 } else {
3428 pmovzxbw(vec1, Address(str1, 0));
3429 }
3430 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3431 jccb(Assembler::below, COMPARE_INDEX_CHAR);
3432
3433 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3434 movdqu(vec1, Address(str1, adr_stride));
3435 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3436 } else {
3437 pmovzxbw(vec1, Address(str1, adr_stride1));
3438 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3439 }
3440 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3441 addl(cnt1, stride);
3442
3443 // Compare the characters at index in cnt1
3444 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3445 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3446 subl(result, cnt2);
3447 jmp(POP_LABEL);
3448
3449 // Setup the registers to start vector comparison loop
3450 bind(COMPARE_WIDE_VECTORS);
3451 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3452 lea(str1, Address(str1, result, scale));
3453 lea(str2, Address(str2, result, scale));
3454 } else {
3455 lea(str1, Address(str1, result, scale1));
3456 lea(str2, Address(str2, result, scale2));
3457 }
3458 subl(result, stride2);
3459 subl(cnt2, stride2);
3460 jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3461 negptr(result);
3462
3463 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3464 bind(COMPARE_WIDE_VECTORS_LOOP);
3465
3466 #ifdef _LP64
3467 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3468 cmpl(cnt2, stride2x2);
3469 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3470 testl(cnt2, stride2x2-1); // cnt2 holds the vector count
3471 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40
3472
3473 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3474 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3475 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3476 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3477 } else {
3478 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3479 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3480 }
3481 kortestql(mask, mask);
3482 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare
3483 addptr(result, stride2x2); // update since we already compared at this addr
3484 subl(cnt2, stride2x2); // and sub the size too
3485 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3486
3487 vpxor(vec1, vec1);
3488 jmpb(COMPARE_WIDE_TAIL);
3489 }//if (VM_Version::supports_avx512vlbw())
3490 #endif // _LP64
3491
3492
3493 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3494 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3495 vmovdqu(vec1, Address(str1, result, scale));
3496 vpxor(vec1, Address(str2, result, scale));
3497 } else {
3498 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3499 vpxor(vec1, Address(str2, result, scale2));
3500 }
3501 vptest(vec1, vec1);
3502 jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3503 addptr(result, stride2);
3504 subl(cnt2, stride2);
3505 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3506 // clean upper bits of YMM registers
3507 vpxor(vec1, vec1);
3508
3509 // compare wide vectors tail
3510 bind(COMPARE_WIDE_TAIL);
3511 testptr(result, result);
3512 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3513
3514 movl(result, stride2);
3515 movl(cnt2, result);
3516 negptr(result);
3517 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3518
3519 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3520 bind(VECTOR_NOT_EQUAL);
3521 // clean upper bits of YMM registers
3522 vpxor(vec1, vec1);
3523 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3524 lea(str1, Address(str1, result, scale));
3525 lea(str2, Address(str2, result, scale));
3526 } else {
3527 lea(str1, Address(str1, result, scale1));
3528 lea(str2, Address(str2, result, scale2));
3529 }
3530 jmp(COMPARE_16_CHARS);
3531
3532 // Compare tail chars, length between 1 to 15 chars
3533 bind(COMPARE_TAIL_LONG);
3534 movl(cnt2, result);
3535 cmpl(cnt2, stride);
3536 jcc(Assembler::less, COMPARE_SMALL_STR);
3537
3538 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3539 movdqu(vec1, Address(str1, 0));
3540 } else {
3541 pmovzxbw(vec1, Address(str1, 0));
3542 }
3543 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3544 jcc(Assembler::below, COMPARE_INDEX_CHAR);
3545 subptr(cnt2, stride);
3546 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3547 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3548 lea(str1, Address(str1, result, scale));
3549 lea(str2, Address(str2, result, scale));
3550 } else {
3551 lea(str1, Address(str1, result, scale1));
3552 lea(str2, Address(str2, result, scale2));
3553 }
3554 negptr(cnt2);
3555 jmpb(WHILE_HEAD_LABEL);
3556
3557 bind(COMPARE_SMALL_STR);
3558 } else if (UseSSE42Intrinsics) {
3559 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3560 int pcmpmask = 0x19;
3561 // Setup to compare 8-char (16-byte) vectors,
3562 // start from first character again because it has aligned address.
3563 movl(result, cnt2);
3564 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count
3565 if (ae == StrIntrinsicNode::LL) {
3566 pcmpmask &= ~0x01;
3567 }
3568 jcc(Assembler::zero, COMPARE_TAIL);
3569 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3570 lea(str1, Address(str1, result, scale));
3571 lea(str2, Address(str2, result, scale));
3572 } else {
3573 lea(str1, Address(str1, result, scale1));
3574 lea(str2, Address(str2, result, scale2));
3575 }
3576 negptr(result);
3577
3578 // pcmpestri
3579 // inputs:
3580 // vec1- substring
3581 // rax - negative string length (elements count)
3582 // mem - scanned string
3583 // rdx - string length (elements count)
3584 // pcmpmask - cmp mode: 11000 (string compare with negated result)
3585 // + 00 (unsigned bytes) or + 01 (unsigned shorts)
3586 // outputs:
3587 // rcx - first mismatched element index
3588 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3589
3590 bind(COMPARE_WIDE_VECTORS);
3591 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3592 movdqu(vec1, Address(str1, result, scale));
3593 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3594 } else {
3595 pmovzxbw(vec1, Address(str1, result, scale1));
3596 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3597 }
3598 // After pcmpestri cnt1(rcx) contains mismatched element index
3599
3600 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1
3601 addptr(result, stride);
3602 subptr(cnt2, stride);
3603 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3604
3605 // compare wide vectors tail
3606 testptr(result, result);
3607 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3608
3609 movl(cnt2, stride);
3610 movl(result, stride);
3611 negptr(result);
3612 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3613 movdqu(vec1, Address(str1, result, scale));
3614 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3615 } else {
3616 pmovzxbw(vec1, Address(str1, result, scale1));
3617 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3618 }
3619 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3620
3621 // Mismatched characters in the vectors
3622 bind(VECTOR_NOT_EQUAL);
3623 addptr(cnt1, result);
3624 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3625 subl(result, cnt2);
3626 jmpb(POP_LABEL);
3627
3628 bind(COMPARE_TAIL); // limit is zero
3629 movl(cnt2, result);
3630 // Fallthru to tail compare
3631 }
3632 // Shift str2 and str1 to the end of the arrays, negate min
3633 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3634 lea(str1, Address(str1, cnt2, scale));
3635 lea(str2, Address(str2, cnt2, scale));
3636 } else {
3637 lea(str1, Address(str1, cnt2, scale1));
3638 lea(str2, Address(str2, cnt2, scale2));
3639 }
3640 decrementl(cnt2); // first character was compared already
3641 negptr(cnt2);
3642
3643 // Compare the rest of the elements
3644 bind(WHILE_HEAD_LABEL);
3645 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3646 subl(result, cnt1);
3647 jccb(Assembler::notZero, POP_LABEL);
3648 increment(cnt2);
3649 jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3650
3651 // Strings are equal up to min length. Return the length difference.
3652 bind(LENGTH_DIFF_LABEL);
3653 pop(result);
3654 if (ae == StrIntrinsicNode::UU) {
3655 // Divide diff by 2 to get number of chars
3656 sarl(result, 1);
3657 }
3658 jmpb(DONE_LABEL);
3659
3660 #ifdef _LP64
3661 if (VM_Version::supports_avx512vlbw()) {
3662
3663 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3664
3665 kmovql(cnt1, mask);
3666 notq(cnt1);
3667 bsfq(cnt2, cnt1);
3668 if (ae != StrIntrinsicNode::LL) {
3669 // Divide diff by 2 to get number of chars
3670 sarl(cnt2, 1);
3671 }
3672 addq(result, cnt2);
3673 if (ae == StrIntrinsicNode::LL) {
3674 load_unsigned_byte(cnt1, Address(str2, result));
3675 load_unsigned_byte(result, Address(str1, result));
3676 } else if (ae == StrIntrinsicNode::UU) {
3677 load_unsigned_short(cnt1, Address(str2, result, scale));
3678 load_unsigned_short(result, Address(str1, result, scale));
3679 } else {
3680 load_unsigned_short(cnt1, Address(str2, result, scale2));
3681 load_unsigned_byte(result, Address(str1, result, scale1));
3682 }
3683 subl(result, cnt1);
3684 jmpb(POP_LABEL);
3685 }//if (VM_Version::supports_avx512vlbw())
3686 #endif // _LP64
3687
3688 // Discard the stored length difference
3689 bind(POP_LABEL);
3690 pop(cnt1);
3691
3692 // That's it
3693 bind(DONE_LABEL);
3694 if(ae == StrIntrinsicNode::UL) {
3695 negl(result);
3696 }
3697
3698 }
3699
3700 // Search for Non-ASCII character (Negative byte value) in a byte array,
3701 // return true if it has any and false otherwise.
3702 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3703 // @IntrinsicCandidate
3704 // private static boolean hasNegatives(byte[] ba, int off, int len) {
3705 // for (int i = off; i < off + len; i++) {
3706 // if (ba[i] < 0) {
3707 // return true;
3708 // }
3709 // }
3710 // return false;
3711 // }
3712 void C2_MacroAssembler::has_negatives(Register ary1, Register len,
3713 Register result, Register tmp1,
3714 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3715 // rsi: byte array
3716 // rcx: len
3717 // rax: result
3718 ShortBranchVerifier sbv(this);
3719 assert_different_registers(ary1, len, result, tmp1);
3720 assert_different_registers(vec1, vec2);
3721 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3722
3723 // len == 0
3724 testl(len, len);
3725 jcc(Assembler::zero, FALSE_LABEL);
3726
3727 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3728 VM_Version::supports_avx512vlbw() &&
3729 VM_Version::supports_bmi2()) {
3730
3731 Label test_64_loop, test_tail;
3732 Register tmp3_aliased = len;
3733
3734 movl(tmp1, len);
3735 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3736
3737 andl(tmp1, 64 - 1); // tail count (in chars) 0x3F
3738 andl(len, ~(64 - 1)); // vector count (in chars)
3739 jccb(Assembler::zero, test_tail);
3740
3741 lea(ary1, Address(ary1, len, Address::times_1));
3742 negptr(len);
3743
3744 bind(test_64_loop);
3745 // Check whether our 64 elements of size byte contain negatives
3746 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3747 kortestql(mask1, mask1);
3748 jcc(Assembler::notZero, TRUE_LABEL);
3749
3750 addptr(len, 64);
3751 jccb(Assembler::notZero, test_64_loop);
3752
3753
3754 bind(test_tail);
3755 // bail out when there is nothing to be done
3756 testl(tmp1, -1);
3757 jcc(Assembler::zero, FALSE_LABEL);
3758
3759 // ~(~0 << len) applied up to two times (for 32-bit scenario)
3760 #ifdef _LP64
3761 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3762 shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3763 notq(tmp3_aliased);
3764 kmovql(mask2, tmp3_aliased);
3765 #else
3766 Label k_init;
3767 jmp(k_init);
3768
3769 // We could not read 64-bits from a general purpose register thus we move
3770 // data required to compose 64 1's to the instruction stream
3771 // We emit 64 byte wide series of elements from 0..63 which later on would
3772 // be used as a compare targets with tail count contained in tmp1 register.
3773 // Result would be a k register having tmp1 consecutive number or 1
3774 // counting from least significant bit.
3775 address tmp = pc();
3776 emit_int64(0x0706050403020100);
3777 emit_int64(0x0F0E0D0C0B0A0908);
3778 emit_int64(0x1716151413121110);
3779 emit_int64(0x1F1E1D1C1B1A1918);
3780 emit_int64(0x2726252423222120);
3781 emit_int64(0x2F2E2D2C2B2A2928);
3782 emit_int64(0x3736353433323130);
3783 emit_int64(0x3F3E3D3C3B3A3938);
3784
3785 bind(k_init);
3786 lea(len, InternalAddress(tmp));
3787 // create mask to test for negative byte inside a vector
3788 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3789 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
3790
3791 #endif
3792 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3793 ktestq(mask1, mask2);
3794 jcc(Assembler::notZero, TRUE_LABEL);
3795
3796 jmp(FALSE_LABEL);
3797 } else {
3798 movl(result, len); // copy
3799
3800 if (UseAVX >= 2 && UseSSE >= 2) {
3801 // With AVX2, use 32-byte vector compare
3802 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3803
3804 // Compare 32-byte vectors
3805 andl(result, 0x0000001f); // tail count (in bytes)
3806 andl(len, 0xffffffe0); // vector count (in bytes)
3807 jccb(Assembler::zero, COMPARE_TAIL);
3808
3809 lea(ary1, Address(ary1, len, Address::times_1));
3810 negptr(len);
3811
3812 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
3813 movdl(vec2, tmp1);
3814 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3815
3816 bind(COMPARE_WIDE_VECTORS);
3817 vmovdqu(vec1, Address(ary1, len, Address::times_1));
3818 vptest(vec1, vec2);
3819 jccb(Assembler::notZero, TRUE_LABEL);
3820 addptr(len, 32);
3821 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3822
3823 testl(result, result);
3824 jccb(Assembler::zero, FALSE_LABEL);
3825
3826 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3827 vptest(vec1, vec2);
3828 jccb(Assembler::notZero, TRUE_LABEL);
3829 jmpb(FALSE_LABEL);
3830
3831 bind(COMPARE_TAIL); // len is zero
3832 movl(len, result);
3833 // Fallthru to tail compare
3834 } else if (UseSSE42Intrinsics) {
3835 // With SSE4.2, use double quad vector compare
3836 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3837
3838 // Compare 16-byte vectors
3839 andl(result, 0x0000000f); // tail count (in bytes)
3840 andl(len, 0xfffffff0); // vector count (in bytes)
3841 jcc(Assembler::zero, COMPARE_TAIL);
3842
3843 lea(ary1, Address(ary1, len, Address::times_1));
3844 negptr(len);
3845
3846 movl(tmp1, 0x80808080);
3847 movdl(vec2, tmp1);
3848 pshufd(vec2, vec2, 0);
3849
3850 bind(COMPARE_WIDE_VECTORS);
3851 movdqu(vec1, Address(ary1, len, Address::times_1));
3852 ptest(vec1, vec2);
3853 jcc(Assembler::notZero, TRUE_LABEL);
3854 addptr(len, 16);
3855 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3856
3857 testl(result, result);
3858 jcc(Assembler::zero, FALSE_LABEL);
3859
3860 movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3861 ptest(vec1, vec2);
3862 jccb(Assembler::notZero, TRUE_LABEL);
3863 jmpb(FALSE_LABEL);
3864
3865 bind(COMPARE_TAIL); // len is zero
3866 movl(len, result);
3867 // Fallthru to tail compare
3868 }
3869 }
3870 // Compare 4-byte vectors
3871 andl(len, 0xfffffffc); // vector count (in bytes)
3872 jccb(Assembler::zero, COMPARE_CHAR);
3873
3874 lea(ary1, Address(ary1, len, Address::times_1));
3875 negptr(len);
3876
3877 bind(COMPARE_VECTORS);
3878 movl(tmp1, Address(ary1, len, Address::times_1));
3879 andl(tmp1, 0x80808080);
3880 jccb(Assembler::notZero, TRUE_LABEL);
3881 addptr(len, 4);
3882 jcc(Assembler::notZero, COMPARE_VECTORS);
3883
3884 // Compare trailing char (final 2 bytes), if any
3885 bind(COMPARE_CHAR);
3886 testl(result, 0x2); // tail char
3887 jccb(Assembler::zero, COMPARE_BYTE);
3888 load_unsigned_short(tmp1, Address(ary1, 0));
3889 andl(tmp1, 0x00008080);
3890 jccb(Assembler::notZero, TRUE_LABEL);
3891 subptr(result, 2);
3892 lea(ary1, Address(ary1, 2));
3893
3894 bind(COMPARE_BYTE);
3895 testl(result, 0x1); // tail byte
3896 jccb(Assembler::zero, FALSE_LABEL);
3897 load_unsigned_byte(tmp1, Address(ary1, 0));
3898 andl(tmp1, 0x00000080);
3899 jccb(Assembler::notEqual, TRUE_LABEL);
3900 jmpb(FALSE_LABEL);
3901
3902 bind(TRUE_LABEL);
3903 movl(result, 1); // return true
3904 jmpb(DONE);
3905
3906 bind(FALSE_LABEL);
3907 xorl(result, result); // return false
3908
3909 // That's it
3910 bind(DONE);
3911 if (UseAVX >= 2 && UseSSE >= 2) {
3912 // clean upper bits of YMM registers
3913 vpxor(vec1, vec1);
3914 vpxor(vec2, vec2);
3915 }
3916 }
3917 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
3918 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
3919 Register limit, Register result, Register chr,
3920 XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
3921 ShortBranchVerifier sbv(this);
3922 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
3923
3924 int length_offset = arrayOopDesc::length_offset_in_bytes();
3925 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
3926
3927 if (is_array_equ) {
3928 // Check the input args
3929 cmpoop(ary1, ary2);
3930 jcc(Assembler::equal, TRUE_LABEL);
3931
3932 // Need additional checks for arrays_equals.
3933 testptr(ary1, ary1);
3934 jcc(Assembler::zero, FALSE_LABEL);
3935 testptr(ary2, ary2);
3936 jcc(Assembler::zero, FALSE_LABEL);
3937
3938 // Check the lengths
3939 movl(limit, Address(ary1, length_offset));
3940 cmpl(limit, Address(ary2, length_offset));
3941 jcc(Assembler::notEqual, FALSE_LABEL);
3942 }
3943
3944 // count == 0
3945 testl(limit, limit);
3946 jcc(Assembler::zero, TRUE_LABEL);
3947
3948 if (is_array_equ) {
3949 // Load array address
3950 lea(ary1, Address(ary1, base_offset));
3951 lea(ary2, Address(ary2, base_offset));
3952 }
3953
3954 if (is_array_equ && is_char) {
3955 // arrays_equals when used for char[].
3956 shll(limit, 1); // byte count != 0
3957 }
3958 movl(result, limit); // copy
3959
3960 if (UseAVX >= 2) {
3961 // With AVX2, use 32-byte vector compare
3962 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3963
3964 // Compare 32-byte vectors
3965 andl(result, 0x0000001f); // tail count (in bytes)
3966 andl(limit, 0xffffffe0); // vector count (in bytes)
3967 jcc(Assembler::zero, COMPARE_TAIL);
3968
3969 lea(ary1, Address(ary1, limit, Address::times_1));
3970 lea(ary2, Address(ary2, limit, Address::times_1));
3971 negptr(limit);
3972
3973 #ifdef _LP64
3974 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3975 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
3976
3977 cmpl(limit, -64);
3978 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3979
3980 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3981
3982 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
3983 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
3984 kortestql(mask, mask);
3985 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
3986 addptr(limit, 64); // update since we already compared at this addr
3987 cmpl(limit, -64);
3988 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3989
3990 // At this point we may still need to compare -limit+result bytes.
3991 // We could execute the next two instruction and just continue via non-wide path:
3992 // cmpl(limit, 0);
3993 // jcc(Assembler::equal, COMPARE_TAIL); // true
3994 // But since we stopped at the points ary{1,2}+limit which are
3995 // not farther than 64 bytes from the ends of arrays ary{1,2}+result
3996 // (|limit| <= 32 and result < 32),
3997 // we may just compare the last 64 bytes.
3998 //
3999 addptr(result, -64); // it is safe, bc we just came from this area
4000 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
4001 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
4002 kortestql(mask, mask);
4003 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
4004
4005 jmp(TRUE_LABEL);
4006
4007 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
4008
4009 }//if (VM_Version::supports_avx512vlbw())
4010 #endif //_LP64
4011 bind(COMPARE_WIDE_VECTORS);
4012 vmovdqu(vec1, Address(ary1, limit, Address::times_1));
4013 vmovdqu(vec2, Address(ary2, limit, Address::times_1));
4014 vpxor(vec1, vec2);
4015
4016 vptest(vec1, vec1);
4017 jcc(Assembler::notZero, FALSE_LABEL);
4018 addptr(limit, 32);
4019 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4020
4021 testl(result, result);
4022 jcc(Assembler::zero, TRUE_LABEL);
4023
4024 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
4025 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
4026 vpxor(vec1, vec2);
4027
4028 vptest(vec1, vec1);
4029 jccb(Assembler::notZero, FALSE_LABEL);
4030 jmpb(TRUE_LABEL);
4031
4032 bind(COMPARE_TAIL); // limit is zero
4033 movl(limit, result);
4034 // Fallthru to tail compare
4035 } else if (UseSSE42Intrinsics) {
4036 // With SSE4.2, use double quad vector compare
4037 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
4038
4039 // Compare 16-byte vectors
4040 andl(result, 0x0000000f); // tail count (in bytes)
4041 andl(limit, 0xfffffff0); // vector count (in bytes)
4042 jcc(Assembler::zero, COMPARE_TAIL);
4043
4044 lea(ary1, Address(ary1, limit, Address::times_1));
4045 lea(ary2, Address(ary2, limit, Address::times_1));
4046 negptr(limit);
4047
4048 bind(COMPARE_WIDE_VECTORS);
4049 movdqu(vec1, Address(ary1, limit, Address::times_1));
4050 movdqu(vec2, Address(ary2, limit, Address::times_1));
4051 pxor(vec1, vec2);
4052
4053 ptest(vec1, vec1);
4054 jcc(Assembler::notZero, FALSE_LABEL);
4055 addptr(limit, 16);
4056 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
4057
4058 testl(result, result);
4059 jcc(Assembler::zero, TRUE_LABEL);
4060
4061 movdqu(vec1, Address(ary1, result, Address::times_1, -16));
4062 movdqu(vec2, Address(ary2, result, Address::times_1, -16));
4063 pxor(vec1, vec2);
4064
4065 ptest(vec1, vec1);
4066 jccb(Assembler::notZero, FALSE_LABEL);
4067 jmpb(TRUE_LABEL);
4068
4069 bind(COMPARE_TAIL); // limit is zero
4070 movl(limit, result);
4071 // Fallthru to tail compare
4072 }
4073
4074 // Compare 4-byte vectors
4075 andl(limit, 0xfffffffc); // vector count (in bytes)
4076 jccb(Assembler::zero, COMPARE_CHAR);
4077
4078 lea(ary1, Address(ary1, limit, Address::times_1));
4079 lea(ary2, Address(ary2, limit, Address::times_1));
4080 negptr(limit);
4081
4082 bind(COMPARE_VECTORS);
4083 movl(chr, Address(ary1, limit, Address::times_1));
4084 cmpl(chr, Address(ary2, limit, Address::times_1));
4085 jccb(Assembler::notEqual, FALSE_LABEL);
4086 addptr(limit, 4);
4087 jcc(Assembler::notZero, COMPARE_VECTORS);
4088
4089 // Compare trailing char (final 2 bytes), if any
4090 bind(COMPARE_CHAR);
4091 testl(result, 0x2); // tail char
4092 jccb(Assembler::zero, COMPARE_BYTE);
4093 load_unsigned_short(chr, Address(ary1, 0));
4094 load_unsigned_short(limit, Address(ary2, 0));
4095 cmpl(chr, limit);
4096 jccb(Assembler::notEqual, FALSE_LABEL);
4097
4098 if (is_array_equ && is_char) {
4099 bind(COMPARE_BYTE);
4100 } else {
4101 lea(ary1, Address(ary1, 2));
4102 lea(ary2, Address(ary2, 2));
4103
4104 bind(COMPARE_BYTE);
4105 testl(result, 0x1); // tail byte
4106 jccb(Assembler::zero, TRUE_LABEL);
4107 load_unsigned_byte(chr, Address(ary1, 0));
4108 load_unsigned_byte(limit, Address(ary2, 0));
4109 cmpl(chr, limit);
4110 jccb(Assembler::notEqual, FALSE_LABEL);
4111 }
4112 bind(TRUE_LABEL);
4113 movl(result, 1); // return true
4114 jmpb(DONE);
4115
4116 bind(FALSE_LABEL);
4117 xorl(result, result); // return false
4118
4119 // That's it
4120 bind(DONE);
4121 if (UseAVX >= 2) {
4122 // clean upper bits of YMM registers
4123 vpxor(vec1, vec1);
4124 vpxor(vec2, vec2);
4125 }
4126 }
4127
4128 #ifdef _LP64
4129 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
4130 Register tmp, KRegister ktmp, int masklen, int vec_enc) {
4131 assert(VM_Version::supports_avx512vlbw(), "");
4132 vpxor(xtmp, xtmp, xtmp, vec_enc);
4133 vpsubb(xtmp, xtmp, mask, vec_enc);
4134 evpmovb2m(ktmp, xtmp, vec_enc);
4135 kmovql(tmp, ktmp);
4136 switch(opc) {
4137 case Op_VectorMaskTrueCount:
4138 popcntq(dst, tmp);
4139 break;
4140 case Op_VectorMaskLastTrue:
4141 mov64(dst, -1);
4142 bsrq(tmp, tmp);
4143 cmov(Assembler::notZero, dst, tmp);
4144 break;
4145 case Op_VectorMaskFirstTrue:
4146 mov64(dst, masklen);
4147 bsfq(tmp, tmp);
4148 cmov(Assembler::notZero, dst, tmp);
4149 break;
4150 default: assert(false, "Unhandled mask operation");
4151 }
4152 }
4153
4154 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
4155 XMMRegister xtmp1, Register tmp, int masklen, int vec_enc) {
4156 assert(VM_Version::supports_avx(), "");
4157 vpxor(xtmp, xtmp, xtmp, vec_enc);
4158 vpsubb(xtmp, xtmp, mask, vec_enc);
4159 vpmovmskb(tmp, xtmp, vec_enc);
4160 if (masklen < 64) {
4161 andq(tmp, (((jlong)1 << masklen) - 1));
4162 }
4163 switch(opc) {
4164 case Op_VectorMaskTrueCount:
4165 popcntq(dst, tmp);
4166 break;
4167 case Op_VectorMaskLastTrue:
4168 mov64(dst, -1);
4169 bsrq(tmp, tmp);
4170 cmov(Assembler::notZero, dst, tmp);
4171 break;
4172 case Op_VectorMaskFirstTrue:
4173 mov64(dst, masklen);
4174 bsfq(tmp, tmp);
4175 cmov(Assembler::notZero, dst, tmp);
4176 break;
4177 default: assert(false, "Unhandled mask operation");
4178 }
4179 }
4180 #endif
4181
4182 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
4183 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
4184 int vlen_enc) {
4185 assert(VM_Version::supports_avx512bw(), "");
4186 // Byte shuffles are inlane operations and indices are determined using
4187 // lower 4 bit of each shuffle lane, thus all shuffle indices are
4188 // normalized to index range 0-15. This makes sure that all the multiples
4189 // of an index value are placed at same relative position in 128 bit
4190 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
4191 // will be 16th element in their respective 128 bit lanes.
4192 movl(rtmp, 16);
4193 evpbroadcastb(xtmp1, rtmp, vlen_enc);
4194
4195 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
4196 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
4197 // original shuffle indices and move the shuffled lanes corresponding to true
4198 // mask to destination vector.
4199 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
4200 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
4201 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
4202
4203 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
4204 // and broadcasting second 128 bit lane.
4205 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
4206 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
4207 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
4208 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
4209 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
4210
4211 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
4212 // and broadcasting third 128 bit lane.
4213 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc);
4214 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
4215 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
4216 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
4217 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
4218
4219 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
4220 // and broadcasting third 128 bit lane.
4221 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
4222 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
4223 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
4224 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
4225 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
4226 }
4227
4228 #ifdef _LP64
4229 void C2_MacroAssembler::load_nklass_compact_c2(Register dst, Register obj, Register index, Address::ScaleFactor scale, int disp) {
4230 C2LoadNKlassStub* stub = new (Compile::current()->comp_arena()) C2LoadNKlassStub(dst);
4231 Compile::current()->output()->add_stub(stub);
4232
4233 // Note: Don't clobber obj anywhere in that method!
4234
4235 // The incoming address is pointing into obj-start + klass_offset_in_bytes. We need to extract
4236 // obj-start, so that we can load from the object's mark-word instead. Usually the address
4237 // comes as obj-start in obj and klass_offset_in_bytes in disp. However, sometimes C2
4238 // emits code that pre-computes obj-start + klass_offset_in_bytes into a register, and
4239 // then passes that register as obj and 0 in disp. The following code extracts the base
4240 // and offset to load the mark-word.
4241 int offset = oopDesc::mark_offset_in_bytes() + disp - oopDesc::klass_offset_in_bytes();
4242 movq(dst, Address(obj, index, scale, offset));
4243 testb(dst, markWord::monitor_value);
4244 jcc(Assembler::notZero, stub->entry());
4245 bind(stub->continuation());
4246 shrq(dst, markWord::klass_shift);
4247 }
4248 #endif