1 /*
2 * Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "precompiled.hpp"
26 #include "asm/assembler.hpp"
27 #include "asm/assembler.inline.hpp"
28 #include "oops/methodData.hpp"
29 #include "opto/c2_MacroAssembler.hpp"
30 #include "opto/intrinsicnode.hpp"
31 #include "opto/opcodes.hpp"
32 #include "opto/subnode.hpp"
33 #include "runtime/biasedLocking.hpp"
34 #include "runtime/objectMonitor.hpp"
35 #include "runtime/stubRoutines.hpp"
36
37 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
38 switch (vlen_in_bytes) {
39 case 4: // fall-through
40 case 8: // fall-through
41 case 16: return Assembler::AVX_128bit;
42 case 32: return Assembler::AVX_256bit;
43 case 64: return Assembler::AVX_512bit;
44
45 default: {
46 ShouldNotReachHere();
47 return Assembler::AVX_NoVec;
48 }
49 }
50 }
51
52 void C2_MacroAssembler::setvectmask(Register dst, Register src, KRegister mask) {
53 guarantee(PostLoopMultiversioning, "must be");
54 Assembler::movl(dst, 1);
55 Assembler::shlxl(dst, dst, src);
56 Assembler::decl(dst);
57 Assembler::kmovdl(mask, dst);
58 Assembler::movl(dst, src);
59 }
60
61 void C2_MacroAssembler::restorevectmask(KRegister mask) {
62 guarantee(PostLoopMultiversioning, "must be");
63 Assembler::knotwl(mask, k0);
64 }
65
66 #if INCLUDE_RTM_OPT
67
68 // Update rtm_counters based on abort status
69 // input: abort_status
70 // rtm_counters (RTMLockingCounters*)
71 // flags are killed
72 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
73
74 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
75 if (PrintPreciseRTMLockingStatistics) {
76 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
77 Label check_abort;
78 testl(abort_status, (1<<i));
79 jccb(Assembler::equal, check_abort);
80 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
81 bind(check_abort);
82 }
83 }
84 }
85
86 // Branch if (random & (count-1) != 0), count is 2^n
87 // tmp, scr and flags are killed
88 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
89 assert(tmp == rax, "");
90 assert(scr == rdx, "");
91 rdtsc(); // modifies EDX:EAX
92 andptr(tmp, count-1);
93 jccb(Assembler::notZero, brLabel);
94 }
95
96 // Perform abort ratio calculation, set no_rtm bit if high ratio
97 // input: rtm_counters_Reg (RTMLockingCounters* address)
98 // tmpReg, rtm_counters_Reg and flags are killed
99 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
100 Register rtm_counters_Reg,
101 RTMLockingCounters* rtm_counters,
102 Metadata* method_data) {
103 Label L_done, L_check_always_rtm1, L_check_always_rtm2;
104
105 if (RTMLockingCalculationDelay > 0) {
106 // Delay calculation
107 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
108 testptr(tmpReg, tmpReg);
109 jccb(Assembler::equal, L_done);
110 }
111 // Abort ratio calculation only if abort_count > RTMAbortThreshold
112 // Aborted transactions = abort_count * 100
113 // All transactions = total_count * RTMTotalCountIncrRate
114 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
115
116 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
117 cmpptr(tmpReg, RTMAbortThreshold);
118 jccb(Assembler::below, L_check_always_rtm2);
119 imulptr(tmpReg, tmpReg, 100);
120
121 Register scrReg = rtm_counters_Reg;
122 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
123 imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
124 imulptr(scrReg, scrReg, RTMAbortRatio);
125 cmpptr(tmpReg, scrReg);
126 jccb(Assembler::below, L_check_always_rtm1);
127 if (method_data != NULL) {
128 // set rtm_state to "no rtm" in MDO
129 mov_metadata(tmpReg, method_data);
130 lock();
131 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
132 }
133 jmpb(L_done);
134 bind(L_check_always_rtm1);
135 // Reload RTMLockingCounters* address
136 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
137 bind(L_check_always_rtm2);
138 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
139 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
140 jccb(Assembler::below, L_done);
141 if (method_data != NULL) {
142 // set rtm_state to "always rtm" in MDO
143 mov_metadata(tmpReg, method_data);
144 lock();
145 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
146 }
147 bind(L_done);
148 }
149
150 // Update counters and perform abort ratio calculation
151 // input: abort_status_Reg
152 // rtm_counters_Reg, flags are killed
153 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
154 Register rtm_counters_Reg,
155 RTMLockingCounters* rtm_counters,
156 Metadata* method_data,
157 bool profile_rtm) {
158
159 assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
160 // update rtm counters based on rax value at abort
161 // reads abort_status_Reg, updates flags
162 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
163 rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
164 if (profile_rtm) {
165 // Save abort status because abort_status_Reg is used by following code.
166 if (RTMRetryCount > 0) {
167 push(abort_status_Reg);
168 }
169 assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
170 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
171 // restore abort status
172 if (RTMRetryCount > 0) {
173 pop(abort_status_Reg);
174 }
175 }
176 }
177
178 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
179 // inputs: retry_count_Reg
180 // : abort_status_Reg
181 // output: retry_count_Reg decremented by 1
182 // flags are killed
183 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
184 Label doneRetry;
185 assert(abort_status_Reg == rax, "");
186 // The abort reason bits are in eax (see all states in rtmLocking.hpp)
187 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
188 // if reason is in 0x6 and retry count != 0 then retry
189 andptr(abort_status_Reg, 0x6);
190 jccb(Assembler::zero, doneRetry);
191 testl(retry_count_Reg, retry_count_Reg);
192 jccb(Assembler::zero, doneRetry);
193 pause();
194 decrementl(retry_count_Reg);
195 jmp(retryLabel);
196 bind(doneRetry);
197 }
198
199 // Spin and retry if lock is busy,
200 // inputs: box_Reg (monitor address)
201 // : retry_count_Reg
202 // output: retry_count_Reg decremented by 1
203 // : clear z flag if retry count exceeded
204 // tmp_Reg, scr_Reg, flags are killed
205 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
206 Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
207 Label SpinLoop, SpinExit, doneRetry;
208 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
209
210 testl(retry_count_Reg, retry_count_Reg);
211 jccb(Assembler::zero, doneRetry);
212 decrementl(retry_count_Reg);
213 movptr(scr_Reg, RTMSpinLoopCount);
214
215 bind(SpinLoop);
216 pause();
217 decrementl(scr_Reg);
218 jccb(Assembler::lessEqual, SpinExit);
219 movptr(tmp_Reg, Address(box_Reg, owner_offset));
220 testptr(tmp_Reg, tmp_Reg);
221 jccb(Assembler::notZero, SpinLoop);
222
223 bind(SpinExit);
224 jmp(retryLabel);
225 bind(doneRetry);
226 incrementl(retry_count_Reg); // clear z flag
227 }
228
229 // Use RTM for normal stack locks
230 // Input: objReg (object to lock)
231 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
232 Register retry_on_abort_count_Reg,
233 RTMLockingCounters* stack_rtm_counters,
234 Metadata* method_data, bool profile_rtm,
235 Label& DONE_LABEL, Label& IsInflated) {
236 assert(UseRTMForStackLocks, "why call this otherwise?");
237 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
238 assert(tmpReg == rax, "");
239 assert(scrReg == rdx, "");
240 Label L_rtm_retry, L_decrement_retry, L_on_abort;
241
242 if (RTMRetryCount > 0) {
243 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
244 bind(L_rtm_retry);
245 }
246 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
247 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
248 jcc(Assembler::notZero, IsInflated);
249
250 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
251 Label L_noincrement;
252 if (RTMTotalCountIncrRate > 1) {
253 // tmpReg, scrReg and flags are killed
254 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
255 }
256 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
257 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
258 bind(L_noincrement);
259 }
260 xbegin(L_on_abort);
261 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
262 andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
263 cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked
264 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked
265
266 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
267 if (UseRTMXendForLockBusy) {
268 xend();
269 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry)
270 jmp(L_decrement_retry);
271 }
272 else {
273 xabort(0);
274 }
275 bind(L_on_abort);
276 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
277 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
278 }
279 bind(L_decrement_retry);
280 if (RTMRetryCount > 0) {
281 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
282 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
283 }
284 }
285
286 // Use RTM for inflating locks
287 // inputs: objReg (object to lock)
288 // boxReg (on-stack box address (displaced header location) - KILLED)
289 // tmpReg (ObjectMonitor address + markWord::monitor_value)
290 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
291 Register scrReg, Register retry_on_busy_count_Reg,
292 Register retry_on_abort_count_Reg,
293 RTMLockingCounters* rtm_counters,
294 Metadata* method_data, bool profile_rtm,
295 Label& DONE_LABEL) {
296 assert(UseRTMLocking, "why call this otherwise?");
297 assert(tmpReg == rax, "");
298 assert(scrReg == rdx, "");
299 Label L_rtm_retry, L_decrement_retry, L_on_abort;
300 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
301
302 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
303 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
304 movptr(boxReg, tmpReg); // Save ObjectMonitor address
305
306 if (RTMRetryCount > 0) {
307 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy
308 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
309 bind(L_rtm_retry);
310 }
311 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
312 Label L_noincrement;
313 if (RTMTotalCountIncrRate > 1) {
314 // tmpReg, scrReg and flags are killed
315 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
316 }
317 assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
318 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
319 bind(L_noincrement);
320 }
321 xbegin(L_on_abort);
322 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
323 movptr(tmpReg, Address(tmpReg, owner_offset));
324 testptr(tmpReg, tmpReg);
325 jcc(Assembler::zero, DONE_LABEL);
326 if (UseRTMXendForLockBusy) {
327 xend();
328 jmp(L_decrement_retry);
329 }
330 else {
331 xabort(0);
332 }
333 bind(L_on_abort);
334 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
335 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
336 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
337 }
338 if (RTMRetryCount > 0) {
339 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
340 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
341 }
342
343 movptr(tmpReg, Address(boxReg, owner_offset)) ;
344 testptr(tmpReg, tmpReg) ;
345 jccb(Assembler::notZero, L_decrement_retry) ;
346
347 // Appears unlocked - try to swing _owner from null to non-null.
348 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
349 #ifdef _LP64
350 Register threadReg = r15_thread;
351 #else
352 get_thread(scrReg);
353 Register threadReg = scrReg;
354 #endif
355 lock();
356 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
357
358 if (RTMRetryCount > 0) {
359 // success done else retry
360 jccb(Assembler::equal, DONE_LABEL) ;
361 bind(L_decrement_retry);
362 // Spin and retry if lock is busy.
363 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
364 }
365 else {
366 bind(L_decrement_retry);
367 }
368 }
369
370 #endif // INCLUDE_RTM_OPT
371
372 // fast_lock and fast_unlock used by C2
373
374 // Because the transitions from emitted code to the runtime
375 // monitorenter/exit helper stubs are so slow it's critical that
376 // we inline both the stack-locking fast path and the inflated fast path.
377 //
378 // See also: cmpFastLock and cmpFastUnlock.
379 //
380 // What follows is a specialized inline transliteration of the code
381 // in enter() and exit(). If we're concerned about I$ bloat another
382 // option would be to emit TrySlowEnter and TrySlowExit methods
383 // at startup-time. These methods would accept arguments as
384 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
385 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply
386 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
387 // In practice, however, the # of lock sites is bounded and is usually small.
388 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
389 // if the processor uses simple bimodal branch predictors keyed by EIP
390 // Since the helper routines would be called from multiple synchronization
391 // sites.
392 //
393 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
394 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
395 // to those specialized methods. That'd give us a mostly platform-independent
396 // implementation that the JITs could optimize and inline at their pleasure.
397 // Done correctly, the only time we'd need to cross to native could would be
398 // to park() or unpark() threads. We'd also need a few more unsafe operators
399 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
400 // (b) explicit barriers or fence operations.
401 //
402 // TODO:
403 //
404 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
405 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
406 // Given TLAB allocation, Self is usually manifested in a register, so passing it into
407 // the lock operators would typically be faster than reifying Self.
408 //
409 // * Ideally I'd define the primitives as:
410 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
411 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
412 // Unfortunately ADLC bugs prevent us from expressing the ideal form.
413 // Instead, we're stuck with a rather awkward and brittle register assignments below.
414 // Furthermore the register assignments are overconstrained, possibly resulting in
415 // sub-optimal code near the synchronization site.
416 //
417 // * Eliminate the sp-proximity tests and just use "== Self" tests instead.
418 // Alternately, use a better sp-proximity test.
419 //
420 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
421 // Either one is sufficient to uniquely identify a thread.
422 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
423 //
424 // * Intrinsify notify() and notifyAll() for the common cases where the
425 // object is locked by the calling thread but the waitlist is empty.
426 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
427 //
428 // * use jccb and jmpb instead of jcc and jmp to improve code density.
429 // But beware of excessive branch density on AMD Opterons.
430 //
431 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success
432 // or failure of the fast path. If the fast path fails then we pass
433 // control to the slow path, typically in C. In fast_lock and
434 // fast_unlock we often branch to DONE_LABEL, just to find that C2
435 // will emit a conditional branch immediately after the node.
436 // So we have branches to branches and lots of ICC.ZF games.
437 // Instead, it might be better to have C2 pass a "FailureLabel"
438 // into fast_lock and fast_unlock. In the case of success, control
439 // will drop through the node. ICC.ZF is undefined at exit.
440 // In the case of failure, the node will branch directly to the
441 // FailureLabel
442
443
444 // obj: object to lock
445 // box: on-stack box address (displaced header location) - KILLED
446 // rax,: tmp -- KILLED
447 // scr: tmp -- KILLED
448 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
449 Register scrReg, Register cx1Reg, Register cx2Reg,
450 BiasedLockingCounters* counters,
451 RTMLockingCounters* rtm_counters,
452 RTMLockingCounters* stack_rtm_counters,
453 Metadata* method_data,
454 bool use_rtm, bool profile_rtm) {
455 // Ensure the register assignments are disjoint
456 assert(tmpReg == rax, "");
457
458 if (use_rtm) {
459 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
460 } else {
461 assert(cx2Reg == noreg, "");
462 assert_different_registers(objReg, boxReg, tmpReg, scrReg);
463 }
464
465 if (counters != NULL) {
466 atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
467 }
468
469 // Possible cases that we'll encounter in fast_lock
470 // ------------------------------------------------
471 // * Inflated
472 // -- unlocked
473 // -- Locked
474 // = by self
475 // = by other
476 // * biased
477 // -- by Self
478 // -- by other
479 // * neutral
480 // * stack-locked
481 // -- by self
482 // = sp-proximity test hits
483 // = sp-proximity test generates false-negative
484 // -- by other
485 //
486
487 Label IsInflated, DONE_LABEL;
488
489 if (DiagnoseSyncOnValueBasedClasses != 0) {
490 load_klass(tmpReg, objReg, cx1Reg);
491 movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
492 testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS);
493 jcc(Assembler::notZero, DONE_LABEL);
494 }
495
496 // it's stack-locked, biased or neutral
497 // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
498 // order to reduce the number of conditional branches in the most common cases.
499 // Beware -- there's a subtle invariant that fetch of the markword
500 // at [FETCH], below, will never observe a biased encoding (*101b).
501 // If this invariant is not held we risk exclusion (safety) failure.
502 if (UseBiasedLocking && !UseOptoBiasInlining) {
503 biased_locking_enter(boxReg, objReg, tmpReg, scrReg, cx1Reg, false, DONE_LABEL, NULL, counters);
504 }
505
506 #if INCLUDE_RTM_OPT
507 if (UseRTMForStackLocks && use_rtm) {
508 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
509 stack_rtm_counters, method_data, profile_rtm,
510 DONE_LABEL, IsInflated);
511 }
512 #endif // INCLUDE_RTM_OPT
513
514 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH]
515 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
516 jccb(Assembler::notZero, IsInflated);
517
518 // Attempt stack-locking ...
519 orptr (tmpReg, markWord::unlocked_value);
520 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS
521 lock();
522 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg
523 if (counters != NULL) {
524 cond_inc32(Assembler::equal,
525 ExternalAddress((address)counters->fast_path_entry_count_addr()));
526 }
527 jcc(Assembler::equal, DONE_LABEL); // Success
528
529 // Recursive locking.
530 // The object is stack-locked: markword contains stack pointer to BasicLock.
531 // Locked by current thread if difference with current SP is less than one page.
532 subptr(tmpReg, rsp);
533 // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
534 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
535 movptr(Address(boxReg, 0), tmpReg);
536 if (counters != NULL) {
537 cond_inc32(Assembler::equal,
538 ExternalAddress((address)counters->fast_path_entry_count_addr()));
539 }
540 jmp(DONE_LABEL);
541
542 bind(IsInflated);
543 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
544
545 #if INCLUDE_RTM_OPT
546 // Use the same RTM locking code in 32- and 64-bit VM.
547 if (use_rtm) {
548 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
549 rtm_counters, method_data, profile_rtm, DONE_LABEL);
550 } else {
551 #endif // INCLUDE_RTM_OPT
552
553 #ifndef _LP64
554 // The object is inflated.
555
556 // boxReg refers to the on-stack BasicLock in the current frame.
557 // We'd like to write:
558 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices.
559 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers
560 // additional latency as we have another ST in the store buffer that must drain.
561
562 // avoid ST-before-CAS
563 // register juggle because we need tmpReg for cmpxchgptr below
564 movptr(scrReg, boxReg);
565 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
566
567 // Optimistic form: consider XORL tmpReg,tmpReg
568 movptr(tmpReg, NULL_WORD);
569
570 // Appears unlocked - try to swing _owner from null to non-null.
571 // Ideally, I'd manifest "Self" with get_thread and then attempt
572 // to CAS the register containing Self into m->Owner.
573 // But we don't have enough registers, so instead we can either try to CAS
574 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds
575 // we later store "Self" into m->Owner. Transiently storing a stack address
576 // (rsp or the address of the box) into m->owner is harmless.
577 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
578 lock();
579 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
580 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3
581 // If we weren't able to swing _owner from NULL to the BasicLock
582 // then take the slow path.
583 jccb (Assembler::notZero, DONE_LABEL);
584 // update _owner from BasicLock to thread
585 get_thread (scrReg); // beware: clobbers ICCs
586 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
587 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success
588
589 // If the CAS fails we can either retry or pass control to the slow path.
590 // We use the latter tactic.
591 // Pass the CAS result in the icc.ZFlag into DONE_LABEL
592 // If the CAS was successful ...
593 // Self has acquired the lock
594 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
595 // Intentional fall-through into DONE_LABEL ...
596 #else // _LP64
597 // It's inflated and we use scrReg for ObjectMonitor* in this section.
598 movq(scrReg, tmpReg);
599 xorq(tmpReg, tmpReg);
600 lock();
601 cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
602 // Unconditionally set box->_displaced_header = markWord::unused_mark().
603 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
604 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
605 // Propagate ICC.ZF from CAS above into DONE_LABEL.
606 jcc(Assembler::equal, DONE_LABEL); // CAS above succeeded; propagate ZF = 1 (success)
607
608 cmpptr(r15_thread, rax); // Check if we are already the owner (recursive lock)
609 jcc(Assembler::notEqual, DONE_LABEL); // If not recursive, ZF = 0 at this point (fail)
610 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
611 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success
612 #endif // _LP64
613 #if INCLUDE_RTM_OPT
614 } // use_rtm()
615 #endif
616 // DONE_LABEL is a hot target - we'd really like to place it at the
617 // start of cache line by padding with NOPs.
618 // See the AMD and Intel software optimization manuals for the
619 // most efficient "long" NOP encodings.
620 // Unfortunately none of our alignment mechanisms suffice.
621 bind(DONE_LABEL);
622
623 // At DONE_LABEL the icc ZFlag is set as follows ...
624 // fast_unlock uses the same protocol.
625 // ZFlag == 1 -> Success
626 // ZFlag == 0 -> Failure - force control through the slow path
627 }
628
629 // obj: object to unlock
630 // box: box address (displaced header location), killed. Must be EAX.
631 // tmp: killed, cannot be obj nor box.
632 //
633 // Some commentary on balanced locking:
634 //
635 // fast_lock and fast_unlock are emitted only for provably balanced lock sites.
636 // Methods that don't have provably balanced locking are forced to run in the
637 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
638 // The interpreter provides two properties:
639 // I1: At return-time the interpreter automatically and quietly unlocks any
640 // objects acquired the current activation (frame). Recall that the
641 // interpreter maintains an on-stack list of locks currently held by
642 // a frame.
643 // I2: If a method attempts to unlock an object that is not held by the
644 // the frame the interpreter throws IMSX.
645 //
646 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
647 // B() doesn't have provably balanced locking so it runs in the interpreter.
648 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
649 // is still locked by A().
650 //
651 // The only other source of unbalanced locking would be JNI. The "Java Native Interface:
652 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
653 // should not be unlocked by "normal" java-level locking and vice-versa. The specification
654 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
655 // Arguably given that the spec legislates the JNI case as undefined our implementation
656 // could reasonably *avoid* checking owner in fast_unlock().
657 // In the interest of performance we elide m->Owner==Self check in unlock.
658 // A perfectly viable alternative is to elide the owner check except when
659 // Xcheck:jni is enabled.
660
661 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
662 assert(boxReg == rax, "");
663 assert_different_registers(objReg, boxReg, tmpReg);
664
665 Label DONE_LABEL, Stacked, CheckSucc;
666
667 // Critically, the biased locking test must have precedence over
668 // and appear before the (box->dhw == 0) recursive stack-lock test.
669 if (UseBiasedLocking && !UseOptoBiasInlining) {
670 biased_locking_exit(objReg, tmpReg, DONE_LABEL);
671 }
672
673 #if INCLUDE_RTM_OPT
674 if (UseRTMForStackLocks && use_rtm) {
675 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
676 Label L_regular_unlock;
677 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
678 andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
679 cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked
680 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock
681 xend(); // otherwise end...
682 jmp(DONE_LABEL); // ... and we're done
683 bind(L_regular_unlock);
684 }
685 #endif
686
687 cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header
688 jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock
689 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
690 testptr(tmpReg, markWord::monitor_value); // Inflated?
691 jccb (Assembler::zero, Stacked);
692
693 // It's inflated.
694 #if INCLUDE_RTM_OPT
695 if (use_rtm) {
696 Label L_regular_inflated_unlock;
697 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
698 movptr(boxReg, Address(tmpReg, owner_offset));
699 testptr(boxReg, boxReg);
700 jccb(Assembler::notZero, L_regular_inflated_unlock);
701 xend();
702 jmpb(DONE_LABEL);
703 bind(L_regular_inflated_unlock);
704 }
705 #endif
706
707 // Despite our balanced locking property we still check that m->_owner == Self
708 // as java routines or native JNI code called by this thread might
709 // have released the lock.
710 // Refer to the comments in synchronizer.cpp for how we might encode extra
711 // state in _succ so we can avoid fetching EntryList|cxq.
712 //
713 // If there's no contention try a 1-0 exit. That is, exit without
714 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how
715 // we detect and recover from the race that the 1-0 exit admits.
716 //
717 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier
718 // before it STs null into _owner, releasing the lock. Updates
719 // to data protected by the critical section must be visible before
720 // we drop the lock (and thus before any other thread could acquire
721 // the lock and observe the fields protected by the lock).
722 // IA32's memory-model is SPO, so STs are ordered with respect to
723 // each other and there's no need for an explicit barrier (fence).
724 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
725 #ifndef _LP64
726 get_thread (boxReg);
727
728 // Note that we could employ various encoding schemes to reduce
729 // the number of loads below (currently 4) to just 2 or 3.
730 // Refer to the comments in synchronizer.cpp.
731 // In practice the chain of fetches doesn't seem to impact performance, however.
732 xorptr(boxReg, boxReg);
733 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
734 jccb (Assembler::notZero, DONE_LABEL);
735 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
736 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
737 jccb (Assembler::notZero, CheckSucc);
738 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
739 jmpb (DONE_LABEL);
740
741 bind (Stacked);
742 // It's not inflated and it's not recursively stack-locked and it's not biased.
743 // It must be stack-locked.
744 // Try to reset the header to displaced header.
745 // The "box" value on the stack is stable, so we can reload
746 // and be assured we observe the same value as above.
747 movptr(tmpReg, Address(boxReg, 0));
748 lock();
749 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
750 // Intention fall-thru into DONE_LABEL
751
752 // DONE_LABEL is a hot target - we'd really like to place it at the
753 // start of cache line by padding with NOPs.
754 // See the AMD and Intel software optimization manuals for the
755 // most efficient "long" NOP encodings.
756 // Unfortunately none of our alignment mechanisms suffice.
757 bind (CheckSucc);
758 #else // _LP64
759 // It's inflated
760 Label LNotRecursive, LSuccess, LGoSlowPath;
761
762 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0);
763 jccb(Assembler::equal, LNotRecursive);
764
765 // Recursive inflated unlock
766 decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
767 jmpb(LSuccess);
768
769 bind(LNotRecursive);
770 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
771 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
772 jccb (Assembler::notZero, CheckSucc);
773 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
774 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
775 jmpb (DONE_LABEL);
776
777 // Try to avoid passing control into the slow_path ...
778 bind (CheckSucc);
779
780 // The following optional optimization can be elided if necessary
781 // Effectively: if (succ == null) goto slow path
782 // The code reduces the window for a race, however,
783 // and thus benefits performance.
784 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
785 jccb (Assembler::zero, LGoSlowPath);
786
787 xorptr(boxReg, boxReg);
788 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
789 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
790
791 // Memory barrier/fence
792 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
793 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
794 // This is faster on Nehalem and AMD Shanghai/Barcelona.
795 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
796 // We might also restructure (ST Owner=0;barrier;LD _Succ) to
797 // (mov box,0; xchgq box, &m->Owner; LD _succ) .
798 lock(); addl(Address(rsp, 0), 0);
799
800 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
801 jccb (Assembler::notZero, LSuccess);
802
803 // Rare inopportune interleaving - race.
804 // The successor vanished in the small window above.
805 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
806 // We need to ensure progress and succession.
807 // Try to reacquire the lock.
808 // If that fails then the new owner is responsible for succession and this
809 // thread needs to take no further action and can exit via the fast path (success).
810 // If the re-acquire succeeds then pass control into the slow path.
811 // As implemented, this latter mode is horrible because we generated more
812 // coherence traffic on the lock *and* artifically extended the critical section
813 // length while by virtue of passing control into the slow path.
814
815 // box is really RAX -- the following CMPXCHG depends on that binding
816 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
817 lock();
818 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
819 // There's no successor so we tried to regrab the lock.
820 // If that didn't work, then another thread grabbed the
821 // lock so we're done (and exit was a success).
822 jccb (Assembler::notEqual, LSuccess);
823 // Intentional fall-through into slow path
824
825 bind (LGoSlowPath);
826 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure
827 jmpb (DONE_LABEL);
828
829 bind (LSuccess);
830 testl (boxReg, 0); // set ICC.ZF=1 to indicate success
831 jmpb (DONE_LABEL);
832
833 bind (Stacked);
834 movptr(tmpReg, Address (boxReg, 0)); // re-fetch
835 lock();
836 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
837
838 #endif
839 bind(DONE_LABEL);
840 }
841
842 //-------------------------------------------------------------------------------------------
843 // Generic instructions support for use in .ad files C2 code generation
844
845 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
846 if (dst != src) {
847 movdqu(dst, src);
848 }
849 if (opcode == Op_AbsVD) {
850 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
851 } else {
852 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
853 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
854 }
855 }
856
857 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
858 if (opcode == Op_AbsVD) {
859 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
860 } else {
861 assert((opcode == Op_NegVD),"opcode should be Op_NegD");
862 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
863 }
864 }
865
866 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
867 if (dst != src) {
868 movdqu(dst, src);
869 }
870 if (opcode == Op_AbsVF) {
871 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
872 } else {
873 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
874 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
875 }
876 }
877
878 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
879 if (opcode == Op_AbsVF) {
880 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
881 } else {
882 assert((opcode == Op_NegVF),"opcode should be Op_NegF");
883 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
884 }
885 }
886
887 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
888 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
889 assert(tmp == xnoreg || elem_bt == T_LONG, "unused");
890
891 if (opcode == Op_MinV) {
892 if (elem_bt == T_BYTE) {
893 pminsb(dst, src);
894 } else if (elem_bt == T_SHORT) {
895 pminsw(dst, src);
896 } else if (elem_bt == T_INT) {
897 pminsd(dst, src);
898 } else {
899 assert(elem_bt == T_LONG, "required");
900 assert(tmp == xmm0, "required");
901 assert_different_registers(dst, src, tmp);
902 movdqu(xmm0, dst);
903 pcmpgtq(xmm0, src);
904 blendvpd(dst, src); // xmm0 as mask
905 }
906 } else { // opcode == Op_MaxV
907 if (elem_bt == T_BYTE) {
908 pmaxsb(dst, src);
909 } else if (elem_bt == T_SHORT) {
910 pmaxsw(dst, src);
911 } else if (elem_bt == T_INT) {
912 pmaxsd(dst, src);
913 } else {
914 assert(elem_bt == T_LONG, "required");
915 assert(tmp == xmm0, "required");
916 assert_different_registers(dst, src, tmp);
917 movdqu(xmm0, src);
918 pcmpgtq(xmm0, dst);
919 blendvpd(dst, src); // xmm0 as mask
920 }
921 }
922 }
923
924 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
925 XMMRegister dst, XMMRegister src1, XMMRegister src2,
926 int vlen_enc) {
927 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
928
929 if (opcode == Op_MinV) {
930 if (elem_bt == T_BYTE) {
931 vpminsb(dst, src1, src2, vlen_enc);
932 } else if (elem_bt == T_SHORT) {
933 vpminsw(dst, src1, src2, vlen_enc);
934 } else if (elem_bt == T_INT) {
935 vpminsd(dst, src1, src2, vlen_enc);
936 } else {
937 assert(elem_bt == T_LONG, "required");
938 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
939 vpminsq(dst, src1, src2, vlen_enc);
940 } else {
941 assert_different_registers(dst, src1, src2);
942 vpcmpgtq(dst, src1, src2, vlen_enc);
943 vblendvpd(dst, src1, src2, dst, vlen_enc);
944 }
945 }
946 } else { // opcode == Op_MaxV
947 if (elem_bt == T_BYTE) {
948 vpmaxsb(dst, src1, src2, vlen_enc);
949 } else if (elem_bt == T_SHORT) {
950 vpmaxsw(dst, src1, src2, vlen_enc);
951 } else if (elem_bt == T_INT) {
952 vpmaxsd(dst, src1, src2, vlen_enc);
953 } else {
954 assert(elem_bt == T_LONG, "required");
955 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
956 vpmaxsq(dst, src1, src2, vlen_enc);
957 } else {
958 assert_different_registers(dst, src1, src2);
959 vpcmpgtq(dst, src1, src2, vlen_enc);
960 vblendvpd(dst, src2, src1, dst, vlen_enc);
961 }
962 }
963 }
964 }
965
966 // Float/Double min max
967
968 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
969 XMMRegister dst, XMMRegister a, XMMRegister b,
970 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
971 int vlen_enc) {
972 assert(UseAVX > 0, "required");
973 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
974 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
975 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
976 assert_different_registers(a, b, tmp, atmp, btmp);
977
978 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
979 bool is_double_word = is_double_word_type(elem_bt);
980
981 if (!is_double_word && is_min) {
982 vblendvps(atmp, a, b, a, vlen_enc);
983 vblendvps(btmp, b, a, a, vlen_enc);
984 vminps(tmp, atmp, btmp, vlen_enc);
985 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
986 vblendvps(dst, tmp, atmp, btmp, vlen_enc);
987 } else if (!is_double_word && !is_min) {
988 vblendvps(btmp, b, a, b, vlen_enc);
989 vblendvps(atmp, a, b, b, vlen_enc);
990 vmaxps(tmp, atmp, btmp, vlen_enc);
991 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
992 vblendvps(dst, tmp, atmp, btmp, vlen_enc);
993 } else if (is_double_word && is_min) {
994 vblendvpd(atmp, a, b, a, vlen_enc);
995 vblendvpd(btmp, b, a, a, vlen_enc);
996 vminpd(tmp, atmp, btmp, vlen_enc);
997 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
998 vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
999 } else {
1000 assert(is_double_word && !is_min, "sanity");
1001 vblendvpd(btmp, b, a, b, vlen_enc);
1002 vblendvpd(atmp, a, b, b, vlen_enc);
1003 vmaxpd(tmp, atmp, btmp, vlen_enc);
1004 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1005 vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
1006 }
1007 }
1008
1009 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
1010 XMMRegister dst, XMMRegister a, XMMRegister b,
1011 KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
1012 int vlen_enc) {
1013 assert(UseAVX > 2, "required");
1014 assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
1015 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
1016 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
1017 assert_different_registers(dst, a, b, atmp, btmp);
1018
1019 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
1020 bool is_double_word = is_double_word_type(elem_bt);
1021 bool merge = true;
1022
1023 if (!is_double_word && is_min) {
1024 evpmovd2m(ktmp, a, vlen_enc);
1025 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1026 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1027 vminps(dst, atmp, btmp, vlen_enc);
1028 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1029 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1030 } else if (!is_double_word && !is_min) {
1031 evpmovd2m(ktmp, b, vlen_enc);
1032 evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
1033 evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
1034 vmaxps(dst, atmp, btmp, vlen_enc);
1035 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1036 evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
1037 } else if (is_double_word && is_min) {
1038 evpmovq2m(ktmp, a, vlen_enc);
1039 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1040 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1041 vminpd(dst, atmp, btmp, vlen_enc);
1042 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1043 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1044 } else {
1045 assert(is_double_word && !is_min, "sanity");
1046 evpmovq2m(ktmp, b, vlen_enc);
1047 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
1048 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
1049 vmaxpd(dst, atmp, btmp, vlen_enc);
1050 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
1051 evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
1052 }
1053 }
1054
1055 // Float/Double signum
1056 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst,
1057 XMMRegister zero, XMMRegister one,
1058 Register scratch) {
1059 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity");
1060
1061 Label DONE_LABEL;
1062
1063 if (opcode == Op_SignumF) {
1064 assert(UseSSE > 0, "required");
1065 ucomiss(dst, zero);
1066 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1067 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN
1068 movflt(dst, one);
1069 jcc(Assembler::above, DONE_LABEL);
1070 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scratch);
1071 } else if (opcode == Op_SignumD) {
1072 assert(UseSSE > 1, "required");
1073 ucomisd(dst, zero);
1074 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument
1075 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN
1076 movdbl(dst, one);
1077 jcc(Assembler::above, DONE_LABEL);
1078 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scratch);
1079 }
1080
1081 bind(DONE_LABEL);
1082 }
1083
1084 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
1085 if (sign) {
1086 pmovsxbw(dst, src);
1087 } else {
1088 pmovzxbw(dst, src);
1089 }
1090 }
1091
1092 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1093 if (sign) {
1094 vpmovsxbw(dst, src, vector_len);
1095 } else {
1096 vpmovzxbw(dst, src, vector_len);
1097 }
1098 }
1099
1100 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1101 if (sign) {
1102 vpmovsxbd(dst, src, vector_len);
1103 } else {
1104 vpmovzxbd(dst, src, vector_len);
1105 }
1106 }
1107
1108 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
1109 if (sign) {
1110 vpmovsxwd(dst, src, vector_len);
1111 } else {
1112 vpmovzxwd(dst, src, vector_len);
1113 }
1114 }
1115
1116 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1117 int shift, int vector_len) {
1118 if (opcode == Op_RotateLeftV) {
1119 if (etype == T_INT) {
1120 evprold(dst, src, shift, vector_len);
1121 } else {
1122 assert(etype == T_LONG, "expected type T_LONG");
1123 evprolq(dst, src, shift, vector_len);
1124 }
1125 } else {
1126 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1127 if (etype == T_INT) {
1128 evprord(dst, src, shift, vector_len);
1129 } else {
1130 assert(etype == T_LONG, "expected type T_LONG");
1131 evprorq(dst, src, shift, vector_len);
1132 }
1133 }
1134 }
1135
1136 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
1137 XMMRegister shift, int vector_len) {
1138 if (opcode == Op_RotateLeftV) {
1139 if (etype == T_INT) {
1140 evprolvd(dst, src, shift, vector_len);
1141 } else {
1142 assert(etype == T_LONG, "expected type T_LONG");
1143 evprolvq(dst, src, shift, vector_len);
1144 }
1145 } else {
1146 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
1147 if (etype == T_INT) {
1148 evprorvd(dst, src, shift, vector_len);
1149 } else {
1150 assert(etype == T_LONG, "expected type T_LONG");
1151 evprorvq(dst, src, shift, vector_len);
1152 }
1153 }
1154 }
1155
1156 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
1157 if (opcode == Op_RShiftVI) {
1158 psrad(dst, shift);
1159 } else if (opcode == Op_LShiftVI) {
1160 pslld(dst, shift);
1161 } else {
1162 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1163 psrld(dst, shift);
1164 }
1165 }
1166
1167 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
1168 switch (opcode) {
1169 case Op_RShiftVI: psrad(dst, shift); break;
1170 case Op_LShiftVI: pslld(dst, shift); break;
1171 case Op_URShiftVI: psrld(dst, shift); break;
1172
1173 default: assert(false, "%s", NodeClassNames[opcode]);
1174 }
1175 }
1176
1177 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1178 if (opcode == Op_RShiftVI) {
1179 vpsrad(dst, nds, shift, vector_len);
1180 } else if (opcode == Op_LShiftVI) {
1181 vpslld(dst, nds, shift, vector_len);
1182 } else {
1183 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
1184 vpsrld(dst, nds, shift, vector_len);
1185 }
1186 }
1187
1188 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1189 switch (opcode) {
1190 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break;
1191 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break;
1192 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
1193
1194 default: assert(false, "%s", NodeClassNames[opcode]);
1195 }
1196 }
1197
1198 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
1199 switch (opcode) {
1200 case Op_RShiftVB: // fall-through
1201 case Op_RShiftVS: psraw(dst, shift); break;
1202
1203 case Op_LShiftVB: // fall-through
1204 case Op_LShiftVS: psllw(dst, shift); break;
1205
1206 case Op_URShiftVS: // fall-through
1207 case Op_URShiftVB: psrlw(dst, shift); break;
1208
1209 default: assert(false, "%s", NodeClassNames[opcode]);
1210 }
1211 }
1212
1213 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1214 switch (opcode) {
1215 case Op_RShiftVB: // fall-through
1216 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break;
1217
1218 case Op_LShiftVB: // fall-through
1219 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break;
1220
1221 case Op_URShiftVS: // fall-through
1222 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
1223
1224 default: assert(false, "%s", NodeClassNames[opcode]);
1225 }
1226 }
1227
1228 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
1229 switch (opcode) {
1230 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
1231 case Op_LShiftVL: psllq(dst, shift); break;
1232 case Op_URShiftVL: psrlq(dst, shift); break;
1233
1234 default: assert(false, "%s", NodeClassNames[opcode]);
1235 }
1236 }
1237
1238 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
1239 if (opcode == Op_RShiftVL) {
1240 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems
1241 } else if (opcode == Op_LShiftVL) {
1242 psllq(dst, shift);
1243 } else {
1244 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1245 psrlq(dst, shift);
1246 }
1247 }
1248
1249 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1250 switch (opcode) {
1251 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
1252 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break;
1253 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
1254
1255 default: assert(false, "%s", NodeClassNames[opcode]);
1256 }
1257 }
1258
1259 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
1260 if (opcode == Op_RShiftVL) {
1261 evpsraq(dst, nds, shift, vector_len);
1262 } else if (opcode == Op_LShiftVL) {
1263 vpsllq(dst, nds, shift, vector_len);
1264 } else {
1265 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
1266 vpsrlq(dst, nds, shift, vector_len);
1267 }
1268 }
1269
1270 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1271 switch (opcode) {
1272 case Op_RShiftVB: // fall-through
1273 case Op_RShiftVS: // fall-through
1274 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break;
1275
1276 case Op_LShiftVB: // fall-through
1277 case Op_LShiftVS: // fall-through
1278 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break;
1279
1280 case Op_URShiftVB: // fall-through
1281 case Op_URShiftVS: // fall-through
1282 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
1283
1284 default: assert(false, "%s", NodeClassNames[opcode]);
1285 }
1286 }
1287
1288 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
1289 switch (opcode) {
1290 case Op_RShiftVB: // fall-through
1291 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break;
1292
1293 case Op_LShiftVB: // fall-through
1294 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break;
1295
1296 case Op_URShiftVB: // fall-through
1297 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
1298
1299 default: assert(false, "%s", NodeClassNames[opcode]);
1300 }
1301 }
1302
1303 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
1304 assert(UseAVX >= 2, "required");
1305 switch (opcode) {
1306 case Op_RShiftVL: {
1307 if (UseAVX > 2) {
1308 assert(tmp == xnoreg, "not used");
1309 if (!VM_Version::supports_avx512vl()) {
1310 vlen_enc = Assembler::AVX_512bit;
1311 }
1312 evpsravq(dst, src, shift, vlen_enc);
1313 } else {
1314 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
1315 vpsrlvq(dst, src, shift, vlen_enc);
1316 vpsrlvq(tmp, tmp, shift, vlen_enc);
1317 vpxor(dst, dst, tmp, vlen_enc);
1318 vpsubq(dst, dst, tmp, vlen_enc);
1319 }
1320 break;
1321 }
1322 case Op_LShiftVL: {
1323 assert(tmp == xnoreg, "not used");
1324 vpsllvq(dst, src, shift, vlen_enc);
1325 break;
1326 }
1327 case Op_URShiftVL: {
1328 assert(tmp == xnoreg, "not used");
1329 vpsrlvq(dst, src, shift, vlen_enc);
1330 break;
1331 }
1332 default: assert(false, "%s", NodeClassNames[opcode]);
1333 }
1334 }
1335
1336 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
1337 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1338 assert(opcode == Op_LShiftVB ||
1339 opcode == Op_RShiftVB ||
1340 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1341 bool sign = (opcode != Op_URShiftVB);
1342 assert(vector_len == 0, "required");
1343 vextendbd(sign, dst, src, 1);
1344 vpmovzxbd(vtmp, shift, 1);
1345 varshiftd(opcode, dst, dst, vtmp, 1);
1346 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch);
1347 vextracti128_high(vtmp, dst);
1348 vpackusdw(dst, dst, vtmp, 0);
1349 }
1350
1351 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
1352 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
1353 assert(opcode == Op_LShiftVB ||
1354 opcode == Op_RShiftVB ||
1355 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
1356 bool sign = (opcode != Op_URShiftVB);
1357 int ext_vector_len = vector_len + 1;
1358 vextendbw(sign, dst, src, ext_vector_len);
1359 vpmovzxbw(vtmp, shift, ext_vector_len);
1360 varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
1361 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch);
1362 if (vector_len == 0) {
1363 vextracti128_high(vtmp, dst);
1364 vpackuswb(dst, dst, vtmp, vector_len);
1365 } else {
1366 vextracti64x4_high(vtmp, dst);
1367 vpackuswb(dst, dst, vtmp, vector_len);
1368 vpermq(dst, dst, 0xD8, vector_len);
1369 }
1370 }
1371
1372 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
1373 switch(typ) {
1374 case T_BYTE:
1375 pinsrb(dst, val, idx);
1376 break;
1377 case T_SHORT:
1378 pinsrw(dst, val, idx);
1379 break;
1380 case T_INT:
1381 pinsrd(dst, val, idx);
1382 break;
1383 case T_LONG:
1384 pinsrq(dst, val, idx);
1385 break;
1386 default:
1387 assert(false,"Should not reach here.");
1388 break;
1389 }
1390 }
1391
1392 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
1393 switch(typ) {
1394 case T_BYTE:
1395 vpinsrb(dst, src, val, idx);
1396 break;
1397 case T_SHORT:
1398 vpinsrw(dst, src, val, idx);
1399 break;
1400 case T_INT:
1401 vpinsrd(dst, src, val, idx);
1402 break;
1403 case T_LONG:
1404 vpinsrq(dst, src, val, idx);
1405 break;
1406 default:
1407 assert(false,"Should not reach here.");
1408 break;
1409 }
1410 }
1411
1412 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
1413 switch(typ) {
1414 case T_INT:
1415 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
1416 break;
1417 case T_FLOAT:
1418 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
1419 break;
1420 case T_LONG:
1421 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
1422 break;
1423 case T_DOUBLE:
1424 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
1425 break;
1426 default:
1427 assert(false,"Should not reach here.");
1428 break;
1429 }
1430 }
1431
1432 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
1433 switch(typ) {
1434 case T_INT:
1435 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
1436 break;
1437 case T_FLOAT:
1438 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
1439 break;
1440 case T_LONG:
1441 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
1442 break;
1443 case T_DOUBLE:
1444 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
1445 break;
1446 default:
1447 assert(false,"Should not reach here.");
1448 break;
1449 }
1450 }
1451
1452 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
1453 switch(typ) {
1454 case T_INT:
1455 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
1456 break;
1457 case T_FLOAT:
1458 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
1459 break;
1460 case T_LONG:
1461 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
1462 break;
1463 case T_DOUBLE:
1464 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
1465 break;
1466 default:
1467 assert(false,"Should not reach here.");
1468 break;
1469 }
1470 }
1471
1472 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) {
1473 if (vlen_in_bytes <= 16) {
1474 pxor (dst, dst);
1475 psubb(dst, src);
1476 switch (elem_bt) {
1477 case T_BYTE: /* nothing to do */ break;
1478 case T_SHORT: pmovsxbw(dst, dst); break;
1479 case T_INT: pmovsxbd(dst, dst); break;
1480 case T_FLOAT: pmovsxbd(dst, dst); break;
1481 case T_LONG: pmovsxbq(dst, dst); break;
1482 case T_DOUBLE: pmovsxbq(dst, dst); break;
1483
1484 default: assert(false, "%s", type2name(elem_bt));
1485 }
1486 } else {
1487 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, "");
1488 int vlen_enc = vector_length_encoding(vlen_in_bytes);
1489
1490 vpxor (dst, dst, dst, vlen_enc);
1491 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc);
1492
1493 switch (elem_bt) {
1494 case T_BYTE: /* nothing to do */ break;
1495 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break;
1496 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break;
1497 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break;
1498 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break;
1499 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
1500
1501 default: assert(false, "%s", type2name(elem_bt));
1502 }
1503 }
1504 }
1505
1506 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) {
1507 ExternalAddress addr(StubRoutines::x86::vector_iota_indices());
1508 if (vlen_in_bytes == 4) {
1509 movdl(dst, addr);
1510 } else if (vlen_in_bytes == 8) {
1511 movq(dst, addr);
1512 } else if (vlen_in_bytes == 16) {
1513 movdqu(dst, addr, scratch);
1514 } else if (vlen_in_bytes == 32) {
1515 vmovdqu(dst, addr, scratch);
1516 } else {
1517 assert(vlen_in_bytes == 64, "%d", vlen_in_bytes);
1518 evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch);
1519 }
1520 }
1521
1522 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
1523
1524 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
1525 int vector_len = Assembler::AVX_128bit;
1526
1527 switch (opcode) {
1528 case Op_AndReductionV: pand(dst, src); break;
1529 case Op_OrReductionV: por (dst, src); break;
1530 case Op_XorReductionV: pxor(dst, src); break;
1531 case Op_MinReductionV:
1532 switch (typ) {
1533 case T_BYTE: pminsb(dst, src); break;
1534 case T_SHORT: pminsw(dst, src); break;
1535 case T_INT: pminsd(dst, src); break;
1536 case T_LONG: assert(UseAVX > 2, "required");
1537 vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
1538 default: assert(false, "wrong type");
1539 }
1540 break;
1541 case Op_MaxReductionV:
1542 switch (typ) {
1543 case T_BYTE: pmaxsb(dst, src); break;
1544 case T_SHORT: pmaxsw(dst, src); break;
1545 case T_INT: pmaxsd(dst, src); break;
1546 case T_LONG: assert(UseAVX > 2, "required");
1547 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
1548 default: assert(false, "wrong type");
1549 }
1550 break;
1551 case Op_AddReductionVF: addss(dst, src); break;
1552 case Op_AddReductionVD: addsd(dst, src); break;
1553 case Op_AddReductionVI:
1554 switch (typ) {
1555 case T_BYTE: paddb(dst, src); break;
1556 case T_SHORT: paddw(dst, src); break;
1557 case T_INT: paddd(dst, src); break;
1558 default: assert(false, "wrong type");
1559 }
1560 break;
1561 case Op_AddReductionVL: paddq(dst, src); break;
1562 case Op_MulReductionVF: mulss(dst, src); break;
1563 case Op_MulReductionVD: mulsd(dst, src); break;
1564 case Op_MulReductionVI:
1565 switch (typ) {
1566 case T_SHORT: pmullw(dst, src); break;
1567 case T_INT: pmulld(dst, src); break;
1568 default: assert(false, "wrong type");
1569 }
1570 break;
1571 case Op_MulReductionVL: assert(UseAVX > 2, "required");
1572 vpmullq(dst, dst, src, vector_len); break;
1573 default: assert(false, "wrong opcode");
1574 }
1575 }
1576
1577 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
1578 int vector_len = Assembler::AVX_256bit;
1579
1580 switch (opcode) {
1581 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break;
1582 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break;
1583 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break;
1584 case Op_MinReductionV:
1585 switch (typ) {
1586 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break;
1587 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break;
1588 case T_INT: vpminsd(dst, src1, src2, vector_len); break;
1589 case T_LONG: assert(UseAVX > 2, "required");
1590 vpminsq(dst, src1, src2, vector_len); break;
1591 default: assert(false, "wrong type");
1592 }
1593 break;
1594 case Op_MaxReductionV:
1595 switch (typ) {
1596 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break;
1597 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break;
1598 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break;
1599 case T_LONG: assert(UseAVX > 2, "required");
1600 vpmaxsq(dst, src1, src2, vector_len); break;
1601 default: assert(false, "wrong type");
1602 }
1603 break;
1604 case Op_AddReductionVI:
1605 switch (typ) {
1606 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break;
1607 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break;
1608 case T_INT: vpaddd(dst, src1, src2, vector_len); break;
1609 default: assert(false, "wrong type");
1610 }
1611 break;
1612 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
1613 case Op_MulReductionVI:
1614 switch (typ) {
1615 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break;
1616 case T_INT: vpmulld(dst, src1, src2, vector_len); break;
1617 default: assert(false, "wrong type");
1618 }
1619 break;
1620 case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
1621 default: assert(false, "wrong opcode");
1622 }
1623 }
1624
1625 void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
1626 XMMRegister dst, XMMRegister src,
1627 XMMRegister vtmp1, XMMRegister vtmp2) {
1628 switch (opcode) {
1629 case Op_AddReductionVF:
1630 case Op_MulReductionVF:
1631 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
1632 break;
1633
1634 case Op_AddReductionVD:
1635 case Op_MulReductionVD:
1636 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
1637 break;
1638
1639 default: assert(false, "wrong opcode");
1640 }
1641 }
1642
1643 void C2_MacroAssembler::reduceB(int opcode, int vlen,
1644 Register dst, Register src1, XMMRegister src2,
1645 XMMRegister vtmp1, XMMRegister vtmp2) {
1646 switch (vlen) {
1647 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1648 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1649 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1650 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1651
1652 default: assert(false, "wrong vector length");
1653 }
1654 }
1655
1656 void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
1657 Register dst, Register src1, XMMRegister src2,
1658 XMMRegister vtmp1, XMMRegister vtmp2) {
1659 switch (vlen) {
1660 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1661 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1662 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1663 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1664
1665 default: assert(false, "wrong vector length");
1666 }
1667 }
1668
1669 void C2_MacroAssembler::reduceS(int opcode, int vlen,
1670 Register dst, Register src1, XMMRegister src2,
1671 XMMRegister vtmp1, XMMRegister vtmp2) {
1672 switch (vlen) {
1673 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1674 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1675 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1676 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1677
1678 default: assert(false, "wrong vector length");
1679 }
1680 }
1681
1682 void C2_MacroAssembler::reduceI(int opcode, int vlen,
1683 Register dst, Register src1, XMMRegister src2,
1684 XMMRegister vtmp1, XMMRegister vtmp2) {
1685 switch (vlen) {
1686 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1687 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1688 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
1689 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1690
1691 default: assert(false, "wrong vector length");
1692 }
1693 }
1694
1695 #ifdef _LP64
1696 void C2_MacroAssembler::reduceL(int opcode, int vlen,
1697 Register dst, Register src1, XMMRegister src2,
1698 XMMRegister vtmp1, XMMRegister vtmp2) {
1699 switch (vlen) {
1700 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1701 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1702 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
1703
1704 default: assert(false, "wrong vector length");
1705 }
1706 }
1707 #endif // _LP64
1708
1709 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1710 switch (vlen) {
1711 case 2:
1712 assert(vtmp2 == xnoreg, "");
1713 reduce2F(opcode, dst, src, vtmp1);
1714 break;
1715 case 4:
1716 assert(vtmp2 == xnoreg, "");
1717 reduce4F(opcode, dst, src, vtmp1);
1718 break;
1719 case 8:
1720 reduce8F(opcode, dst, src, vtmp1, vtmp2);
1721 break;
1722 case 16:
1723 reduce16F(opcode, dst, src, vtmp1, vtmp2);
1724 break;
1725 default: assert(false, "wrong vector length");
1726 }
1727 }
1728
1729 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1730 switch (vlen) {
1731 case 2:
1732 assert(vtmp2 == xnoreg, "");
1733 reduce2D(opcode, dst, src, vtmp1);
1734 break;
1735 case 4:
1736 reduce4D(opcode, dst, src, vtmp1, vtmp2);
1737 break;
1738 case 8:
1739 reduce8D(opcode, dst, src, vtmp1, vtmp2);
1740 break;
1741 default: assert(false, "wrong vector length");
1742 }
1743 }
1744
1745 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1746 if (opcode == Op_AddReductionVI) {
1747 if (vtmp1 != src2) {
1748 movdqu(vtmp1, src2);
1749 }
1750 phaddd(vtmp1, vtmp1);
1751 } else {
1752 pshufd(vtmp1, src2, 0x1);
1753 reduce_operation_128(T_INT, opcode, vtmp1, src2);
1754 }
1755 movdl(vtmp2, src1);
1756 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1757 movdl(dst, vtmp1);
1758 }
1759
1760 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1761 if (opcode == Op_AddReductionVI) {
1762 if (vtmp1 != src2) {
1763 movdqu(vtmp1, src2);
1764 }
1765 phaddd(vtmp1, src2);
1766 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1767 } else {
1768 pshufd(vtmp2, src2, 0xE);
1769 reduce_operation_128(T_INT, opcode, vtmp2, src2);
1770 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1771 }
1772 }
1773
1774 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1775 if (opcode == Op_AddReductionVI) {
1776 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
1777 vextracti128_high(vtmp2, vtmp1);
1778 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
1779 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1780 } else {
1781 vextracti128_high(vtmp1, src2);
1782 reduce_operation_128(T_INT, opcode, vtmp1, src2);
1783 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1784 }
1785 }
1786
1787 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1788 vextracti64x4_high(vtmp2, src2);
1789 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
1790 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1791 }
1792
1793 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1794 pshufd(vtmp2, src2, 0x1);
1795 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1796 movdqu(vtmp1, vtmp2);
1797 psrldq(vtmp1, 2);
1798 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1799 movdqu(vtmp2, vtmp1);
1800 psrldq(vtmp2, 1);
1801 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
1802 movdl(vtmp2, src1);
1803 pmovsxbd(vtmp1, vtmp1);
1804 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1805 pextrb(dst, vtmp1, 0x0);
1806 movsbl(dst, dst);
1807 }
1808
1809 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1810 pshufd(vtmp1, src2, 0xE);
1811 reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
1812 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1813 }
1814
1815 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1816 vextracti128_high(vtmp2, src2);
1817 reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
1818 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1819 }
1820
1821 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1822 vextracti64x4_high(vtmp1, src2);
1823 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
1824 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1825 }
1826
1827 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1828 pmovsxbw(vtmp2, src2);
1829 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1830 }
1831
1832 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1833 if (UseAVX > 1) {
1834 int vector_len = Assembler::AVX_256bit;
1835 vpmovsxbw(vtmp1, src2, vector_len);
1836 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1837 } else {
1838 pmovsxbw(vtmp2, src2);
1839 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1840 pshufd(vtmp2, src2, 0x1);
1841 pmovsxbw(vtmp2, src2);
1842 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1843 }
1844 }
1845
1846 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1847 if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
1848 int vector_len = Assembler::AVX_512bit;
1849 vpmovsxbw(vtmp1, src2, vector_len);
1850 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1851 } else {
1852 assert(UseAVX >= 2,"Should not reach here.");
1853 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
1854 vextracti128_high(vtmp2, src2);
1855 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1856 }
1857 }
1858
1859 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1860 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
1861 vextracti64x4_high(vtmp2, src2);
1862 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
1863 }
1864
1865 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1866 if (opcode == Op_AddReductionVI) {
1867 if (vtmp1 != src2) {
1868 movdqu(vtmp1, src2);
1869 }
1870 phaddw(vtmp1, vtmp1);
1871 phaddw(vtmp1, vtmp1);
1872 } else {
1873 pshufd(vtmp2, src2, 0x1);
1874 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1875 movdqu(vtmp1, vtmp2);
1876 psrldq(vtmp1, 2);
1877 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
1878 }
1879 movdl(vtmp2, src1);
1880 pmovsxwd(vtmp1, vtmp1);
1881 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
1882 pextrw(dst, vtmp1, 0x0);
1883 movswl(dst, dst);
1884 }
1885
1886 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1887 if (opcode == Op_AddReductionVI) {
1888 if (vtmp1 != src2) {
1889 movdqu(vtmp1, src2);
1890 }
1891 phaddw(vtmp1, src2);
1892 } else {
1893 pshufd(vtmp1, src2, 0xE);
1894 reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
1895 }
1896 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1897 }
1898
1899 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1900 if (opcode == Op_AddReductionVI) {
1901 int vector_len = Assembler::AVX_256bit;
1902 vphaddw(vtmp2, src2, src2, vector_len);
1903 vpermq(vtmp2, vtmp2, 0xD8, vector_len);
1904 } else {
1905 vextracti128_high(vtmp2, src2);
1906 reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
1907 }
1908 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1909 }
1910
1911 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1912 int vector_len = Assembler::AVX_256bit;
1913 vextracti64x4_high(vtmp1, src2);
1914 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
1915 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1916 }
1917
1918 #ifdef _LP64
1919 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1920 pshufd(vtmp2, src2, 0xE);
1921 reduce_operation_128(T_LONG, opcode, vtmp2, src2);
1922 movdq(vtmp1, src1);
1923 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
1924 movdq(dst, vtmp1);
1925 }
1926
1927 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1928 vextracti128_high(vtmp1, src2);
1929 reduce_operation_128(T_LONG, opcode, vtmp1, src2);
1930 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
1931 }
1932
1933 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
1934 vextracti64x4_high(vtmp2, src2);
1935 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
1936 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
1937 }
1938
1939 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
1940 assert(ArrayOperationPartialInlineSize > 0 && ArrayOperationPartialInlineSize <= 64, "invalid");
1941 mov64(temp, -1L);
1942 bzhiq(temp, temp, len);
1943 kmovql(dst, temp);
1944 }
1945 #endif // _LP64
1946
1947 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1948 reduce_operation_128(T_FLOAT, opcode, dst, src);
1949 pshufd(vtmp, src, 0x1);
1950 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1951 }
1952
1953 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1954 reduce2F(opcode, dst, src, vtmp);
1955 pshufd(vtmp, src, 0x2);
1956 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1957 pshufd(vtmp, src, 0x3);
1958 reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
1959 }
1960
1961 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1962 reduce4F(opcode, dst, src, vtmp2);
1963 vextractf128_high(vtmp2, src);
1964 reduce4F(opcode, dst, vtmp2, vtmp1);
1965 }
1966
1967 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1968 reduce8F(opcode, dst, src, vtmp1, vtmp2);
1969 vextracti64x4_high(vtmp1, src);
1970 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
1971 }
1972
1973 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
1974 reduce_operation_128(T_DOUBLE, opcode, dst, src);
1975 pshufd(vtmp, src, 0xE);
1976 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
1977 }
1978
1979 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1980 reduce2D(opcode, dst, src, vtmp2);
1981 vextractf128_high(vtmp2, src);
1982 reduce2D(opcode, dst, vtmp2, vtmp1);
1983 }
1984
1985 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
1986 reduce4D(opcode, dst, src, vtmp1, vtmp2);
1987 vextracti64x4_high(vtmp1, src);
1988 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
1989 }
1990
1991 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {
1992 MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
1993 }
1994
1995 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {
1996 MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
1997 }
1998
1999
2000 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
2001 XMMRegister dst, XMMRegister src,
2002 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2003 XMMRegister xmm_0, XMMRegister xmm_1) {
2004 int permconst[] = {1, 14};
2005 XMMRegister wsrc = src;
2006 XMMRegister wdst = xmm_0;
2007 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2008
2009 int vlen_enc = Assembler::AVX_128bit;
2010 if (vlen == 16) {
2011 vlen_enc = Assembler::AVX_256bit;
2012 }
2013
2014 for (int i = log2(vlen) - 1; i >=0; i--) {
2015 if (i == 0 && !is_dst_valid) {
2016 wdst = dst;
2017 }
2018 if (i == 3) {
2019 vextracti64x4_high(wtmp, wsrc);
2020 } else if (i == 2) {
2021 vextracti128_high(wtmp, wsrc);
2022 } else { // i = [0,1]
2023 vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
2024 }
2025 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2026 wsrc = wdst;
2027 vlen_enc = Assembler::AVX_128bit;
2028 }
2029 if (is_dst_valid) {
2030 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2031 }
2032 }
2033
2034 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
2035 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
2036 XMMRegister xmm_0, XMMRegister xmm_1) {
2037 XMMRegister wsrc = src;
2038 XMMRegister wdst = xmm_0;
2039 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
2040 int vlen_enc = Assembler::AVX_128bit;
2041 if (vlen == 8) {
2042 vlen_enc = Assembler::AVX_256bit;
2043 }
2044 for (int i = log2(vlen) - 1; i >=0; i--) {
2045 if (i == 0 && !is_dst_valid) {
2046 wdst = dst;
2047 }
2048 if (i == 1) {
2049 vextracti128_high(wtmp, wsrc);
2050 } else if (i == 2) {
2051 vextracti64x4_high(wtmp, wsrc);
2052 } else {
2053 assert(i == 0, "%d", i);
2054 vpermilpd(wtmp, wsrc, 1, vlen_enc);
2055 }
2056 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
2057 wsrc = wdst;
2058 vlen_enc = Assembler::AVX_128bit;
2059 }
2060 if (is_dst_valid) {
2061 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
2062 }
2063 }
2064
2065 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
2066 switch (bt) {
2067 case T_BYTE: pextrb(dst, src, idx); break;
2068 case T_SHORT: pextrw(dst, src, idx); break;
2069 case T_INT: pextrd(dst, src, idx); break;
2070 case T_LONG: pextrq(dst, src, idx); break;
2071
2072 default:
2073 assert(false,"Should not reach here.");
2074 break;
2075 }
2076 }
2077
2078 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
2079 int esize = type2aelembytes(typ);
2080 int elem_per_lane = 16/esize;
2081 int lane = elemindex / elem_per_lane;
2082 int eindex = elemindex % elem_per_lane;
2083
2084 if (lane >= 2) {
2085 assert(UseAVX > 2, "required");
2086 vextractf32x4(dst, src, lane & 3);
2087 return dst;
2088 } else if (lane > 0) {
2089 assert(UseAVX > 0, "required");
2090 vextractf128(dst, src, lane);
2091 return dst;
2092 } else {
2093 return src;
2094 }
2095 }
2096
2097 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) {
2098 if (typ == T_BYTE) {
2099 movsbl(dst, dst);
2100 } else if (typ == T_SHORT) {
2101 movswl(dst, dst);
2102 }
2103 }
2104
2105 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
2106 int esize = type2aelembytes(typ);
2107 int elem_per_lane = 16/esize;
2108 int eindex = elemindex % elem_per_lane;
2109 assert(is_integral_type(typ),"required");
2110
2111 if (eindex == 0) {
2112 if (typ == T_LONG) {
2113 movq(dst, src);
2114 } else {
2115 movdl(dst, src);
2116 movsxl(typ, dst);
2117 }
2118 } else {
2119 extract(typ, dst, src, eindex);
2120 movsxl(typ, dst);
2121 }
2122 }
2123
2124 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) {
2125 int esize = type2aelembytes(typ);
2126 int elem_per_lane = 16/esize;
2127 int eindex = elemindex % elem_per_lane;
2128 assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
2129
2130 if (eindex == 0) {
2131 movq(dst, src);
2132 } else {
2133 if (typ == T_FLOAT) {
2134 if (UseAVX == 0) {
2135 movdqu(dst, src);
2136 pshufps(dst, dst, eindex);
2137 } else {
2138 vpshufps(dst, src, src, eindex, Assembler::AVX_128bit);
2139 }
2140 } else {
2141 if (UseAVX == 0) {
2142 movdqu(dst, src);
2143 psrldq(dst, eindex*esize);
2144 } else {
2145 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
2146 }
2147 movq(dst, dst);
2148 }
2149 }
2150 // Zero upper bits
2151 if (typ == T_FLOAT) {
2152 if (UseAVX == 0) {
2153 assert((vtmp != xnoreg) && (tmp != noreg), "required.");
2154 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp);
2155 pand(dst, vtmp);
2156 } else {
2157 assert((tmp != noreg), "required.");
2158 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp);
2159 }
2160 }
2161 }
2162
2163 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
2164 switch(typ) {
2165 case T_BYTE:
2166 case T_BOOLEAN:
2167 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2168 break;
2169 case T_SHORT:
2170 case T_CHAR:
2171 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2172 break;
2173 case T_INT:
2174 case T_FLOAT:
2175 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2176 break;
2177 case T_LONG:
2178 case T_DOUBLE:
2179 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len);
2180 break;
2181 default:
2182 assert(false,"Should not reach here.");
2183 break;
2184 }
2185 }
2186
2187 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) {
2188 switch(typ) {
2189 case T_BOOLEAN:
2190 case T_BYTE:
2191 evpcmpb(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2192 break;
2193 case T_CHAR:
2194 case T_SHORT:
2195 evpcmpw(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2196 break;
2197 case T_INT:
2198 case T_FLOAT:
2199 evpcmpd(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2200 break;
2201 case T_LONG:
2202 case T_DOUBLE:
2203 evpcmpq(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
2204 break;
2205 default:
2206 assert(false,"Should not reach here.");
2207 break;
2208 }
2209 }
2210
2211 void C2_MacroAssembler::vpcmpu(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison,
2212 int vlen_in_bytes, XMMRegister vtmp1, XMMRegister vtmp2, Register scratch) {
2213 int vlen_enc = vector_length_encoding(vlen_in_bytes*2);
2214 switch (typ) {
2215 case T_BYTE:
2216 vpmovzxbw(vtmp1, src1, vlen_enc);
2217 vpmovzxbw(vtmp2, src2, vlen_enc);
2218 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2219 vpacksswb(dst, dst, dst, vlen_enc);
2220 break;
2221 case T_SHORT:
2222 vpmovzxwd(vtmp1, src1, vlen_enc);
2223 vpmovzxwd(vtmp2, src2, vlen_enc);
2224 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
2225 vpackssdw(dst, dst, dst, vlen_enc);
2226 break;
2227 case T_INT:
2228 vpmovzxdq(vtmp1, src1, vlen_enc);
2229 vpmovzxdq(vtmp2, src2, vlen_enc);
2230 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
2231 vpermilps(dst, dst, 8, vlen_enc);
2232 break;
2233 default:
2234 assert(false, "Should not reach here");
2235 }
2236 if (vlen_in_bytes == 16) {
2237 vpermpd(dst, dst, 0x8, vlen_enc);
2238 }
2239 }
2240
2241 void C2_MacroAssembler::vpcmpu32(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison, int vlen_in_bytes,
2242 XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, Register scratch) {
2243 int vlen_enc = vector_length_encoding(vlen_in_bytes);
2244 switch (typ) {
2245 case T_BYTE:
2246 vpmovzxbw(vtmp1, src1, vlen_enc);
2247 vpmovzxbw(vtmp2, src2, vlen_enc);
2248 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2249 vextracti128(vtmp1, src1, 1);
2250 vextracti128(vtmp2, src2, 1);
2251 vpmovzxbw(vtmp1, vtmp1, vlen_enc);
2252 vpmovzxbw(vtmp2, vtmp2, vlen_enc);
2253 vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch);
2254 vpacksswb(dst, dst, vtmp3, vlen_enc);
2255 vpermpd(dst, dst, 0xd8, vlen_enc);
2256 break;
2257 case T_SHORT:
2258 vpmovzxwd(vtmp1, src1, vlen_enc);
2259 vpmovzxwd(vtmp2, src2, vlen_enc);
2260 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
2261 vextracti128(vtmp1, src1, 1);
2262 vextracti128(vtmp2, src2, 1);
2263 vpmovzxwd(vtmp1, vtmp1, vlen_enc);
2264 vpmovzxwd(vtmp2, vtmp2, vlen_enc);
2265 vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch);
2266 vpackssdw(dst, dst, vtmp3, vlen_enc);
2267 vpermpd(dst, dst, 0xd8, vlen_enc);
2268 break;
2269 case T_INT:
2270 vpmovzxdq(vtmp1, src1, vlen_enc);
2271 vpmovzxdq(vtmp2, src2, vlen_enc);
2272 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
2273 vpshufd(dst, dst, 8, vlen_enc);
2274 vpermq(dst, dst, 8, vlen_enc);
2275 vextracti128(vtmp1, src1, 1);
2276 vextracti128(vtmp2, src2, 1);
2277 vpmovzxdq(vtmp1, vtmp1, vlen_enc);
2278 vpmovzxdq(vtmp2, vtmp2, vlen_enc);
2279 vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch);
2280 vpshufd(vtmp3, vtmp3, 8, vlen_enc);
2281 vpermq(vtmp3, vtmp3, 0x80, vlen_enc);
2282 vpblendd(dst, dst, vtmp3, 0xf0, vlen_enc);
2283 break;
2284 default:
2285 assert(false, "Should not reach here");
2286 }
2287 }
2288
2289 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
2290 switch(typ) {
2291 case T_BYTE:
2292 evpblendmb(dst, kmask, src1, src2, merge, vector_len);
2293 break;
2294 case T_SHORT:
2295 evpblendmw(dst, kmask, src1, src2, merge, vector_len);
2296 break;
2297 case T_INT:
2298 case T_FLOAT:
2299 evpblendmd(dst, kmask, src1, src2, merge, vector_len);
2300 break;
2301 case T_LONG:
2302 case T_DOUBLE:
2303 evpblendmq(dst, kmask, src1, src2, merge, vector_len);
2304 break;
2305 default:
2306 assert(false,"Should not reach here.");
2307 break;
2308 }
2309 }
2310
2311 void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2,
2312 XMMRegister vtmp1, XMMRegister vtmp2, KRegister mask) {
2313 switch(vlen) {
2314 case 4:
2315 assert(vtmp1 != xnoreg, "required.");
2316 // Broadcast lower 32 bits to 128 bits before ptest
2317 pshufd(vtmp1, src1, 0x0);
2318 if (bt == BoolTest::overflow) {
2319 assert(vtmp2 != xnoreg, "required.");
2320 pshufd(vtmp2, src2, 0x0);
2321 } else {
2322 assert(vtmp2 == xnoreg, "required.");
2323 vtmp2 = src2;
2324 }
2325 ptest(vtmp1, vtmp2);
2326 break;
2327 case 8:
2328 assert(vtmp1 != xnoreg, "required.");
2329 // Broadcast lower 64 bits to 128 bits before ptest
2330 pshufd(vtmp1, src1, 0x4);
2331 if (bt == BoolTest::overflow) {
2332 assert(vtmp2 != xnoreg, "required.");
2333 pshufd(vtmp2, src2, 0x4);
2334 } else {
2335 assert(vtmp2 == xnoreg, "required.");
2336 vtmp2 = src2;
2337 }
2338 ptest(vtmp1, vtmp2);
2339 break;
2340 case 16:
2341 assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2342 ptest(src1, src2);
2343 break;
2344 case 32:
2345 assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2346 vptest(src1, src2, Assembler::AVX_256bit);
2347 break;
2348 case 64:
2349 {
2350 assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
2351 evpcmpeqb(mask, src1, src2, Assembler::AVX_512bit);
2352 if (bt == BoolTest::ne) {
2353 ktestql(mask, mask);
2354 } else {
2355 assert(bt == BoolTest::overflow, "required");
2356 kortestql(mask, mask);
2357 }
2358 }
2359 break;
2360 default:
2361 assert(false,"Should not reach here.");
2362 break;
2363 }
2364 }
2365
2366 //-------------------------------------------------------------------------------------------
2367
2368 // IndexOf for constant substrings with size >= 8 chars
2369 // which don't need to be loaded through stack.
2370 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
2371 Register cnt1, Register cnt2,
2372 int int_cnt2, Register result,
2373 XMMRegister vec, Register tmp,
2374 int ae) {
2375 ShortBranchVerifier sbv(this);
2376 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2377 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2378
2379 // This method uses the pcmpestri instruction with bound registers
2380 // inputs:
2381 // xmm - substring
2382 // rax - substring length (elements count)
2383 // mem - scanned string
2384 // rdx - string length (elements count)
2385 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2386 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2387 // outputs:
2388 // rcx - matched index in string
2389 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2390 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2391 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2392 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2393 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2394
2395 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
2396 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
2397 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
2398
2399 // Note, inline_string_indexOf() generates checks:
2400 // if (substr.count > string.count) return -1;
2401 // if (substr.count == 0) return 0;
2402 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
2403
2404 // Load substring.
2405 if (ae == StrIntrinsicNode::UL) {
2406 pmovzxbw(vec, Address(str2, 0));
2407 } else {
2408 movdqu(vec, Address(str2, 0));
2409 }
2410 movl(cnt2, int_cnt2);
2411 movptr(result, str1); // string addr
2412
2413 if (int_cnt2 > stride) {
2414 jmpb(SCAN_TO_SUBSTR);
2415
2416 // Reload substr for rescan, this code
2417 // is executed only for large substrings (> 8 chars)
2418 bind(RELOAD_SUBSTR);
2419 if (ae == StrIntrinsicNode::UL) {
2420 pmovzxbw(vec, Address(str2, 0));
2421 } else {
2422 movdqu(vec, Address(str2, 0));
2423 }
2424 negptr(cnt2); // Jumped here with negative cnt2, convert to positive
2425
2426 bind(RELOAD_STR);
2427 // We came here after the beginning of the substring was
2428 // matched but the rest of it was not so we need to search
2429 // again. Start from the next element after the previous match.
2430
2431 // cnt2 is number of substring reminding elements and
2432 // cnt1 is number of string reminding elements when cmp failed.
2433 // Restored cnt1 = cnt1 - cnt2 + int_cnt2
2434 subl(cnt1, cnt2);
2435 addl(cnt1, int_cnt2);
2436 movl(cnt2, int_cnt2); // Now restore cnt2
2437
2438 decrementl(cnt1); // Shift to next element
2439 cmpl(cnt1, cnt2);
2440 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2441
2442 addptr(result, (1<<scale1));
2443
2444 } // (int_cnt2 > 8)
2445
2446 // Scan string for start of substr in 16-byte vectors
2447 bind(SCAN_TO_SUBSTR);
2448 pcmpestri(vec, Address(result, 0), mode);
2449 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
2450 subl(cnt1, stride);
2451 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2452 cmpl(cnt1, cnt2);
2453 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2454 addptr(result, 16);
2455 jmpb(SCAN_TO_SUBSTR);
2456
2457 // Found a potential substr
2458 bind(FOUND_CANDIDATE);
2459 // Matched whole vector if first element matched (tmp(rcx) == 0).
2460 if (int_cnt2 == stride) {
2461 jccb(Assembler::overflow, RET_FOUND); // OF == 1
2462 } else { // int_cnt2 > 8
2463 jccb(Assembler::overflow, FOUND_SUBSTR);
2464 }
2465 // After pcmpestri tmp(rcx) contains matched element index
2466 // Compute start addr of substr
2467 lea(result, Address(result, tmp, scale1));
2468
2469 // Make sure string is still long enough
2470 subl(cnt1, tmp);
2471 cmpl(cnt1, cnt2);
2472 if (int_cnt2 == stride) {
2473 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2474 } else { // int_cnt2 > 8
2475 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
2476 }
2477 // Left less then substring.
2478
2479 bind(RET_NOT_FOUND);
2480 movl(result, -1);
2481 jmp(EXIT);
2482
2483 if (int_cnt2 > stride) {
2484 // This code is optimized for the case when whole substring
2485 // is matched if its head is matched.
2486 bind(MATCH_SUBSTR_HEAD);
2487 pcmpestri(vec, Address(result, 0), mode);
2488 // Reload only string if does not match
2489 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
2490
2491 Label CONT_SCAN_SUBSTR;
2492 // Compare the rest of substring (> 8 chars).
2493 bind(FOUND_SUBSTR);
2494 // First 8 chars are already matched.
2495 negptr(cnt2);
2496 addptr(cnt2, stride);
2497
2498 bind(SCAN_SUBSTR);
2499 subl(cnt1, stride);
2500 cmpl(cnt2, -stride); // Do not read beyond substring
2501 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
2502 // Back-up strings to avoid reading beyond substring:
2503 // cnt1 = cnt1 - cnt2 + 8
2504 addl(cnt1, cnt2); // cnt2 is negative
2505 addl(cnt1, stride);
2506 movl(cnt2, stride); negptr(cnt2);
2507 bind(CONT_SCAN_SUBSTR);
2508 if (int_cnt2 < (int)G) {
2509 int tail_off1 = int_cnt2<<scale1;
2510 int tail_off2 = int_cnt2<<scale2;
2511 if (ae == StrIntrinsicNode::UL) {
2512 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
2513 } else {
2514 movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
2515 }
2516 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
2517 } else {
2518 // calculate index in register to avoid integer overflow (int_cnt2*2)
2519 movl(tmp, int_cnt2);
2520 addptr(tmp, cnt2);
2521 if (ae == StrIntrinsicNode::UL) {
2522 pmovzxbw(vec, Address(str2, tmp, scale2, 0));
2523 } else {
2524 movdqu(vec, Address(str2, tmp, scale2, 0));
2525 }
2526 pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
2527 }
2528 // Need to reload strings pointers if not matched whole vector
2529 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2530 addptr(cnt2, stride);
2531 jcc(Assembler::negative, SCAN_SUBSTR);
2532 // Fall through if found full substring
2533
2534 } // (int_cnt2 > 8)
2535
2536 bind(RET_FOUND);
2537 // Found result if we matched full small substring.
2538 // Compute substr offset
2539 subptr(result, str1);
2540 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2541 shrl(result, 1); // index
2542 }
2543 bind(EXIT);
2544
2545 } // string_indexofC8
2546
2547 // Small strings are loaded through stack if they cross page boundary.
2548 void C2_MacroAssembler::string_indexof(Register str1, Register str2,
2549 Register cnt1, Register cnt2,
2550 int int_cnt2, Register result,
2551 XMMRegister vec, Register tmp,
2552 int ae) {
2553 ShortBranchVerifier sbv(this);
2554 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2555 assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
2556
2557 //
2558 // int_cnt2 is length of small (< 8 chars) constant substring
2559 // or (-1) for non constant substring in which case its length
2560 // is in cnt2 register.
2561 //
2562 // Note, inline_string_indexOf() generates checks:
2563 // if (substr.count > string.count) return -1;
2564 // if (substr.count == 0) return 0;
2565 //
2566 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
2567 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
2568 // This method uses the pcmpestri instruction with bound registers
2569 // inputs:
2570 // xmm - substring
2571 // rax - substring length (elements count)
2572 // mem - scanned string
2573 // rdx - string length (elements count)
2574 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
2575 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
2576 // outputs:
2577 // rcx - matched index in string
2578 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2579 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
2580 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
2581 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
2582
2583 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
2584 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
2585 FOUND_CANDIDATE;
2586
2587 { //========================================================
2588 // We don't know where these strings are located
2589 // and we can't read beyond them. Load them through stack.
2590 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
2591
2592 movptr(tmp, rsp); // save old SP
2593
2594 if (int_cnt2 > 0) { // small (< 8 chars) constant substring
2595 if (int_cnt2 == (1>>scale2)) { // One byte
2596 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
2597 load_unsigned_byte(result, Address(str2, 0));
2598 movdl(vec, result); // move 32 bits
2599 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes
2600 // Not enough header space in 32-bit VM: 12+3 = 15.
2601 movl(result, Address(str2, -1));
2602 shrl(result, 8);
2603 movdl(vec, result); // move 32 bits
2604 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char
2605 load_unsigned_short(result, Address(str2, 0));
2606 movdl(vec, result); // move 32 bits
2607 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
2608 movdl(vec, Address(str2, 0)); // move 32 bits
2609 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
2610 movq(vec, Address(str2, 0)); // move 64 bits
2611 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
2612 // Array header size is 12 bytes in 32-bit VM
2613 // + 6 bytes for 3 chars == 18 bytes,
2614 // enough space to load vec and shift.
2615 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
2616 if (ae == StrIntrinsicNode::UL) {
2617 int tail_off = int_cnt2-8;
2618 pmovzxbw(vec, Address(str2, tail_off));
2619 psrldq(vec, -2*tail_off);
2620 }
2621 else {
2622 int tail_off = int_cnt2*(1<<scale2);
2623 movdqu(vec, Address(str2, tail_off-16));
2624 psrldq(vec, 16-tail_off);
2625 }
2626 }
2627 } else { // not constant substring
2628 cmpl(cnt2, stride);
2629 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
2630
2631 // We can read beyond string if srt+16 does not cross page boundary
2632 // since heaps are aligned and mapped by pages.
2633 assert(os::vm_page_size() < (int)G, "default page should be small");
2634 movl(result, str2); // We need only low 32 bits
2635 andl(result, (os::vm_page_size()-1));
2636 cmpl(result, (os::vm_page_size()-16));
2637 jccb(Assembler::belowEqual, CHECK_STR);
2638
2639 // Move small strings to stack to allow load 16 bytes into vec.
2640 subptr(rsp, 16);
2641 int stk_offset = wordSize-(1<<scale2);
2642 push(cnt2);
2643
2644 bind(COPY_SUBSTR);
2645 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
2646 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
2647 movb(Address(rsp, cnt2, scale2, stk_offset), result);
2648 } else if (ae == StrIntrinsicNode::UU) {
2649 load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
2650 movw(Address(rsp, cnt2, scale2, stk_offset), result);
2651 }
2652 decrement(cnt2);
2653 jccb(Assembler::notZero, COPY_SUBSTR);
2654
2655 pop(cnt2);
2656 movptr(str2, rsp); // New substring address
2657 } // non constant
2658
2659 bind(CHECK_STR);
2660 cmpl(cnt1, stride);
2661 jccb(Assembler::aboveEqual, BIG_STRINGS);
2662
2663 // Check cross page boundary.
2664 movl(result, str1); // We need only low 32 bits
2665 andl(result, (os::vm_page_size()-1));
2666 cmpl(result, (os::vm_page_size()-16));
2667 jccb(Assembler::belowEqual, BIG_STRINGS);
2668
2669 subptr(rsp, 16);
2670 int stk_offset = -(1<<scale1);
2671 if (int_cnt2 < 0) { // not constant
2672 push(cnt2);
2673 stk_offset += wordSize;
2674 }
2675 movl(cnt2, cnt1);
2676
2677 bind(COPY_STR);
2678 if (ae == StrIntrinsicNode::LL) {
2679 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
2680 movb(Address(rsp, cnt2, scale1, stk_offset), result);
2681 } else {
2682 load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
2683 movw(Address(rsp, cnt2, scale1, stk_offset), result);
2684 }
2685 decrement(cnt2);
2686 jccb(Assembler::notZero, COPY_STR);
2687
2688 if (int_cnt2 < 0) { // not constant
2689 pop(cnt2);
2690 }
2691 movptr(str1, rsp); // New string address
2692
2693 bind(BIG_STRINGS);
2694 // Load substring.
2695 if (int_cnt2 < 0) { // -1
2696 if (ae == StrIntrinsicNode::UL) {
2697 pmovzxbw(vec, Address(str2, 0));
2698 } else {
2699 movdqu(vec, Address(str2, 0));
2700 }
2701 push(cnt2); // substr count
2702 push(str2); // substr addr
2703 push(str1); // string addr
2704 } else {
2705 // Small (< 8 chars) constant substrings are loaded already.
2706 movl(cnt2, int_cnt2);
2707 }
2708 push(tmp); // original SP
2709
2710 } // Finished loading
2711
2712 //========================================================
2713 // Start search
2714 //
2715
2716 movptr(result, str1); // string addr
2717
2718 if (int_cnt2 < 0) { // Only for non constant substring
2719 jmpb(SCAN_TO_SUBSTR);
2720
2721 // SP saved at sp+0
2722 // String saved at sp+1*wordSize
2723 // Substr saved at sp+2*wordSize
2724 // Substr count saved at sp+3*wordSize
2725
2726 // Reload substr for rescan, this code
2727 // is executed only for large substrings (> 8 chars)
2728 bind(RELOAD_SUBSTR);
2729 movptr(str2, Address(rsp, 2*wordSize));
2730 movl(cnt2, Address(rsp, 3*wordSize));
2731 if (ae == StrIntrinsicNode::UL) {
2732 pmovzxbw(vec, Address(str2, 0));
2733 } else {
2734 movdqu(vec, Address(str2, 0));
2735 }
2736 // We came here after the beginning of the substring was
2737 // matched but the rest of it was not so we need to search
2738 // again. Start from the next element after the previous match.
2739 subptr(str1, result); // Restore counter
2740 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2741 shrl(str1, 1);
2742 }
2743 addl(cnt1, str1);
2744 decrementl(cnt1); // Shift to next element
2745 cmpl(cnt1, cnt2);
2746 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2747
2748 addptr(result, (1<<scale1));
2749 } // non constant
2750
2751 // Scan string for start of substr in 16-byte vectors
2752 bind(SCAN_TO_SUBSTR);
2753 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
2754 pcmpestri(vec, Address(result, 0), mode);
2755 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
2756 subl(cnt1, stride);
2757 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
2758 cmpl(cnt1, cnt2);
2759 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
2760 addptr(result, 16);
2761
2762 bind(ADJUST_STR);
2763 cmpl(cnt1, stride); // Do not read beyond string
2764 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
2765 // Back-up string to avoid reading beyond string.
2766 lea(result, Address(result, cnt1, scale1, -16));
2767 movl(cnt1, stride);
2768 jmpb(SCAN_TO_SUBSTR);
2769
2770 // Found a potential substr
2771 bind(FOUND_CANDIDATE);
2772 // After pcmpestri tmp(rcx) contains matched element index
2773
2774 // Make sure string is still long enough
2775 subl(cnt1, tmp);
2776 cmpl(cnt1, cnt2);
2777 jccb(Assembler::greaterEqual, FOUND_SUBSTR);
2778 // Left less then substring.
2779
2780 bind(RET_NOT_FOUND);
2781 movl(result, -1);
2782 jmp(CLEANUP);
2783
2784 bind(FOUND_SUBSTR);
2785 // Compute start addr of substr
2786 lea(result, Address(result, tmp, scale1));
2787 if (int_cnt2 > 0) { // Constant substring
2788 // Repeat search for small substring (< 8 chars)
2789 // from new point without reloading substring.
2790 // Have to check that we don't read beyond string.
2791 cmpl(tmp, stride-int_cnt2);
2792 jccb(Assembler::greater, ADJUST_STR);
2793 // Fall through if matched whole substring.
2794 } else { // non constant
2795 assert(int_cnt2 == -1, "should be != 0");
2796
2797 addl(tmp, cnt2);
2798 // Found result if we matched whole substring.
2799 cmpl(tmp, stride);
2800 jcc(Assembler::lessEqual, RET_FOUND);
2801
2802 // Repeat search for small substring (<= 8 chars)
2803 // from new point 'str1' without reloading substring.
2804 cmpl(cnt2, stride);
2805 // Have to check that we don't read beyond string.
2806 jccb(Assembler::lessEqual, ADJUST_STR);
2807
2808 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
2809 // Compare the rest of substring (> 8 chars).
2810 movptr(str1, result);
2811
2812 cmpl(tmp, cnt2);
2813 // First 8 chars are already matched.
2814 jccb(Assembler::equal, CHECK_NEXT);
2815
2816 bind(SCAN_SUBSTR);
2817 pcmpestri(vec, Address(str1, 0), mode);
2818 // Need to reload strings pointers if not matched whole vector
2819 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
2820
2821 bind(CHECK_NEXT);
2822 subl(cnt2, stride);
2823 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
2824 addptr(str1, 16);
2825 if (ae == StrIntrinsicNode::UL) {
2826 addptr(str2, 8);
2827 } else {
2828 addptr(str2, 16);
2829 }
2830 subl(cnt1, stride);
2831 cmpl(cnt2, stride); // Do not read beyond substring
2832 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
2833 // Back-up strings to avoid reading beyond substring.
2834
2835 if (ae == StrIntrinsicNode::UL) {
2836 lea(str2, Address(str2, cnt2, scale2, -8));
2837 lea(str1, Address(str1, cnt2, scale1, -16));
2838 } else {
2839 lea(str2, Address(str2, cnt2, scale2, -16));
2840 lea(str1, Address(str1, cnt2, scale1, -16));
2841 }
2842 subl(cnt1, cnt2);
2843 movl(cnt2, stride);
2844 addl(cnt1, stride);
2845 bind(CONT_SCAN_SUBSTR);
2846 if (ae == StrIntrinsicNode::UL) {
2847 pmovzxbw(vec, Address(str2, 0));
2848 } else {
2849 movdqu(vec, Address(str2, 0));
2850 }
2851 jmp(SCAN_SUBSTR);
2852
2853 bind(RET_FOUND_LONG);
2854 movptr(str1, Address(rsp, wordSize));
2855 } // non constant
2856
2857 bind(RET_FOUND);
2858 // Compute substr offset
2859 subptr(result, str1);
2860 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
2861 shrl(result, 1); // index
2862 }
2863 bind(CLEANUP);
2864 pop(rsp); // restore SP
2865
2866 } // string_indexof
2867
2868 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2869 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2870 ShortBranchVerifier sbv(this);
2871 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2872
2873 int stride = 8;
2874
2875 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
2876 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
2877 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
2878 FOUND_SEQ_CHAR, DONE_LABEL;
2879
2880 movptr(result, str1);
2881 if (UseAVX >= 2) {
2882 cmpl(cnt1, stride);
2883 jcc(Assembler::less, SCAN_TO_CHAR);
2884 cmpl(cnt1, 2*stride);
2885 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
2886 movdl(vec1, ch);
2887 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
2888 vpxor(vec2, vec2);
2889 movl(tmp, cnt1);
2890 andl(tmp, 0xFFFFFFF0); //vector count (in chars)
2891 andl(cnt1,0x0000000F); //tail count (in chars)
2892
2893 bind(SCAN_TO_16_CHAR_LOOP);
2894 vmovdqu(vec3, Address(result, 0));
2895 vpcmpeqw(vec3, vec3, vec1, 1);
2896 vptest(vec2, vec3);
2897 jcc(Assembler::carryClear, FOUND_CHAR);
2898 addptr(result, 32);
2899 subl(tmp, 2*stride);
2900 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
2901 jmp(SCAN_TO_8_CHAR);
2902 bind(SCAN_TO_8_CHAR_INIT);
2903 movdl(vec1, ch);
2904 pshuflw(vec1, vec1, 0x00);
2905 pshufd(vec1, vec1, 0);
2906 pxor(vec2, vec2);
2907 }
2908 bind(SCAN_TO_8_CHAR);
2909 cmpl(cnt1, stride);
2910 jcc(Assembler::less, SCAN_TO_CHAR);
2911 if (UseAVX < 2) {
2912 movdl(vec1, ch);
2913 pshuflw(vec1, vec1, 0x00);
2914 pshufd(vec1, vec1, 0);
2915 pxor(vec2, vec2);
2916 }
2917 movl(tmp, cnt1);
2918 andl(tmp, 0xFFFFFFF8); //vector count (in chars)
2919 andl(cnt1,0x00000007); //tail count (in chars)
2920
2921 bind(SCAN_TO_8_CHAR_LOOP);
2922 movdqu(vec3, Address(result, 0));
2923 pcmpeqw(vec3, vec1);
2924 ptest(vec2, vec3);
2925 jcc(Assembler::carryClear, FOUND_CHAR);
2926 addptr(result, 16);
2927 subl(tmp, stride);
2928 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
2929 bind(SCAN_TO_CHAR);
2930 testl(cnt1, cnt1);
2931 jcc(Assembler::zero, RET_NOT_FOUND);
2932 bind(SCAN_TO_CHAR_LOOP);
2933 load_unsigned_short(tmp, Address(result, 0));
2934 cmpl(ch, tmp);
2935 jccb(Assembler::equal, FOUND_SEQ_CHAR);
2936 addptr(result, 2);
2937 subl(cnt1, 1);
2938 jccb(Assembler::zero, RET_NOT_FOUND);
2939 jmp(SCAN_TO_CHAR_LOOP);
2940
2941 bind(RET_NOT_FOUND);
2942 movl(result, -1);
2943 jmpb(DONE_LABEL);
2944
2945 bind(FOUND_CHAR);
2946 if (UseAVX >= 2) {
2947 vpmovmskb(tmp, vec3);
2948 } else {
2949 pmovmskb(tmp, vec3);
2950 }
2951 bsfl(ch, tmp);
2952 addptr(result, ch);
2953
2954 bind(FOUND_SEQ_CHAR);
2955 subptr(result, str1);
2956 shrl(result, 1);
2957
2958 bind(DONE_LABEL);
2959 } // string_indexof_char
2960
2961 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
2962 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
2963 ShortBranchVerifier sbv(this);
2964 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
2965
2966 int stride = 16;
2967
2968 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
2969 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
2970 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
2971 FOUND_SEQ_CHAR, DONE_LABEL;
2972
2973 movptr(result, str1);
2974 if (UseAVX >= 2) {
2975 cmpl(cnt1, stride);
2976 jcc(Assembler::less, SCAN_TO_CHAR_INIT);
2977 cmpl(cnt1, stride*2);
2978 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
2979 movdl(vec1, ch);
2980 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
2981 vpxor(vec2, vec2);
2982 movl(tmp, cnt1);
2983 andl(tmp, 0xFFFFFFE0); //vector count (in chars)
2984 andl(cnt1,0x0000001F); //tail count (in chars)
2985
2986 bind(SCAN_TO_32_CHAR_LOOP);
2987 vmovdqu(vec3, Address(result, 0));
2988 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
2989 vptest(vec2, vec3);
2990 jcc(Assembler::carryClear, FOUND_CHAR);
2991 addptr(result, 32);
2992 subl(tmp, stride*2);
2993 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
2994 jmp(SCAN_TO_16_CHAR);
2995
2996 bind(SCAN_TO_16_CHAR_INIT);
2997 movdl(vec1, ch);
2998 pxor(vec2, vec2);
2999 pshufb(vec1, vec2);
3000 }
3001
3002 bind(SCAN_TO_16_CHAR);
3003 cmpl(cnt1, stride);
3004 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entires left
3005 if (UseAVX < 2) {
3006 movdl(vec1, ch);
3007 pxor(vec2, vec2);
3008 pshufb(vec1, vec2);
3009 }
3010 movl(tmp, cnt1);
3011 andl(tmp, 0xFFFFFFF0); //vector count (in bytes)
3012 andl(cnt1,0x0000000F); //tail count (in bytes)
3013
3014 bind(SCAN_TO_16_CHAR_LOOP);
3015 movdqu(vec3, Address(result, 0));
3016 pcmpeqb(vec3, vec1);
3017 ptest(vec2, vec3);
3018 jcc(Assembler::carryClear, FOUND_CHAR);
3019 addptr(result, 16);
3020 subl(tmp, stride);
3021 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
3022
3023 bind(SCAN_TO_CHAR_INIT);
3024 testl(cnt1, cnt1);
3025 jcc(Assembler::zero, RET_NOT_FOUND);
3026 bind(SCAN_TO_CHAR_LOOP);
3027 load_unsigned_byte(tmp, Address(result, 0));
3028 cmpl(ch, tmp);
3029 jccb(Assembler::equal, FOUND_SEQ_CHAR);
3030 addptr(result, 1);
3031 subl(cnt1, 1);
3032 jccb(Assembler::zero, RET_NOT_FOUND);
3033 jmp(SCAN_TO_CHAR_LOOP);
3034
3035 bind(RET_NOT_FOUND);
3036 movl(result, -1);
3037 jmpb(DONE_LABEL);
3038
3039 bind(FOUND_CHAR);
3040 if (UseAVX >= 2) {
3041 vpmovmskb(tmp, vec3);
3042 } else {
3043 pmovmskb(tmp, vec3);
3044 }
3045 bsfl(ch, tmp);
3046 addptr(result, ch);
3047
3048 bind(FOUND_SEQ_CHAR);
3049 subptr(result, str1);
3050
3051 bind(DONE_LABEL);
3052 } // stringL_indexof_char
3053
3054 // helper function for string_compare
3055 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
3056 Address::ScaleFactor scale, Address::ScaleFactor scale1,
3057 Address::ScaleFactor scale2, Register index, int ae) {
3058 if (ae == StrIntrinsicNode::LL) {
3059 load_unsigned_byte(elem1, Address(str1, index, scale, 0));
3060 load_unsigned_byte(elem2, Address(str2, index, scale, 0));
3061 } else if (ae == StrIntrinsicNode::UU) {
3062 load_unsigned_short(elem1, Address(str1, index, scale, 0));
3063 load_unsigned_short(elem2, Address(str2, index, scale, 0));
3064 } else {
3065 load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
3066 load_unsigned_short(elem2, Address(str2, index, scale2, 0));
3067 }
3068 }
3069
3070 // Compare strings, used for char[] and byte[].
3071 void C2_MacroAssembler::string_compare(Register str1, Register str2,
3072 Register cnt1, Register cnt2, Register result,
3073 XMMRegister vec1, int ae, KRegister mask) {
3074 ShortBranchVerifier sbv(this);
3075 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
3076 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3
3077 int stride, stride2, adr_stride, adr_stride1, adr_stride2;
3078 int stride2x2 = 0x40;
3079 Address::ScaleFactor scale = Address::no_scale;
3080 Address::ScaleFactor scale1 = Address::no_scale;
3081 Address::ScaleFactor scale2 = Address::no_scale;
3082
3083 if (ae != StrIntrinsicNode::LL) {
3084 stride2x2 = 0x20;
3085 }
3086
3087 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
3088 shrl(cnt2, 1);
3089 }
3090 // Compute the minimum of the string lengths and the
3091 // difference of the string lengths (stack).
3092 // Do the conditional move stuff
3093 movl(result, cnt1);
3094 subl(cnt1, cnt2);
3095 push(cnt1);
3096 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2)
3097
3098 // Is the minimum length zero?
3099 testl(cnt2, cnt2);
3100 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3101 if (ae == StrIntrinsicNode::LL) {
3102 // Load first bytes
3103 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0]
3104 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0]
3105 } else if (ae == StrIntrinsicNode::UU) {
3106 // Load first characters
3107 load_unsigned_short(result, Address(str1, 0));
3108 load_unsigned_short(cnt1, Address(str2, 0));
3109 } else {
3110 load_unsigned_byte(result, Address(str1, 0));
3111 load_unsigned_short(cnt1, Address(str2, 0));
3112 }
3113 subl(result, cnt1);
3114 jcc(Assembler::notZero, POP_LABEL);
3115
3116 if (ae == StrIntrinsicNode::UU) {
3117 // Divide length by 2 to get number of chars
3118 shrl(cnt2, 1);
3119 }
3120 cmpl(cnt2, 1);
3121 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3122
3123 // Check if the strings start at the same location and setup scale and stride
3124 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3125 cmpptr(str1, str2);
3126 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
3127 if (ae == StrIntrinsicNode::LL) {
3128 scale = Address::times_1;
3129 stride = 16;
3130 } else {
3131 scale = Address::times_2;
3132 stride = 8;
3133 }
3134 } else {
3135 scale1 = Address::times_1;
3136 scale2 = Address::times_2;
3137 // scale not used
3138 stride = 8;
3139 }
3140
3141 if (UseAVX >= 2 && UseSSE42Intrinsics) {
3142 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
3143 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
3144 Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
3145 Label COMPARE_TAIL_LONG;
3146 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3
3147
3148 int pcmpmask = 0x19;
3149 if (ae == StrIntrinsicNode::LL) {
3150 pcmpmask &= ~0x01;
3151 }
3152
3153 // Setup to compare 16-chars (32-bytes) vectors,
3154 // start from first character again because it has aligned address.
3155 if (ae == StrIntrinsicNode::LL) {
3156 stride2 = 32;
3157 } else {
3158 stride2 = 16;
3159 }
3160 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3161 adr_stride = stride << scale;
3162 } else {
3163 adr_stride1 = 8; //stride << scale1;
3164 adr_stride2 = 16; //stride << scale2;
3165 }
3166
3167 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3168 // rax and rdx are used by pcmpestri as elements counters
3169 movl(result, cnt2);
3170 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count
3171 jcc(Assembler::zero, COMPARE_TAIL_LONG);
3172
3173 // fast path : compare first 2 8-char vectors.
3174 bind(COMPARE_16_CHARS);
3175 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3176 movdqu(vec1, Address(str1, 0));
3177 } else {
3178 pmovzxbw(vec1, Address(str1, 0));
3179 }
3180 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3181 jccb(Assembler::below, COMPARE_INDEX_CHAR);
3182
3183 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3184 movdqu(vec1, Address(str1, adr_stride));
3185 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
3186 } else {
3187 pmovzxbw(vec1, Address(str1, adr_stride1));
3188 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
3189 }
3190 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
3191 addl(cnt1, stride);
3192
3193 // Compare the characters at index in cnt1
3194 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
3195 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3196 subl(result, cnt2);
3197 jmp(POP_LABEL);
3198
3199 // Setup the registers to start vector comparison loop
3200 bind(COMPARE_WIDE_VECTORS);
3201 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3202 lea(str1, Address(str1, result, scale));
3203 lea(str2, Address(str2, result, scale));
3204 } else {
3205 lea(str1, Address(str1, result, scale1));
3206 lea(str2, Address(str2, result, scale2));
3207 }
3208 subl(result, stride2);
3209 subl(cnt2, stride2);
3210 jcc(Assembler::zero, COMPARE_WIDE_TAIL);
3211 negptr(result);
3212
3213 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
3214 bind(COMPARE_WIDE_VECTORS_LOOP);
3215
3216 #ifdef _LP64
3217 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3218 cmpl(cnt2, stride2x2);
3219 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3220 testl(cnt2, stride2x2-1); // cnt2 holds the vector count
3221 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40
3222
3223 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3224 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3225 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
3226 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3227 } else {
3228 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
3229 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
3230 }
3231 kortestql(mask, mask);
3232 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare
3233 addptr(result, stride2x2); // update since we already compared at this addr
3234 subl(cnt2, stride2x2); // and sub the size too
3235 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3236
3237 vpxor(vec1, vec1);
3238 jmpb(COMPARE_WIDE_TAIL);
3239 }//if (VM_Version::supports_avx512vlbw())
3240 #endif // _LP64
3241
3242
3243 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3244 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3245 vmovdqu(vec1, Address(str1, result, scale));
3246 vpxor(vec1, Address(str2, result, scale));
3247 } else {
3248 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
3249 vpxor(vec1, Address(str2, result, scale2));
3250 }
3251 vptest(vec1, vec1);
3252 jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
3253 addptr(result, stride2);
3254 subl(cnt2, stride2);
3255 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
3256 // clean upper bits of YMM registers
3257 vpxor(vec1, vec1);
3258
3259 // compare wide vectors tail
3260 bind(COMPARE_WIDE_TAIL);
3261 testptr(result, result);
3262 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3263
3264 movl(result, stride2);
3265 movl(cnt2, result);
3266 negptr(result);
3267 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3268
3269 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
3270 bind(VECTOR_NOT_EQUAL);
3271 // clean upper bits of YMM registers
3272 vpxor(vec1, vec1);
3273 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3274 lea(str1, Address(str1, result, scale));
3275 lea(str2, Address(str2, result, scale));
3276 } else {
3277 lea(str1, Address(str1, result, scale1));
3278 lea(str2, Address(str2, result, scale2));
3279 }
3280 jmp(COMPARE_16_CHARS);
3281
3282 // Compare tail chars, length between 1 to 15 chars
3283 bind(COMPARE_TAIL_LONG);
3284 movl(cnt2, result);
3285 cmpl(cnt2, stride);
3286 jcc(Assembler::less, COMPARE_SMALL_STR);
3287
3288 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3289 movdqu(vec1, Address(str1, 0));
3290 } else {
3291 pmovzxbw(vec1, Address(str1, 0));
3292 }
3293 pcmpestri(vec1, Address(str2, 0), pcmpmask);
3294 jcc(Assembler::below, COMPARE_INDEX_CHAR);
3295 subptr(cnt2, stride);
3296 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3297 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3298 lea(str1, Address(str1, result, scale));
3299 lea(str2, Address(str2, result, scale));
3300 } else {
3301 lea(str1, Address(str1, result, scale1));
3302 lea(str2, Address(str2, result, scale2));
3303 }
3304 negptr(cnt2);
3305 jmpb(WHILE_HEAD_LABEL);
3306
3307 bind(COMPARE_SMALL_STR);
3308 } else if (UseSSE42Intrinsics) {
3309 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
3310 int pcmpmask = 0x19;
3311 // Setup to compare 8-char (16-byte) vectors,
3312 // start from first character again because it has aligned address.
3313 movl(result, cnt2);
3314 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count
3315 if (ae == StrIntrinsicNode::LL) {
3316 pcmpmask &= ~0x01;
3317 }
3318 jcc(Assembler::zero, COMPARE_TAIL);
3319 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3320 lea(str1, Address(str1, result, scale));
3321 lea(str2, Address(str2, result, scale));
3322 } else {
3323 lea(str1, Address(str1, result, scale1));
3324 lea(str2, Address(str2, result, scale2));
3325 }
3326 negptr(result);
3327
3328 // pcmpestri
3329 // inputs:
3330 // vec1- substring
3331 // rax - negative string length (elements count)
3332 // mem - scanned string
3333 // rdx - string length (elements count)
3334 // pcmpmask - cmp mode: 11000 (string compare with negated result)
3335 // + 00 (unsigned bytes) or + 01 (unsigned shorts)
3336 // outputs:
3337 // rcx - first mismatched element index
3338 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
3339
3340 bind(COMPARE_WIDE_VECTORS);
3341 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3342 movdqu(vec1, Address(str1, result, scale));
3343 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3344 } else {
3345 pmovzxbw(vec1, Address(str1, result, scale1));
3346 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3347 }
3348 // After pcmpestri cnt1(rcx) contains mismatched element index
3349
3350 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1
3351 addptr(result, stride);
3352 subptr(cnt2, stride);
3353 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
3354
3355 // compare wide vectors tail
3356 testptr(result, result);
3357 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
3358
3359 movl(cnt2, stride);
3360 movl(result, stride);
3361 negptr(result);
3362 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3363 movdqu(vec1, Address(str1, result, scale));
3364 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
3365 } else {
3366 pmovzxbw(vec1, Address(str1, result, scale1));
3367 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
3368 }
3369 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
3370
3371 // Mismatched characters in the vectors
3372 bind(VECTOR_NOT_EQUAL);
3373 addptr(cnt1, result);
3374 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
3375 subl(result, cnt2);
3376 jmpb(POP_LABEL);
3377
3378 bind(COMPARE_TAIL); // limit is zero
3379 movl(cnt2, result);
3380 // Fallthru to tail compare
3381 }
3382 // Shift str2 and str1 to the end of the arrays, negate min
3383 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3384 lea(str1, Address(str1, cnt2, scale));
3385 lea(str2, Address(str2, cnt2, scale));
3386 } else {
3387 lea(str1, Address(str1, cnt2, scale1));
3388 lea(str2, Address(str2, cnt2, scale2));
3389 }
3390 decrementl(cnt2); // first character was compared already
3391 negptr(cnt2);
3392
3393 // Compare the rest of the elements
3394 bind(WHILE_HEAD_LABEL);
3395 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
3396 subl(result, cnt1);
3397 jccb(Assembler::notZero, POP_LABEL);
3398 increment(cnt2);
3399 jccb(Assembler::notZero, WHILE_HEAD_LABEL);
3400
3401 // Strings are equal up to min length. Return the length difference.
3402 bind(LENGTH_DIFF_LABEL);
3403 pop(result);
3404 if (ae == StrIntrinsicNode::UU) {
3405 // Divide diff by 2 to get number of chars
3406 sarl(result, 1);
3407 }
3408 jmpb(DONE_LABEL);
3409
3410 #ifdef _LP64
3411 if (VM_Version::supports_avx512vlbw()) {
3412
3413 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
3414
3415 kmovql(cnt1, mask);
3416 notq(cnt1);
3417 bsfq(cnt2, cnt1);
3418 if (ae != StrIntrinsicNode::LL) {
3419 // Divide diff by 2 to get number of chars
3420 sarl(cnt2, 1);
3421 }
3422 addq(result, cnt2);
3423 if (ae == StrIntrinsicNode::LL) {
3424 load_unsigned_byte(cnt1, Address(str2, result));
3425 load_unsigned_byte(result, Address(str1, result));
3426 } else if (ae == StrIntrinsicNode::UU) {
3427 load_unsigned_short(cnt1, Address(str2, result, scale));
3428 load_unsigned_short(result, Address(str1, result, scale));
3429 } else {
3430 load_unsigned_short(cnt1, Address(str2, result, scale2));
3431 load_unsigned_byte(result, Address(str1, result, scale1));
3432 }
3433 subl(result, cnt1);
3434 jmpb(POP_LABEL);
3435 }//if (VM_Version::supports_avx512vlbw())
3436 #endif // _LP64
3437
3438 // Discard the stored length difference
3439 bind(POP_LABEL);
3440 pop(cnt1);
3441
3442 // That's it
3443 bind(DONE_LABEL);
3444 if(ae == StrIntrinsicNode::UL) {
3445 negl(result);
3446 }
3447
3448 }
3449
3450 // Search for Non-ASCII character (Negative byte value) in a byte array,
3451 // return true if it has any and false otherwise.
3452 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
3453 // @IntrinsicCandidate
3454 // private static boolean hasNegatives(byte[] ba, int off, int len) {
3455 // for (int i = off; i < off + len; i++) {
3456 // if (ba[i] < 0) {
3457 // return true;
3458 // }
3459 // }
3460 // return false;
3461 // }
3462 void C2_MacroAssembler::has_negatives(Register ary1, Register len,
3463 Register result, Register tmp1,
3464 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) {
3465 // rsi: byte array
3466 // rcx: len
3467 // rax: result
3468 ShortBranchVerifier sbv(this);
3469 assert_different_registers(ary1, len, result, tmp1);
3470 assert_different_registers(vec1, vec2);
3471 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
3472
3473 // len == 0
3474 testl(len, len);
3475 jcc(Assembler::zero, FALSE_LABEL);
3476
3477 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
3478 VM_Version::supports_avx512vlbw() &&
3479 VM_Version::supports_bmi2()) {
3480
3481 Label test_64_loop, test_tail;
3482 Register tmp3_aliased = len;
3483
3484 movl(tmp1, len);
3485 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
3486
3487 andl(tmp1, 64 - 1); // tail count (in chars) 0x3F
3488 andl(len, ~(64 - 1)); // vector count (in chars)
3489 jccb(Assembler::zero, test_tail);
3490
3491 lea(ary1, Address(ary1, len, Address::times_1));
3492 negptr(len);
3493
3494 bind(test_64_loop);
3495 // Check whether our 64 elements of size byte contain negatives
3496 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
3497 kortestql(mask1, mask1);
3498 jcc(Assembler::notZero, TRUE_LABEL);
3499
3500 addptr(len, 64);
3501 jccb(Assembler::notZero, test_64_loop);
3502
3503
3504 bind(test_tail);
3505 // bail out when there is nothing to be done
3506 testl(tmp1, -1);
3507 jcc(Assembler::zero, FALSE_LABEL);
3508
3509 // ~(~0 << len) applied up to two times (for 32-bit scenario)
3510 #ifdef _LP64
3511 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
3512 shlxq(tmp3_aliased, tmp3_aliased, tmp1);
3513 notq(tmp3_aliased);
3514 kmovql(mask2, tmp3_aliased);
3515 #else
3516 Label k_init;
3517 jmp(k_init);
3518
3519 // We could not read 64-bits from a general purpose register thus we move
3520 // data required to compose 64 1's to the instruction stream
3521 // We emit 64 byte wide series of elements from 0..63 which later on would
3522 // be used as a compare targets with tail count contained in tmp1 register.
3523 // Result would be a k register having tmp1 consecutive number or 1
3524 // counting from least significant bit.
3525 address tmp = pc();
3526 emit_int64(0x0706050403020100);
3527 emit_int64(0x0F0E0D0C0B0A0908);
3528 emit_int64(0x1716151413121110);
3529 emit_int64(0x1F1E1D1C1B1A1918);
3530 emit_int64(0x2726252423222120);
3531 emit_int64(0x2F2E2D2C2B2A2928);
3532 emit_int64(0x3736353433323130);
3533 emit_int64(0x3F3E3D3C3B3A3938);
3534
3535 bind(k_init);
3536 lea(len, InternalAddress(tmp));
3537 // create mask to test for negative byte inside a vector
3538 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
3539 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
3540
3541 #endif
3542 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit);
3543 ktestq(mask1, mask2);
3544 jcc(Assembler::notZero, TRUE_LABEL);
3545
3546 jmp(FALSE_LABEL);
3547 } else {
3548 movl(result, len); // copy
3549
3550 if (UseAVX >= 2 && UseSSE >= 2) {
3551 // With AVX2, use 32-byte vector compare
3552 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3553
3554 // Compare 32-byte vectors
3555 andl(result, 0x0000001f); // tail count (in bytes)
3556 andl(len, 0xffffffe0); // vector count (in bytes)
3557 jccb(Assembler::zero, COMPARE_TAIL);
3558
3559 lea(ary1, Address(ary1, len, Address::times_1));
3560 negptr(len);
3561
3562 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
3563 movdl(vec2, tmp1);
3564 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
3565
3566 bind(COMPARE_WIDE_VECTORS);
3567 vmovdqu(vec1, Address(ary1, len, Address::times_1));
3568 vptest(vec1, vec2);
3569 jccb(Assembler::notZero, TRUE_LABEL);
3570 addptr(len, 32);
3571 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3572
3573 testl(result, result);
3574 jccb(Assembler::zero, FALSE_LABEL);
3575
3576 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3577 vptest(vec1, vec2);
3578 jccb(Assembler::notZero, TRUE_LABEL);
3579 jmpb(FALSE_LABEL);
3580
3581 bind(COMPARE_TAIL); // len is zero
3582 movl(len, result);
3583 // Fallthru to tail compare
3584 } else if (UseSSE42Intrinsics) {
3585 // With SSE4.2, use double quad vector compare
3586 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3587
3588 // Compare 16-byte vectors
3589 andl(result, 0x0000000f); // tail count (in bytes)
3590 andl(len, 0xfffffff0); // vector count (in bytes)
3591 jcc(Assembler::zero, COMPARE_TAIL);
3592
3593 lea(ary1, Address(ary1, len, Address::times_1));
3594 negptr(len);
3595
3596 movl(tmp1, 0x80808080);
3597 movdl(vec2, tmp1);
3598 pshufd(vec2, vec2, 0);
3599
3600 bind(COMPARE_WIDE_VECTORS);
3601 movdqu(vec1, Address(ary1, len, Address::times_1));
3602 ptest(vec1, vec2);
3603 jcc(Assembler::notZero, TRUE_LABEL);
3604 addptr(len, 16);
3605 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3606
3607 testl(result, result);
3608 jcc(Assembler::zero, FALSE_LABEL);
3609
3610 movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3611 ptest(vec1, vec2);
3612 jccb(Assembler::notZero, TRUE_LABEL);
3613 jmpb(FALSE_LABEL);
3614
3615 bind(COMPARE_TAIL); // len is zero
3616 movl(len, result);
3617 // Fallthru to tail compare
3618 }
3619 }
3620 // Compare 4-byte vectors
3621 andl(len, 0xfffffffc); // vector count (in bytes)
3622 jccb(Assembler::zero, COMPARE_CHAR);
3623
3624 lea(ary1, Address(ary1, len, Address::times_1));
3625 negptr(len);
3626
3627 bind(COMPARE_VECTORS);
3628 movl(tmp1, Address(ary1, len, Address::times_1));
3629 andl(tmp1, 0x80808080);
3630 jccb(Assembler::notZero, TRUE_LABEL);
3631 addptr(len, 4);
3632 jcc(Assembler::notZero, COMPARE_VECTORS);
3633
3634 // Compare trailing char (final 2 bytes), if any
3635 bind(COMPARE_CHAR);
3636 testl(result, 0x2); // tail char
3637 jccb(Assembler::zero, COMPARE_BYTE);
3638 load_unsigned_short(tmp1, Address(ary1, 0));
3639 andl(tmp1, 0x00008080);
3640 jccb(Assembler::notZero, TRUE_LABEL);
3641 subptr(result, 2);
3642 lea(ary1, Address(ary1, 2));
3643
3644 bind(COMPARE_BYTE);
3645 testl(result, 0x1); // tail byte
3646 jccb(Assembler::zero, FALSE_LABEL);
3647 load_unsigned_byte(tmp1, Address(ary1, 0));
3648 andl(tmp1, 0x00000080);
3649 jccb(Assembler::notEqual, TRUE_LABEL);
3650 jmpb(FALSE_LABEL);
3651
3652 bind(TRUE_LABEL);
3653 movl(result, 1); // return true
3654 jmpb(DONE);
3655
3656 bind(FALSE_LABEL);
3657 xorl(result, result); // return false
3658
3659 // That's it
3660 bind(DONE);
3661 if (UseAVX >= 2 && UseSSE >= 2) {
3662 // clean upper bits of YMM registers
3663 vpxor(vec1, vec1);
3664 vpxor(vec2, vec2);
3665 }
3666 }
3667 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
3668 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
3669 Register limit, Register result, Register chr,
3670 XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) {
3671 ShortBranchVerifier sbv(this);
3672 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
3673
3674 int length_offset = arrayOopDesc::length_offset_in_bytes();
3675 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
3676
3677 if (is_array_equ) {
3678 // Check the input args
3679 cmpoop(ary1, ary2);
3680 jcc(Assembler::equal, TRUE_LABEL);
3681
3682 // Need additional checks for arrays_equals.
3683 testptr(ary1, ary1);
3684 jcc(Assembler::zero, FALSE_LABEL);
3685 testptr(ary2, ary2);
3686 jcc(Assembler::zero, FALSE_LABEL);
3687
3688 // Check the lengths
3689 movl(limit, Address(ary1, length_offset));
3690 cmpl(limit, Address(ary2, length_offset));
3691 jcc(Assembler::notEqual, FALSE_LABEL);
3692 }
3693
3694 // count == 0
3695 testl(limit, limit);
3696 jcc(Assembler::zero, TRUE_LABEL);
3697
3698 if (is_array_equ) {
3699 // Load array address
3700 lea(ary1, Address(ary1, base_offset));
3701 lea(ary2, Address(ary2, base_offset));
3702 }
3703
3704 if (is_array_equ && is_char) {
3705 // arrays_equals when used for char[].
3706 shll(limit, 1); // byte count != 0
3707 }
3708 movl(result, limit); // copy
3709
3710 if (UseAVX >= 2) {
3711 // With AVX2, use 32-byte vector compare
3712 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3713
3714 // Compare 32-byte vectors
3715 andl(result, 0x0000001f); // tail count (in bytes)
3716 andl(limit, 0xffffffe0); // vector count (in bytes)
3717 jcc(Assembler::zero, COMPARE_TAIL);
3718
3719 lea(ary1, Address(ary1, limit, Address::times_1));
3720 lea(ary2, Address(ary2, limit, Address::times_1));
3721 negptr(limit);
3722
3723 #ifdef _LP64
3724 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
3725 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
3726
3727 cmpl(limit, -64);
3728 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
3729
3730 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
3731
3732 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
3733 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
3734 kortestql(mask, mask);
3735 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
3736 addptr(limit, 64); // update since we already compared at this addr
3737 cmpl(limit, -64);
3738 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
3739
3740 // At this point we may still need to compare -limit+result bytes.
3741 // We could execute the next two instruction and just continue via non-wide path:
3742 // cmpl(limit, 0);
3743 // jcc(Assembler::equal, COMPARE_TAIL); // true
3744 // But since we stopped at the points ary{1,2}+limit which are
3745 // not farther than 64 bytes from the ends of arrays ary{1,2}+result
3746 // (|limit| <= 32 and result < 32),
3747 // we may just compare the last 64 bytes.
3748 //
3749 addptr(result, -64); // it is safe, bc we just came from this area
3750 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
3751 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
3752 kortestql(mask, mask);
3753 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
3754
3755 jmp(TRUE_LABEL);
3756
3757 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
3758
3759 }//if (VM_Version::supports_avx512vlbw())
3760 #endif //_LP64
3761 bind(COMPARE_WIDE_VECTORS);
3762 vmovdqu(vec1, Address(ary1, limit, Address::times_1));
3763 vmovdqu(vec2, Address(ary2, limit, Address::times_1));
3764 vpxor(vec1, vec2);
3765
3766 vptest(vec1, vec1);
3767 jcc(Assembler::notZero, FALSE_LABEL);
3768 addptr(limit, 32);
3769 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3770
3771 testl(result, result);
3772 jcc(Assembler::zero, TRUE_LABEL);
3773
3774 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
3775 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
3776 vpxor(vec1, vec2);
3777
3778 vptest(vec1, vec1);
3779 jccb(Assembler::notZero, FALSE_LABEL);
3780 jmpb(TRUE_LABEL);
3781
3782 bind(COMPARE_TAIL); // limit is zero
3783 movl(limit, result);
3784 // Fallthru to tail compare
3785 } else if (UseSSE42Intrinsics) {
3786 // With SSE4.2, use double quad vector compare
3787 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
3788
3789 // Compare 16-byte vectors
3790 andl(result, 0x0000000f); // tail count (in bytes)
3791 andl(limit, 0xfffffff0); // vector count (in bytes)
3792 jcc(Assembler::zero, COMPARE_TAIL);
3793
3794 lea(ary1, Address(ary1, limit, Address::times_1));
3795 lea(ary2, Address(ary2, limit, Address::times_1));
3796 negptr(limit);
3797
3798 bind(COMPARE_WIDE_VECTORS);
3799 movdqu(vec1, Address(ary1, limit, Address::times_1));
3800 movdqu(vec2, Address(ary2, limit, Address::times_1));
3801 pxor(vec1, vec2);
3802
3803 ptest(vec1, vec1);
3804 jcc(Assembler::notZero, FALSE_LABEL);
3805 addptr(limit, 16);
3806 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
3807
3808 testl(result, result);
3809 jcc(Assembler::zero, TRUE_LABEL);
3810
3811 movdqu(vec1, Address(ary1, result, Address::times_1, -16));
3812 movdqu(vec2, Address(ary2, result, Address::times_1, -16));
3813 pxor(vec1, vec2);
3814
3815 ptest(vec1, vec1);
3816 jccb(Assembler::notZero, FALSE_LABEL);
3817 jmpb(TRUE_LABEL);
3818
3819 bind(COMPARE_TAIL); // limit is zero
3820 movl(limit, result);
3821 // Fallthru to tail compare
3822 }
3823
3824 // Compare 4-byte vectors
3825 andl(limit, 0xfffffffc); // vector count (in bytes)
3826 jccb(Assembler::zero, COMPARE_CHAR);
3827
3828 lea(ary1, Address(ary1, limit, Address::times_1));
3829 lea(ary2, Address(ary2, limit, Address::times_1));
3830 negptr(limit);
3831
3832 bind(COMPARE_VECTORS);
3833 movl(chr, Address(ary1, limit, Address::times_1));
3834 cmpl(chr, Address(ary2, limit, Address::times_1));
3835 jccb(Assembler::notEqual, FALSE_LABEL);
3836 addptr(limit, 4);
3837 jcc(Assembler::notZero, COMPARE_VECTORS);
3838
3839 // Compare trailing char (final 2 bytes), if any
3840 bind(COMPARE_CHAR);
3841 testl(result, 0x2); // tail char
3842 jccb(Assembler::zero, COMPARE_BYTE);
3843 load_unsigned_short(chr, Address(ary1, 0));
3844 load_unsigned_short(limit, Address(ary2, 0));
3845 cmpl(chr, limit);
3846 jccb(Assembler::notEqual, FALSE_LABEL);
3847
3848 if (is_array_equ && is_char) {
3849 bind(COMPARE_BYTE);
3850 } else {
3851 lea(ary1, Address(ary1, 2));
3852 lea(ary2, Address(ary2, 2));
3853
3854 bind(COMPARE_BYTE);
3855 testl(result, 0x1); // tail byte
3856 jccb(Assembler::zero, TRUE_LABEL);
3857 load_unsigned_byte(chr, Address(ary1, 0));
3858 load_unsigned_byte(limit, Address(ary2, 0));
3859 cmpl(chr, limit);
3860 jccb(Assembler::notEqual, FALSE_LABEL);
3861 }
3862 bind(TRUE_LABEL);
3863 movl(result, 1); // return true
3864 jmpb(DONE);
3865
3866 bind(FALSE_LABEL);
3867 xorl(result, result); // return false
3868
3869 // That's it
3870 bind(DONE);
3871 if (UseAVX >= 2) {
3872 // clean upper bits of YMM registers
3873 vpxor(vec1, vec1);
3874 vpxor(vec2, vec2);
3875 }
3876 }
3877
3878 #ifdef _LP64
3879 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
3880 Register tmp, KRegister ktmp, int masklen, int vec_enc) {
3881 assert(VM_Version::supports_avx512vlbw(), "");
3882 vpxor(xtmp, xtmp, xtmp, vec_enc);
3883 vpsubb(xtmp, xtmp, mask, vec_enc);
3884 evpmovb2m(ktmp, xtmp, vec_enc);
3885 kmovql(tmp, ktmp);
3886 switch(opc) {
3887 case Op_VectorMaskTrueCount:
3888 popcntq(dst, tmp);
3889 break;
3890 case Op_VectorMaskLastTrue:
3891 mov64(dst, -1);
3892 bsrq(tmp, tmp);
3893 cmov(Assembler::notZero, dst, tmp);
3894 break;
3895 case Op_VectorMaskFirstTrue:
3896 mov64(dst, masklen);
3897 bsfq(tmp, tmp);
3898 cmov(Assembler::notZero, dst, tmp);
3899 break;
3900 default: assert(false, "Unhandled mask operation");
3901 }
3902 }
3903
3904 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
3905 XMMRegister xtmp1, Register tmp, int masklen, int vec_enc) {
3906 assert(VM_Version::supports_avx(), "");
3907 vpxor(xtmp, xtmp, xtmp, vec_enc);
3908 vpsubb(xtmp, xtmp, mask, vec_enc);
3909 vpmovmskb(tmp, xtmp, vec_enc);
3910 if (masklen < 64) {
3911 andq(tmp, (((jlong)1 << masklen) - 1));
3912 }
3913 switch(opc) {
3914 case Op_VectorMaskTrueCount:
3915 popcntq(dst, tmp);
3916 break;
3917 case Op_VectorMaskLastTrue:
3918 mov64(dst, -1);
3919 bsrq(tmp, tmp);
3920 cmov(Assembler::notZero, dst, tmp);
3921 break;
3922 case Op_VectorMaskFirstTrue:
3923 mov64(dst, masklen);
3924 bsfq(tmp, tmp);
3925 cmov(Assembler::notZero, dst, tmp);
3926 break;
3927 default: assert(false, "Unhandled mask operation");
3928 }
3929 }
3930 #endif
3931
3932 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
3933 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp,
3934 int vlen_enc) {
3935 assert(VM_Version::supports_avx512bw(), "");
3936 // Byte shuffles are inlane operations and indices are determined using
3937 // lower 4 bit of each shuffle lane, thus all shuffle indices are
3938 // normalized to index range 0-15. This makes sure that all the multiples
3939 // of an index value are placed at same relative position in 128 bit
3940 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64
3941 // will be 16th element in their respective 128 bit lanes.
3942 movl(rtmp, 16);
3943 evpbroadcastb(xtmp1, rtmp, vlen_enc);
3944
3945 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16,
3946 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using
3947 // original shuffle indices and move the shuffled lanes corresponding to true
3948 // mask to destination vector.
3949 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
3950 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
3951 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
3952
3953 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32
3954 // and broadcasting second 128 bit lane.
3955 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
3956 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
3957 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
3958 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
3959 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
3960
3961 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48
3962 // and broadcasting third 128 bit lane.
3963 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc);
3964 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
3965 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
3966 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
3967 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
3968
3969 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64
3970 // and broadcasting third 128 bit lane.
3971 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
3972 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
3973 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
3974 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
3975 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
3976 }