1 /* 2 * Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "oops/methodData.hpp" 29 #include "opto/c2_CodeStubs.hpp" 30 #include "opto/c2_MacroAssembler.hpp" 31 #include "opto/intrinsicnode.hpp" 32 #include "opto/opcodes.hpp" 33 #include "opto/output.hpp" 34 #include "opto/subnode.hpp" 35 #include "runtime/biasedLocking.hpp" 36 #include "runtime/objectMonitor.hpp" 37 #include "runtime/stubRoutines.hpp" 38 39 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 40 switch (vlen_in_bytes) { 41 case 4: // fall-through 42 case 8: // fall-through 43 case 16: return Assembler::AVX_128bit; 44 case 32: return Assembler::AVX_256bit; 45 case 64: return Assembler::AVX_512bit; 46 47 default: { 48 ShouldNotReachHere(); 49 return Assembler::AVX_NoVec; 50 } 51 } 52 } 53 54 void C2_MacroAssembler::setvectmask(Register dst, Register src, KRegister mask) { 55 guarantee(PostLoopMultiversioning, "must be"); 56 Assembler::movl(dst, 1); 57 Assembler::shlxl(dst, dst, src); 58 Assembler::decl(dst); 59 Assembler::kmovdl(mask, dst); 60 Assembler::movl(dst, src); 61 } 62 63 void C2_MacroAssembler::restorevectmask(KRegister mask) { 64 guarantee(PostLoopMultiversioning, "must be"); 65 Assembler::knotwl(mask, k0); 66 } 67 68 #if INCLUDE_RTM_OPT 69 70 // Update rtm_counters based on abort status 71 // input: abort_status 72 // rtm_counters (RTMLockingCounters*) 73 // flags are killed 74 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) { 75 76 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset())); 77 if (PrintPreciseRTMLockingStatistics) { 78 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 79 Label check_abort; 80 testl(abort_status, (1<<i)); 81 jccb(Assembler::equal, check_abort); 82 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx)))); 83 bind(check_abort); 84 } 85 } 86 } 87 88 // Branch if (random & (count-1) != 0), count is 2^n 89 // tmp, scr and flags are killed 90 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) { 91 assert(tmp == rax, ""); 92 assert(scr == rdx, ""); 93 rdtsc(); // modifies EDX:EAX 94 andptr(tmp, count-1); 95 jccb(Assembler::notZero, brLabel); 96 } 97 98 // Perform abort ratio calculation, set no_rtm bit if high ratio 99 // input: rtm_counters_Reg (RTMLockingCounters* address) 100 // tmpReg, rtm_counters_Reg and flags are killed 101 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg, 102 Register rtm_counters_Reg, 103 RTMLockingCounters* rtm_counters, 104 Metadata* method_data) { 105 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 106 107 if (RTMLockingCalculationDelay > 0) { 108 // Delay calculation 109 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg); 110 testptr(tmpReg, tmpReg); 111 jccb(Assembler::equal, L_done); 112 } 113 // Abort ratio calculation only if abort_count > RTMAbortThreshold 114 // Aborted transactions = abort_count * 100 115 // All transactions = total_count * RTMTotalCountIncrRate 116 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 117 118 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset())); 119 cmpptr(tmpReg, RTMAbortThreshold); 120 jccb(Assembler::below, L_check_always_rtm2); 121 imulptr(tmpReg, tmpReg, 100); 122 123 Register scrReg = rtm_counters_Reg; 124 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 125 imulptr(scrReg, scrReg, RTMTotalCountIncrRate); 126 imulptr(scrReg, scrReg, RTMAbortRatio); 127 cmpptr(tmpReg, scrReg); 128 jccb(Assembler::below, L_check_always_rtm1); 129 if (method_data != NULL) { 130 // set rtm_state to "no rtm" in MDO 131 mov_metadata(tmpReg, method_data); 132 lock(); 133 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM); 134 } 135 jmpb(L_done); 136 bind(L_check_always_rtm1); 137 // Reload RTMLockingCounters* address 138 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 139 bind(L_check_always_rtm2); 140 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 141 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 142 jccb(Assembler::below, L_done); 143 if (method_data != NULL) { 144 // set rtm_state to "always rtm" in MDO 145 mov_metadata(tmpReg, method_data); 146 lock(); 147 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM); 148 } 149 bind(L_done); 150 } 151 152 // Update counters and perform abort ratio calculation 153 // input: abort_status_Reg 154 // rtm_counters_Reg, flags are killed 155 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg, 156 Register rtm_counters_Reg, 157 RTMLockingCounters* rtm_counters, 158 Metadata* method_data, 159 bool profile_rtm) { 160 161 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 162 // update rtm counters based on rax value at abort 163 // reads abort_status_Reg, updates flags 164 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 165 rtm_counters_update(abort_status_Reg, rtm_counters_Reg); 166 if (profile_rtm) { 167 // Save abort status because abort_status_Reg is used by following code. 168 if (RTMRetryCount > 0) { 169 push(abort_status_Reg); 170 } 171 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 172 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data); 173 // restore abort status 174 if (RTMRetryCount > 0) { 175 pop(abort_status_Reg); 176 } 177 } 178 } 179 180 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4) 181 // inputs: retry_count_Reg 182 // : abort_status_Reg 183 // output: retry_count_Reg decremented by 1 184 // flags are killed 185 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) { 186 Label doneRetry; 187 assert(abort_status_Reg == rax, ""); 188 // The abort reason bits are in eax (see all states in rtmLocking.hpp) 189 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) 190 // if reason is in 0x6 and retry count != 0 then retry 191 andptr(abort_status_Reg, 0x6); 192 jccb(Assembler::zero, doneRetry); 193 testl(retry_count_Reg, retry_count_Reg); 194 jccb(Assembler::zero, doneRetry); 195 pause(); 196 decrementl(retry_count_Reg); 197 jmp(retryLabel); 198 bind(doneRetry); 199 } 200 201 // Spin and retry if lock is busy, 202 // inputs: box_Reg (monitor address) 203 // : retry_count_Reg 204 // output: retry_count_Reg decremented by 1 205 // : clear z flag if retry count exceeded 206 // tmp_Reg, scr_Reg, flags are killed 207 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg, 208 Register tmp_Reg, Register scr_Reg, Label& retryLabel) { 209 Label SpinLoop, SpinExit, doneRetry; 210 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 211 212 testl(retry_count_Reg, retry_count_Reg); 213 jccb(Assembler::zero, doneRetry); 214 decrementl(retry_count_Reg); 215 movptr(scr_Reg, RTMSpinLoopCount); 216 217 bind(SpinLoop); 218 pause(); 219 decrementl(scr_Reg); 220 jccb(Assembler::lessEqual, SpinExit); 221 movptr(tmp_Reg, Address(box_Reg, owner_offset)); 222 testptr(tmp_Reg, tmp_Reg); 223 jccb(Assembler::notZero, SpinLoop); 224 225 bind(SpinExit); 226 jmp(retryLabel); 227 bind(doneRetry); 228 incrementl(retry_count_Reg); // clear z flag 229 } 230 231 // Use RTM for normal stack locks 232 // Input: objReg (object to lock) 233 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg, 234 Register retry_on_abort_count_Reg, 235 RTMLockingCounters* stack_rtm_counters, 236 Metadata* method_data, bool profile_rtm, 237 Label& DONE_LABEL, Label& IsInflated) { 238 assert(UseRTMForStackLocks, "why call this otherwise?"); 239 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 240 assert(tmpReg == rax, ""); 241 assert(scrReg == rdx, ""); 242 Label L_rtm_retry, L_decrement_retry, L_on_abort; 243 244 if (RTMRetryCount > 0) { 245 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 246 bind(L_rtm_retry); 247 } 248 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 249 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased 250 jcc(Assembler::notZero, IsInflated); 251 252 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 253 Label L_noincrement; 254 if (RTMTotalCountIncrRate > 1) { 255 // tmpReg, scrReg and flags are killed 256 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 257 } 258 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 259 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg); 260 bind(L_noincrement); 261 } 262 xbegin(L_on_abort); 263 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 264 andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits 265 cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked 266 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked 267 268 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 269 if (UseRTMXendForLockBusy) { 270 xend(); 271 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry) 272 jmp(L_decrement_retry); 273 } 274 else { 275 xabort(0); 276 } 277 bind(L_on_abort); 278 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 279 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm); 280 } 281 bind(L_decrement_retry); 282 if (RTMRetryCount > 0) { 283 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 284 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 285 } 286 } 287 288 // Use RTM for inflating locks 289 // inputs: objReg (object to lock) 290 // boxReg (on-stack box address (displaced header location) - KILLED) 291 // tmpReg (ObjectMonitor address + markWord::monitor_value) 292 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg, 293 Register scrReg, Register retry_on_busy_count_Reg, 294 Register retry_on_abort_count_Reg, 295 RTMLockingCounters* rtm_counters, 296 Metadata* method_data, bool profile_rtm, 297 Label& DONE_LABEL) { 298 assert(UseRTMLocking, "why call this otherwise?"); 299 assert(tmpReg == rax, ""); 300 assert(scrReg == rdx, ""); 301 Label L_rtm_retry, L_decrement_retry, L_on_abort; 302 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 303 304 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 305 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value())); 306 movptr(boxReg, tmpReg); // Save ObjectMonitor address 307 308 if (RTMRetryCount > 0) { 309 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy 310 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 311 bind(L_rtm_retry); 312 } 313 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 314 Label L_noincrement; 315 if (RTMTotalCountIncrRate > 1) { 316 // tmpReg, scrReg and flags are killed 317 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 318 } 319 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 320 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg); 321 bind(L_noincrement); 322 } 323 xbegin(L_on_abort); 324 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 325 movptr(tmpReg, Address(tmpReg, owner_offset)); 326 testptr(tmpReg, tmpReg); 327 jcc(Assembler::zero, DONE_LABEL); 328 if (UseRTMXendForLockBusy) { 329 xend(); 330 jmp(L_decrement_retry); 331 } 332 else { 333 xabort(0); 334 } 335 bind(L_on_abort); 336 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 337 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 338 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm); 339 } 340 if (RTMRetryCount > 0) { 341 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 342 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 343 } 344 345 movptr(tmpReg, Address(boxReg, owner_offset)) ; 346 testptr(tmpReg, tmpReg) ; 347 jccb(Assembler::notZero, L_decrement_retry) ; 348 349 // Appears unlocked - try to swing _owner from null to non-null. 350 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 351 #ifdef _LP64 352 Register threadReg = r15_thread; 353 #else 354 get_thread(scrReg); 355 Register threadReg = scrReg; 356 #endif 357 lock(); 358 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg 359 360 if (RTMRetryCount > 0) { 361 // success done else retry 362 jccb(Assembler::equal, DONE_LABEL) ; 363 bind(L_decrement_retry); 364 // Spin and retry if lock is busy. 365 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry); 366 } 367 else { 368 bind(L_decrement_retry); 369 } 370 } 371 372 #endif // INCLUDE_RTM_OPT 373 374 // fast_lock and fast_unlock used by C2 375 376 // Because the transitions from emitted code to the runtime 377 // monitorenter/exit helper stubs are so slow it's critical that 378 // we inline both the stack-locking fast path and the inflated fast path. 379 // 380 // See also: cmpFastLock and cmpFastUnlock. 381 // 382 // What follows is a specialized inline transliteration of the code 383 // in enter() and exit(). If we're concerned about I$ bloat another 384 // option would be to emit TrySlowEnter and TrySlowExit methods 385 // at startup-time. These methods would accept arguments as 386 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 387 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 388 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 389 // In practice, however, the # of lock sites is bounded and is usually small. 390 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 391 // if the processor uses simple bimodal branch predictors keyed by EIP 392 // Since the helper routines would be called from multiple synchronization 393 // sites. 394 // 395 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 396 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 397 // to those specialized methods. That'd give us a mostly platform-independent 398 // implementation that the JITs could optimize and inline at their pleasure. 399 // Done correctly, the only time we'd need to cross to native could would be 400 // to park() or unpark() threads. We'd also need a few more unsafe operators 401 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 402 // (b) explicit barriers or fence operations. 403 // 404 // TODO: 405 // 406 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 407 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 408 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 409 // the lock operators would typically be faster than reifying Self. 410 // 411 // * Ideally I'd define the primitives as: 412 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 413 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 414 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 415 // Instead, we're stuck with a rather awkward and brittle register assignments below. 416 // Furthermore the register assignments are overconstrained, possibly resulting in 417 // sub-optimal code near the synchronization site. 418 // 419 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 420 // Alternately, use a better sp-proximity test. 421 // 422 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 423 // Either one is sufficient to uniquely identify a thread. 424 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 425 // 426 // * Intrinsify notify() and notifyAll() for the common cases where the 427 // object is locked by the calling thread but the waitlist is empty. 428 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 429 // 430 // * use jccb and jmpb instead of jcc and jmp to improve code density. 431 // But beware of excessive branch density on AMD Opterons. 432 // 433 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 434 // or failure of the fast path. If the fast path fails then we pass 435 // control to the slow path, typically in C. In fast_lock and 436 // fast_unlock we often branch to DONE_LABEL, just to find that C2 437 // will emit a conditional branch immediately after the node. 438 // So we have branches to branches and lots of ICC.ZF games. 439 // Instead, it might be better to have C2 pass a "FailureLabel" 440 // into fast_lock and fast_unlock. In the case of success, control 441 // will drop through the node. ICC.ZF is undefined at exit. 442 // In the case of failure, the node will branch directly to the 443 // FailureLabel 444 445 446 // obj: object to lock 447 // box: on-stack box address (displaced header location) - KILLED 448 // rax,: tmp -- KILLED 449 // scr: tmp -- KILLED 450 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 451 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 452 BiasedLockingCounters* counters, 453 RTMLockingCounters* rtm_counters, 454 RTMLockingCounters* stack_rtm_counters, 455 Metadata* method_data, 456 bool use_rtm, bool profile_rtm) { 457 // Ensure the register assignments are disjoint 458 assert(tmpReg == rax, ""); 459 460 if (use_rtm) { 461 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg); 462 } else { 463 assert(cx2Reg == noreg, ""); 464 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 465 } 466 467 if (counters != NULL) { 468 atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg); 469 } 470 471 // Possible cases that we'll encounter in fast_lock 472 // ------------------------------------------------ 473 // * Inflated 474 // -- unlocked 475 // -- Locked 476 // = by self 477 // = by other 478 // * biased 479 // -- by Self 480 // -- by other 481 // * neutral 482 // * stack-locked 483 // -- by self 484 // = sp-proximity test hits 485 // = sp-proximity test generates false-negative 486 // -- by other 487 // 488 489 Label IsInflated, DONE_LABEL; 490 491 if (DiagnoseSyncOnValueBasedClasses != 0) { 492 load_klass(tmpReg, objReg, cx1Reg); 493 movl(tmpReg, Address(tmpReg, Klass::access_flags_offset())); 494 testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS); 495 jcc(Assembler::notZero, DONE_LABEL); 496 } 497 498 // it's stack-locked, biased or neutral 499 // TODO: optimize away redundant LDs of obj->mark and improve the markword triage 500 // order to reduce the number of conditional branches in the most common cases. 501 // Beware -- there's a subtle invariant that fetch of the markword 502 // at [FETCH], below, will never observe a biased encoding (*101b). 503 // If this invariant is not held we risk exclusion (safety) failure. 504 if (UseBiasedLocking && !UseOptoBiasInlining) { 505 biased_locking_enter(boxReg, objReg, tmpReg, scrReg, cx1Reg, false, DONE_LABEL, NULL, counters); 506 } 507 508 #if INCLUDE_RTM_OPT 509 if (UseRTMForStackLocks && use_rtm) { 510 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg, 511 stack_rtm_counters, method_data, profile_rtm, 512 DONE_LABEL, IsInflated); 513 } 514 #endif // INCLUDE_RTM_OPT 515 516 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 517 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased 518 jccb(Assembler::notZero, IsInflated); 519 520 if (UseFastLocking) { 521 #ifdef _LP64 522 fast_lock_impl(objReg, tmpReg, thread, scrReg, DONE_LABEL, false); 523 xorl(tmpReg, tmpReg); // Set ZF=1 to indicate success 524 #else 525 // We can not emit the lock-stack-check in verified_entry() because we don't have enough 526 // registers (for thread ptr). Therefor we have to emit the lock-stack-check in 527 // fast_lock_impl(). However, that check can take a slow-path with ZF=1, therefore 528 // we need to handle it specially and force ZF=0 before taking the actual slow-path. 529 Label slow; 530 fast_lock_impl(objReg, tmpReg, thread, scrReg, slow); 531 xorl(tmpReg, tmpReg); 532 jmp(DONE_LABEL); 533 bind(slow); 534 testptr(objReg, objReg); // ZF=0 to indicate failure 535 #endif 536 } else { 537 // Attempt stack-locking ... 538 orptr (tmpReg, markWord::unlocked_value); 539 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 540 lock(); 541 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 542 if (counters != NULL) { 543 cond_inc32(Assembler::equal, 544 ExternalAddress((address)counters->fast_path_entry_count_addr())); 545 } 546 jcc(Assembler::equal, DONE_LABEL); // Success 547 548 // Recursive locking. 549 // The object is stack-locked: markword contains stack pointer to BasicLock. 550 // Locked by current thread if difference with current SP is less than one page. 551 subptr(tmpReg, rsp); 552 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 553 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) ); 554 movptr(Address(boxReg, 0), tmpReg); 555 if (counters != NULL) { 556 cond_inc32(Assembler::equal, 557 ExternalAddress((address)counters->fast_path_entry_count_addr())); 558 } 559 } 560 jmp(DONE_LABEL); 561 562 bind(IsInflated); 563 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 564 565 #if INCLUDE_RTM_OPT 566 // Use the same RTM locking code in 32- and 64-bit VM. 567 if (use_rtm) { 568 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg, 569 rtm_counters, method_data, profile_rtm, DONE_LABEL); 570 } else { 571 #endif // INCLUDE_RTM_OPT 572 573 #ifndef _LP64 574 // The object is inflated. 575 576 // boxReg refers to the on-stack BasicLock in the current frame. 577 // We'd like to write: 578 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 579 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 580 // additional latency as we have another ST in the store buffer that must drain. 581 582 // avoid ST-before-CAS 583 // register juggle because we need tmpReg for cmpxchgptr below 584 movptr(scrReg, boxReg); 585 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 586 587 // Optimistic form: consider XORL tmpReg,tmpReg 588 movptr(tmpReg, NULL_WORD); 589 590 // Appears unlocked - try to swing _owner from null to non-null. 591 // Ideally, I'd manifest "Self" with get_thread and then attempt 592 // to CAS the register containing Self into m->Owner. 593 // But we don't have enough registers, so instead we can either try to CAS 594 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 595 // we later store "Self" into m->Owner. Transiently storing a stack address 596 // (rsp or the address of the box) into m->owner is harmless. 597 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 598 lock(); 599 cmpxchgptr(thread, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 600 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 601 // If the CAS fails we can either retry or pass control to the slow path. 602 // We use the latter tactic. 603 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 604 // If the CAS was successful ... 605 // Self has acquired the lock 606 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 607 // Intentional fall-through into DONE_LABEL ... 608 #else // _LP64 609 // It's inflated and we use scrReg for ObjectMonitor* in this section. 610 movq(scrReg, tmpReg); 611 xorq(tmpReg, tmpReg); 612 lock(); 613 cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 614 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 615 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 616 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value())); 617 // Propagate ICC.ZF from CAS above into DONE_LABEL. 618 jcc(Assembler::equal, DONE_LABEL); // CAS above succeeded; propagate ZF = 1 (success) 619 620 cmpptr(r15_thread, rax); // Check if we are already the owner (recursive lock) 621 jcc(Assembler::notEqual, DONE_LABEL); // If not recursive, ZF = 0 at this point (fail) 622 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 623 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 624 #endif // _LP64 625 #if INCLUDE_RTM_OPT 626 } // use_rtm() 627 #endif 628 // DONE_LABEL is a hot target - we'd really like to place it at the 629 // start of cache line by padding with NOPs. 630 // See the AMD and Intel software optimization manuals for the 631 // most efficient "long" NOP encodings. 632 // Unfortunately none of our alignment mechanisms suffice. 633 bind(DONE_LABEL); 634 635 // At DONE_LABEL the icc ZFlag is set as follows ... 636 // fast_unlock uses the same protocol. 637 // ZFlag == 1 -> Success 638 // ZFlag == 0 -> Failure - force control through the slow path 639 } 640 641 // obj: object to unlock 642 // box: box address (displaced header location), killed. Must be EAX. 643 // tmp: killed, cannot be obj nor box. 644 // 645 // Some commentary on balanced locking: 646 // 647 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 648 // Methods that don't have provably balanced locking are forced to run in the 649 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 650 // The interpreter provides two properties: 651 // I1: At return-time the interpreter automatically and quietly unlocks any 652 // objects acquired the current activation (frame). Recall that the 653 // interpreter maintains an on-stack list of locks currently held by 654 // a frame. 655 // I2: If a method attempts to unlock an object that is not held by the 656 // the frame the interpreter throws IMSX. 657 // 658 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 659 // B() doesn't have provably balanced locking so it runs in the interpreter. 660 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 661 // is still locked by A(). 662 // 663 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 664 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 665 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 666 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 667 // Arguably given that the spec legislates the JNI case as undefined our implementation 668 // could reasonably *avoid* checking owner in fast_unlock(). 669 // In the interest of performance we elide m->Owner==Self check in unlock. 670 // A perfectly viable alternative is to elide the owner check except when 671 // Xcheck:jni is enabled. 672 673 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { 674 assert(boxReg == rax, ""); 675 assert_different_registers(objReg, boxReg, tmpReg); 676 677 Label DONE_LABEL, Stacked, CheckSucc; 678 679 // Critically, the biased locking test must have precedence over 680 // and appear before the (box->dhw == 0) recursive stack-lock test. 681 if (UseBiasedLocking && !UseOptoBiasInlining) { 682 biased_locking_exit(objReg, tmpReg, DONE_LABEL); 683 } 684 685 #if INCLUDE_RTM_OPT 686 if (UseRTMForStackLocks && use_rtm) { 687 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 688 Label L_regular_unlock; 689 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 690 andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits 691 cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked 692 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock 693 xend(); // otherwise end... 694 jmp(DONE_LABEL); // ... and we're done 695 bind(L_regular_unlock); 696 } 697 #endif 698 699 if (!UseFastLocking) { 700 cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header 701 jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock 702 } 703 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 704 testptr(tmpReg, markWord::monitor_value); // Inflated? 705 jcc(Assembler::zero, Stacked); 706 707 if (UseFastLocking) { 708 // If the owner is ANONYMOUS, we need to fix it. 709 testb(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int) (intptr_t) ANONYMOUS_OWNER); 710 #ifdef _LP64 711 C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmpReg); 712 Compile::current()->output()->add_stub(stub); 713 jcc(Assembler::notEqual, stub->entry()); 714 bind(stub->continuation()); 715 #else 716 // We can't easily implement this optimization on 32 bit because we don't have a thread register. 717 // Call the slow-path instead. 718 jcc(Assembler::notEqual, DONE_LABEL); 719 #endif 720 } 721 722 // It's inflated. 723 #if INCLUDE_RTM_OPT 724 if (use_rtm) { 725 Label L_regular_inflated_unlock; 726 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 727 movptr(boxReg, Address(tmpReg, owner_offset)); 728 testptr(boxReg, boxReg); 729 jccb(Assembler::notZero, L_regular_inflated_unlock); 730 xend(); 731 jmpb(DONE_LABEL); 732 bind(L_regular_inflated_unlock); 733 } 734 #endif 735 736 // Despite our balanced locking property we still check that m->_owner == Self 737 // as java routines or native JNI code called by this thread might 738 // have released the lock. 739 // Refer to the comments in synchronizer.cpp for how we might encode extra 740 // state in _succ so we can avoid fetching EntryList|cxq. 741 // 742 // If there's no contention try a 1-0 exit. That is, exit without 743 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 744 // we detect and recover from the race that the 1-0 exit admits. 745 // 746 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 747 // before it STs null into _owner, releasing the lock. Updates 748 // to data protected by the critical section must be visible before 749 // we drop the lock (and thus before any other thread could acquire 750 // the lock and observe the fields protected by the lock). 751 // IA32's memory-model is SPO, so STs are ordered with respect to 752 // each other and there's no need for an explicit barrier (fence). 753 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 754 #ifndef _LP64 755 get_thread (boxReg); 756 757 // Note that we could employ various encoding schemes to reduce 758 // the number of loads below (currently 4) to just 2 or 3. 759 // Refer to the comments in synchronizer.cpp. 760 // In practice the chain of fetches doesn't seem to impact performance, however. 761 xorptr(boxReg, boxReg); 762 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 763 jccb (Assembler::notZero, DONE_LABEL); 764 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 765 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 766 jccb (Assembler::notZero, CheckSucc); 767 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 768 jmpb (DONE_LABEL); 769 770 bind (Stacked); 771 if (UseFastLocking) { 772 mov(boxReg, tmpReg); 773 fast_unlock_impl(objReg, boxReg, tmpReg, DONE_LABEL); 774 xorl(tmpReg, tmpReg); 775 } else { 776 // It's not inflated and it's not recursively stack-locked and it's not biased. 777 // It must be stack-locked. 778 // Try to reset the header to displaced header. 779 // The "box" value on the stack is stable, so we can reload 780 // and be assured we observe the same value as above. 781 movptr(tmpReg, Address(boxReg, 0)); 782 lock(); 783 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 784 } 785 // Intention fall-thru into DONE_LABEL 786 787 // DONE_LABEL is a hot target - we'd really like to place it at the 788 // start of cache line by padding with NOPs. 789 // See the AMD and Intel software optimization manuals for the 790 // most efficient "long" NOP encodings. 791 // Unfortunately none of our alignment mechanisms suffice. 792 bind (CheckSucc); 793 #else // _LP64 794 // It's inflated 795 Label LNotRecursive, LSuccess, LGoSlowPath; 796 797 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 798 jccb(Assembler::equal, LNotRecursive); 799 800 // Recursive inflated unlock 801 decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 802 jmpb(LSuccess); 803 804 bind(LNotRecursive); 805 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 806 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 807 jccb (Assembler::notZero, CheckSucc); 808 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 809 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); 810 jmpb (DONE_LABEL); 811 812 // Try to avoid passing control into the slow_path ... 813 bind (CheckSucc); 814 815 // The following optional optimization can be elided if necessary 816 // Effectively: if (succ == null) goto slow path 817 // The code reduces the window for a race, however, 818 // and thus benefits performance. 819 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 820 jccb (Assembler::zero, LGoSlowPath); 821 822 xorptr(boxReg, boxReg); 823 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 824 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); 825 826 // Memory barrier/fence 827 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 828 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 829 // This is faster on Nehalem and AMD Shanghai/Barcelona. 830 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 831 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 832 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 833 lock(); addl(Address(rsp, 0), 0); 834 835 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 836 jccb (Assembler::notZero, LSuccess); 837 838 // Rare inopportune interleaving - race. 839 // The successor vanished in the small window above. 840 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 841 // We need to ensure progress and succession. 842 // Try to reacquire the lock. 843 // If that fails then the new owner is responsible for succession and this 844 // thread needs to take no further action and can exit via the fast path (success). 845 // If the re-acquire succeeds then pass control into the slow path. 846 // As implemented, this latter mode is horrible because we generated more 847 // coherence traffic on the lock *and* artifically extended the critical section 848 // length while by virtue of passing control into the slow path. 849 850 // box is really RAX -- the following CMPXCHG depends on that binding 851 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 852 lock(); 853 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 854 // There's no successor so we tried to regrab the lock. 855 // If that didn't work, then another thread grabbed the 856 // lock so we're done (and exit was a success). 857 jccb (Assembler::notEqual, LSuccess); 858 // Intentional fall-through into slow path 859 860 bind (LGoSlowPath); 861 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 862 jmpb (DONE_LABEL); 863 864 bind (LSuccess); 865 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 866 jmpb (DONE_LABEL); 867 868 bind (Stacked); 869 870 if (UseFastLocking) { 871 mov(boxReg, tmpReg); 872 fast_unlock_impl(objReg, boxReg, tmpReg, DONE_LABEL); 873 xorl(tmpReg, tmpReg); 874 } else { 875 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 876 lock(); 877 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 878 } 879 880 #endif 881 bind(DONE_LABEL); 882 } 883 884 //------------------------------------------------------------------------------------------- 885 // Generic instructions support for use in .ad files C2 code generation 886 887 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) { 888 if (dst != src) { 889 movdqu(dst, src); 890 } 891 if (opcode == Op_AbsVD) { 892 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr); 893 } else { 894 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 895 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr); 896 } 897 } 898 899 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) { 900 if (opcode == Op_AbsVD) { 901 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr); 902 } else { 903 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 904 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr); 905 } 906 } 907 908 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) { 909 if (dst != src) { 910 movdqu(dst, src); 911 } 912 if (opcode == Op_AbsVF) { 913 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr); 914 } else { 915 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 916 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr); 917 } 918 } 919 920 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) { 921 if (opcode == Op_AbsVF) { 922 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr); 923 } else { 924 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 925 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr); 926 } 927 } 928 929 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 930 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 931 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 932 933 if (opcode == Op_MinV) { 934 if (elem_bt == T_BYTE) { 935 pminsb(dst, src); 936 } else if (elem_bt == T_SHORT) { 937 pminsw(dst, src); 938 } else if (elem_bt == T_INT) { 939 pminsd(dst, src); 940 } else { 941 assert(elem_bt == T_LONG, "required"); 942 assert(tmp == xmm0, "required"); 943 assert_different_registers(dst, src, tmp); 944 movdqu(xmm0, dst); 945 pcmpgtq(xmm0, src); 946 blendvpd(dst, src); // xmm0 as mask 947 } 948 } else { // opcode == Op_MaxV 949 if (elem_bt == T_BYTE) { 950 pmaxsb(dst, src); 951 } else if (elem_bt == T_SHORT) { 952 pmaxsw(dst, src); 953 } else if (elem_bt == T_INT) { 954 pmaxsd(dst, src); 955 } else { 956 assert(elem_bt == T_LONG, "required"); 957 assert(tmp == xmm0, "required"); 958 assert_different_registers(dst, src, tmp); 959 movdqu(xmm0, src); 960 pcmpgtq(xmm0, dst); 961 blendvpd(dst, src); // xmm0 as mask 962 } 963 } 964 } 965 966 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 967 XMMRegister dst, XMMRegister src1, XMMRegister src2, 968 int vlen_enc) { 969 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 970 971 if (opcode == Op_MinV) { 972 if (elem_bt == T_BYTE) { 973 vpminsb(dst, src1, src2, vlen_enc); 974 } else if (elem_bt == T_SHORT) { 975 vpminsw(dst, src1, src2, vlen_enc); 976 } else if (elem_bt == T_INT) { 977 vpminsd(dst, src1, src2, vlen_enc); 978 } else { 979 assert(elem_bt == T_LONG, "required"); 980 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 981 vpminsq(dst, src1, src2, vlen_enc); 982 } else { 983 assert_different_registers(dst, src1, src2); 984 vpcmpgtq(dst, src1, src2, vlen_enc); 985 vblendvpd(dst, src1, src2, dst, vlen_enc); 986 } 987 } 988 } else { // opcode == Op_MaxV 989 if (elem_bt == T_BYTE) { 990 vpmaxsb(dst, src1, src2, vlen_enc); 991 } else if (elem_bt == T_SHORT) { 992 vpmaxsw(dst, src1, src2, vlen_enc); 993 } else if (elem_bt == T_INT) { 994 vpmaxsd(dst, src1, src2, vlen_enc); 995 } else { 996 assert(elem_bt == T_LONG, "required"); 997 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 998 vpmaxsq(dst, src1, src2, vlen_enc); 999 } else { 1000 assert_different_registers(dst, src1, src2); 1001 vpcmpgtq(dst, src1, src2, vlen_enc); 1002 vblendvpd(dst, src2, src1, dst, vlen_enc); 1003 } 1004 } 1005 } 1006 } 1007 1008 // Float/Double min max 1009 1010 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1011 XMMRegister dst, XMMRegister a, XMMRegister b, 1012 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1013 int vlen_enc) { 1014 assert(UseAVX > 0, "required"); 1015 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1016 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1017 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1018 assert_different_registers(a, b, tmp, atmp, btmp); 1019 1020 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1021 bool is_double_word = is_double_word_type(elem_bt); 1022 1023 if (!is_double_word && is_min) { 1024 vblendvps(atmp, a, b, a, vlen_enc); 1025 vblendvps(btmp, b, a, a, vlen_enc); 1026 vminps(tmp, atmp, btmp, vlen_enc); 1027 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1028 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 1029 } else if (!is_double_word && !is_min) { 1030 vblendvps(btmp, b, a, b, vlen_enc); 1031 vblendvps(atmp, a, b, b, vlen_enc); 1032 vmaxps(tmp, atmp, btmp, vlen_enc); 1033 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1034 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 1035 } else if (is_double_word && is_min) { 1036 vblendvpd(atmp, a, b, a, vlen_enc); 1037 vblendvpd(btmp, b, a, a, vlen_enc); 1038 vminpd(tmp, atmp, btmp, vlen_enc); 1039 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1040 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 1041 } else { 1042 assert(is_double_word && !is_min, "sanity"); 1043 vblendvpd(btmp, b, a, b, vlen_enc); 1044 vblendvpd(atmp, a, b, b, vlen_enc); 1045 vmaxpd(tmp, atmp, btmp, vlen_enc); 1046 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1047 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 1048 } 1049 } 1050 1051 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1052 XMMRegister dst, XMMRegister a, XMMRegister b, 1053 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1054 int vlen_enc) { 1055 assert(UseAVX > 2, "required"); 1056 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1057 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1058 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1059 assert_different_registers(dst, a, b, atmp, btmp); 1060 1061 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1062 bool is_double_word = is_double_word_type(elem_bt); 1063 bool merge = true; 1064 1065 if (!is_double_word && is_min) { 1066 evpmovd2m(ktmp, a, vlen_enc); 1067 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1068 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1069 vminps(dst, atmp, btmp, vlen_enc); 1070 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1071 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1072 } else if (!is_double_word && !is_min) { 1073 evpmovd2m(ktmp, b, vlen_enc); 1074 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1075 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1076 vmaxps(dst, atmp, btmp, vlen_enc); 1077 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1078 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1079 } else if (is_double_word && is_min) { 1080 evpmovq2m(ktmp, a, vlen_enc); 1081 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1082 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1083 vminpd(dst, atmp, btmp, vlen_enc); 1084 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1085 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1086 } else { 1087 assert(is_double_word && !is_min, "sanity"); 1088 evpmovq2m(ktmp, b, vlen_enc); 1089 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1090 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1091 vmaxpd(dst, atmp, btmp, vlen_enc); 1092 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1093 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1094 } 1095 } 1096 1097 // Float/Double signum 1098 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, 1099 XMMRegister zero, XMMRegister one, 1100 Register scratch) { 1101 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1102 1103 Label DONE_LABEL; 1104 1105 if (opcode == Op_SignumF) { 1106 assert(UseSSE > 0, "required"); 1107 ucomiss(dst, zero); 1108 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1109 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1110 movflt(dst, one); 1111 jcc(Assembler::above, DONE_LABEL); 1112 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scratch); 1113 } else if (opcode == Op_SignumD) { 1114 assert(UseSSE > 1, "required"); 1115 ucomisd(dst, zero); 1116 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1117 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1118 movdbl(dst, one); 1119 jcc(Assembler::above, DONE_LABEL); 1120 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scratch); 1121 } 1122 1123 bind(DONE_LABEL); 1124 } 1125 1126 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1127 if (sign) { 1128 pmovsxbw(dst, src); 1129 } else { 1130 pmovzxbw(dst, src); 1131 } 1132 } 1133 1134 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1135 if (sign) { 1136 vpmovsxbw(dst, src, vector_len); 1137 } else { 1138 vpmovzxbw(dst, src, vector_len); 1139 } 1140 } 1141 1142 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1143 if (sign) { 1144 vpmovsxbd(dst, src, vector_len); 1145 } else { 1146 vpmovzxbd(dst, src, vector_len); 1147 } 1148 } 1149 1150 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1151 if (sign) { 1152 vpmovsxwd(dst, src, vector_len); 1153 } else { 1154 vpmovzxwd(dst, src, vector_len); 1155 } 1156 } 1157 1158 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1159 int shift, int vector_len) { 1160 if (opcode == Op_RotateLeftV) { 1161 if (etype == T_INT) { 1162 evprold(dst, src, shift, vector_len); 1163 } else { 1164 assert(etype == T_LONG, "expected type T_LONG"); 1165 evprolq(dst, src, shift, vector_len); 1166 } 1167 } else { 1168 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1169 if (etype == T_INT) { 1170 evprord(dst, src, shift, vector_len); 1171 } else { 1172 assert(etype == T_LONG, "expected type T_LONG"); 1173 evprorq(dst, src, shift, vector_len); 1174 } 1175 } 1176 } 1177 1178 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1179 XMMRegister shift, int vector_len) { 1180 if (opcode == Op_RotateLeftV) { 1181 if (etype == T_INT) { 1182 evprolvd(dst, src, shift, vector_len); 1183 } else { 1184 assert(etype == T_LONG, "expected type T_LONG"); 1185 evprolvq(dst, src, shift, vector_len); 1186 } 1187 } else { 1188 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1189 if (etype == T_INT) { 1190 evprorvd(dst, src, shift, vector_len); 1191 } else { 1192 assert(etype == T_LONG, "expected type T_LONG"); 1193 evprorvq(dst, src, shift, vector_len); 1194 } 1195 } 1196 } 1197 1198 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1199 if (opcode == Op_RShiftVI) { 1200 psrad(dst, shift); 1201 } else if (opcode == Op_LShiftVI) { 1202 pslld(dst, shift); 1203 } else { 1204 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1205 psrld(dst, shift); 1206 } 1207 } 1208 1209 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1210 switch (opcode) { 1211 case Op_RShiftVI: psrad(dst, shift); break; 1212 case Op_LShiftVI: pslld(dst, shift); break; 1213 case Op_URShiftVI: psrld(dst, shift); break; 1214 1215 default: assert(false, "%s", NodeClassNames[opcode]); 1216 } 1217 } 1218 1219 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1220 if (opcode == Op_RShiftVI) { 1221 vpsrad(dst, nds, shift, vector_len); 1222 } else if (opcode == Op_LShiftVI) { 1223 vpslld(dst, nds, shift, vector_len); 1224 } else { 1225 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1226 vpsrld(dst, nds, shift, vector_len); 1227 } 1228 } 1229 1230 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1231 switch (opcode) { 1232 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1233 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1234 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1235 1236 default: assert(false, "%s", NodeClassNames[opcode]); 1237 } 1238 } 1239 1240 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1241 switch (opcode) { 1242 case Op_RShiftVB: // fall-through 1243 case Op_RShiftVS: psraw(dst, shift); break; 1244 1245 case Op_LShiftVB: // fall-through 1246 case Op_LShiftVS: psllw(dst, shift); break; 1247 1248 case Op_URShiftVS: // fall-through 1249 case Op_URShiftVB: psrlw(dst, shift); break; 1250 1251 default: assert(false, "%s", NodeClassNames[opcode]); 1252 } 1253 } 1254 1255 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1256 switch (opcode) { 1257 case Op_RShiftVB: // fall-through 1258 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1259 1260 case Op_LShiftVB: // fall-through 1261 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1262 1263 case Op_URShiftVS: // fall-through 1264 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1265 1266 default: assert(false, "%s", NodeClassNames[opcode]); 1267 } 1268 } 1269 1270 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1271 switch (opcode) { 1272 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1273 case Op_LShiftVL: psllq(dst, shift); break; 1274 case Op_URShiftVL: psrlq(dst, shift); break; 1275 1276 default: assert(false, "%s", NodeClassNames[opcode]); 1277 } 1278 } 1279 1280 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1281 if (opcode == Op_RShiftVL) { 1282 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1283 } else if (opcode == Op_LShiftVL) { 1284 psllq(dst, shift); 1285 } else { 1286 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1287 psrlq(dst, shift); 1288 } 1289 } 1290 1291 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1292 switch (opcode) { 1293 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1294 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1295 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1296 1297 default: assert(false, "%s", NodeClassNames[opcode]); 1298 } 1299 } 1300 1301 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1302 if (opcode == Op_RShiftVL) { 1303 evpsraq(dst, nds, shift, vector_len); 1304 } else if (opcode == Op_LShiftVL) { 1305 vpsllq(dst, nds, shift, vector_len); 1306 } else { 1307 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1308 vpsrlq(dst, nds, shift, vector_len); 1309 } 1310 } 1311 1312 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1313 switch (opcode) { 1314 case Op_RShiftVB: // fall-through 1315 case Op_RShiftVS: // fall-through 1316 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1317 1318 case Op_LShiftVB: // fall-through 1319 case Op_LShiftVS: // fall-through 1320 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1321 1322 case Op_URShiftVB: // fall-through 1323 case Op_URShiftVS: // fall-through 1324 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1325 1326 default: assert(false, "%s", NodeClassNames[opcode]); 1327 } 1328 } 1329 1330 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1331 switch (opcode) { 1332 case Op_RShiftVB: // fall-through 1333 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1334 1335 case Op_LShiftVB: // fall-through 1336 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1337 1338 case Op_URShiftVB: // fall-through 1339 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1340 1341 default: assert(false, "%s", NodeClassNames[opcode]); 1342 } 1343 } 1344 1345 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1346 assert(UseAVX >= 2, "required"); 1347 switch (opcode) { 1348 case Op_RShiftVL: { 1349 if (UseAVX > 2) { 1350 assert(tmp == xnoreg, "not used"); 1351 if (!VM_Version::supports_avx512vl()) { 1352 vlen_enc = Assembler::AVX_512bit; 1353 } 1354 evpsravq(dst, src, shift, vlen_enc); 1355 } else { 1356 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1357 vpsrlvq(dst, src, shift, vlen_enc); 1358 vpsrlvq(tmp, tmp, shift, vlen_enc); 1359 vpxor(dst, dst, tmp, vlen_enc); 1360 vpsubq(dst, dst, tmp, vlen_enc); 1361 } 1362 break; 1363 } 1364 case Op_LShiftVL: { 1365 assert(tmp == xnoreg, "not used"); 1366 vpsllvq(dst, src, shift, vlen_enc); 1367 break; 1368 } 1369 case Op_URShiftVL: { 1370 assert(tmp == xnoreg, "not used"); 1371 vpsrlvq(dst, src, shift, vlen_enc); 1372 break; 1373 } 1374 default: assert(false, "%s", NodeClassNames[opcode]); 1375 } 1376 } 1377 1378 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1379 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) { 1380 assert(opcode == Op_LShiftVB || 1381 opcode == Op_RShiftVB || 1382 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1383 bool sign = (opcode != Op_URShiftVB); 1384 assert(vector_len == 0, "required"); 1385 vextendbd(sign, dst, src, 1); 1386 vpmovzxbd(vtmp, shift, 1); 1387 varshiftd(opcode, dst, dst, vtmp, 1); 1388 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch); 1389 vextracti128_high(vtmp, dst); 1390 vpackusdw(dst, dst, vtmp, 0); 1391 } 1392 1393 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1394 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) { 1395 assert(opcode == Op_LShiftVB || 1396 opcode == Op_RShiftVB || 1397 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1398 bool sign = (opcode != Op_URShiftVB); 1399 int ext_vector_len = vector_len + 1; 1400 vextendbw(sign, dst, src, ext_vector_len); 1401 vpmovzxbw(vtmp, shift, ext_vector_len); 1402 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1403 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch); 1404 if (vector_len == 0) { 1405 vextracti128_high(vtmp, dst); 1406 vpackuswb(dst, dst, vtmp, vector_len); 1407 } else { 1408 vextracti64x4_high(vtmp, dst); 1409 vpackuswb(dst, dst, vtmp, vector_len); 1410 vpermq(dst, dst, 0xD8, vector_len); 1411 } 1412 } 1413 1414 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1415 switch(typ) { 1416 case T_BYTE: 1417 pinsrb(dst, val, idx); 1418 break; 1419 case T_SHORT: 1420 pinsrw(dst, val, idx); 1421 break; 1422 case T_INT: 1423 pinsrd(dst, val, idx); 1424 break; 1425 case T_LONG: 1426 pinsrq(dst, val, idx); 1427 break; 1428 default: 1429 assert(false,"Should not reach here."); 1430 break; 1431 } 1432 } 1433 1434 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1435 switch(typ) { 1436 case T_BYTE: 1437 vpinsrb(dst, src, val, idx); 1438 break; 1439 case T_SHORT: 1440 vpinsrw(dst, src, val, idx); 1441 break; 1442 case T_INT: 1443 vpinsrd(dst, src, val, idx); 1444 break; 1445 case T_LONG: 1446 vpinsrq(dst, src, val, idx); 1447 break; 1448 default: 1449 assert(false,"Should not reach here."); 1450 break; 1451 } 1452 } 1453 1454 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1455 switch(typ) { 1456 case T_INT: 1457 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1458 break; 1459 case T_FLOAT: 1460 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1461 break; 1462 case T_LONG: 1463 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1464 break; 1465 case T_DOUBLE: 1466 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1467 break; 1468 default: 1469 assert(false,"Should not reach here."); 1470 break; 1471 } 1472 } 1473 1474 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1475 switch(typ) { 1476 case T_INT: 1477 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1478 break; 1479 case T_FLOAT: 1480 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1481 break; 1482 case T_LONG: 1483 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1484 break; 1485 case T_DOUBLE: 1486 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1487 break; 1488 default: 1489 assert(false,"Should not reach here."); 1490 break; 1491 } 1492 } 1493 1494 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1495 switch(typ) { 1496 case T_INT: 1497 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1498 break; 1499 case T_FLOAT: 1500 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1501 break; 1502 case T_LONG: 1503 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1504 break; 1505 case T_DOUBLE: 1506 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1507 break; 1508 default: 1509 assert(false,"Should not reach here."); 1510 break; 1511 } 1512 } 1513 1514 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1515 if (vlen_in_bytes <= 16) { 1516 pxor (dst, dst); 1517 psubb(dst, src); 1518 switch (elem_bt) { 1519 case T_BYTE: /* nothing to do */ break; 1520 case T_SHORT: pmovsxbw(dst, dst); break; 1521 case T_INT: pmovsxbd(dst, dst); break; 1522 case T_FLOAT: pmovsxbd(dst, dst); break; 1523 case T_LONG: pmovsxbq(dst, dst); break; 1524 case T_DOUBLE: pmovsxbq(dst, dst); break; 1525 1526 default: assert(false, "%s", type2name(elem_bt)); 1527 } 1528 } else { 1529 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1530 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1531 1532 vpxor (dst, dst, dst, vlen_enc); 1533 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1534 1535 switch (elem_bt) { 1536 case T_BYTE: /* nothing to do */ break; 1537 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1538 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1539 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1540 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1541 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1542 1543 default: assert(false, "%s", type2name(elem_bt)); 1544 } 1545 } 1546 } 1547 1548 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) { 1549 ExternalAddress addr(StubRoutines::x86::vector_iota_indices()); 1550 if (vlen_in_bytes == 4) { 1551 movdl(dst, addr); 1552 } else if (vlen_in_bytes == 8) { 1553 movq(dst, addr); 1554 } else if (vlen_in_bytes == 16) { 1555 movdqu(dst, addr, scratch); 1556 } else if (vlen_in_bytes == 32) { 1557 vmovdqu(dst, addr, scratch); 1558 } else { 1559 assert(vlen_in_bytes == 64, "%d", vlen_in_bytes); 1560 evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch); 1561 } 1562 } 1563 1564 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1565 1566 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1567 int vector_len = Assembler::AVX_128bit; 1568 1569 switch (opcode) { 1570 case Op_AndReductionV: pand(dst, src); break; 1571 case Op_OrReductionV: por (dst, src); break; 1572 case Op_XorReductionV: pxor(dst, src); break; 1573 case Op_MinReductionV: 1574 switch (typ) { 1575 case T_BYTE: pminsb(dst, src); break; 1576 case T_SHORT: pminsw(dst, src); break; 1577 case T_INT: pminsd(dst, src); break; 1578 case T_LONG: assert(UseAVX > 2, "required"); 1579 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1580 default: assert(false, "wrong type"); 1581 } 1582 break; 1583 case Op_MaxReductionV: 1584 switch (typ) { 1585 case T_BYTE: pmaxsb(dst, src); break; 1586 case T_SHORT: pmaxsw(dst, src); break; 1587 case T_INT: pmaxsd(dst, src); break; 1588 case T_LONG: assert(UseAVX > 2, "required"); 1589 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1590 default: assert(false, "wrong type"); 1591 } 1592 break; 1593 case Op_AddReductionVF: addss(dst, src); break; 1594 case Op_AddReductionVD: addsd(dst, src); break; 1595 case Op_AddReductionVI: 1596 switch (typ) { 1597 case T_BYTE: paddb(dst, src); break; 1598 case T_SHORT: paddw(dst, src); break; 1599 case T_INT: paddd(dst, src); break; 1600 default: assert(false, "wrong type"); 1601 } 1602 break; 1603 case Op_AddReductionVL: paddq(dst, src); break; 1604 case Op_MulReductionVF: mulss(dst, src); break; 1605 case Op_MulReductionVD: mulsd(dst, src); break; 1606 case Op_MulReductionVI: 1607 switch (typ) { 1608 case T_SHORT: pmullw(dst, src); break; 1609 case T_INT: pmulld(dst, src); break; 1610 default: assert(false, "wrong type"); 1611 } 1612 break; 1613 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1614 vpmullq(dst, dst, src, vector_len); break; 1615 default: assert(false, "wrong opcode"); 1616 } 1617 } 1618 1619 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1620 int vector_len = Assembler::AVX_256bit; 1621 1622 switch (opcode) { 1623 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1624 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1625 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1626 case Op_MinReductionV: 1627 switch (typ) { 1628 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1629 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1630 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1631 case T_LONG: assert(UseAVX > 2, "required"); 1632 vpminsq(dst, src1, src2, vector_len); break; 1633 default: assert(false, "wrong type"); 1634 } 1635 break; 1636 case Op_MaxReductionV: 1637 switch (typ) { 1638 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1639 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1640 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1641 case T_LONG: assert(UseAVX > 2, "required"); 1642 vpmaxsq(dst, src1, src2, vector_len); break; 1643 default: assert(false, "wrong type"); 1644 } 1645 break; 1646 case Op_AddReductionVI: 1647 switch (typ) { 1648 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1649 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1650 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1651 default: assert(false, "wrong type"); 1652 } 1653 break; 1654 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1655 case Op_MulReductionVI: 1656 switch (typ) { 1657 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1658 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1659 default: assert(false, "wrong type"); 1660 } 1661 break; 1662 case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break; 1663 default: assert(false, "wrong opcode"); 1664 } 1665 } 1666 1667 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1668 XMMRegister dst, XMMRegister src, 1669 XMMRegister vtmp1, XMMRegister vtmp2) { 1670 switch (opcode) { 1671 case Op_AddReductionVF: 1672 case Op_MulReductionVF: 1673 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1674 break; 1675 1676 case Op_AddReductionVD: 1677 case Op_MulReductionVD: 1678 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1679 break; 1680 1681 default: assert(false, "wrong opcode"); 1682 } 1683 } 1684 1685 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1686 Register dst, Register src1, XMMRegister src2, 1687 XMMRegister vtmp1, XMMRegister vtmp2) { 1688 switch (vlen) { 1689 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1690 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1691 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1692 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1693 1694 default: assert(false, "wrong vector length"); 1695 } 1696 } 1697 1698 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1699 Register dst, Register src1, XMMRegister src2, 1700 XMMRegister vtmp1, XMMRegister vtmp2) { 1701 switch (vlen) { 1702 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1703 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1704 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1705 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1706 1707 default: assert(false, "wrong vector length"); 1708 } 1709 } 1710 1711 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1712 Register dst, Register src1, XMMRegister src2, 1713 XMMRegister vtmp1, XMMRegister vtmp2) { 1714 switch (vlen) { 1715 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1716 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1717 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1718 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1719 1720 default: assert(false, "wrong vector length"); 1721 } 1722 } 1723 1724 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1725 Register dst, Register src1, XMMRegister src2, 1726 XMMRegister vtmp1, XMMRegister vtmp2) { 1727 switch (vlen) { 1728 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1729 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1730 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1731 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1732 1733 default: assert(false, "wrong vector length"); 1734 } 1735 } 1736 1737 #ifdef _LP64 1738 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1739 Register dst, Register src1, XMMRegister src2, 1740 XMMRegister vtmp1, XMMRegister vtmp2) { 1741 switch (vlen) { 1742 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1743 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1744 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1745 1746 default: assert(false, "wrong vector length"); 1747 } 1748 } 1749 #endif // _LP64 1750 1751 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1752 switch (vlen) { 1753 case 2: 1754 assert(vtmp2 == xnoreg, ""); 1755 reduce2F(opcode, dst, src, vtmp1); 1756 break; 1757 case 4: 1758 assert(vtmp2 == xnoreg, ""); 1759 reduce4F(opcode, dst, src, vtmp1); 1760 break; 1761 case 8: 1762 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1763 break; 1764 case 16: 1765 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1766 break; 1767 default: assert(false, "wrong vector length"); 1768 } 1769 } 1770 1771 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1772 switch (vlen) { 1773 case 2: 1774 assert(vtmp2 == xnoreg, ""); 1775 reduce2D(opcode, dst, src, vtmp1); 1776 break; 1777 case 4: 1778 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1779 break; 1780 case 8: 1781 reduce8D(opcode, dst, src, vtmp1, vtmp2); 1782 break; 1783 default: assert(false, "wrong vector length"); 1784 } 1785 } 1786 1787 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1788 if (opcode == Op_AddReductionVI) { 1789 if (vtmp1 != src2) { 1790 movdqu(vtmp1, src2); 1791 } 1792 phaddd(vtmp1, vtmp1); 1793 } else { 1794 pshufd(vtmp1, src2, 0x1); 1795 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1796 } 1797 movdl(vtmp2, src1); 1798 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1799 movdl(dst, vtmp1); 1800 } 1801 1802 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1803 if (opcode == Op_AddReductionVI) { 1804 if (vtmp1 != src2) { 1805 movdqu(vtmp1, src2); 1806 } 1807 phaddd(vtmp1, src2); 1808 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1809 } else { 1810 pshufd(vtmp2, src2, 0xE); 1811 reduce_operation_128(T_INT, opcode, vtmp2, src2); 1812 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1813 } 1814 } 1815 1816 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1817 if (opcode == Op_AddReductionVI) { 1818 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 1819 vextracti128_high(vtmp2, vtmp1); 1820 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 1821 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1822 } else { 1823 vextracti128_high(vtmp1, src2); 1824 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1825 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1826 } 1827 } 1828 1829 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1830 vextracti64x4_high(vtmp2, src2); 1831 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 1832 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1833 } 1834 1835 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1836 pshufd(vtmp2, src2, 0x1); 1837 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1838 movdqu(vtmp1, vtmp2); 1839 psrldq(vtmp1, 2); 1840 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 1841 movdqu(vtmp2, vtmp1); 1842 psrldq(vtmp2, 1); 1843 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 1844 movdl(vtmp2, src1); 1845 pmovsxbd(vtmp1, vtmp1); 1846 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1847 pextrb(dst, vtmp1, 0x0); 1848 movsbl(dst, dst); 1849 } 1850 1851 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1852 pshufd(vtmp1, src2, 0xE); 1853 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 1854 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1855 } 1856 1857 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1858 vextracti128_high(vtmp2, src2); 1859 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1860 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1861 } 1862 1863 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1864 vextracti64x4_high(vtmp1, src2); 1865 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 1866 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1867 } 1868 1869 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1870 pmovsxbw(vtmp2, src2); 1871 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1872 } 1873 1874 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1875 if (UseAVX > 1) { 1876 int vector_len = Assembler::AVX_256bit; 1877 vpmovsxbw(vtmp1, src2, vector_len); 1878 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1879 } else { 1880 pmovsxbw(vtmp2, src2); 1881 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1882 pshufd(vtmp2, src2, 0x1); 1883 pmovsxbw(vtmp2, src2); 1884 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 1885 } 1886 } 1887 1888 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1889 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 1890 int vector_len = Assembler::AVX_512bit; 1891 vpmovsxbw(vtmp1, src2, vector_len); 1892 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1893 } else { 1894 assert(UseAVX >= 2,"Should not reach here."); 1895 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 1896 vextracti128_high(vtmp2, src2); 1897 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 1898 } 1899 } 1900 1901 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1902 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 1903 vextracti64x4_high(vtmp2, src2); 1904 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 1905 } 1906 1907 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1908 if (opcode == Op_AddReductionVI) { 1909 if (vtmp1 != src2) { 1910 movdqu(vtmp1, src2); 1911 } 1912 phaddw(vtmp1, vtmp1); 1913 phaddw(vtmp1, vtmp1); 1914 } else { 1915 pshufd(vtmp2, src2, 0x1); 1916 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 1917 movdqu(vtmp1, vtmp2); 1918 psrldq(vtmp1, 2); 1919 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 1920 } 1921 movdl(vtmp2, src1); 1922 pmovsxwd(vtmp1, vtmp1); 1923 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1924 pextrw(dst, vtmp1, 0x0); 1925 movswl(dst, dst); 1926 } 1927 1928 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1929 if (opcode == Op_AddReductionVI) { 1930 if (vtmp1 != src2) { 1931 movdqu(vtmp1, src2); 1932 } 1933 phaddw(vtmp1, src2); 1934 } else { 1935 pshufd(vtmp1, src2, 0xE); 1936 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 1937 } 1938 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1939 } 1940 1941 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1942 if (opcode == Op_AddReductionVI) { 1943 int vector_len = Assembler::AVX_256bit; 1944 vphaddw(vtmp2, src2, src2, vector_len); 1945 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 1946 } else { 1947 vextracti128_high(vtmp2, src2); 1948 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 1949 } 1950 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1951 } 1952 1953 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1954 int vector_len = Assembler::AVX_256bit; 1955 vextracti64x4_high(vtmp1, src2); 1956 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 1957 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1958 } 1959 1960 #ifdef _LP64 1961 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1962 pshufd(vtmp2, src2, 0xE); 1963 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 1964 movdq(vtmp1, src1); 1965 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 1966 movdq(dst, vtmp1); 1967 } 1968 1969 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1970 vextracti128_high(vtmp1, src2); 1971 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 1972 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1973 } 1974 1975 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1976 vextracti64x4_high(vtmp2, src2); 1977 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 1978 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1979 } 1980 1981 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 1982 assert(ArrayOperationPartialInlineSize > 0 && ArrayOperationPartialInlineSize <= 64, "invalid"); 1983 mov64(temp, -1L); 1984 bzhiq(temp, temp, len); 1985 kmovql(dst, temp); 1986 } 1987 #endif // _LP64 1988 1989 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1990 reduce_operation_128(T_FLOAT, opcode, dst, src); 1991 pshufd(vtmp, src, 0x1); 1992 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 1993 } 1994 1995 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1996 reduce2F(opcode, dst, src, vtmp); 1997 pshufd(vtmp, src, 0x2); 1998 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 1999 pshufd(vtmp, src, 0x3); 2000 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2001 } 2002 2003 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2004 reduce4F(opcode, dst, src, vtmp2); 2005 vextractf128_high(vtmp2, src); 2006 reduce4F(opcode, dst, vtmp2, vtmp1); 2007 } 2008 2009 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2010 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2011 vextracti64x4_high(vtmp1, src); 2012 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2013 } 2014 2015 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2016 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2017 pshufd(vtmp, src, 0xE); 2018 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2019 } 2020 2021 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2022 reduce2D(opcode, dst, src, vtmp2); 2023 vextractf128_high(vtmp2, src); 2024 reduce2D(opcode, dst, vtmp2, vtmp1); 2025 } 2026 2027 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2028 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2029 vextracti64x4_high(vtmp1, src); 2030 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2031 } 2032 2033 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) { 2034 MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len); 2035 } 2036 2037 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) { 2038 MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len); 2039 } 2040 2041 2042 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2043 XMMRegister dst, XMMRegister src, 2044 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2045 XMMRegister xmm_0, XMMRegister xmm_1) { 2046 int permconst[] = {1, 14}; 2047 XMMRegister wsrc = src; 2048 XMMRegister wdst = xmm_0; 2049 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2050 2051 int vlen_enc = Assembler::AVX_128bit; 2052 if (vlen == 16) { 2053 vlen_enc = Assembler::AVX_256bit; 2054 } 2055 2056 for (int i = log2(vlen) - 1; i >=0; i--) { 2057 if (i == 0 && !is_dst_valid) { 2058 wdst = dst; 2059 } 2060 if (i == 3) { 2061 vextracti64x4_high(wtmp, wsrc); 2062 } else if (i == 2) { 2063 vextracti128_high(wtmp, wsrc); 2064 } else { // i = [0,1] 2065 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2066 } 2067 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2068 wsrc = wdst; 2069 vlen_enc = Assembler::AVX_128bit; 2070 } 2071 if (is_dst_valid) { 2072 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2073 } 2074 } 2075 2076 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2077 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2078 XMMRegister xmm_0, XMMRegister xmm_1) { 2079 XMMRegister wsrc = src; 2080 XMMRegister wdst = xmm_0; 2081 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2082 int vlen_enc = Assembler::AVX_128bit; 2083 if (vlen == 8) { 2084 vlen_enc = Assembler::AVX_256bit; 2085 } 2086 for (int i = log2(vlen) - 1; i >=0; i--) { 2087 if (i == 0 && !is_dst_valid) { 2088 wdst = dst; 2089 } 2090 if (i == 1) { 2091 vextracti128_high(wtmp, wsrc); 2092 } else if (i == 2) { 2093 vextracti64x4_high(wtmp, wsrc); 2094 } else { 2095 assert(i == 0, "%d", i); 2096 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2097 } 2098 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2099 wsrc = wdst; 2100 vlen_enc = Assembler::AVX_128bit; 2101 } 2102 if (is_dst_valid) { 2103 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2104 } 2105 } 2106 2107 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2108 switch (bt) { 2109 case T_BYTE: pextrb(dst, src, idx); break; 2110 case T_SHORT: pextrw(dst, src, idx); break; 2111 case T_INT: pextrd(dst, src, idx); break; 2112 case T_LONG: pextrq(dst, src, idx); break; 2113 2114 default: 2115 assert(false,"Should not reach here."); 2116 break; 2117 } 2118 } 2119 2120 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2121 int esize = type2aelembytes(typ); 2122 int elem_per_lane = 16/esize; 2123 int lane = elemindex / elem_per_lane; 2124 int eindex = elemindex % elem_per_lane; 2125 2126 if (lane >= 2) { 2127 assert(UseAVX > 2, "required"); 2128 vextractf32x4(dst, src, lane & 3); 2129 return dst; 2130 } else if (lane > 0) { 2131 assert(UseAVX > 0, "required"); 2132 vextractf128(dst, src, lane); 2133 return dst; 2134 } else { 2135 return src; 2136 } 2137 } 2138 2139 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2140 int esize = type2aelembytes(typ); 2141 int elem_per_lane = 16/esize; 2142 int eindex = elemindex % elem_per_lane; 2143 assert(is_integral_type(typ),"required"); 2144 2145 if (eindex == 0) { 2146 if (typ == T_LONG) { 2147 movq(dst, src); 2148 } else { 2149 movdl(dst, src); 2150 if (typ == T_BYTE) 2151 movsbl(dst, dst); 2152 else if (typ == T_SHORT) 2153 movswl(dst, dst); 2154 } 2155 } else { 2156 extract(typ, dst, src, eindex); 2157 } 2158 } 2159 2160 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) { 2161 int esize = type2aelembytes(typ); 2162 int elem_per_lane = 16/esize; 2163 int eindex = elemindex % elem_per_lane; 2164 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2165 2166 if (eindex == 0) { 2167 movq(dst, src); 2168 } else { 2169 if (typ == T_FLOAT) { 2170 if (UseAVX == 0) { 2171 movdqu(dst, src); 2172 pshufps(dst, dst, eindex); 2173 } else { 2174 vpshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2175 } 2176 } else { 2177 if (UseAVX == 0) { 2178 movdqu(dst, src); 2179 psrldq(dst, eindex*esize); 2180 } else { 2181 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2182 } 2183 movq(dst, dst); 2184 } 2185 } 2186 // Zero upper bits 2187 if (typ == T_FLOAT) { 2188 if (UseAVX == 0) { 2189 assert((vtmp != xnoreg) && (tmp != noreg), "required."); 2190 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp); 2191 pand(dst, vtmp); 2192 } else { 2193 assert((tmp != noreg), "required."); 2194 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp); 2195 } 2196 } 2197 } 2198 2199 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2200 switch(typ) { 2201 case T_BYTE: 2202 case T_BOOLEAN: 2203 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2204 break; 2205 case T_SHORT: 2206 case T_CHAR: 2207 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2208 break; 2209 case T_INT: 2210 case T_FLOAT: 2211 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2212 break; 2213 case T_LONG: 2214 case T_DOUBLE: 2215 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2216 break; 2217 default: 2218 assert(false,"Should not reach here."); 2219 break; 2220 } 2221 } 2222 2223 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) { 2224 switch(typ) { 2225 case T_BOOLEAN: 2226 case T_BYTE: 2227 evpcmpb(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch); 2228 break; 2229 case T_CHAR: 2230 case T_SHORT: 2231 evpcmpw(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch); 2232 break; 2233 case T_INT: 2234 case T_FLOAT: 2235 evpcmpd(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch); 2236 break; 2237 case T_LONG: 2238 case T_DOUBLE: 2239 evpcmpq(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch); 2240 break; 2241 default: 2242 assert(false,"Should not reach here."); 2243 break; 2244 } 2245 } 2246 2247 void C2_MacroAssembler::vpcmpu(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison, 2248 int vlen_in_bytes, XMMRegister vtmp1, XMMRegister vtmp2, Register scratch) { 2249 int vlen_enc = vector_length_encoding(vlen_in_bytes*2); 2250 switch (typ) { 2251 case T_BYTE: 2252 vpmovzxbw(vtmp1, src1, vlen_enc); 2253 vpmovzxbw(vtmp2, src2, vlen_enc); 2254 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch); 2255 vpacksswb(dst, dst, dst, vlen_enc); 2256 break; 2257 case T_SHORT: 2258 vpmovzxwd(vtmp1, src1, vlen_enc); 2259 vpmovzxwd(vtmp2, src2, vlen_enc); 2260 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch); 2261 vpackssdw(dst, dst, dst, vlen_enc); 2262 break; 2263 case T_INT: 2264 vpmovzxdq(vtmp1, src1, vlen_enc); 2265 vpmovzxdq(vtmp2, src2, vlen_enc); 2266 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch); 2267 vpermilps(dst, dst, 8, vlen_enc); 2268 break; 2269 default: 2270 assert(false, "Should not reach here"); 2271 } 2272 if (vlen_in_bytes == 16) { 2273 vpermpd(dst, dst, 0x8, vlen_enc); 2274 } 2275 } 2276 2277 void C2_MacroAssembler::vpcmpu32(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison, int vlen_in_bytes, 2278 XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, Register scratch) { 2279 int vlen_enc = vector_length_encoding(vlen_in_bytes); 2280 switch (typ) { 2281 case T_BYTE: 2282 vpmovzxbw(vtmp1, src1, vlen_enc); 2283 vpmovzxbw(vtmp2, src2, vlen_enc); 2284 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch); 2285 vextracti128(vtmp1, src1, 1); 2286 vextracti128(vtmp2, src2, 1); 2287 vpmovzxbw(vtmp1, vtmp1, vlen_enc); 2288 vpmovzxbw(vtmp2, vtmp2, vlen_enc); 2289 vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch); 2290 vpacksswb(dst, dst, vtmp3, vlen_enc); 2291 vpermpd(dst, dst, 0xd8, vlen_enc); 2292 break; 2293 case T_SHORT: 2294 vpmovzxwd(vtmp1, src1, vlen_enc); 2295 vpmovzxwd(vtmp2, src2, vlen_enc); 2296 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch); 2297 vextracti128(vtmp1, src1, 1); 2298 vextracti128(vtmp2, src2, 1); 2299 vpmovzxwd(vtmp1, vtmp1, vlen_enc); 2300 vpmovzxwd(vtmp2, vtmp2, vlen_enc); 2301 vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch); 2302 vpackssdw(dst, dst, vtmp3, vlen_enc); 2303 vpermpd(dst, dst, 0xd8, vlen_enc); 2304 break; 2305 case T_INT: 2306 vpmovzxdq(vtmp1, src1, vlen_enc); 2307 vpmovzxdq(vtmp2, src2, vlen_enc); 2308 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch); 2309 vpshufd(dst, dst, 8, vlen_enc); 2310 vpermq(dst, dst, 8, vlen_enc); 2311 vextracti128(vtmp1, src1, 1); 2312 vextracti128(vtmp2, src2, 1); 2313 vpmovzxdq(vtmp1, vtmp1, vlen_enc); 2314 vpmovzxdq(vtmp2, vtmp2, vlen_enc); 2315 vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch); 2316 vpshufd(vtmp3, vtmp3, 8, vlen_enc); 2317 vpermq(vtmp3, vtmp3, 0x80, vlen_enc); 2318 vpblendd(dst, dst, vtmp3, 0xf0, vlen_enc); 2319 break; 2320 default: 2321 assert(false, "Should not reach here"); 2322 } 2323 } 2324 2325 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2326 switch(typ) { 2327 case T_BYTE: 2328 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2329 break; 2330 case T_SHORT: 2331 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2332 break; 2333 case T_INT: 2334 case T_FLOAT: 2335 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2336 break; 2337 case T_LONG: 2338 case T_DOUBLE: 2339 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2340 break; 2341 default: 2342 assert(false,"Should not reach here."); 2343 break; 2344 } 2345 } 2346 2347 void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2, 2348 XMMRegister vtmp1, XMMRegister vtmp2, KRegister mask) { 2349 switch(vlen) { 2350 case 4: 2351 assert(vtmp1 != xnoreg, "required."); 2352 // Broadcast lower 32 bits to 128 bits before ptest 2353 pshufd(vtmp1, src1, 0x0); 2354 if (bt == BoolTest::overflow) { 2355 assert(vtmp2 != xnoreg, "required."); 2356 pshufd(vtmp2, src2, 0x0); 2357 } else { 2358 assert(vtmp2 == xnoreg, "required."); 2359 vtmp2 = src2; 2360 } 2361 ptest(vtmp1, vtmp2); 2362 break; 2363 case 8: 2364 assert(vtmp1 != xnoreg, "required."); 2365 // Broadcast lower 64 bits to 128 bits before ptest 2366 pshufd(vtmp1, src1, 0x4); 2367 if (bt == BoolTest::overflow) { 2368 assert(vtmp2 != xnoreg, "required."); 2369 pshufd(vtmp2, src2, 0x4); 2370 } else { 2371 assert(vtmp2 == xnoreg, "required."); 2372 vtmp2 = src2; 2373 } 2374 ptest(vtmp1, vtmp2); 2375 break; 2376 case 16: 2377 assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required."); 2378 ptest(src1, src2); 2379 break; 2380 case 32: 2381 assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required."); 2382 vptest(src1, src2, Assembler::AVX_256bit); 2383 break; 2384 case 64: 2385 { 2386 assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required."); 2387 evpcmpeqb(mask, src1, src2, Assembler::AVX_512bit); 2388 if (bt == BoolTest::ne) { 2389 ktestql(mask, mask); 2390 } else { 2391 assert(bt == BoolTest::overflow, "required"); 2392 kortestql(mask, mask); 2393 } 2394 } 2395 break; 2396 default: 2397 assert(false,"Should not reach here."); 2398 break; 2399 } 2400 } 2401 2402 //------------------------------------------------------------------------------------------- 2403 2404 // IndexOf for constant substrings with size >= 8 chars 2405 // which don't need to be loaded through stack. 2406 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2407 Register cnt1, Register cnt2, 2408 int int_cnt2, Register result, 2409 XMMRegister vec, Register tmp, 2410 int ae) { 2411 ShortBranchVerifier sbv(this); 2412 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2413 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2414 2415 // This method uses the pcmpestri instruction with bound registers 2416 // inputs: 2417 // xmm - substring 2418 // rax - substring length (elements count) 2419 // mem - scanned string 2420 // rdx - string length (elements count) 2421 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2422 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2423 // outputs: 2424 // rcx - matched index in string 2425 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2426 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2427 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2428 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2429 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2430 2431 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2432 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2433 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2434 2435 // Note, inline_string_indexOf() generates checks: 2436 // if (substr.count > string.count) return -1; 2437 // if (substr.count == 0) return 0; 2438 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2439 2440 // Load substring. 2441 if (ae == StrIntrinsicNode::UL) { 2442 pmovzxbw(vec, Address(str2, 0)); 2443 } else { 2444 movdqu(vec, Address(str2, 0)); 2445 } 2446 movl(cnt2, int_cnt2); 2447 movptr(result, str1); // string addr 2448 2449 if (int_cnt2 > stride) { 2450 jmpb(SCAN_TO_SUBSTR); 2451 2452 // Reload substr for rescan, this code 2453 // is executed only for large substrings (> 8 chars) 2454 bind(RELOAD_SUBSTR); 2455 if (ae == StrIntrinsicNode::UL) { 2456 pmovzxbw(vec, Address(str2, 0)); 2457 } else { 2458 movdqu(vec, Address(str2, 0)); 2459 } 2460 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2461 2462 bind(RELOAD_STR); 2463 // We came here after the beginning of the substring was 2464 // matched but the rest of it was not so we need to search 2465 // again. Start from the next element after the previous match. 2466 2467 // cnt2 is number of substring reminding elements and 2468 // cnt1 is number of string reminding elements when cmp failed. 2469 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2470 subl(cnt1, cnt2); 2471 addl(cnt1, int_cnt2); 2472 movl(cnt2, int_cnt2); // Now restore cnt2 2473 2474 decrementl(cnt1); // Shift to next element 2475 cmpl(cnt1, cnt2); 2476 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2477 2478 addptr(result, (1<<scale1)); 2479 2480 } // (int_cnt2 > 8) 2481 2482 // Scan string for start of substr in 16-byte vectors 2483 bind(SCAN_TO_SUBSTR); 2484 pcmpestri(vec, Address(result, 0), mode); 2485 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2486 subl(cnt1, stride); 2487 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2488 cmpl(cnt1, cnt2); 2489 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2490 addptr(result, 16); 2491 jmpb(SCAN_TO_SUBSTR); 2492 2493 // Found a potential substr 2494 bind(FOUND_CANDIDATE); 2495 // Matched whole vector if first element matched (tmp(rcx) == 0). 2496 if (int_cnt2 == stride) { 2497 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2498 } else { // int_cnt2 > 8 2499 jccb(Assembler::overflow, FOUND_SUBSTR); 2500 } 2501 // After pcmpestri tmp(rcx) contains matched element index 2502 // Compute start addr of substr 2503 lea(result, Address(result, tmp, scale1)); 2504 2505 // Make sure string is still long enough 2506 subl(cnt1, tmp); 2507 cmpl(cnt1, cnt2); 2508 if (int_cnt2 == stride) { 2509 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2510 } else { // int_cnt2 > 8 2511 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2512 } 2513 // Left less then substring. 2514 2515 bind(RET_NOT_FOUND); 2516 movl(result, -1); 2517 jmp(EXIT); 2518 2519 if (int_cnt2 > stride) { 2520 // This code is optimized for the case when whole substring 2521 // is matched if its head is matched. 2522 bind(MATCH_SUBSTR_HEAD); 2523 pcmpestri(vec, Address(result, 0), mode); 2524 // Reload only string if does not match 2525 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2526 2527 Label CONT_SCAN_SUBSTR; 2528 // Compare the rest of substring (> 8 chars). 2529 bind(FOUND_SUBSTR); 2530 // First 8 chars are already matched. 2531 negptr(cnt2); 2532 addptr(cnt2, stride); 2533 2534 bind(SCAN_SUBSTR); 2535 subl(cnt1, stride); 2536 cmpl(cnt2, -stride); // Do not read beyond substring 2537 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2538 // Back-up strings to avoid reading beyond substring: 2539 // cnt1 = cnt1 - cnt2 + 8 2540 addl(cnt1, cnt2); // cnt2 is negative 2541 addl(cnt1, stride); 2542 movl(cnt2, stride); negptr(cnt2); 2543 bind(CONT_SCAN_SUBSTR); 2544 if (int_cnt2 < (int)G) { 2545 int tail_off1 = int_cnt2<<scale1; 2546 int tail_off2 = int_cnt2<<scale2; 2547 if (ae == StrIntrinsicNode::UL) { 2548 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2549 } else { 2550 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2551 } 2552 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2553 } else { 2554 // calculate index in register to avoid integer overflow (int_cnt2*2) 2555 movl(tmp, int_cnt2); 2556 addptr(tmp, cnt2); 2557 if (ae == StrIntrinsicNode::UL) { 2558 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2559 } else { 2560 movdqu(vec, Address(str2, tmp, scale2, 0)); 2561 } 2562 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2563 } 2564 // Need to reload strings pointers if not matched whole vector 2565 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2566 addptr(cnt2, stride); 2567 jcc(Assembler::negative, SCAN_SUBSTR); 2568 // Fall through if found full substring 2569 2570 } // (int_cnt2 > 8) 2571 2572 bind(RET_FOUND); 2573 // Found result if we matched full small substring. 2574 // Compute substr offset 2575 subptr(result, str1); 2576 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2577 shrl(result, 1); // index 2578 } 2579 bind(EXIT); 2580 2581 } // string_indexofC8 2582 2583 // Small strings are loaded through stack if they cross page boundary. 2584 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2585 Register cnt1, Register cnt2, 2586 int int_cnt2, Register result, 2587 XMMRegister vec, Register tmp, 2588 int ae) { 2589 ShortBranchVerifier sbv(this); 2590 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2591 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2592 2593 // 2594 // int_cnt2 is length of small (< 8 chars) constant substring 2595 // or (-1) for non constant substring in which case its length 2596 // is in cnt2 register. 2597 // 2598 // Note, inline_string_indexOf() generates checks: 2599 // if (substr.count > string.count) return -1; 2600 // if (substr.count == 0) return 0; 2601 // 2602 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2603 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2604 // This method uses the pcmpestri instruction with bound registers 2605 // inputs: 2606 // xmm - substring 2607 // rax - substring length (elements count) 2608 // mem - scanned string 2609 // rdx - string length (elements count) 2610 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2611 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2612 // outputs: 2613 // rcx - matched index in string 2614 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2615 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2616 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2617 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2618 2619 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2620 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2621 FOUND_CANDIDATE; 2622 2623 { //======================================================== 2624 // We don't know where these strings are located 2625 // and we can't read beyond them. Load them through stack. 2626 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2627 2628 movptr(tmp, rsp); // save old SP 2629 2630 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2631 if (int_cnt2 == (1>>scale2)) { // One byte 2632 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2633 load_unsigned_byte(result, Address(str2, 0)); 2634 movdl(vec, result); // move 32 bits 2635 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2636 // Not enough header space in 32-bit VM: 12+3 = 15. 2637 movl(result, Address(str2, -1)); 2638 shrl(result, 8); 2639 movdl(vec, result); // move 32 bits 2640 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2641 load_unsigned_short(result, Address(str2, 0)); 2642 movdl(vec, result); // move 32 bits 2643 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2644 movdl(vec, Address(str2, 0)); // move 32 bits 2645 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2646 movq(vec, Address(str2, 0)); // move 64 bits 2647 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2648 // Array header size is 12 bytes in 32-bit VM 2649 // + 6 bytes for 3 chars == 18 bytes, 2650 // enough space to load vec and shift. 2651 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2652 if (ae == StrIntrinsicNode::UL) { 2653 int tail_off = int_cnt2-8; 2654 pmovzxbw(vec, Address(str2, tail_off)); 2655 psrldq(vec, -2*tail_off); 2656 } 2657 else { 2658 int tail_off = int_cnt2*(1<<scale2); 2659 movdqu(vec, Address(str2, tail_off-16)); 2660 psrldq(vec, 16-tail_off); 2661 } 2662 } 2663 } else { // not constant substring 2664 cmpl(cnt2, stride); 2665 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 2666 2667 // We can read beyond string if srt+16 does not cross page boundary 2668 // since heaps are aligned and mapped by pages. 2669 assert(os::vm_page_size() < (int)G, "default page should be small"); 2670 movl(result, str2); // We need only low 32 bits 2671 andl(result, (os::vm_page_size()-1)); 2672 cmpl(result, (os::vm_page_size()-16)); 2673 jccb(Assembler::belowEqual, CHECK_STR); 2674 2675 // Move small strings to stack to allow load 16 bytes into vec. 2676 subptr(rsp, 16); 2677 int stk_offset = wordSize-(1<<scale2); 2678 push(cnt2); 2679 2680 bind(COPY_SUBSTR); 2681 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 2682 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 2683 movb(Address(rsp, cnt2, scale2, stk_offset), result); 2684 } else if (ae == StrIntrinsicNode::UU) { 2685 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 2686 movw(Address(rsp, cnt2, scale2, stk_offset), result); 2687 } 2688 decrement(cnt2); 2689 jccb(Assembler::notZero, COPY_SUBSTR); 2690 2691 pop(cnt2); 2692 movptr(str2, rsp); // New substring address 2693 } // non constant 2694 2695 bind(CHECK_STR); 2696 cmpl(cnt1, stride); 2697 jccb(Assembler::aboveEqual, BIG_STRINGS); 2698 2699 // Check cross page boundary. 2700 movl(result, str1); // We need only low 32 bits 2701 andl(result, (os::vm_page_size()-1)); 2702 cmpl(result, (os::vm_page_size()-16)); 2703 jccb(Assembler::belowEqual, BIG_STRINGS); 2704 2705 subptr(rsp, 16); 2706 int stk_offset = -(1<<scale1); 2707 if (int_cnt2 < 0) { // not constant 2708 push(cnt2); 2709 stk_offset += wordSize; 2710 } 2711 movl(cnt2, cnt1); 2712 2713 bind(COPY_STR); 2714 if (ae == StrIntrinsicNode::LL) { 2715 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 2716 movb(Address(rsp, cnt2, scale1, stk_offset), result); 2717 } else { 2718 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 2719 movw(Address(rsp, cnt2, scale1, stk_offset), result); 2720 } 2721 decrement(cnt2); 2722 jccb(Assembler::notZero, COPY_STR); 2723 2724 if (int_cnt2 < 0) { // not constant 2725 pop(cnt2); 2726 } 2727 movptr(str1, rsp); // New string address 2728 2729 bind(BIG_STRINGS); 2730 // Load substring. 2731 if (int_cnt2 < 0) { // -1 2732 if (ae == StrIntrinsicNode::UL) { 2733 pmovzxbw(vec, Address(str2, 0)); 2734 } else { 2735 movdqu(vec, Address(str2, 0)); 2736 } 2737 push(cnt2); // substr count 2738 push(str2); // substr addr 2739 push(str1); // string addr 2740 } else { 2741 // Small (< 8 chars) constant substrings are loaded already. 2742 movl(cnt2, int_cnt2); 2743 } 2744 push(tmp); // original SP 2745 2746 } // Finished loading 2747 2748 //======================================================== 2749 // Start search 2750 // 2751 2752 movptr(result, str1); // string addr 2753 2754 if (int_cnt2 < 0) { // Only for non constant substring 2755 jmpb(SCAN_TO_SUBSTR); 2756 2757 // SP saved at sp+0 2758 // String saved at sp+1*wordSize 2759 // Substr saved at sp+2*wordSize 2760 // Substr count saved at sp+3*wordSize 2761 2762 // Reload substr for rescan, this code 2763 // is executed only for large substrings (> 8 chars) 2764 bind(RELOAD_SUBSTR); 2765 movptr(str2, Address(rsp, 2*wordSize)); 2766 movl(cnt2, Address(rsp, 3*wordSize)); 2767 if (ae == StrIntrinsicNode::UL) { 2768 pmovzxbw(vec, Address(str2, 0)); 2769 } else { 2770 movdqu(vec, Address(str2, 0)); 2771 } 2772 // We came here after the beginning of the substring was 2773 // matched but the rest of it was not so we need to search 2774 // again. Start from the next element after the previous match. 2775 subptr(str1, result); // Restore counter 2776 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2777 shrl(str1, 1); 2778 } 2779 addl(cnt1, str1); 2780 decrementl(cnt1); // Shift to next element 2781 cmpl(cnt1, cnt2); 2782 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2783 2784 addptr(result, (1<<scale1)); 2785 } // non constant 2786 2787 // Scan string for start of substr in 16-byte vectors 2788 bind(SCAN_TO_SUBSTR); 2789 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2790 pcmpestri(vec, Address(result, 0), mode); 2791 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2792 subl(cnt1, stride); 2793 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2794 cmpl(cnt1, cnt2); 2795 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2796 addptr(result, 16); 2797 2798 bind(ADJUST_STR); 2799 cmpl(cnt1, stride); // Do not read beyond string 2800 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2801 // Back-up string to avoid reading beyond string. 2802 lea(result, Address(result, cnt1, scale1, -16)); 2803 movl(cnt1, stride); 2804 jmpb(SCAN_TO_SUBSTR); 2805 2806 // Found a potential substr 2807 bind(FOUND_CANDIDATE); 2808 // After pcmpestri tmp(rcx) contains matched element index 2809 2810 // Make sure string is still long enough 2811 subl(cnt1, tmp); 2812 cmpl(cnt1, cnt2); 2813 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 2814 // Left less then substring. 2815 2816 bind(RET_NOT_FOUND); 2817 movl(result, -1); 2818 jmp(CLEANUP); 2819 2820 bind(FOUND_SUBSTR); 2821 // Compute start addr of substr 2822 lea(result, Address(result, tmp, scale1)); 2823 if (int_cnt2 > 0) { // Constant substring 2824 // Repeat search for small substring (< 8 chars) 2825 // from new point without reloading substring. 2826 // Have to check that we don't read beyond string. 2827 cmpl(tmp, stride-int_cnt2); 2828 jccb(Assembler::greater, ADJUST_STR); 2829 // Fall through if matched whole substring. 2830 } else { // non constant 2831 assert(int_cnt2 == -1, "should be != 0"); 2832 2833 addl(tmp, cnt2); 2834 // Found result if we matched whole substring. 2835 cmpl(tmp, stride); 2836 jcc(Assembler::lessEqual, RET_FOUND); 2837 2838 // Repeat search for small substring (<= 8 chars) 2839 // from new point 'str1' without reloading substring. 2840 cmpl(cnt2, stride); 2841 // Have to check that we don't read beyond string. 2842 jccb(Assembler::lessEqual, ADJUST_STR); 2843 2844 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 2845 // Compare the rest of substring (> 8 chars). 2846 movptr(str1, result); 2847 2848 cmpl(tmp, cnt2); 2849 // First 8 chars are already matched. 2850 jccb(Assembler::equal, CHECK_NEXT); 2851 2852 bind(SCAN_SUBSTR); 2853 pcmpestri(vec, Address(str1, 0), mode); 2854 // Need to reload strings pointers if not matched whole vector 2855 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2856 2857 bind(CHECK_NEXT); 2858 subl(cnt2, stride); 2859 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 2860 addptr(str1, 16); 2861 if (ae == StrIntrinsicNode::UL) { 2862 addptr(str2, 8); 2863 } else { 2864 addptr(str2, 16); 2865 } 2866 subl(cnt1, stride); 2867 cmpl(cnt2, stride); // Do not read beyond substring 2868 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 2869 // Back-up strings to avoid reading beyond substring. 2870 2871 if (ae == StrIntrinsicNode::UL) { 2872 lea(str2, Address(str2, cnt2, scale2, -8)); 2873 lea(str1, Address(str1, cnt2, scale1, -16)); 2874 } else { 2875 lea(str2, Address(str2, cnt2, scale2, -16)); 2876 lea(str1, Address(str1, cnt2, scale1, -16)); 2877 } 2878 subl(cnt1, cnt2); 2879 movl(cnt2, stride); 2880 addl(cnt1, stride); 2881 bind(CONT_SCAN_SUBSTR); 2882 if (ae == StrIntrinsicNode::UL) { 2883 pmovzxbw(vec, Address(str2, 0)); 2884 } else { 2885 movdqu(vec, Address(str2, 0)); 2886 } 2887 jmp(SCAN_SUBSTR); 2888 2889 bind(RET_FOUND_LONG); 2890 movptr(str1, Address(rsp, wordSize)); 2891 } // non constant 2892 2893 bind(RET_FOUND); 2894 // Compute substr offset 2895 subptr(result, str1); 2896 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2897 shrl(result, 1); // index 2898 } 2899 bind(CLEANUP); 2900 pop(rsp); // restore SP 2901 2902 } // string_indexof 2903 2904 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 2905 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 2906 ShortBranchVerifier sbv(this); 2907 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2908 2909 int stride = 8; 2910 2911 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 2912 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 2913 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 2914 FOUND_SEQ_CHAR, DONE_LABEL; 2915 2916 movptr(result, str1); 2917 if (UseAVX >= 2) { 2918 cmpl(cnt1, stride); 2919 jcc(Assembler::less, SCAN_TO_CHAR); 2920 cmpl(cnt1, 2*stride); 2921 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 2922 movdl(vec1, ch); 2923 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 2924 vpxor(vec2, vec2); 2925 movl(tmp, cnt1); 2926 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 2927 andl(cnt1,0x0000000F); //tail count (in chars) 2928 2929 bind(SCAN_TO_16_CHAR_LOOP); 2930 vmovdqu(vec3, Address(result, 0)); 2931 vpcmpeqw(vec3, vec3, vec1, 1); 2932 vptest(vec2, vec3); 2933 jcc(Assembler::carryClear, FOUND_CHAR); 2934 addptr(result, 32); 2935 subl(tmp, 2*stride); 2936 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 2937 jmp(SCAN_TO_8_CHAR); 2938 bind(SCAN_TO_8_CHAR_INIT); 2939 movdl(vec1, ch); 2940 pshuflw(vec1, vec1, 0x00); 2941 pshufd(vec1, vec1, 0); 2942 pxor(vec2, vec2); 2943 } 2944 bind(SCAN_TO_8_CHAR); 2945 cmpl(cnt1, stride); 2946 jcc(Assembler::less, SCAN_TO_CHAR); 2947 if (UseAVX < 2) { 2948 movdl(vec1, ch); 2949 pshuflw(vec1, vec1, 0x00); 2950 pshufd(vec1, vec1, 0); 2951 pxor(vec2, vec2); 2952 } 2953 movl(tmp, cnt1); 2954 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 2955 andl(cnt1,0x00000007); //tail count (in chars) 2956 2957 bind(SCAN_TO_8_CHAR_LOOP); 2958 movdqu(vec3, Address(result, 0)); 2959 pcmpeqw(vec3, vec1); 2960 ptest(vec2, vec3); 2961 jcc(Assembler::carryClear, FOUND_CHAR); 2962 addptr(result, 16); 2963 subl(tmp, stride); 2964 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 2965 bind(SCAN_TO_CHAR); 2966 testl(cnt1, cnt1); 2967 jcc(Assembler::zero, RET_NOT_FOUND); 2968 bind(SCAN_TO_CHAR_LOOP); 2969 load_unsigned_short(tmp, Address(result, 0)); 2970 cmpl(ch, tmp); 2971 jccb(Assembler::equal, FOUND_SEQ_CHAR); 2972 addptr(result, 2); 2973 subl(cnt1, 1); 2974 jccb(Assembler::zero, RET_NOT_FOUND); 2975 jmp(SCAN_TO_CHAR_LOOP); 2976 2977 bind(RET_NOT_FOUND); 2978 movl(result, -1); 2979 jmpb(DONE_LABEL); 2980 2981 bind(FOUND_CHAR); 2982 if (UseAVX >= 2) { 2983 vpmovmskb(tmp, vec3); 2984 } else { 2985 pmovmskb(tmp, vec3); 2986 } 2987 bsfl(ch, tmp); 2988 addptr(result, ch); 2989 2990 bind(FOUND_SEQ_CHAR); 2991 subptr(result, str1); 2992 shrl(result, 1); 2993 2994 bind(DONE_LABEL); 2995 } // string_indexof_char 2996 2997 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 2998 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 2999 ShortBranchVerifier sbv(this); 3000 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3001 3002 int stride = 16; 3003 3004 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3005 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3006 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3007 FOUND_SEQ_CHAR, DONE_LABEL; 3008 3009 movptr(result, str1); 3010 if (UseAVX >= 2) { 3011 cmpl(cnt1, stride); 3012 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3013 cmpl(cnt1, stride*2); 3014 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3015 movdl(vec1, ch); 3016 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3017 vpxor(vec2, vec2); 3018 movl(tmp, cnt1); 3019 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3020 andl(cnt1,0x0000001F); //tail count (in chars) 3021 3022 bind(SCAN_TO_32_CHAR_LOOP); 3023 vmovdqu(vec3, Address(result, 0)); 3024 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3025 vptest(vec2, vec3); 3026 jcc(Assembler::carryClear, FOUND_CHAR); 3027 addptr(result, 32); 3028 subl(tmp, stride*2); 3029 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3030 jmp(SCAN_TO_16_CHAR); 3031 3032 bind(SCAN_TO_16_CHAR_INIT); 3033 movdl(vec1, ch); 3034 pxor(vec2, vec2); 3035 pshufb(vec1, vec2); 3036 } 3037 3038 bind(SCAN_TO_16_CHAR); 3039 cmpl(cnt1, stride); 3040 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entires left 3041 if (UseAVX < 2) { 3042 movdl(vec1, ch); 3043 pxor(vec2, vec2); 3044 pshufb(vec1, vec2); 3045 } 3046 movl(tmp, cnt1); 3047 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3048 andl(cnt1,0x0000000F); //tail count (in bytes) 3049 3050 bind(SCAN_TO_16_CHAR_LOOP); 3051 movdqu(vec3, Address(result, 0)); 3052 pcmpeqb(vec3, vec1); 3053 ptest(vec2, vec3); 3054 jcc(Assembler::carryClear, FOUND_CHAR); 3055 addptr(result, 16); 3056 subl(tmp, stride); 3057 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3058 3059 bind(SCAN_TO_CHAR_INIT); 3060 testl(cnt1, cnt1); 3061 jcc(Assembler::zero, RET_NOT_FOUND); 3062 bind(SCAN_TO_CHAR_LOOP); 3063 load_unsigned_byte(tmp, Address(result, 0)); 3064 cmpl(ch, tmp); 3065 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3066 addptr(result, 1); 3067 subl(cnt1, 1); 3068 jccb(Assembler::zero, RET_NOT_FOUND); 3069 jmp(SCAN_TO_CHAR_LOOP); 3070 3071 bind(RET_NOT_FOUND); 3072 movl(result, -1); 3073 jmpb(DONE_LABEL); 3074 3075 bind(FOUND_CHAR); 3076 if (UseAVX >= 2) { 3077 vpmovmskb(tmp, vec3); 3078 } else { 3079 pmovmskb(tmp, vec3); 3080 } 3081 bsfl(ch, tmp); 3082 addptr(result, ch); 3083 3084 bind(FOUND_SEQ_CHAR); 3085 subptr(result, str1); 3086 3087 bind(DONE_LABEL); 3088 } // stringL_indexof_char 3089 3090 // helper function for string_compare 3091 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3092 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3093 Address::ScaleFactor scale2, Register index, int ae) { 3094 if (ae == StrIntrinsicNode::LL) { 3095 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3096 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3097 } else if (ae == StrIntrinsicNode::UU) { 3098 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3099 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3100 } else { 3101 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3102 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3103 } 3104 } 3105 3106 // Compare strings, used for char[] and byte[]. 3107 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3108 Register cnt1, Register cnt2, Register result, 3109 XMMRegister vec1, int ae, KRegister mask) { 3110 ShortBranchVerifier sbv(this); 3111 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3112 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3113 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3114 int stride2x2 = 0x40; 3115 Address::ScaleFactor scale = Address::no_scale; 3116 Address::ScaleFactor scale1 = Address::no_scale; 3117 Address::ScaleFactor scale2 = Address::no_scale; 3118 3119 if (ae != StrIntrinsicNode::LL) { 3120 stride2x2 = 0x20; 3121 } 3122 3123 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3124 shrl(cnt2, 1); 3125 } 3126 // Compute the minimum of the string lengths and the 3127 // difference of the string lengths (stack). 3128 // Do the conditional move stuff 3129 movl(result, cnt1); 3130 subl(cnt1, cnt2); 3131 push(cnt1); 3132 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3133 3134 // Is the minimum length zero? 3135 testl(cnt2, cnt2); 3136 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3137 if (ae == StrIntrinsicNode::LL) { 3138 // Load first bytes 3139 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3140 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3141 } else if (ae == StrIntrinsicNode::UU) { 3142 // Load first characters 3143 load_unsigned_short(result, Address(str1, 0)); 3144 load_unsigned_short(cnt1, Address(str2, 0)); 3145 } else { 3146 load_unsigned_byte(result, Address(str1, 0)); 3147 load_unsigned_short(cnt1, Address(str2, 0)); 3148 } 3149 subl(result, cnt1); 3150 jcc(Assembler::notZero, POP_LABEL); 3151 3152 if (ae == StrIntrinsicNode::UU) { 3153 // Divide length by 2 to get number of chars 3154 shrl(cnt2, 1); 3155 } 3156 cmpl(cnt2, 1); 3157 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3158 3159 // Check if the strings start at the same location and setup scale and stride 3160 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3161 cmpptr(str1, str2); 3162 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3163 if (ae == StrIntrinsicNode::LL) { 3164 scale = Address::times_1; 3165 stride = 16; 3166 } else { 3167 scale = Address::times_2; 3168 stride = 8; 3169 } 3170 } else { 3171 scale1 = Address::times_1; 3172 scale2 = Address::times_2; 3173 // scale not used 3174 stride = 8; 3175 } 3176 3177 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3178 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3179 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3180 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3181 Label COMPARE_TAIL_LONG; 3182 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3183 3184 int pcmpmask = 0x19; 3185 if (ae == StrIntrinsicNode::LL) { 3186 pcmpmask &= ~0x01; 3187 } 3188 3189 // Setup to compare 16-chars (32-bytes) vectors, 3190 // start from first character again because it has aligned address. 3191 if (ae == StrIntrinsicNode::LL) { 3192 stride2 = 32; 3193 } else { 3194 stride2 = 16; 3195 } 3196 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3197 adr_stride = stride << scale; 3198 } else { 3199 adr_stride1 = 8; //stride << scale1; 3200 adr_stride2 = 16; //stride << scale2; 3201 } 3202 3203 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3204 // rax and rdx are used by pcmpestri as elements counters 3205 movl(result, cnt2); 3206 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3207 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3208 3209 // fast path : compare first 2 8-char vectors. 3210 bind(COMPARE_16_CHARS); 3211 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3212 movdqu(vec1, Address(str1, 0)); 3213 } else { 3214 pmovzxbw(vec1, Address(str1, 0)); 3215 } 3216 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3217 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3218 3219 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3220 movdqu(vec1, Address(str1, adr_stride)); 3221 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3222 } else { 3223 pmovzxbw(vec1, Address(str1, adr_stride1)); 3224 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3225 } 3226 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3227 addl(cnt1, stride); 3228 3229 // Compare the characters at index in cnt1 3230 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3231 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3232 subl(result, cnt2); 3233 jmp(POP_LABEL); 3234 3235 // Setup the registers to start vector comparison loop 3236 bind(COMPARE_WIDE_VECTORS); 3237 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3238 lea(str1, Address(str1, result, scale)); 3239 lea(str2, Address(str2, result, scale)); 3240 } else { 3241 lea(str1, Address(str1, result, scale1)); 3242 lea(str2, Address(str2, result, scale2)); 3243 } 3244 subl(result, stride2); 3245 subl(cnt2, stride2); 3246 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3247 negptr(result); 3248 3249 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3250 bind(COMPARE_WIDE_VECTORS_LOOP); 3251 3252 #ifdef _LP64 3253 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3254 cmpl(cnt2, stride2x2); 3255 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3256 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3257 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3258 3259 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3260 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3261 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3262 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3263 } else { 3264 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3265 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3266 } 3267 kortestql(mask, mask); 3268 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3269 addptr(result, stride2x2); // update since we already compared at this addr 3270 subl(cnt2, stride2x2); // and sub the size too 3271 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3272 3273 vpxor(vec1, vec1); 3274 jmpb(COMPARE_WIDE_TAIL); 3275 }//if (VM_Version::supports_avx512vlbw()) 3276 #endif // _LP64 3277 3278 3279 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3280 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3281 vmovdqu(vec1, Address(str1, result, scale)); 3282 vpxor(vec1, Address(str2, result, scale)); 3283 } else { 3284 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3285 vpxor(vec1, Address(str2, result, scale2)); 3286 } 3287 vptest(vec1, vec1); 3288 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3289 addptr(result, stride2); 3290 subl(cnt2, stride2); 3291 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3292 // clean upper bits of YMM registers 3293 vpxor(vec1, vec1); 3294 3295 // compare wide vectors tail 3296 bind(COMPARE_WIDE_TAIL); 3297 testptr(result, result); 3298 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3299 3300 movl(result, stride2); 3301 movl(cnt2, result); 3302 negptr(result); 3303 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3304 3305 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3306 bind(VECTOR_NOT_EQUAL); 3307 // clean upper bits of YMM registers 3308 vpxor(vec1, vec1); 3309 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3310 lea(str1, Address(str1, result, scale)); 3311 lea(str2, Address(str2, result, scale)); 3312 } else { 3313 lea(str1, Address(str1, result, scale1)); 3314 lea(str2, Address(str2, result, scale2)); 3315 } 3316 jmp(COMPARE_16_CHARS); 3317 3318 // Compare tail chars, length between 1 to 15 chars 3319 bind(COMPARE_TAIL_LONG); 3320 movl(cnt2, result); 3321 cmpl(cnt2, stride); 3322 jcc(Assembler::less, COMPARE_SMALL_STR); 3323 3324 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3325 movdqu(vec1, Address(str1, 0)); 3326 } else { 3327 pmovzxbw(vec1, Address(str1, 0)); 3328 } 3329 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3330 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3331 subptr(cnt2, stride); 3332 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3333 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3334 lea(str1, Address(str1, result, scale)); 3335 lea(str2, Address(str2, result, scale)); 3336 } else { 3337 lea(str1, Address(str1, result, scale1)); 3338 lea(str2, Address(str2, result, scale2)); 3339 } 3340 negptr(cnt2); 3341 jmpb(WHILE_HEAD_LABEL); 3342 3343 bind(COMPARE_SMALL_STR); 3344 } else if (UseSSE42Intrinsics) { 3345 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3346 int pcmpmask = 0x19; 3347 // Setup to compare 8-char (16-byte) vectors, 3348 // start from first character again because it has aligned address. 3349 movl(result, cnt2); 3350 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3351 if (ae == StrIntrinsicNode::LL) { 3352 pcmpmask &= ~0x01; 3353 } 3354 jcc(Assembler::zero, COMPARE_TAIL); 3355 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3356 lea(str1, Address(str1, result, scale)); 3357 lea(str2, Address(str2, result, scale)); 3358 } else { 3359 lea(str1, Address(str1, result, scale1)); 3360 lea(str2, Address(str2, result, scale2)); 3361 } 3362 negptr(result); 3363 3364 // pcmpestri 3365 // inputs: 3366 // vec1- substring 3367 // rax - negative string length (elements count) 3368 // mem - scanned string 3369 // rdx - string length (elements count) 3370 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3371 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3372 // outputs: 3373 // rcx - first mismatched element index 3374 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3375 3376 bind(COMPARE_WIDE_VECTORS); 3377 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3378 movdqu(vec1, Address(str1, result, scale)); 3379 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3380 } else { 3381 pmovzxbw(vec1, Address(str1, result, scale1)); 3382 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3383 } 3384 // After pcmpestri cnt1(rcx) contains mismatched element index 3385 3386 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3387 addptr(result, stride); 3388 subptr(cnt2, stride); 3389 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3390 3391 // compare wide vectors tail 3392 testptr(result, result); 3393 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3394 3395 movl(cnt2, stride); 3396 movl(result, stride); 3397 negptr(result); 3398 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3399 movdqu(vec1, Address(str1, result, scale)); 3400 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3401 } else { 3402 pmovzxbw(vec1, Address(str1, result, scale1)); 3403 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3404 } 3405 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3406 3407 // Mismatched characters in the vectors 3408 bind(VECTOR_NOT_EQUAL); 3409 addptr(cnt1, result); 3410 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3411 subl(result, cnt2); 3412 jmpb(POP_LABEL); 3413 3414 bind(COMPARE_TAIL); // limit is zero 3415 movl(cnt2, result); 3416 // Fallthru to tail compare 3417 } 3418 // Shift str2 and str1 to the end of the arrays, negate min 3419 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3420 lea(str1, Address(str1, cnt2, scale)); 3421 lea(str2, Address(str2, cnt2, scale)); 3422 } else { 3423 lea(str1, Address(str1, cnt2, scale1)); 3424 lea(str2, Address(str2, cnt2, scale2)); 3425 } 3426 decrementl(cnt2); // first character was compared already 3427 negptr(cnt2); 3428 3429 // Compare the rest of the elements 3430 bind(WHILE_HEAD_LABEL); 3431 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3432 subl(result, cnt1); 3433 jccb(Assembler::notZero, POP_LABEL); 3434 increment(cnt2); 3435 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3436 3437 // Strings are equal up to min length. Return the length difference. 3438 bind(LENGTH_DIFF_LABEL); 3439 pop(result); 3440 if (ae == StrIntrinsicNode::UU) { 3441 // Divide diff by 2 to get number of chars 3442 sarl(result, 1); 3443 } 3444 jmpb(DONE_LABEL); 3445 3446 #ifdef _LP64 3447 if (VM_Version::supports_avx512vlbw()) { 3448 3449 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3450 3451 kmovql(cnt1, mask); 3452 notq(cnt1); 3453 bsfq(cnt2, cnt1); 3454 if (ae != StrIntrinsicNode::LL) { 3455 // Divide diff by 2 to get number of chars 3456 sarl(cnt2, 1); 3457 } 3458 addq(result, cnt2); 3459 if (ae == StrIntrinsicNode::LL) { 3460 load_unsigned_byte(cnt1, Address(str2, result)); 3461 load_unsigned_byte(result, Address(str1, result)); 3462 } else if (ae == StrIntrinsicNode::UU) { 3463 load_unsigned_short(cnt1, Address(str2, result, scale)); 3464 load_unsigned_short(result, Address(str1, result, scale)); 3465 } else { 3466 load_unsigned_short(cnt1, Address(str2, result, scale2)); 3467 load_unsigned_byte(result, Address(str1, result, scale1)); 3468 } 3469 subl(result, cnt1); 3470 jmpb(POP_LABEL); 3471 }//if (VM_Version::supports_avx512vlbw()) 3472 #endif // _LP64 3473 3474 // Discard the stored length difference 3475 bind(POP_LABEL); 3476 pop(cnt1); 3477 3478 // That's it 3479 bind(DONE_LABEL); 3480 if(ae == StrIntrinsicNode::UL) { 3481 negl(result); 3482 } 3483 3484 } 3485 3486 // Search for Non-ASCII character (Negative byte value) in a byte array, 3487 // return true if it has any and false otherwise. 3488 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 3489 // @IntrinsicCandidate 3490 // private static boolean hasNegatives(byte[] ba, int off, int len) { 3491 // for (int i = off; i < off + len; i++) { 3492 // if (ba[i] < 0) { 3493 // return true; 3494 // } 3495 // } 3496 // return false; 3497 // } 3498 void C2_MacroAssembler::has_negatives(Register ary1, Register len, 3499 Register result, Register tmp1, 3500 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 3501 // rsi: byte array 3502 // rcx: len 3503 // rax: result 3504 ShortBranchVerifier sbv(this); 3505 assert_different_registers(ary1, len, result, tmp1); 3506 assert_different_registers(vec1, vec2); 3507 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 3508 3509 // len == 0 3510 testl(len, len); 3511 jcc(Assembler::zero, FALSE_LABEL); 3512 3513 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 3514 VM_Version::supports_avx512vlbw() && 3515 VM_Version::supports_bmi2()) { 3516 3517 Label test_64_loop, test_tail; 3518 Register tmp3_aliased = len; 3519 3520 movl(tmp1, len); 3521 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 3522 3523 andl(tmp1, 64 - 1); // tail count (in chars) 0x3F 3524 andl(len, ~(64 - 1)); // vector count (in chars) 3525 jccb(Assembler::zero, test_tail); 3526 3527 lea(ary1, Address(ary1, len, Address::times_1)); 3528 negptr(len); 3529 3530 bind(test_64_loop); 3531 // Check whether our 64 elements of size byte contain negatives 3532 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 3533 kortestql(mask1, mask1); 3534 jcc(Assembler::notZero, TRUE_LABEL); 3535 3536 addptr(len, 64); 3537 jccb(Assembler::notZero, test_64_loop); 3538 3539 3540 bind(test_tail); 3541 // bail out when there is nothing to be done 3542 testl(tmp1, -1); 3543 jcc(Assembler::zero, FALSE_LABEL); 3544 3545 // ~(~0 << len) applied up to two times (for 32-bit scenario) 3546 #ifdef _LP64 3547 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 3548 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 3549 notq(tmp3_aliased); 3550 kmovql(mask2, tmp3_aliased); 3551 #else 3552 Label k_init; 3553 jmp(k_init); 3554 3555 // We could not read 64-bits from a general purpose register thus we move 3556 // data required to compose 64 1's to the instruction stream 3557 // We emit 64 byte wide series of elements from 0..63 which later on would 3558 // be used as a compare targets with tail count contained in tmp1 register. 3559 // Result would be a k register having tmp1 consecutive number or 1 3560 // counting from least significant bit. 3561 address tmp = pc(); 3562 emit_int64(0x0706050403020100); 3563 emit_int64(0x0F0E0D0C0B0A0908); 3564 emit_int64(0x1716151413121110); 3565 emit_int64(0x1F1E1D1C1B1A1918); 3566 emit_int64(0x2726252423222120); 3567 emit_int64(0x2F2E2D2C2B2A2928); 3568 emit_int64(0x3736353433323130); 3569 emit_int64(0x3F3E3D3C3B3A3938); 3570 3571 bind(k_init); 3572 lea(len, InternalAddress(tmp)); 3573 // create mask to test for negative byte inside a vector 3574 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 3575 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 3576 3577 #endif 3578 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 3579 ktestq(mask1, mask2); 3580 jcc(Assembler::notZero, TRUE_LABEL); 3581 3582 jmp(FALSE_LABEL); 3583 } else { 3584 movl(result, len); // copy 3585 3586 if (UseAVX >= 2 && UseSSE >= 2) { 3587 // With AVX2, use 32-byte vector compare 3588 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3589 3590 // Compare 32-byte vectors 3591 andl(result, 0x0000001f); // tail count (in bytes) 3592 andl(len, 0xffffffe0); // vector count (in bytes) 3593 jccb(Assembler::zero, COMPARE_TAIL); 3594 3595 lea(ary1, Address(ary1, len, Address::times_1)); 3596 negptr(len); 3597 3598 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 3599 movdl(vec2, tmp1); 3600 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 3601 3602 bind(COMPARE_WIDE_VECTORS); 3603 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 3604 vptest(vec1, vec2); 3605 jccb(Assembler::notZero, TRUE_LABEL); 3606 addptr(len, 32); 3607 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3608 3609 testl(result, result); 3610 jccb(Assembler::zero, FALSE_LABEL); 3611 3612 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 3613 vptest(vec1, vec2); 3614 jccb(Assembler::notZero, TRUE_LABEL); 3615 jmpb(FALSE_LABEL); 3616 3617 bind(COMPARE_TAIL); // len is zero 3618 movl(len, result); 3619 // Fallthru to tail compare 3620 } else if (UseSSE42Intrinsics) { 3621 // With SSE4.2, use double quad vector compare 3622 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3623 3624 // Compare 16-byte vectors 3625 andl(result, 0x0000000f); // tail count (in bytes) 3626 andl(len, 0xfffffff0); // vector count (in bytes) 3627 jcc(Assembler::zero, COMPARE_TAIL); 3628 3629 lea(ary1, Address(ary1, len, Address::times_1)); 3630 negptr(len); 3631 3632 movl(tmp1, 0x80808080); 3633 movdl(vec2, tmp1); 3634 pshufd(vec2, vec2, 0); 3635 3636 bind(COMPARE_WIDE_VECTORS); 3637 movdqu(vec1, Address(ary1, len, Address::times_1)); 3638 ptest(vec1, vec2); 3639 jcc(Assembler::notZero, TRUE_LABEL); 3640 addptr(len, 16); 3641 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3642 3643 testl(result, result); 3644 jcc(Assembler::zero, FALSE_LABEL); 3645 3646 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 3647 ptest(vec1, vec2); 3648 jccb(Assembler::notZero, TRUE_LABEL); 3649 jmpb(FALSE_LABEL); 3650 3651 bind(COMPARE_TAIL); // len is zero 3652 movl(len, result); 3653 // Fallthru to tail compare 3654 } 3655 } 3656 // Compare 4-byte vectors 3657 andl(len, 0xfffffffc); // vector count (in bytes) 3658 jccb(Assembler::zero, COMPARE_CHAR); 3659 3660 lea(ary1, Address(ary1, len, Address::times_1)); 3661 negptr(len); 3662 3663 bind(COMPARE_VECTORS); 3664 movl(tmp1, Address(ary1, len, Address::times_1)); 3665 andl(tmp1, 0x80808080); 3666 jccb(Assembler::notZero, TRUE_LABEL); 3667 addptr(len, 4); 3668 jcc(Assembler::notZero, COMPARE_VECTORS); 3669 3670 // Compare trailing char (final 2 bytes), if any 3671 bind(COMPARE_CHAR); 3672 testl(result, 0x2); // tail char 3673 jccb(Assembler::zero, COMPARE_BYTE); 3674 load_unsigned_short(tmp1, Address(ary1, 0)); 3675 andl(tmp1, 0x00008080); 3676 jccb(Assembler::notZero, TRUE_LABEL); 3677 subptr(result, 2); 3678 lea(ary1, Address(ary1, 2)); 3679 3680 bind(COMPARE_BYTE); 3681 testl(result, 0x1); // tail byte 3682 jccb(Assembler::zero, FALSE_LABEL); 3683 load_unsigned_byte(tmp1, Address(ary1, 0)); 3684 andl(tmp1, 0x00000080); 3685 jccb(Assembler::notEqual, TRUE_LABEL); 3686 jmpb(FALSE_LABEL); 3687 3688 bind(TRUE_LABEL); 3689 movl(result, 1); // return true 3690 jmpb(DONE); 3691 3692 bind(FALSE_LABEL); 3693 xorl(result, result); // return false 3694 3695 // That's it 3696 bind(DONE); 3697 if (UseAVX >= 2 && UseSSE >= 2) { 3698 // clean upper bits of YMM registers 3699 vpxor(vec1, vec1); 3700 vpxor(vec2, vec2); 3701 } 3702 } 3703 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 3704 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 3705 Register limit, Register result, Register chr, 3706 XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) { 3707 ShortBranchVerifier sbv(this); 3708 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 3709 3710 int length_offset = arrayOopDesc::length_offset_in_bytes(); 3711 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 3712 3713 if (is_array_equ) { 3714 // Check the input args 3715 cmpoop(ary1, ary2); 3716 jcc(Assembler::equal, TRUE_LABEL); 3717 3718 // Need additional checks for arrays_equals. 3719 testptr(ary1, ary1); 3720 jcc(Assembler::zero, FALSE_LABEL); 3721 testptr(ary2, ary2); 3722 jcc(Assembler::zero, FALSE_LABEL); 3723 3724 // Check the lengths 3725 movl(limit, Address(ary1, length_offset)); 3726 cmpl(limit, Address(ary2, length_offset)); 3727 jcc(Assembler::notEqual, FALSE_LABEL); 3728 } 3729 3730 // count == 0 3731 testl(limit, limit); 3732 jcc(Assembler::zero, TRUE_LABEL); 3733 3734 if (is_array_equ) { 3735 // Load array address 3736 lea(ary1, Address(ary1, base_offset)); 3737 lea(ary2, Address(ary2, base_offset)); 3738 } 3739 3740 if (is_array_equ && is_char) { 3741 // arrays_equals when used for char[]. 3742 shll(limit, 1); // byte count != 0 3743 } 3744 movl(result, limit); // copy 3745 3746 if (UseAVX >= 2) { 3747 // With AVX2, use 32-byte vector compare 3748 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3749 3750 // Compare 32-byte vectors 3751 andl(result, 0x0000001f); // tail count (in bytes) 3752 andl(limit, 0xffffffe0); // vector count (in bytes) 3753 jcc(Assembler::zero, COMPARE_TAIL); 3754 3755 lea(ary1, Address(ary1, limit, Address::times_1)); 3756 lea(ary2, Address(ary2, limit, Address::times_1)); 3757 negptr(limit); 3758 3759 #ifdef _LP64 3760 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3761 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 3762 3763 cmpl(limit, -64); 3764 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3765 3766 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3767 3768 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 3769 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 3770 kortestql(mask, mask); 3771 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 3772 addptr(limit, 64); // update since we already compared at this addr 3773 cmpl(limit, -64); 3774 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3775 3776 // At this point we may still need to compare -limit+result bytes. 3777 // We could execute the next two instruction and just continue via non-wide path: 3778 // cmpl(limit, 0); 3779 // jcc(Assembler::equal, COMPARE_TAIL); // true 3780 // But since we stopped at the points ary{1,2}+limit which are 3781 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 3782 // (|limit| <= 32 and result < 32), 3783 // we may just compare the last 64 bytes. 3784 // 3785 addptr(result, -64); // it is safe, bc we just came from this area 3786 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 3787 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 3788 kortestql(mask, mask); 3789 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 3790 3791 jmp(TRUE_LABEL); 3792 3793 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3794 3795 }//if (VM_Version::supports_avx512vlbw()) 3796 #endif //_LP64 3797 bind(COMPARE_WIDE_VECTORS); 3798 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); 3799 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 3800 vpxor(vec1, vec2); 3801 3802 vptest(vec1, vec1); 3803 jcc(Assembler::notZero, FALSE_LABEL); 3804 addptr(limit, 32); 3805 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3806 3807 testl(result, result); 3808 jcc(Assembler::zero, TRUE_LABEL); 3809 3810 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 3811 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 3812 vpxor(vec1, vec2); 3813 3814 vptest(vec1, vec1); 3815 jccb(Assembler::notZero, FALSE_LABEL); 3816 jmpb(TRUE_LABEL); 3817 3818 bind(COMPARE_TAIL); // limit is zero 3819 movl(limit, result); 3820 // Fallthru to tail compare 3821 } else if (UseSSE42Intrinsics) { 3822 // With SSE4.2, use double quad vector compare 3823 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3824 3825 // Compare 16-byte vectors 3826 andl(result, 0x0000000f); // tail count (in bytes) 3827 andl(limit, 0xfffffff0); // vector count (in bytes) 3828 jcc(Assembler::zero, COMPARE_TAIL); 3829 3830 lea(ary1, Address(ary1, limit, Address::times_1)); 3831 lea(ary2, Address(ary2, limit, Address::times_1)); 3832 negptr(limit); 3833 3834 bind(COMPARE_WIDE_VECTORS); 3835 movdqu(vec1, Address(ary1, limit, Address::times_1)); 3836 movdqu(vec2, Address(ary2, limit, Address::times_1)); 3837 pxor(vec1, vec2); 3838 3839 ptest(vec1, vec1); 3840 jcc(Assembler::notZero, FALSE_LABEL); 3841 addptr(limit, 16); 3842 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3843 3844 testl(result, result); 3845 jcc(Assembler::zero, TRUE_LABEL); 3846 3847 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 3848 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 3849 pxor(vec1, vec2); 3850 3851 ptest(vec1, vec1); 3852 jccb(Assembler::notZero, FALSE_LABEL); 3853 jmpb(TRUE_LABEL); 3854 3855 bind(COMPARE_TAIL); // limit is zero 3856 movl(limit, result); 3857 // Fallthru to tail compare 3858 } 3859 3860 // Compare 4-byte vectors 3861 andl(limit, 0xfffffffc); // vector count (in bytes) 3862 jccb(Assembler::zero, COMPARE_CHAR); 3863 3864 lea(ary1, Address(ary1, limit, Address::times_1)); 3865 lea(ary2, Address(ary2, limit, Address::times_1)); 3866 negptr(limit); 3867 3868 bind(COMPARE_VECTORS); 3869 movl(chr, Address(ary1, limit, Address::times_1)); 3870 cmpl(chr, Address(ary2, limit, Address::times_1)); 3871 jccb(Assembler::notEqual, FALSE_LABEL); 3872 addptr(limit, 4); 3873 jcc(Assembler::notZero, COMPARE_VECTORS); 3874 3875 // Compare trailing char (final 2 bytes), if any 3876 bind(COMPARE_CHAR); 3877 testl(result, 0x2); // tail char 3878 jccb(Assembler::zero, COMPARE_BYTE); 3879 load_unsigned_short(chr, Address(ary1, 0)); 3880 load_unsigned_short(limit, Address(ary2, 0)); 3881 cmpl(chr, limit); 3882 jccb(Assembler::notEqual, FALSE_LABEL); 3883 3884 if (is_array_equ && is_char) { 3885 bind(COMPARE_BYTE); 3886 } else { 3887 lea(ary1, Address(ary1, 2)); 3888 lea(ary2, Address(ary2, 2)); 3889 3890 bind(COMPARE_BYTE); 3891 testl(result, 0x1); // tail byte 3892 jccb(Assembler::zero, TRUE_LABEL); 3893 load_unsigned_byte(chr, Address(ary1, 0)); 3894 load_unsigned_byte(limit, Address(ary2, 0)); 3895 cmpl(chr, limit); 3896 jccb(Assembler::notEqual, FALSE_LABEL); 3897 } 3898 bind(TRUE_LABEL); 3899 movl(result, 1); // return true 3900 jmpb(DONE); 3901 3902 bind(FALSE_LABEL); 3903 xorl(result, result); // return false 3904 3905 // That's it 3906 bind(DONE); 3907 if (UseAVX >= 2) { 3908 // clean upper bits of YMM registers 3909 vpxor(vec1, vec1); 3910 vpxor(vec2, vec2); 3911 } 3912 } 3913 3914 #ifdef _LP64 3915 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 3916 Register tmp, KRegister ktmp, int masklen, int vec_enc) { 3917 assert(VM_Version::supports_avx512vlbw(), ""); 3918 vpxor(xtmp, xtmp, xtmp, vec_enc); 3919 vpsubb(xtmp, xtmp, mask, vec_enc); 3920 evpmovb2m(ktmp, xtmp, vec_enc); 3921 kmovql(tmp, ktmp); 3922 switch(opc) { 3923 case Op_VectorMaskTrueCount: 3924 popcntq(dst, tmp); 3925 break; 3926 case Op_VectorMaskLastTrue: 3927 mov64(dst, -1); 3928 bsrq(tmp, tmp); 3929 cmov(Assembler::notZero, dst, tmp); 3930 break; 3931 case Op_VectorMaskFirstTrue: 3932 mov64(dst, masklen); 3933 bsfq(tmp, tmp); 3934 cmov(Assembler::notZero, dst, tmp); 3935 break; 3936 default: assert(false, "Unhandled mask operation"); 3937 } 3938 } 3939 3940 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 3941 XMMRegister xtmp1, Register tmp, int masklen, int vec_enc) { 3942 assert(VM_Version::supports_avx(), ""); 3943 vpxor(xtmp, xtmp, xtmp, vec_enc); 3944 vpsubb(xtmp, xtmp, mask, vec_enc); 3945 vpmovmskb(tmp, xtmp, vec_enc); 3946 if (masklen < 64) { 3947 andq(tmp, (((jlong)1 << masklen) - 1)); 3948 } 3949 switch(opc) { 3950 case Op_VectorMaskTrueCount: 3951 popcntq(dst, tmp); 3952 break; 3953 case Op_VectorMaskLastTrue: 3954 mov64(dst, -1); 3955 bsrq(tmp, tmp); 3956 cmov(Assembler::notZero, dst, tmp); 3957 break; 3958 case Op_VectorMaskFirstTrue: 3959 mov64(dst, masklen); 3960 bsfq(tmp, tmp); 3961 cmov(Assembler::notZero, dst, tmp); 3962 break; 3963 default: assert(false, "Unhandled mask operation"); 3964 } 3965 } 3966 #endif 3967 3968 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 3969 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 3970 int vlen_enc) { 3971 assert(VM_Version::supports_avx512bw(), ""); 3972 // Byte shuffles are inlane operations and indices are determined using 3973 // lower 4 bit of each shuffle lane, thus all shuffle indices are 3974 // normalized to index range 0-15. This makes sure that all the multiples 3975 // of an index value are placed at same relative position in 128 bit 3976 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 3977 // will be 16th element in their respective 128 bit lanes. 3978 movl(rtmp, 16); 3979 evpbroadcastb(xtmp1, rtmp, vlen_enc); 3980 3981 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 3982 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 3983 // original shuffle indices and move the shuffled lanes corresponding to true 3984 // mask to destination vector. 3985 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 3986 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 3987 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 3988 3989 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 3990 // and broadcasting second 128 bit lane. 3991 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 3992 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 3993 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 3994 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 3995 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 3996 3997 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 3998 // and broadcasting third 128 bit lane. 3999 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 4000 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 4001 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 4002 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 4003 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 4004 4005 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 4006 // and broadcasting third 128 bit lane. 4007 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 4008 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 4009 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 4010 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 4011 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 4012 }