1 /* 2 * Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "oops/methodData.hpp" 29 #include "opto/c2_CodeStubs.hpp" 30 #include "opto/c2_MacroAssembler.hpp" 31 #include "opto/intrinsicnode.hpp" 32 #include "opto/opcodes.hpp" 33 #include "opto/output.hpp" 34 #include "opto/subnode.hpp" 35 #include "runtime/biasedLocking.hpp" 36 #include "runtime/globals.hpp" 37 #include "runtime/objectMonitor.hpp" 38 #include "runtime/stubRoutines.hpp" 39 #include "utilities/globalDefinitions.hpp" 40 #include "utilities/powerOfTwo.hpp" 41 #include "utilities/sizes.hpp" 42 43 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 44 switch (vlen_in_bytes) { 45 case 4: // fall-through 46 case 8: // fall-through 47 case 16: return Assembler::AVX_128bit; 48 case 32: return Assembler::AVX_256bit; 49 case 64: return Assembler::AVX_512bit; 50 51 default: { 52 ShouldNotReachHere(); 53 return Assembler::AVX_NoVec; 54 } 55 } 56 } 57 58 void C2_MacroAssembler::setvectmask(Register dst, Register src, KRegister mask) { 59 guarantee(PostLoopMultiversioning, "must be"); 60 Assembler::movl(dst, 1); 61 Assembler::shlxl(dst, dst, src); 62 Assembler::decl(dst); 63 Assembler::kmovdl(mask, dst); 64 Assembler::movl(dst, src); 65 } 66 67 void C2_MacroAssembler::restorevectmask(KRegister mask) { 68 guarantee(PostLoopMultiversioning, "must be"); 69 Assembler::knotwl(mask, k0); 70 } 71 72 #if INCLUDE_RTM_OPT 73 74 // Update rtm_counters based on abort status 75 // input: abort_status 76 // rtm_counters (RTMLockingCounters*) 77 // flags are killed 78 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) { 79 80 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset())); 81 if (PrintPreciseRTMLockingStatistics) { 82 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 83 Label check_abort; 84 testl(abort_status, (1<<i)); 85 jccb(Assembler::equal, check_abort); 86 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx)))); 87 bind(check_abort); 88 } 89 } 90 } 91 92 // Branch if (random & (count-1) != 0), count is 2^n 93 // tmp, scr and flags are killed 94 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) { 95 assert(tmp == rax, ""); 96 assert(scr == rdx, ""); 97 rdtsc(); // modifies EDX:EAX 98 andptr(tmp, count-1); 99 jccb(Assembler::notZero, brLabel); 100 } 101 102 // Perform abort ratio calculation, set no_rtm bit if high ratio 103 // input: rtm_counters_Reg (RTMLockingCounters* address) 104 // tmpReg, rtm_counters_Reg and flags are killed 105 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg, 106 Register rtm_counters_Reg, 107 RTMLockingCounters* rtm_counters, 108 Metadata* method_data) { 109 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 110 111 if (RTMLockingCalculationDelay > 0) { 112 // Delay calculation 113 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg); 114 testptr(tmpReg, tmpReg); 115 jccb(Assembler::equal, L_done); 116 } 117 // Abort ratio calculation only if abort_count > RTMAbortThreshold 118 // Aborted transactions = abort_count * 100 119 // All transactions = total_count * RTMTotalCountIncrRate 120 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 121 122 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset())); 123 cmpptr(tmpReg, RTMAbortThreshold); 124 jccb(Assembler::below, L_check_always_rtm2); 125 imulptr(tmpReg, tmpReg, 100); 126 127 Register scrReg = rtm_counters_Reg; 128 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 129 imulptr(scrReg, scrReg, RTMTotalCountIncrRate); 130 imulptr(scrReg, scrReg, RTMAbortRatio); 131 cmpptr(tmpReg, scrReg); 132 jccb(Assembler::below, L_check_always_rtm1); 133 if (method_data != NULL) { 134 // set rtm_state to "no rtm" in MDO 135 mov_metadata(tmpReg, method_data); 136 lock(); 137 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM); 138 } 139 jmpb(L_done); 140 bind(L_check_always_rtm1); 141 // Reload RTMLockingCounters* address 142 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 143 bind(L_check_always_rtm2); 144 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 145 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 146 jccb(Assembler::below, L_done); 147 if (method_data != NULL) { 148 // set rtm_state to "always rtm" in MDO 149 mov_metadata(tmpReg, method_data); 150 lock(); 151 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM); 152 } 153 bind(L_done); 154 } 155 156 // Update counters and perform abort ratio calculation 157 // input: abort_status_Reg 158 // rtm_counters_Reg, flags are killed 159 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg, 160 Register rtm_counters_Reg, 161 RTMLockingCounters* rtm_counters, 162 Metadata* method_data, 163 bool profile_rtm) { 164 165 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 166 // update rtm counters based on rax value at abort 167 // reads abort_status_Reg, updates flags 168 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 169 rtm_counters_update(abort_status_Reg, rtm_counters_Reg); 170 if (profile_rtm) { 171 // Save abort status because abort_status_Reg is used by following code. 172 if (RTMRetryCount > 0) { 173 push(abort_status_Reg); 174 } 175 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 176 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data); 177 // restore abort status 178 if (RTMRetryCount > 0) { 179 pop(abort_status_Reg); 180 } 181 } 182 } 183 184 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4) 185 // inputs: retry_count_Reg 186 // : abort_status_Reg 187 // output: retry_count_Reg decremented by 1 188 // flags are killed 189 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) { 190 Label doneRetry; 191 assert(abort_status_Reg == rax, ""); 192 // The abort reason bits are in eax (see all states in rtmLocking.hpp) 193 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) 194 // if reason is in 0x6 and retry count != 0 then retry 195 andptr(abort_status_Reg, 0x6); 196 jccb(Assembler::zero, doneRetry); 197 testl(retry_count_Reg, retry_count_Reg); 198 jccb(Assembler::zero, doneRetry); 199 pause(); 200 decrementl(retry_count_Reg); 201 jmp(retryLabel); 202 bind(doneRetry); 203 } 204 205 // Spin and retry if lock is busy, 206 // inputs: box_Reg (monitor address) 207 // : retry_count_Reg 208 // output: retry_count_Reg decremented by 1 209 // : clear z flag if retry count exceeded 210 // tmp_Reg, scr_Reg, flags are killed 211 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg, 212 Register tmp_Reg, Register scr_Reg, Label& retryLabel) { 213 Label SpinLoop, SpinExit, doneRetry; 214 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 215 216 testl(retry_count_Reg, retry_count_Reg); 217 jccb(Assembler::zero, doneRetry); 218 decrementl(retry_count_Reg); 219 movptr(scr_Reg, RTMSpinLoopCount); 220 221 bind(SpinLoop); 222 pause(); 223 decrementl(scr_Reg); 224 jccb(Assembler::lessEqual, SpinExit); 225 movptr(tmp_Reg, Address(box_Reg, owner_offset)); 226 testptr(tmp_Reg, tmp_Reg); 227 jccb(Assembler::notZero, SpinLoop); 228 229 bind(SpinExit); 230 jmp(retryLabel); 231 bind(doneRetry); 232 incrementl(retry_count_Reg); // clear z flag 233 } 234 235 // Use RTM for normal stack locks 236 // Input: objReg (object to lock) 237 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg, 238 Register retry_on_abort_count_Reg, 239 RTMLockingCounters* stack_rtm_counters, 240 Metadata* method_data, bool profile_rtm, 241 Label& DONE_LABEL, Label& IsInflated) { 242 assert(UseRTMForStackLocks, "why call this otherwise?"); 243 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 244 assert(tmpReg == rax, ""); 245 assert(scrReg == rdx, ""); 246 Label L_rtm_retry, L_decrement_retry, L_on_abort; 247 248 if (RTMRetryCount > 0) { 249 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 250 bind(L_rtm_retry); 251 } 252 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 253 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased 254 jcc(Assembler::notZero, IsInflated); 255 256 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 257 Label L_noincrement; 258 if (RTMTotalCountIncrRate > 1) { 259 // tmpReg, scrReg and flags are killed 260 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 261 } 262 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 263 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg); 264 bind(L_noincrement); 265 } 266 xbegin(L_on_abort); 267 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 268 andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits 269 cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked 270 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked 271 272 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 273 if (UseRTMXendForLockBusy) { 274 xend(); 275 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry) 276 jmp(L_decrement_retry); 277 } 278 else { 279 xabort(0); 280 } 281 bind(L_on_abort); 282 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 283 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm); 284 } 285 bind(L_decrement_retry); 286 if (RTMRetryCount > 0) { 287 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 288 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 289 } 290 } 291 292 // Use RTM for inflating locks 293 // inputs: objReg (object to lock) 294 // boxReg (on-stack box address (displaced header location) - KILLED) 295 // tmpReg (ObjectMonitor address + markWord::monitor_value) 296 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg, 297 Register scrReg, Register retry_on_busy_count_Reg, 298 Register retry_on_abort_count_Reg, 299 RTMLockingCounters* rtm_counters, 300 Metadata* method_data, bool profile_rtm, 301 Label& DONE_LABEL) { 302 assert(UseRTMLocking, "why call this otherwise?"); 303 assert(tmpReg == rax, ""); 304 assert(scrReg == rdx, ""); 305 Label L_rtm_retry, L_decrement_retry, L_on_abort; 306 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 307 308 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 309 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value())); 310 movptr(boxReg, tmpReg); // Save ObjectMonitor address 311 312 if (RTMRetryCount > 0) { 313 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy 314 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 315 bind(L_rtm_retry); 316 } 317 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 318 Label L_noincrement; 319 if (RTMTotalCountIncrRate > 1) { 320 // tmpReg, scrReg and flags are killed 321 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 322 } 323 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 324 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg); 325 bind(L_noincrement); 326 } 327 xbegin(L_on_abort); 328 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 329 movptr(tmpReg, Address(tmpReg, owner_offset)); 330 testptr(tmpReg, tmpReg); 331 jcc(Assembler::zero, DONE_LABEL); 332 if (UseRTMXendForLockBusy) { 333 xend(); 334 jmp(L_decrement_retry); 335 } 336 else { 337 xabort(0); 338 } 339 bind(L_on_abort); 340 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 341 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 342 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm); 343 } 344 if (RTMRetryCount > 0) { 345 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 346 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 347 } 348 349 movptr(tmpReg, Address(boxReg, owner_offset)) ; 350 testptr(tmpReg, tmpReg) ; 351 jccb(Assembler::notZero, L_decrement_retry) ; 352 353 // Appears unlocked - try to swing _owner from null to non-null. 354 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 355 #ifdef _LP64 356 Register threadReg = r15_thread; 357 #else 358 get_thread(scrReg); 359 Register threadReg = scrReg; 360 #endif 361 lock(); 362 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg 363 364 if (RTMRetryCount > 0) { 365 // success done else retry 366 jccb(Assembler::equal, DONE_LABEL) ; 367 bind(L_decrement_retry); 368 // Spin and retry if lock is busy. 369 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry); 370 } 371 else { 372 bind(L_decrement_retry); 373 } 374 } 375 376 #endif // INCLUDE_RTM_OPT 377 378 // fast_lock and fast_unlock used by C2 379 380 // Because the transitions from emitted code to the runtime 381 // monitorenter/exit helper stubs are so slow it's critical that 382 // we inline both the stack-locking fast path and the inflated fast path. 383 // 384 // See also: cmpFastLock and cmpFastUnlock. 385 // 386 // What follows is a specialized inline transliteration of the code 387 // in enter() and exit(). If we're concerned about I$ bloat another 388 // option would be to emit TrySlowEnter and TrySlowExit methods 389 // at startup-time. These methods would accept arguments as 390 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 391 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 392 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 393 // In practice, however, the # of lock sites is bounded and is usually small. 394 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 395 // if the processor uses simple bimodal branch predictors keyed by EIP 396 // Since the helper routines would be called from multiple synchronization 397 // sites. 398 // 399 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 400 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 401 // to those specialized methods. That'd give us a mostly platform-independent 402 // implementation that the JITs could optimize and inline at their pleasure. 403 // Done correctly, the only time we'd need to cross to native could would be 404 // to park() or unpark() threads. We'd also need a few more unsafe operators 405 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 406 // (b) explicit barriers or fence operations. 407 // 408 // TODO: 409 // 410 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 411 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 412 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 413 // the lock operators would typically be faster than reifying Self. 414 // 415 // * Ideally I'd define the primitives as: 416 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 417 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 418 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 419 // Instead, we're stuck with a rather awkward and brittle register assignments below. 420 // Furthermore the register assignments are overconstrained, possibly resulting in 421 // sub-optimal code near the synchronization site. 422 // 423 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 424 // Alternately, use a better sp-proximity test. 425 // 426 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 427 // Either one is sufficient to uniquely identify a thread. 428 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 429 // 430 // * Intrinsify notify() and notifyAll() for the common cases where the 431 // object is locked by the calling thread but the waitlist is empty. 432 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 433 // 434 // * use jccb and jmpb instead of jcc and jmp to improve code density. 435 // But beware of excessive branch density on AMD Opterons. 436 // 437 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 438 // or failure of the fast path. If the fast path fails then we pass 439 // control to the slow path, typically in C. In fast_lock and 440 // fast_unlock we often branch to DONE_LABEL, just to find that C2 441 // will emit a conditional branch immediately after the node. 442 // So we have branches to branches and lots of ICC.ZF games. 443 // Instead, it might be better to have C2 pass a "FailureLabel" 444 // into fast_lock and fast_unlock. In the case of success, control 445 // will drop through the node. ICC.ZF is undefined at exit. 446 // In the case of failure, the node will branch directly to the 447 // FailureLabel 448 449 450 // obj: object to lock 451 // box: on-stack box address (displaced header location) - KILLED 452 // rax,: tmp -- KILLED 453 // scr: tmp -- KILLED 454 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 455 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 456 BiasedLockingCounters* counters, 457 RTMLockingCounters* rtm_counters, 458 RTMLockingCounters* stack_rtm_counters, 459 Metadata* method_data, 460 bool use_rtm, bool profile_rtm) { 461 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 462 // Ensure the register assignments are disjoint 463 assert(tmpReg == rax, ""); 464 465 if (use_rtm) { 466 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg); 467 } else { 468 assert(cx2Reg == noreg, ""); 469 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 470 } 471 472 if (counters != NULL) { 473 atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg); 474 } 475 476 // Possible cases that we'll encounter in fast_lock 477 // ------------------------------------------------ 478 // * Inflated 479 // -- unlocked 480 // -- Locked 481 // = by self 482 // = by other 483 // * biased 484 // -- by Self 485 // -- by other 486 // * neutral 487 // * stack-locked 488 // -- by self 489 // = sp-proximity test hits 490 // = sp-proximity test generates false-negative 491 // -- by other 492 // 493 494 Label IsInflated, DONE_LABEL; 495 496 if (DiagnoseSyncOnValueBasedClasses != 0) { 497 load_klass(tmpReg, objReg, cx1Reg); 498 movl(tmpReg, Address(tmpReg, Klass::access_flags_offset())); 499 testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS); 500 jcc(Assembler::notZero, DONE_LABEL); 501 } 502 503 // it's stack-locked, biased or neutral 504 // TODO: optimize away redundant LDs of obj->mark and improve the markword triage 505 // order to reduce the number of conditional branches in the most common cases. 506 // Beware -- there's a subtle invariant that fetch of the markword 507 // at [FETCH], below, will never observe a biased encoding (*101b). 508 // If this invariant is not held we risk exclusion (safety) failure. 509 if (UseBiasedLocking && !UseOptoBiasInlining) { 510 biased_locking_enter(boxReg, objReg, tmpReg, scrReg, cx1Reg, false, DONE_LABEL, NULL, counters); 511 } 512 513 #if INCLUDE_RTM_OPT 514 if (UseRTMForStackLocks && use_rtm) { 515 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg, 516 stack_rtm_counters, method_data, profile_rtm, 517 DONE_LABEL, IsInflated); 518 } 519 #endif // INCLUDE_RTM_OPT 520 521 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 522 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased 523 jcc(Assembler::notZero, IsInflated); 524 525 if (LockingMode == LM_MONITOR) { 526 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 527 testptr(objReg, objReg); 528 } else { 529 assert(LockingMode == LM_LEGACY, "must be"); 530 // Attempt stack-locking ... 531 orptr (tmpReg, markWord::unlocked_value); 532 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 533 lock(); 534 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 535 if (counters != NULL) { 536 cond_inc32(Assembler::equal, 537 ExternalAddress((address)counters->fast_path_entry_count_addr())); 538 } 539 jcc(Assembler::equal, DONE_LABEL); // Success 540 541 // Recursive locking. 542 // The object is stack-locked: markword contains stack pointer to BasicLock. 543 // Locked by current thread if difference with current SP is less than one page. 544 subptr(tmpReg, rsp); 545 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 546 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) ); 547 movptr(Address(boxReg, 0), tmpReg); 548 if (counters != NULL) { 549 cond_inc32(Assembler::equal, 550 ExternalAddress((address)counters->fast_path_entry_count_addr())); 551 } 552 } 553 jmp(DONE_LABEL); 554 555 bind(IsInflated); 556 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 557 558 #if INCLUDE_RTM_OPT 559 // Use the same RTM locking code in 32- and 64-bit VM. 560 if (use_rtm) { 561 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg, 562 rtm_counters, method_data, profile_rtm, DONE_LABEL); 563 } else { 564 #endif // INCLUDE_RTM_OPT 565 566 #ifndef _LP64 567 // The object is inflated. 568 569 // boxReg refers to the on-stack BasicLock in the current frame. 570 // We'd like to write: 571 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 572 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 573 // additional latency as we have another ST in the store buffer that must drain. 574 575 // avoid ST-before-CAS 576 // register juggle because we need tmpReg for cmpxchgptr below 577 movptr(scrReg, boxReg); 578 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 579 580 // Optimistic form: consider XORL tmpReg,tmpReg 581 movptr(tmpReg, NULL_WORD); 582 583 // Appears unlocked - try to swing _owner from null to non-null. 584 // Ideally, I'd manifest "Self" with get_thread and then attempt 585 // to CAS the register containing Self into m->Owner. 586 // But we don't have enough registers, so instead we can either try to CAS 587 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 588 // we later store "Self" into m->Owner. Transiently storing a stack address 589 // (rsp or the address of the box) into m->owner is harmless. 590 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 591 lock(); 592 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 593 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 594 // If we weren't able to swing _owner from NULL to the BasicLock 595 // then take the slow path. 596 jccb (Assembler::notZero, DONE_LABEL); 597 // update _owner from BasicLock to thread 598 get_thread (scrReg); // beware: clobbers ICCs 599 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 600 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 601 602 // If the CAS fails we can either retry or pass control to the slow path. 603 // We use the latter tactic. 604 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 605 // If the CAS was successful ... 606 // Self has acquired the lock 607 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 608 // Intentional fall-through into DONE_LABEL ... 609 #else // _LP64 610 // It's inflated and we use scrReg for ObjectMonitor* in this section. 611 movq(scrReg, tmpReg); 612 xorq(tmpReg, tmpReg); 613 lock(); 614 cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 615 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 616 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 617 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value())); 618 // Propagate ICC.ZF from CAS above into DONE_LABEL. 619 jcc(Assembler::equal, DONE_LABEL); // CAS above succeeded; propagate ZF = 1 (success) 620 621 cmpptr(r15_thread, rax); // Check if we are already the owner (recursive lock) 622 jcc(Assembler::notEqual, DONE_LABEL); // If not recursive, ZF = 0 at this point (fail) 623 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 624 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 625 #endif // _LP64 626 #if INCLUDE_RTM_OPT 627 } // use_rtm() 628 #endif 629 // DONE_LABEL is a hot target - we'd really like to place it at the 630 // start of cache line by padding with NOPs. 631 // See the AMD and Intel software optimization manuals for the 632 // most efficient "long" NOP encodings. 633 // Unfortunately none of our alignment mechanisms suffice. 634 bind(DONE_LABEL); 635 636 // At DONE_LABEL the icc ZFlag is set as follows ... 637 // fast_unlock uses the same protocol. 638 // ZFlag == 1 -> Success 639 // ZFlag == 0 -> Failure - force control through the slow path 640 } 641 642 // obj: object to unlock 643 // box: box address (displaced header location), killed. Must be EAX. 644 // tmp: killed, cannot be obj nor box. 645 // 646 // Some commentary on balanced locking: 647 // 648 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 649 // Methods that don't have provably balanced locking are forced to run in the 650 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 651 // The interpreter provides two properties: 652 // I1: At return-time the interpreter automatically and quietly unlocks any 653 // objects acquired the current activation (frame). Recall that the 654 // interpreter maintains an on-stack list of locks currently held by 655 // a frame. 656 // I2: If a method attempts to unlock an object that is not held by the 657 // the frame the interpreter throws IMSX. 658 // 659 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 660 // B() doesn't have provably balanced locking so it runs in the interpreter. 661 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 662 // is still locked by A(). 663 // 664 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 665 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 666 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 667 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 668 // Arguably given that the spec legislates the JNI case as undefined our implementation 669 // could reasonably *avoid* checking owner in fast_unlock(). 670 // In the interest of performance we elide m->Owner==Self check in unlock. 671 // A perfectly viable alternative is to elide the owner check except when 672 // Xcheck:jni is enabled. 673 674 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { 675 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 676 assert(boxReg == rax, ""); 677 assert_different_registers(objReg, boxReg, tmpReg); 678 679 Label DONE_LABEL, Stacked, CheckSucc; 680 681 // Critically, the biased locking test must have precedence over 682 // and appear before the (box->dhw == 0) recursive stack-lock test. 683 if (UseBiasedLocking && !UseOptoBiasInlining) { 684 biased_locking_exit(objReg, tmpReg, DONE_LABEL); 685 } 686 687 #if INCLUDE_RTM_OPT 688 if (UseRTMForStackLocks && use_rtm) { 689 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 690 Label L_regular_unlock; 691 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 692 andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits 693 cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked 694 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock 695 xend(); // otherwise end... 696 jmp(DONE_LABEL); // ... and we're done 697 bind(L_regular_unlock); 698 } 699 #endif 700 701 if (LockingMode == LM_LEGACY) { 702 cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header 703 jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock 704 } 705 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 706 if (LockingMode != LM_MONITOR) { 707 testptr(tmpReg, markWord::monitor_value); // Inflated? 708 jcc(Assembler::zero, Stacked); 709 } 710 711 // It's inflated. 712 713 #if INCLUDE_RTM_OPT 714 if (use_rtm) { 715 Label L_regular_inflated_unlock; 716 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 717 movptr(boxReg, Address(tmpReg, owner_offset)); 718 testptr(boxReg, boxReg); 719 jccb(Assembler::notZero, L_regular_inflated_unlock); 720 xend(); 721 jmp(DONE_LABEL); 722 bind(L_regular_inflated_unlock); 723 } 724 #endif 725 726 // Despite our balanced locking property we still check that m->_owner == Self 727 // as java routines or native JNI code called by this thread might 728 // have released the lock. 729 // Refer to the comments in synchronizer.cpp for how we might encode extra 730 // state in _succ so we can avoid fetching EntryList|cxq. 731 // 732 // If there's no contention try a 1-0 exit. That is, exit without 733 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 734 // we detect and recover from the race that the 1-0 exit admits. 735 // 736 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 737 // before it STs null into _owner, releasing the lock. Updates 738 // to data protected by the critical section must be visible before 739 // we drop the lock (and thus before any other thread could acquire 740 // the lock and observe the fields protected by the lock). 741 // IA32's memory-model is SPO, so STs are ordered with respect to 742 // each other and there's no need for an explicit barrier (fence). 743 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 744 #ifndef _LP64 745 get_thread (boxReg); 746 747 // Note that we could employ various encoding schemes to reduce 748 // the number of loads below (currently 4) to just 2 or 3. 749 // Refer to the comments in synchronizer.cpp. 750 // In practice the chain of fetches doesn't seem to impact performance, however. 751 xorptr(boxReg, boxReg); 752 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 753 jccb (Assembler::notZero, DONE_LABEL); 754 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 755 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 756 jccb (Assembler::notZero, DONE_LABEL); 757 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 758 jmpb (DONE_LABEL); 759 760 // Intention fall-thru into DONE_LABEL 761 762 // DONE_LABEL is a hot target - we'd really like to place it at the 763 // start of cache line by padding with NOPs. 764 // See the AMD and Intel software optimization manuals for the 765 // most efficient "long" NOP encodings. 766 // Unfortunately none of our alignment mechanisms suffice. 767 bind (CheckSucc); 768 #else // _LP64 769 // It's inflated 770 Label LNotRecursive, LSuccess, LGoSlowPath; 771 772 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 773 jccb(Assembler::equal, LNotRecursive); 774 775 // Recursive inflated unlock 776 decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 777 jmpb(LSuccess); 778 779 bind(LNotRecursive); 780 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 781 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 782 jccb (Assembler::notZero, CheckSucc); 783 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 784 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); 785 jmpb (DONE_LABEL); 786 787 // Try to avoid passing control into the slow_path ... 788 bind (CheckSucc); 789 790 // The following optional optimization can be elided if necessary 791 // Effectively: if (succ == null) goto slow path 792 // The code reduces the window for a race, however, 793 // and thus benefits performance. 794 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 795 jccb (Assembler::zero, LGoSlowPath); 796 797 xorptr(boxReg, boxReg); 798 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 799 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); 800 801 // Memory barrier/fence 802 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 803 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 804 // This is faster on Nehalem and AMD Shanghai/Barcelona. 805 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 806 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 807 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 808 lock(); addl(Address(rsp, 0), 0); 809 810 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 811 jccb (Assembler::notZero, LSuccess); 812 813 // Rare inopportune interleaving - race. 814 // The successor vanished in the small window above. 815 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 816 // We need to ensure progress and succession. 817 // Try to reacquire the lock. 818 // If that fails then the new owner is responsible for succession and this 819 // thread needs to take no further action and can exit via the fast path (success). 820 // If the re-acquire succeeds then pass control into the slow path. 821 // As implemented, this latter mode is horrible because we generated more 822 // coherence traffic on the lock *and* artifically extended the critical section 823 // length while by virtue of passing control into the slow path. 824 825 // box is really RAX -- the following CMPXCHG depends on that binding 826 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 827 lock(); 828 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 829 // There's no successor so we tried to regrab the lock. 830 // If that didn't work, then another thread grabbed the 831 // lock so we're done (and exit was a success). 832 jccb (Assembler::notEqual, LSuccess); 833 // Intentional fall-through into slow path 834 835 bind (LGoSlowPath); 836 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 837 jmpb (DONE_LABEL); 838 839 bind (LSuccess); 840 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 841 jmpb (DONE_LABEL); 842 843 #endif 844 if (LockingMode == LM_LEGACY) { 845 bind (Stacked); 846 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 847 lock(); 848 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 849 // Intentional fall-thru into DONE_LABEL 850 } 851 852 bind(DONE_LABEL); 853 } 854 855 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 856 Register t, Register thread) { 857 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 858 assert(rax_reg == rax, "Used for CAS"); 859 assert_different_registers(obj, box, rax_reg, t, thread); 860 861 // Handle inflated monitor. 862 Label inflated; 863 // Finish fast lock successfully. ZF value is irrelevant. 864 Label locked; 865 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 866 Label slow_path; 867 868 if (DiagnoseSyncOnValueBasedClasses != 0) { 869 load_klass(rax_reg, obj, t); 870 movl(rax_reg, Address(rax_reg, Klass::access_flags_offset())); 871 testl(rax_reg, JVM_ACC_IS_VALUE_BASED_CLASS); 872 jcc(Assembler::notZero, slow_path); 873 } 874 875 const Register mark = t; 876 877 { // Lightweight Lock 878 879 Label push; 880 881 const Register top = box; 882 883 // Load the mark. 884 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 885 886 // Prefetch top. 887 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 888 889 // Check for monitor (0b10). 890 testptr(mark, markWord::monitor_value); 891 jcc(Assembler::notZero, inflated); 892 893 // Check if lock-stack is full. 894 cmpl(top, LockStack::end_offset() - 1); 895 jcc(Assembler::greater, slow_path); 896 897 // Check if recursive. 898 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 899 jccb(Assembler::equal, push); 900 901 // Try to lock. Transition lock bits 0b01 => 0b00 902 movptr(rax_reg, mark); 903 orptr(rax_reg, markWord::unlocked_value); 904 andptr(mark, ~(int32_t)markWord::unlocked_value); 905 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 906 jcc(Assembler::notEqual, slow_path); 907 908 bind(push); 909 // After successful lock, push object on lock-stack. 910 movptr(Address(thread, top), obj); 911 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 912 jmpb(locked); 913 } 914 915 { // Handle inflated monitor. 916 bind(inflated); 917 918 const Register tagged_monitor = mark; 919 920 // CAS owner (null => current thread). 921 xorptr(rax_reg, rax_reg); 922 lock(); cmpxchgptr(thread, Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 923 jccb(Assembler::equal, locked); 924 925 // Check if recursive. 926 cmpptr(thread, rax_reg); 927 jccb(Assembler::notEqual, slow_path); 928 929 // Recursive. 930 increment(Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 931 } 932 933 bind(locked); 934 // Set ZF = 1 935 xorl(rax_reg, rax_reg); 936 937 #ifdef ASSERT 938 // Check that locked label is reached with ZF set. 939 Label zf_correct; 940 jccb(Assembler::zero, zf_correct); 941 stop("Fast Lock ZF != 1"); 942 #endif 943 944 bind(slow_path); 945 #ifdef ASSERT 946 // Check that slow_path label is reached with ZF not set. 947 jccb(Assembler::notZero, zf_correct); 948 stop("Fast Lock ZF != 0"); 949 bind(zf_correct); 950 #endif 951 // C2 uses the value of ZF to determine the continuation. 952 } 953 954 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 955 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 956 assert(reg_rax == rax, "Used for CAS"); 957 assert_different_registers(obj, reg_rax, t); 958 959 // Handle inflated monitor. 960 Label inflated, inflated_check_lock_stack; 961 // Finish fast unlock successfully. MUST jump with ZF == 1 962 Label unlocked; 963 964 const Register mark = t; 965 const Register top = reg_rax; 966 967 Label dummy; 968 C2FastUnlockLightweightStub* stub = nullptr; 969 970 if (!Compile::current()->output()->in_scratch_emit_size()) { 971 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 972 Compile::current()->output()->add_stub(stub); 973 } 974 975 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 976 Label& check_successor = stub == nullptr ? dummy : stub->check_successor(); 977 978 { // Lightweight Unlock 979 980 // Load top. 981 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 982 983 // Prefetch mark. 984 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 985 986 // Check if obj is top of lock-stack. 987 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 988 // Top of lock stack was not obj. Must be monitor. 989 jcc(Assembler::notEqual, inflated_check_lock_stack); 990 991 // Pop lock-stack. 992 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 993 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 994 995 // Check if recursive. 996 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 997 jcc(Assembler::equal, unlocked); 998 999 // We elide the monitor check, let the CAS fail instead. 1000 1001 // Try to unlock. Transition lock bits 0b00 => 0b01 1002 movptr(reg_rax, mark); 1003 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 1004 orptr(mark, markWord::unlocked_value); 1005 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 1006 jcc(Assembler::notEqual, push_and_slow_path); 1007 jmp(unlocked); 1008 } 1009 1010 1011 { // Handle inflated monitor. 1012 bind(inflated_check_lock_stack); 1013 #ifdef ASSERT 1014 Label check_done; 1015 subl(top, oopSize); 1016 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 1017 jcc(Assembler::below, check_done); 1018 cmpptr(obj, Address(thread, top)); 1019 jccb(Assembler::notEqual, inflated_check_lock_stack); 1020 stop("Fast Unlock lock on stack"); 1021 bind(check_done); 1022 testptr(mark, markWord::monitor_value); 1023 jccb(Assembler::notZero, inflated); 1024 stop("Fast Unlock not monitor"); 1025 #endif 1026 1027 bind(inflated); 1028 1029 // mark contains the tagged ObjectMonitor*. 1030 const Register monitor = mark; 1031 1032 #ifndef _LP64 1033 // Check if recursive. 1034 xorptr(reg_rax, reg_rax); 1035 orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 1036 jcc(Assembler::notZero, check_successor); 1037 1038 // Check if the entry lists are empty. 1039 movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 1040 orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 1041 jcc(Assembler::notZero, check_successor); 1042 1043 // Release lock. 1044 movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 1045 #else // _LP64 1046 Label recursive; 1047 1048 // Check if recursive. 1049 cmpptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 1050 jccb(Assembler::notEqual, recursive); 1051 1052 // Check if the entry lists are empty. 1053 movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 1054 orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 1055 jcc(Assembler::notZero, check_successor); 1056 1057 // Release lock. 1058 movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 1059 jmpb(unlocked); 1060 1061 // Recursive unlock. 1062 bind(recursive); 1063 decrement(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 1064 xorl(t, t); 1065 #endif 1066 } 1067 1068 bind(unlocked); 1069 if (stub != nullptr) { 1070 bind(stub->unlocked_continuation()); 1071 } 1072 1073 #ifdef ASSERT 1074 // Check that unlocked label is reached with ZF set. 1075 Label zf_correct; 1076 jccb(Assembler::zero, zf_correct); 1077 stop("Fast Unlock ZF != 1"); 1078 #endif 1079 1080 if (stub != nullptr) { 1081 bind(stub->slow_path_continuation()); 1082 } 1083 #ifdef ASSERT 1084 // Check that stub->continuation() label is reached with ZF not set. 1085 jccb(Assembler::notZero, zf_correct); 1086 stop("Fast Unlock ZF != 0"); 1087 bind(zf_correct); 1088 #endif 1089 // C2 uses the value of ZF to determine the continuation. 1090 } 1091 1092 //------------------------------------------------------------------------------------------- 1093 // Generic instructions support for use in .ad files C2 code generation 1094 1095 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) { 1096 if (dst != src) { 1097 movdqu(dst, src); 1098 } 1099 if (opcode == Op_AbsVD) { 1100 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr); 1101 } else { 1102 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 1103 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr); 1104 } 1105 } 1106 1107 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) { 1108 if (opcode == Op_AbsVD) { 1109 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr); 1110 } else { 1111 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 1112 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr); 1113 } 1114 } 1115 1116 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) { 1117 if (dst != src) { 1118 movdqu(dst, src); 1119 } 1120 if (opcode == Op_AbsVF) { 1121 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr); 1122 } else { 1123 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 1124 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr); 1125 } 1126 } 1127 1128 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) { 1129 if (opcode == Op_AbsVF) { 1130 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr); 1131 } else { 1132 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 1133 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr); 1134 } 1135 } 1136 1137 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 1138 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1139 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 1140 1141 if (opcode == Op_MinV) { 1142 if (elem_bt == T_BYTE) { 1143 pminsb(dst, src); 1144 } else if (elem_bt == T_SHORT) { 1145 pminsw(dst, src); 1146 } else if (elem_bt == T_INT) { 1147 pminsd(dst, src); 1148 } else { 1149 assert(elem_bt == T_LONG, "required"); 1150 assert(tmp == xmm0, "required"); 1151 assert_different_registers(dst, src, tmp); 1152 movdqu(xmm0, dst); 1153 pcmpgtq(xmm0, src); 1154 blendvpd(dst, src); // xmm0 as mask 1155 } 1156 } else { // opcode == Op_MaxV 1157 if (elem_bt == T_BYTE) { 1158 pmaxsb(dst, src); 1159 } else if (elem_bt == T_SHORT) { 1160 pmaxsw(dst, src); 1161 } else if (elem_bt == T_INT) { 1162 pmaxsd(dst, src); 1163 } else { 1164 assert(elem_bt == T_LONG, "required"); 1165 assert(tmp == xmm0, "required"); 1166 assert_different_registers(dst, src, tmp); 1167 movdqu(xmm0, src); 1168 pcmpgtq(xmm0, dst); 1169 blendvpd(dst, src); // xmm0 as mask 1170 } 1171 } 1172 } 1173 1174 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 1175 XMMRegister dst, XMMRegister src1, XMMRegister src2, 1176 int vlen_enc) { 1177 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1178 1179 if (opcode == Op_MinV) { 1180 if (elem_bt == T_BYTE) { 1181 vpminsb(dst, src1, src2, vlen_enc); 1182 } else if (elem_bt == T_SHORT) { 1183 vpminsw(dst, src1, src2, vlen_enc); 1184 } else if (elem_bt == T_INT) { 1185 vpminsd(dst, src1, src2, vlen_enc); 1186 } else { 1187 assert(elem_bt == T_LONG, "required"); 1188 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1189 vpminsq(dst, src1, src2, vlen_enc); 1190 } else { 1191 assert_different_registers(dst, src1, src2); 1192 vpcmpgtq(dst, src1, src2, vlen_enc); 1193 vblendvpd(dst, src1, src2, dst, vlen_enc); 1194 } 1195 } 1196 } else { // opcode == Op_MaxV 1197 if (elem_bt == T_BYTE) { 1198 vpmaxsb(dst, src1, src2, vlen_enc); 1199 } else if (elem_bt == T_SHORT) { 1200 vpmaxsw(dst, src1, src2, vlen_enc); 1201 } else if (elem_bt == T_INT) { 1202 vpmaxsd(dst, src1, src2, vlen_enc); 1203 } else { 1204 assert(elem_bt == T_LONG, "required"); 1205 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1206 vpmaxsq(dst, src1, src2, vlen_enc); 1207 } else { 1208 assert_different_registers(dst, src1, src2); 1209 vpcmpgtq(dst, src1, src2, vlen_enc); 1210 vblendvpd(dst, src2, src1, dst, vlen_enc); 1211 } 1212 } 1213 } 1214 } 1215 1216 // Float/Double min max 1217 1218 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1219 XMMRegister dst, XMMRegister a, XMMRegister b, 1220 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1221 int vlen_enc) { 1222 assert(UseAVX > 0, "required"); 1223 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1224 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1225 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1226 assert_different_registers(a, b, tmp, atmp, btmp); 1227 1228 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1229 bool is_double_word = is_double_word_type(elem_bt); 1230 1231 if (!is_double_word && is_min) { 1232 vblendvps(atmp, a, b, a, vlen_enc); 1233 vblendvps(btmp, b, a, a, vlen_enc); 1234 vminps(tmp, atmp, btmp, vlen_enc); 1235 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1236 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 1237 } else if (!is_double_word && !is_min) { 1238 vblendvps(btmp, b, a, b, vlen_enc); 1239 vblendvps(atmp, a, b, b, vlen_enc); 1240 vmaxps(tmp, atmp, btmp, vlen_enc); 1241 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1242 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 1243 } else if (is_double_word && is_min) { 1244 vblendvpd(atmp, a, b, a, vlen_enc); 1245 vblendvpd(btmp, b, a, a, vlen_enc); 1246 vminpd(tmp, atmp, btmp, vlen_enc); 1247 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1248 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 1249 } else { 1250 assert(is_double_word && !is_min, "sanity"); 1251 vblendvpd(btmp, b, a, b, vlen_enc); 1252 vblendvpd(atmp, a, b, b, vlen_enc); 1253 vmaxpd(tmp, atmp, btmp, vlen_enc); 1254 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1255 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 1256 } 1257 } 1258 1259 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1260 XMMRegister dst, XMMRegister a, XMMRegister b, 1261 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1262 int vlen_enc) { 1263 assert(UseAVX > 2, "required"); 1264 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1265 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1266 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1267 assert_different_registers(dst, a, b, atmp, btmp); 1268 1269 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1270 bool is_double_word = is_double_word_type(elem_bt); 1271 bool merge = true; 1272 1273 if (!is_double_word && is_min) { 1274 evpmovd2m(ktmp, a, vlen_enc); 1275 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1276 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1277 vminps(dst, atmp, btmp, vlen_enc); 1278 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1279 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1280 } else if (!is_double_word && !is_min) { 1281 evpmovd2m(ktmp, b, vlen_enc); 1282 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1283 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1284 vmaxps(dst, atmp, btmp, vlen_enc); 1285 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1286 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1287 } else if (is_double_word && is_min) { 1288 evpmovq2m(ktmp, a, vlen_enc); 1289 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1290 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1291 vminpd(dst, atmp, btmp, vlen_enc); 1292 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1293 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1294 } else { 1295 assert(is_double_word && !is_min, "sanity"); 1296 evpmovq2m(ktmp, b, vlen_enc); 1297 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1298 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1299 vmaxpd(dst, atmp, btmp, vlen_enc); 1300 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1301 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1302 } 1303 } 1304 1305 // Float/Double signum 1306 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, 1307 XMMRegister zero, XMMRegister one, 1308 Register scratch) { 1309 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1310 1311 Label DONE_LABEL; 1312 1313 if (opcode == Op_SignumF) { 1314 assert(UseSSE > 0, "required"); 1315 ucomiss(dst, zero); 1316 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1317 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1318 movflt(dst, one); 1319 jcc(Assembler::above, DONE_LABEL); 1320 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scratch); 1321 } else if (opcode == Op_SignumD) { 1322 assert(UseSSE > 1, "required"); 1323 ucomisd(dst, zero); 1324 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1325 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1326 movdbl(dst, one); 1327 jcc(Assembler::above, DONE_LABEL); 1328 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scratch); 1329 } 1330 1331 bind(DONE_LABEL); 1332 } 1333 1334 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1335 if (sign) { 1336 pmovsxbw(dst, src); 1337 } else { 1338 pmovzxbw(dst, src); 1339 } 1340 } 1341 1342 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1343 if (sign) { 1344 vpmovsxbw(dst, src, vector_len); 1345 } else { 1346 vpmovzxbw(dst, src, vector_len); 1347 } 1348 } 1349 1350 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1351 if (sign) { 1352 vpmovsxbd(dst, src, vector_len); 1353 } else { 1354 vpmovzxbd(dst, src, vector_len); 1355 } 1356 } 1357 1358 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1359 if (sign) { 1360 vpmovsxwd(dst, src, vector_len); 1361 } else { 1362 vpmovzxwd(dst, src, vector_len); 1363 } 1364 } 1365 1366 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1367 int shift, int vector_len) { 1368 if (opcode == Op_RotateLeftV) { 1369 if (etype == T_INT) { 1370 evprold(dst, src, shift, vector_len); 1371 } else { 1372 assert(etype == T_LONG, "expected type T_LONG"); 1373 evprolq(dst, src, shift, vector_len); 1374 } 1375 } else { 1376 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1377 if (etype == T_INT) { 1378 evprord(dst, src, shift, vector_len); 1379 } else { 1380 assert(etype == T_LONG, "expected type T_LONG"); 1381 evprorq(dst, src, shift, vector_len); 1382 } 1383 } 1384 } 1385 1386 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1387 XMMRegister shift, int vector_len) { 1388 if (opcode == Op_RotateLeftV) { 1389 if (etype == T_INT) { 1390 evprolvd(dst, src, shift, vector_len); 1391 } else { 1392 assert(etype == T_LONG, "expected type T_LONG"); 1393 evprolvq(dst, src, shift, vector_len); 1394 } 1395 } else { 1396 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1397 if (etype == T_INT) { 1398 evprorvd(dst, src, shift, vector_len); 1399 } else { 1400 assert(etype == T_LONG, "expected type T_LONG"); 1401 evprorvq(dst, src, shift, vector_len); 1402 } 1403 } 1404 } 1405 1406 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1407 if (opcode == Op_RShiftVI) { 1408 psrad(dst, shift); 1409 } else if (opcode == Op_LShiftVI) { 1410 pslld(dst, shift); 1411 } else { 1412 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1413 psrld(dst, shift); 1414 } 1415 } 1416 1417 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1418 switch (opcode) { 1419 case Op_RShiftVI: psrad(dst, shift); break; 1420 case Op_LShiftVI: pslld(dst, shift); break; 1421 case Op_URShiftVI: psrld(dst, shift); break; 1422 1423 default: assert(false, "%s", NodeClassNames[opcode]); 1424 } 1425 } 1426 1427 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1428 if (opcode == Op_RShiftVI) { 1429 vpsrad(dst, nds, shift, vector_len); 1430 } else if (opcode == Op_LShiftVI) { 1431 vpslld(dst, nds, shift, vector_len); 1432 } else { 1433 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1434 vpsrld(dst, nds, shift, vector_len); 1435 } 1436 } 1437 1438 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1439 switch (opcode) { 1440 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1441 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1442 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1443 1444 default: assert(false, "%s", NodeClassNames[opcode]); 1445 } 1446 } 1447 1448 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1449 switch (opcode) { 1450 case Op_RShiftVB: // fall-through 1451 case Op_RShiftVS: psraw(dst, shift); break; 1452 1453 case Op_LShiftVB: // fall-through 1454 case Op_LShiftVS: psllw(dst, shift); break; 1455 1456 case Op_URShiftVS: // fall-through 1457 case Op_URShiftVB: psrlw(dst, shift); break; 1458 1459 default: assert(false, "%s", NodeClassNames[opcode]); 1460 } 1461 } 1462 1463 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1464 switch (opcode) { 1465 case Op_RShiftVB: // fall-through 1466 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1467 1468 case Op_LShiftVB: // fall-through 1469 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1470 1471 case Op_URShiftVS: // fall-through 1472 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1473 1474 default: assert(false, "%s", NodeClassNames[opcode]); 1475 } 1476 } 1477 1478 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1479 switch (opcode) { 1480 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1481 case Op_LShiftVL: psllq(dst, shift); break; 1482 case Op_URShiftVL: psrlq(dst, shift); break; 1483 1484 default: assert(false, "%s", NodeClassNames[opcode]); 1485 } 1486 } 1487 1488 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1489 if (opcode == Op_RShiftVL) { 1490 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1491 } else if (opcode == Op_LShiftVL) { 1492 psllq(dst, shift); 1493 } else { 1494 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1495 psrlq(dst, shift); 1496 } 1497 } 1498 1499 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1500 switch (opcode) { 1501 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1502 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1503 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1504 1505 default: assert(false, "%s", NodeClassNames[opcode]); 1506 } 1507 } 1508 1509 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1510 if (opcode == Op_RShiftVL) { 1511 evpsraq(dst, nds, shift, vector_len); 1512 } else if (opcode == Op_LShiftVL) { 1513 vpsllq(dst, nds, shift, vector_len); 1514 } else { 1515 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1516 vpsrlq(dst, nds, shift, vector_len); 1517 } 1518 } 1519 1520 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1521 switch (opcode) { 1522 case Op_RShiftVB: // fall-through 1523 case Op_RShiftVS: // fall-through 1524 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1525 1526 case Op_LShiftVB: // fall-through 1527 case Op_LShiftVS: // fall-through 1528 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1529 1530 case Op_URShiftVB: // fall-through 1531 case Op_URShiftVS: // fall-through 1532 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1533 1534 default: assert(false, "%s", NodeClassNames[opcode]); 1535 } 1536 } 1537 1538 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1539 switch (opcode) { 1540 case Op_RShiftVB: // fall-through 1541 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1542 1543 case Op_LShiftVB: // fall-through 1544 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1545 1546 case Op_URShiftVB: // fall-through 1547 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1548 1549 default: assert(false, "%s", NodeClassNames[opcode]); 1550 } 1551 } 1552 1553 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1554 assert(UseAVX >= 2, "required"); 1555 switch (opcode) { 1556 case Op_RShiftVL: { 1557 if (UseAVX > 2) { 1558 assert(tmp == xnoreg, "not used"); 1559 if (!VM_Version::supports_avx512vl()) { 1560 vlen_enc = Assembler::AVX_512bit; 1561 } 1562 evpsravq(dst, src, shift, vlen_enc); 1563 } else { 1564 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1565 vpsrlvq(dst, src, shift, vlen_enc); 1566 vpsrlvq(tmp, tmp, shift, vlen_enc); 1567 vpxor(dst, dst, tmp, vlen_enc); 1568 vpsubq(dst, dst, tmp, vlen_enc); 1569 } 1570 break; 1571 } 1572 case Op_LShiftVL: { 1573 assert(tmp == xnoreg, "not used"); 1574 vpsllvq(dst, src, shift, vlen_enc); 1575 break; 1576 } 1577 case Op_URShiftVL: { 1578 assert(tmp == xnoreg, "not used"); 1579 vpsrlvq(dst, src, shift, vlen_enc); 1580 break; 1581 } 1582 default: assert(false, "%s", NodeClassNames[opcode]); 1583 } 1584 } 1585 1586 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1587 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) { 1588 assert(opcode == Op_LShiftVB || 1589 opcode == Op_RShiftVB || 1590 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1591 bool sign = (opcode != Op_URShiftVB); 1592 assert(vector_len == 0, "required"); 1593 vextendbd(sign, dst, src, 1); 1594 vpmovzxbd(vtmp, shift, 1); 1595 varshiftd(opcode, dst, dst, vtmp, 1); 1596 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch); 1597 vextracti128_high(vtmp, dst); 1598 vpackusdw(dst, dst, vtmp, 0); 1599 } 1600 1601 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1602 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) { 1603 assert(opcode == Op_LShiftVB || 1604 opcode == Op_RShiftVB || 1605 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1606 bool sign = (opcode != Op_URShiftVB); 1607 int ext_vector_len = vector_len + 1; 1608 vextendbw(sign, dst, src, ext_vector_len); 1609 vpmovzxbw(vtmp, shift, ext_vector_len); 1610 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1611 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch); 1612 if (vector_len == 0) { 1613 vextracti128_high(vtmp, dst); 1614 vpackuswb(dst, dst, vtmp, vector_len); 1615 } else { 1616 vextracti64x4_high(vtmp, dst); 1617 vpackuswb(dst, dst, vtmp, vector_len); 1618 vpermq(dst, dst, 0xD8, vector_len); 1619 } 1620 } 1621 1622 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1623 switch(typ) { 1624 case T_BYTE: 1625 pinsrb(dst, val, idx); 1626 break; 1627 case T_SHORT: 1628 pinsrw(dst, val, idx); 1629 break; 1630 case T_INT: 1631 pinsrd(dst, val, idx); 1632 break; 1633 case T_LONG: 1634 pinsrq(dst, val, idx); 1635 break; 1636 default: 1637 assert(false,"Should not reach here."); 1638 break; 1639 } 1640 } 1641 1642 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1643 switch(typ) { 1644 case T_BYTE: 1645 vpinsrb(dst, src, val, idx); 1646 break; 1647 case T_SHORT: 1648 vpinsrw(dst, src, val, idx); 1649 break; 1650 case T_INT: 1651 vpinsrd(dst, src, val, idx); 1652 break; 1653 case T_LONG: 1654 vpinsrq(dst, src, val, idx); 1655 break; 1656 default: 1657 assert(false,"Should not reach here."); 1658 break; 1659 } 1660 } 1661 1662 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1663 switch(typ) { 1664 case T_INT: 1665 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1666 break; 1667 case T_FLOAT: 1668 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1669 break; 1670 case T_LONG: 1671 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1672 break; 1673 case T_DOUBLE: 1674 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1675 break; 1676 default: 1677 assert(false,"Should not reach here."); 1678 break; 1679 } 1680 } 1681 1682 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1683 switch(typ) { 1684 case T_INT: 1685 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1686 break; 1687 case T_FLOAT: 1688 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1689 break; 1690 case T_LONG: 1691 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1692 break; 1693 case T_DOUBLE: 1694 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1695 break; 1696 default: 1697 assert(false,"Should not reach here."); 1698 break; 1699 } 1700 } 1701 1702 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1703 switch(typ) { 1704 case T_INT: 1705 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1706 break; 1707 case T_FLOAT: 1708 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1709 break; 1710 case T_LONG: 1711 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1712 break; 1713 case T_DOUBLE: 1714 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1715 break; 1716 default: 1717 assert(false,"Should not reach here."); 1718 break; 1719 } 1720 } 1721 1722 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1723 if (vlen_in_bytes <= 16) { 1724 pxor (dst, dst); 1725 psubb(dst, src); 1726 switch (elem_bt) { 1727 case T_BYTE: /* nothing to do */ break; 1728 case T_SHORT: pmovsxbw(dst, dst); break; 1729 case T_INT: pmovsxbd(dst, dst); break; 1730 case T_FLOAT: pmovsxbd(dst, dst); break; 1731 case T_LONG: pmovsxbq(dst, dst); break; 1732 case T_DOUBLE: pmovsxbq(dst, dst); break; 1733 1734 default: assert(false, "%s", type2name(elem_bt)); 1735 } 1736 } else { 1737 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1738 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1739 1740 vpxor (dst, dst, dst, vlen_enc); 1741 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1742 1743 switch (elem_bt) { 1744 case T_BYTE: /* nothing to do */ break; 1745 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1746 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1747 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1748 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1749 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1750 1751 default: assert(false, "%s", type2name(elem_bt)); 1752 } 1753 } 1754 } 1755 1756 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) { 1757 ExternalAddress addr(StubRoutines::x86::vector_iota_indices()); 1758 if (vlen_in_bytes == 4) { 1759 movdl(dst, addr); 1760 } else if (vlen_in_bytes == 8) { 1761 movq(dst, addr); 1762 } else if (vlen_in_bytes == 16) { 1763 movdqu(dst, addr, scratch); 1764 } else if (vlen_in_bytes == 32) { 1765 vmovdqu(dst, addr, scratch); 1766 } else { 1767 assert(vlen_in_bytes == 64, "%d", vlen_in_bytes); 1768 evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch); 1769 } 1770 } 1771 1772 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1773 1774 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1775 int vector_len = Assembler::AVX_128bit; 1776 1777 switch (opcode) { 1778 case Op_AndReductionV: pand(dst, src); break; 1779 case Op_OrReductionV: por (dst, src); break; 1780 case Op_XorReductionV: pxor(dst, src); break; 1781 case Op_MinReductionV: 1782 switch (typ) { 1783 case T_BYTE: pminsb(dst, src); break; 1784 case T_SHORT: pminsw(dst, src); break; 1785 case T_INT: pminsd(dst, src); break; 1786 case T_LONG: assert(UseAVX > 2, "required"); 1787 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1788 default: assert(false, "wrong type"); 1789 } 1790 break; 1791 case Op_MaxReductionV: 1792 switch (typ) { 1793 case T_BYTE: pmaxsb(dst, src); break; 1794 case T_SHORT: pmaxsw(dst, src); break; 1795 case T_INT: pmaxsd(dst, src); break; 1796 case T_LONG: assert(UseAVX > 2, "required"); 1797 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1798 default: assert(false, "wrong type"); 1799 } 1800 break; 1801 case Op_AddReductionVF: addss(dst, src); break; 1802 case Op_AddReductionVD: addsd(dst, src); break; 1803 case Op_AddReductionVI: 1804 switch (typ) { 1805 case T_BYTE: paddb(dst, src); break; 1806 case T_SHORT: paddw(dst, src); break; 1807 case T_INT: paddd(dst, src); break; 1808 default: assert(false, "wrong type"); 1809 } 1810 break; 1811 case Op_AddReductionVL: paddq(dst, src); break; 1812 case Op_MulReductionVF: mulss(dst, src); break; 1813 case Op_MulReductionVD: mulsd(dst, src); break; 1814 case Op_MulReductionVI: 1815 switch (typ) { 1816 case T_SHORT: pmullw(dst, src); break; 1817 case T_INT: pmulld(dst, src); break; 1818 default: assert(false, "wrong type"); 1819 } 1820 break; 1821 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1822 vpmullq(dst, dst, src, vector_len); break; 1823 default: assert(false, "wrong opcode"); 1824 } 1825 } 1826 1827 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1828 int vector_len = Assembler::AVX_256bit; 1829 1830 switch (opcode) { 1831 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1832 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1833 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1834 case Op_MinReductionV: 1835 switch (typ) { 1836 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1837 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1838 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1839 case T_LONG: assert(UseAVX > 2, "required"); 1840 vpminsq(dst, src1, src2, vector_len); break; 1841 default: assert(false, "wrong type"); 1842 } 1843 break; 1844 case Op_MaxReductionV: 1845 switch (typ) { 1846 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1847 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1848 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1849 case T_LONG: assert(UseAVX > 2, "required"); 1850 vpmaxsq(dst, src1, src2, vector_len); break; 1851 default: assert(false, "wrong type"); 1852 } 1853 break; 1854 case Op_AddReductionVI: 1855 switch (typ) { 1856 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1857 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1858 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1859 default: assert(false, "wrong type"); 1860 } 1861 break; 1862 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1863 case Op_MulReductionVI: 1864 switch (typ) { 1865 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1866 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1867 default: assert(false, "wrong type"); 1868 } 1869 break; 1870 case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break; 1871 default: assert(false, "wrong opcode"); 1872 } 1873 } 1874 1875 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1876 XMMRegister dst, XMMRegister src, 1877 XMMRegister vtmp1, XMMRegister vtmp2) { 1878 switch (opcode) { 1879 case Op_AddReductionVF: 1880 case Op_MulReductionVF: 1881 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1882 break; 1883 1884 case Op_AddReductionVD: 1885 case Op_MulReductionVD: 1886 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1887 break; 1888 1889 default: assert(false, "wrong opcode"); 1890 } 1891 } 1892 1893 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1894 Register dst, Register src1, XMMRegister src2, 1895 XMMRegister vtmp1, XMMRegister vtmp2) { 1896 switch (vlen) { 1897 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1898 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1899 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1900 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1901 1902 default: assert(false, "wrong vector length"); 1903 } 1904 } 1905 1906 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1907 Register dst, Register src1, XMMRegister src2, 1908 XMMRegister vtmp1, XMMRegister vtmp2) { 1909 switch (vlen) { 1910 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1911 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1912 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1913 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1914 1915 default: assert(false, "wrong vector length"); 1916 } 1917 } 1918 1919 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1920 Register dst, Register src1, XMMRegister src2, 1921 XMMRegister vtmp1, XMMRegister vtmp2) { 1922 switch (vlen) { 1923 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1924 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1925 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1926 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1927 1928 default: assert(false, "wrong vector length"); 1929 } 1930 } 1931 1932 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1933 Register dst, Register src1, XMMRegister src2, 1934 XMMRegister vtmp1, XMMRegister vtmp2) { 1935 switch (vlen) { 1936 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1937 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1938 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1939 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1940 1941 default: assert(false, "wrong vector length"); 1942 } 1943 } 1944 1945 #ifdef _LP64 1946 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1947 Register dst, Register src1, XMMRegister src2, 1948 XMMRegister vtmp1, XMMRegister vtmp2) { 1949 switch (vlen) { 1950 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1951 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1952 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1953 1954 default: assert(false, "wrong vector length"); 1955 } 1956 } 1957 #endif // _LP64 1958 1959 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1960 switch (vlen) { 1961 case 2: 1962 assert(vtmp2 == xnoreg, ""); 1963 reduce2F(opcode, dst, src, vtmp1); 1964 break; 1965 case 4: 1966 assert(vtmp2 == xnoreg, ""); 1967 reduce4F(opcode, dst, src, vtmp1); 1968 break; 1969 case 8: 1970 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1971 break; 1972 case 16: 1973 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1974 break; 1975 default: assert(false, "wrong vector length"); 1976 } 1977 } 1978 1979 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1980 switch (vlen) { 1981 case 2: 1982 assert(vtmp2 == xnoreg, ""); 1983 reduce2D(opcode, dst, src, vtmp1); 1984 break; 1985 case 4: 1986 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1987 break; 1988 case 8: 1989 reduce8D(opcode, dst, src, vtmp1, vtmp2); 1990 break; 1991 default: assert(false, "wrong vector length"); 1992 } 1993 } 1994 1995 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1996 if (opcode == Op_AddReductionVI) { 1997 if (vtmp1 != src2) { 1998 movdqu(vtmp1, src2); 1999 } 2000 phaddd(vtmp1, vtmp1); 2001 } else { 2002 pshufd(vtmp1, src2, 0x1); 2003 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2004 } 2005 movdl(vtmp2, src1); 2006 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2007 movdl(dst, vtmp1); 2008 } 2009 2010 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2011 if (opcode == Op_AddReductionVI) { 2012 if (vtmp1 != src2) { 2013 movdqu(vtmp1, src2); 2014 } 2015 phaddd(vtmp1, src2); 2016 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2017 } else { 2018 pshufd(vtmp2, src2, 0xE); 2019 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2020 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2021 } 2022 } 2023 2024 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2025 if (opcode == Op_AddReductionVI) { 2026 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2027 vextracti128_high(vtmp2, vtmp1); 2028 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2029 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2030 } else { 2031 vextracti128_high(vtmp1, src2); 2032 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2033 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2034 } 2035 } 2036 2037 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2038 vextracti64x4_high(vtmp2, src2); 2039 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2040 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2041 } 2042 2043 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2044 pshufd(vtmp2, src2, 0x1); 2045 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2046 movdqu(vtmp1, vtmp2); 2047 psrldq(vtmp1, 2); 2048 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2049 movdqu(vtmp2, vtmp1); 2050 psrldq(vtmp2, 1); 2051 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2052 movdl(vtmp2, src1); 2053 pmovsxbd(vtmp1, vtmp1); 2054 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2055 pextrb(dst, vtmp1, 0x0); 2056 movsbl(dst, dst); 2057 } 2058 2059 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2060 pshufd(vtmp1, src2, 0xE); 2061 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2062 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2063 } 2064 2065 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2066 vextracti128_high(vtmp2, src2); 2067 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2068 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2069 } 2070 2071 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2072 vextracti64x4_high(vtmp1, src2); 2073 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2074 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2075 } 2076 2077 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2078 pmovsxbw(vtmp2, src2); 2079 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2080 } 2081 2082 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2083 if (UseAVX > 1) { 2084 int vector_len = Assembler::AVX_256bit; 2085 vpmovsxbw(vtmp1, src2, vector_len); 2086 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2087 } else { 2088 pmovsxbw(vtmp2, src2); 2089 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2090 pshufd(vtmp2, src2, 0x1); 2091 pmovsxbw(vtmp2, src2); 2092 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2093 } 2094 } 2095 2096 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2097 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2098 int vector_len = Assembler::AVX_512bit; 2099 vpmovsxbw(vtmp1, src2, vector_len); 2100 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2101 } else { 2102 assert(UseAVX >= 2,"Should not reach here."); 2103 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2104 vextracti128_high(vtmp2, src2); 2105 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2106 } 2107 } 2108 2109 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2110 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2111 vextracti64x4_high(vtmp2, src2); 2112 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2113 } 2114 2115 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2116 if (opcode == Op_AddReductionVI) { 2117 if (vtmp1 != src2) { 2118 movdqu(vtmp1, src2); 2119 } 2120 phaddw(vtmp1, vtmp1); 2121 phaddw(vtmp1, vtmp1); 2122 } else { 2123 pshufd(vtmp2, src2, 0x1); 2124 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2125 movdqu(vtmp1, vtmp2); 2126 psrldq(vtmp1, 2); 2127 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2128 } 2129 movdl(vtmp2, src1); 2130 pmovsxwd(vtmp1, vtmp1); 2131 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2132 pextrw(dst, vtmp1, 0x0); 2133 movswl(dst, dst); 2134 } 2135 2136 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2137 if (opcode == Op_AddReductionVI) { 2138 if (vtmp1 != src2) { 2139 movdqu(vtmp1, src2); 2140 } 2141 phaddw(vtmp1, src2); 2142 } else { 2143 pshufd(vtmp1, src2, 0xE); 2144 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2145 } 2146 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2147 } 2148 2149 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2150 if (opcode == Op_AddReductionVI) { 2151 int vector_len = Assembler::AVX_256bit; 2152 vphaddw(vtmp2, src2, src2, vector_len); 2153 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2154 } else { 2155 vextracti128_high(vtmp2, src2); 2156 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2157 } 2158 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2159 } 2160 2161 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2162 int vector_len = Assembler::AVX_256bit; 2163 vextracti64x4_high(vtmp1, src2); 2164 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2165 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2166 } 2167 2168 #ifdef _LP64 2169 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2170 pshufd(vtmp2, src2, 0xE); 2171 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2172 movdq(vtmp1, src1); 2173 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2174 movdq(dst, vtmp1); 2175 } 2176 2177 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2178 vextracti128_high(vtmp1, src2); 2179 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2180 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2181 } 2182 2183 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2184 vextracti64x4_high(vtmp2, src2); 2185 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2186 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2187 } 2188 2189 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2190 assert(ArrayOperationPartialInlineSize > 0 && ArrayOperationPartialInlineSize <= 64, "invalid"); 2191 mov64(temp, -1L); 2192 bzhiq(temp, temp, len); 2193 kmovql(dst, temp); 2194 } 2195 #endif // _LP64 2196 2197 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2198 reduce_operation_128(T_FLOAT, opcode, dst, src); 2199 pshufd(vtmp, src, 0x1); 2200 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2201 } 2202 2203 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2204 reduce2F(opcode, dst, src, vtmp); 2205 pshufd(vtmp, src, 0x2); 2206 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2207 pshufd(vtmp, src, 0x3); 2208 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2209 } 2210 2211 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2212 reduce4F(opcode, dst, src, vtmp2); 2213 vextractf128_high(vtmp2, src); 2214 reduce4F(opcode, dst, vtmp2, vtmp1); 2215 } 2216 2217 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2218 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2219 vextracti64x4_high(vtmp1, src); 2220 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2221 } 2222 2223 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2224 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2225 pshufd(vtmp, src, 0xE); 2226 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2227 } 2228 2229 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2230 reduce2D(opcode, dst, src, vtmp2); 2231 vextractf128_high(vtmp2, src); 2232 reduce2D(opcode, dst, vtmp2, vtmp1); 2233 } 2234 2235 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2236 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2237 vextracti64x4_high(vtmp1, src); 2238 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2239 } 2240 2241 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) { 2242 MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len); 2243 } 2244 2245 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) { 2246 MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len); 2247 } 2248 2249 2250 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2251 XMMRegister dst, XMMRegister src, 2252 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2253 XMMRegister xmm_0, XMMRegister xmm_1) { 2254 int permconst[] = {1, 14}; 2255 XMMRegister wsrc = src; 2256 XMMRegister wdst = xmm_0; 2257 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2258 2259 int vlen_enc = Assembler::AVX_128bit; 2260 if (vlen == 16) { 2261 vlen_enc = Assembler::AVX_256bit; 2262 } 2263 2264 for (int i = log2(vlen) - 1; i >=0; i--) { 2265 if (i == 0 && !is_dst_valid) { 2266 wdst = dst; 2267 } 2268 if (i == 3) { 2269 vextracti64x4_high(wtmp, wsrc); 2270 } else if (i == 2) { 2271 vextracti128_high(wtmp, wsrc); 2272 } else { // i = [0,1] 2273 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2274 } 2275 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2276 wsrc = wdst; 2277 vlen_enc = Assembler::AVX_128bit; 2278 } 2279 if (is_dst_valid) { 2280 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2281 } 2282 } 2283 2284 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2285 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2286 XMMRegister xmm_0, XMMRegister xmm_1) { 2287 XMMRegister wsrc = src; 2288 XMMRegister wdst = xmm_0; 2289 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2290 int vlen_enc = Assembler::AVX_128bit; 2291 if (vlen == 8) { 2292 vlen_enc = Assembler::AVX_256bit; 2293 } 2294 for (int i = log2(vlen) - 1; i >=0; i--) { 2295 if (i == 0 && !is_dst_valid) { 2296 wdst = dst; 2297 } 2298 if (i == 1) { 2299 vextracti128_high(wtmp, wsrc); 2300 } else if (i == 2) { 2301 vextracti64x4_high(wtmp, wsrc); 2302 } else { 2303 assert(i == 0, "%d", i); 2304 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2305 } 2306 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2307 wsrc = wdst; 2308 vlen_enc = Assembler::AVX_128bit; 2309 } 2310 if (is_dst_valid) { 2311 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2312 } 2313 } 2314 2315 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2316 switch (bt) { 2317 case T_BYTE: pextrb(dst, src, idx); break; 2318 case T_SHORT: pextrw(dst, src, idx); break; 2319 case T_INT: pextrd(dst, src, idx); break; 2320 case T_LONG: pextrq(dst, src, idx); break; 2321 2322 default: 2323 assert(false,"Should not reach here."); 2324 break; 2325 } 2326 } 2327 2328 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2329 int esize = type2aelembytes(typ); 2330 int elem_per_lane = 16/esize; 2331 int lane = elemindex / elem_per_lane; 2332 int eindex = elemindex % elem_per_lane; 2333 2334 if (lane >= 2) { 2335 assert(UseAVX > 2, "required"); 2336 vextractf32x4(dst, src, lane & 3); 2337 return dst; 2338 } else if (lane > 0) { 2339 assert(UseAVX > 0, "required"); 2340 vextractf128(dst, src, lane); 2341 return dst; 2342 } else { 2343 return src; 2344 } 2345 } 2346 2347 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2348 if (typ == T_BYTE) { 2349 movsbl(dst, dst); 2350 } else if (typ == T_SHORT) { 2351 movswl(dst, dst); 2352 } 2353 } 2354 2355 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2356 int esize = type2aelembytes(typ); 2357 int elem_per_lane = 16/esize; 2358 int eindex = elemindex % elem_per_lane; 2359 assert(is_integral_type(typ),"required"); 2360 2361 if (eindex == 0) { 2362 if (typ == T_LONG) { 2363 movq(dst, src); 2364 } else { 2365 movdl(dst, src); 2366 movsxl(typ, dst); 2367 } 2368 } else { 2369 extract(typ, dst, src, eindex); 2370 movsxl(typ, dst); 2371 } 2372 } 2373 2374 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) { 2375 int esize = type2aelembytes(typ); 2376 int elem_per_lane = 16/esize; 2377 int eindex = elemindex % elem_per_lane; 2378 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2379 2380 if (eindex == 0) { 2381 movq(dst, src); 2382 } else { 2383 if (typ == T_FLOAT) { 2384 if (UseAVX == 0) { 2385 movdqu(dst, src); 2386 pshufps(dst, dst, eindex); 2387 } else { 2388 vpshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2389 } 2390 } else { 2391 if (UseAVX == 0) { 2392 movdqu(dst, src); 2393 psrldq(dst, eindex*esize); 2394 } else { 2395 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2396 } 2397 movq(dst, dst); 2398 } 2399 } 2400 // Zero upper bits 2401 if (typ == T_FLOAT) { 2402 if (UseAVX == 0) { 2403 assert((vtmp != xnoreg) && (tmp != noreg), "required."); 2404 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp); 2405 pand(dst, vtmp); 2406 } else { 2407 assert((tmp != noreg), "required."); 2408 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp); 2409 } 2410 } 2411 } 2412 2413 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2414 switch(typ) { 2415 case T_BYTE: 2416 case T_BOOLEAN: 2417 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2418 break; 2419 case T_SHORT: 2420 case T_CHAR: 2421 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2422 break; 2423 case T_INT: 2424 case T_FLOAT: 2425 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2426 break; 2427 case T_LONG: 2428 case T_DOUBLE: 2429 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2430 break; 2431 default: 2432 assert(false,"Should not reach here."); 2433 break; 2434 } 2435 } 2436 2437 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) { 2438 switch(typ) { 2439 case T_BOOLEAN: 2440 case T_BYTE: 2441 evpcmpb(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch); 2442 break; 2443 case T_CHAR: 2444 case T_SHORT: 2445 evpcmpw(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch); 2446 break; 2447 case T_INT: 2448 case T_FLOAT: 2449 evpcmpd(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch); 2450 break; 2451 case T_LONG: 2452 case T_DOUBLE: 2453 evpcmpq(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch); 2454 break; 2455 default: 2456 assert(false,"Should not reach here."); 2457 break; 2458 } 2459 } 2460 2461 void C2_MacroAssembler::vpcmpu(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison, 2462 int vlen_in_bytes, XMMRegister vtmp1, XMMRegister vtmp2, Register scratch) { 2463 int vlen_enc = vector_length_encoding(vlen_in_bytes*2); 2464 switch (typ) { 2465 case T_BYTE: 2466 vpmovzxbw(vtmp1, src1, vlen_enc); 2467 vpmovzxbw(vtmp2, src2, vlen_enc); 2468 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch); 2469 vpacksswb(dst, dst, dst, vlen_enc); 2470 break; 2471 case T_SHORT: 2472 vpmovzxwd(vtmp1, src1, vlen_enc); 2473 vpmovzxwd(vtmp2, src2, vlen_enc); 2474 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch); 2475 vpackssdw(dst, dst, dst, vlen_enc); 2476 break; 2477 case T_INT: 2478 vpmovzxdq(vtmp1, src1, vlen_enc); 2479 vpmovzxdq(vtmp2, src2, vlen_enc); 2480 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch); 2481 vpermilps(dst, dst, 8, vlen_enc); 2482 break; 2483 default: 2484 assert(false, "Should not reach here"); 2485 } 2486 if (vlen_in_bytes == 16) { 2487 vpermpd(dst, dst, 0x8, vlen_enc); 2488 } 2489 } 2490 2491 void C2_MacroAssembler::vpcmpu32(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison, int vlen_in_bytes, 2492 XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, Register scratch) { 2493 int vlen_enc = vector_length_encoding(vlen_in_bytes); 2494 switch (typ) { 2495 case T_BYTE: 2496 vpmovzxbw(vtmp1, src1, vlen_enc); 2497 vpmovzxbw(vtmp2, src2, vlen_enc); 2498 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch); 2499 vextracti128(vtmp1, src1, 1); 2500 vextracti128(vtmp2, src2, 1); 2501 vpmovzxbw(vtmp1, vtmp1, vlen_enc); 2502 vpmovzxbw(vtmp2, vtmp2, vlen_enc); 2503 vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch); 2504 vpacksswb(dst, dst, vtmp3, vlen_enc); 2505 vpermpd(dst, dst, 0xd8, vlen_enc); 2506 break; 2507 case T_SHORT: 2508 vpmovzxwd(vtmp1, src1, vlen_enc); 2509 vpmovzxwd(vtmp2, src2, vlen_enc); 2510 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch); 2511 vextracti128(vtmp1, src1, 1); 2512 vextracti128(vtmp2, src2, 1); 2513 vpmovzxwd(vtmp1, vtmp1, vlen_enc); 2514 vpmovzxwd(vtmp2, vtmp2, vlen_enc); 2515 vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch); 2516 vpackssdw(dst, dst, vtmp3, vlen_enc); 2517 vpermpd(dst, dst, 0xd8, vlen_enc); 2518 break; 2519 case T_INT: 2520 vpmovzxdq(vtmp1, src1, vlen_enc); 2521 vpmovzxdq(vtmp2, src2, vlen_enc); 2522 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch); 2523 vpshufd(dst, dst, 8, vlen_enc); 2524 vpermq(dst, dst, 8, vlen_enc); 2525 vextracti128(vtmp1, src1, 1); 2526 vextracti128(vtmp2, src2, 1); 2527 vpmovzxdq(vtmp1, vtmp1, vlen_enc); 2528 vpmovzxdq(vtmp2, vtmp2, vlen_enc); 2529 vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch); 2530 vpshufd(vtmp3, vtmp3, 8, vlen_enc); 2531 vpermq(vtmp3, vtmp3, 0x80, vlen_enc); 2532 vpblendd(dst, dst, vtmp3, 0xf0, vlen_enc); 2533 break; 2534 default: 2535 assert(false, "Should not reach here"); 2536 } 2537 } 2538 2539 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2540 switch(typ) { 2541 case T_BYTE: 2542 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2543 break; 2544 case T_SHORT: 2545 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2546 break; 2547 case T_INT: 2548 case T_FLOAT: 2549 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2550 break; 2551 case T_LONG: 2552 case T_DOUBLE: 2553 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2554 break; 2555 default: 2556 assert(false,"Should not reach here."); 2557 break; 2558 } 2559 } 2560 2561 void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2, 2562 XMMRegister vtmp1, XMMRegister vtmp2, KRegister mask) { 2563 switch(vlen) { 2564 case 4: 2565 assert(vtmp1 != xnoreg, "required."); 2566 // Broadcast lower 32 bits to 128 bits before ptest 2567 pshufd(vtmp1, src1, 0x0); 2568 if (bt == BoolTest::overflow) { 2569 assert(vtmp2 != xnoreg, "required."); 2570 pshufd(vtmp2, src2, 0x0); 2571 } else { 2572 assert(vtmp2 == xnoreg, "required."); 2573 vtmp2 = src2; 2574 } 2575 ptest(vtmp1, vtmp2); 2576 break; 2577 case 8: 2578 assert(vtmp1 != xnoreg, "required."); 2579 // Broadcast lower 64 bits to 128 bits before ptest 2580 pshufd(vtmp1, src1, 0x4); 2581 if (bt == BoolTest::overflow) { 2582 assert(vtmp2 != xnoreg, "required."); 2583 pshufd(vtmp2, src2, 0x4); 2584 } else { 2585 assert(vtmp2 == xnoreg, "required."); 2586 vtmp2 = src2; 2587 } 2588 ptest(vtmp1, vtmp2); 2589 break; 2590 case 16: 2591 assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required."); 2592 ptest(src1, src2); 2593 break; 2594 case 32: 2595 assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required."); 2596 vptest(src1, src2, Assembler::AVX_256bit); 2597 break; 2598 case 64: 2599 { 2600 assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required."); 2601 evpcmpeqb(mask, src1, src2, Assembler::AVX_512bit); 2602 if (bt == BoolTest::ne) { 2603 ktestql(mask, mask); 2604 } else { 2605 assert(bt == BoolTest::overflow, "required"); 2606 kortestql(mask, mask); 2607 } 2608 } 2609 break; 2610 default: 2611 assert(false,"Should not reach here."); 2612 break; 2613 } 2614 } 2615 2616 //------------------------------------------------------------------------------------------- 2617 2618 // IndexOf for constant substrings with size >= 8 chars 2619 // which don't need to be loaded through stack. 2620 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2621 Register cnt1, Register cnt2, 2622 int int_cnt2, Register result, 2623 XMMRegister vec, Register tmp, 2624 int ae) { 2625 ShortBranchVerifier sbv(this); 2626 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2627 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2628 2629 // This method uses the pcmpestri instruction with bound registers 2630 // inputs: 2631 // xmm - substring 2632 // rax - substring length (elements count) 2633 // mem - scanned string 2634 // rdx - string length (elements count) 2635 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2636 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2637 // outputs: 2638 // rcx - matched index in string 2639 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2640 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2641 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2642 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2643 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2644 2645 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2646 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2647 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2648 2649 // Note, inline_string_indexOf() generates checks: 2650 // if (substr.count > string.count) return -1; 2651 // if (substr.count == 0) return 0; 2652 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2653 2654 // Load substring. 2655 if (ae == StrIntrinsicNode::UL) { 2656 pmovzxbw(vec, Address(str2, 0)); 2657 } else { 2658 movdqu(vec, Address(str2, 0)); 2659 } 2660 movl(cnt2, int_cnt2); 2661 movptr(result, str1); // string addr 2662 2663 if (int_cnt2 > stride) { 2664 jmpb(SCAN_TO_SUBSTR); 2665 2666 // Reload substr for rescan, this code 2667 // is executed only for large substrings (> 8 chars) 2668 bind(RELOAD_SUBSTR); 2669 if (ae == StrIntrinsicNode::UL) { 2670 pmovzxbw(vec, Address(str2, 0)); 2671 } else { 2672 movdqu(vec, Address(str2, 0)); 2673 } 2674 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2675 2676 bind(RELOAD_STR); 2677 // We came here after the beginning of the substring was 2678 // matched but the rest of it was not so we need to search 2679 // again. Start from the next element after the previous match. 2680 2681 // cnt2 is number of substring reminding elements and 2682 // cnt1 is number of string reminding elements when cmp failed. 2683 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2684 subl(cnt1, cnt2); 2685 addl(cnt1, int_cnt2); 2686 movl(cnt2, int_cnt2); // Now restore cnt2 2687 2688 decrementl(cnt1); // Shift to next element 2689 cmpl(cnt1, cnt2); 2690 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2691 2692 addptr(result, (1<<scale1)); 2693 2694 } // (int_cnt2 > 8) 2695 2696 // Scan string for start of substr in 16-byte vectors 2697 bind(SCAN_TO_SUBSTR); 2698 pcmpestri(vec, Address(result, 0), mode); 2699 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2700 subl(cnt1, stride); 2701 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2702 cmpl(cnt1, cnt2); 2703 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2704 addptr(result, 16); 2705 jmpb(SCAN_TO_SUBSTR); 2706 2707 // Found a potential substr 2708 bind(FOUND_CANDIDATE); 2709 // Matched whole vector if first element matched (tmp(rcx) == 0). 2710 if (int_cnt2 == stride) { 2711 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2712 } else { // int_cnt2 > 8 2713 jccb(Assembler::overflow, FOUND_SUBSTR); 2714 } 2715 // After pcmpestri tmp(rcx) contains matched element index 2716 // Compute start addr of substr 2717 lea(result, Address(result, tmp, scale1)); 2718 2719 // Make sure string is still long enough 2720 subl(cnt1, tmp); 2721 cmpl(cnt1, cnt2); 2722 if (int_cnt2 == stride) { 2723 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2724 } else { // int_cnt2 > 8 2725 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2726 } 2727 // Left less then substring. 2728 2729 bind(RET_NOT_FOUND); 2730 movl(result, -1); 2731 jmp(EXIT); 2732 2733 if (int_cnt2 > stride) { 2734 // This code is optimized for the case when whole substring 2735 // is matched if its head is matched. 2736 bind(MATCH_SUBSTR_HEAD); 2737 pcmpestri(vec, Address(result, 0), mode); 2738 // Reload only string if does not match 2739 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2740 2741 Label CONT_SCAN_SUBSTR; 2742 // Compare the rest of substring (> 8 chars). 2743 bind(FOUND_SUBSTR); 2744 // First 8 chars are already matched. 2745 negptr(cnt2); 2746 addptr(cnt2, stride); 2747 2748 bind(SCAN_SUBSTR); 2749 subl(cnt1, stride); 2750 cmpl(cnt2, -stride); // Do not read beyond substring 2751 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2752 // Back-up strings to avoid reading beyond substring: 2753 // cnt1 = cnt1 - cnt2 + 8 2754 addl(cnt1, cnt2); // cnt2 is negative 2755 addl(cnt1, stride); 2756 movl(cnt2, stride); negptr(cnt2); 2757 bind(CONT_SCAN_SUBSTR); 2758 if (int_cnt2 < (int)G) { 2759 int tail_off1 = int_cnt2<<scale1; 2760 int tail_off2 = int_cnt2<<scale2; 2761 if (ae == StrIntrinsicNode::UL) { 2762 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2763 } else { 2764 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2765 } 2766 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2767 } else { 2768 // calculate index in register to avoid integer overflow (int_cnt2*2) 2769 movl(tmp, int_cnt2); 2770 addptr(tmp, cnt2); 2771 if (ae == StrIntrinsicNode::UL) { 2772 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2773 } else { 2774 movdqu(vec, Address(str2, tmp, scale2, 0)); 2775 } 2776 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2777 } 2778 // Need to reload strings pointers if not matched whole vector 2779 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2780 addptr(cnt2, stride); 2781 jcc(Assembler::negative, SCAN_SUBSTR); 2782 // Fall through if found full substring 2783 2784 } // (int_cnt2 > 8) 2785 2786 bind(RET_FOUND); 2787 // Found result if we matched full small substring. 2788 // Compute substr offset 2789 subptr(result, str1); 2790 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2791 shrl(result, 1); // index 2792 } 2793 bind(EXIT); 2794 2795 } // string_indexofC8 2796 2797 // Small strings are loaded through stack if they cross page boundary. 2798 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2799 Register cnt1, Register cnt2, 2800 int int_cnt2, Register result, 2801 XMMRegister vec, Register tmp, 2802 int ae) { 2803 ShortBranchVerifier sbv(this); 2804 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2805 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2806 2807 // 2808 // int_cnt2 is length of small (< 8 chars) constant substring 2809 // or (-1) for non constant substring in which case its length 2810 // is in cnt2 register. 2811 // 2812 // Note, inline_string_indexOf() generates checks: 2813 // if (substr.count > string.count) return -1; 2814 // if (substr.count == 0) return 0; 2815 // 2816 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2817 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2818 // This method uses the pcmpestri instruction with bound registers 2819 // inputs: 2820 // xmm - substring 2821 // rax - substring length (elements count) 2822 // mem - scanned string 2823 // rdx - string length (elements count) 2824 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2825 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2826 // outputs: 2827 // rcx - matched index in string 2828 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2829 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2830 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2831 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2832 2833 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2834 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2835 FOUND_CANDIDATE; 2836 2837 { //======================================================== 2838 // We don't know where these strings are located 2839 // and we can't read beyond them. Load them through stack. 2840 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2841 2842 movptr(tmp, rsp); // save old SP 2843 2844 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2845 if (int_cnt2 == (1>>scale2)) { // One byte 2846 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2847 load_unsigned_byte(result, Address(str2, 0)); 2848 movdl(vec, result); // move 32 bits 2849 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2850 // Not enough header space in 32-bit VM: 12+3 = 15. 2851 movl(result, Address(str2, -1)); 2852 shrl(result, 8); 2853 movdl(vec, result); // move 32 bits 2854 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2855 load_unsigned_short(result, Address(str2, 0)); 2856 movdl(vec, result); // move 32 bits 2857 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2858 movdl(vec, Address(str2, 0)); // move 32 bits 2859 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2860 movq(vec, Address(str2, 0)); // move 64 bits 2861 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2862 // Array header size is 12 bytes in 32-bit VM 2863 // + 6 bytes for 3 chars == 18 bytes, 2864 // enough space to load vec and shift. 2865 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2866 if (ae == StrIntrinsicNode::UL) { 2867 int tail_off = int_cnt2-8; 2868 pmovzxbw(vec, Address(str2, tail_off)); 2869 psrldq(vec, -2*tail_off); 2870 } 2871 else { 2872 int tail_off = int_cnt2*(1<<scale2); 2873 movdqu(vec, Address(str2, tail_off-16)); 2874 psrldq(vec, 16-tail_off); 2875 } 2876 } 2877 } else { // not constant substring 2878 cmpl(cnt2, stride); 2879 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 2880 2881 // We can read beyond string if srt+16 does not cross page boundary 2882 // since heaps are aligned and mapped by pages. 2883 assert(os::vm_page_size() < (int)G, "default page should be small"); 2884 movl(result, str2); // We need only low 32 bits 2885 andl(result, (os::vm_page_size()-1)); 2886 cmpl(result, (os::vm_page_size()-16)); 2887 jccb(Assembler::belowEqual, CHECK_STR); 2888 2889 // Move small strings to stack to allow load 16 bytes into vec. 2890 subptr(rsp, 16); 2891 int stk_offset = wordSize-(1<<scale2); 2892 push(cnt2); 2893 2894 bind(COPY_SUBSTR); 2895 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 2896 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 2897 movb(Address(rsp, cnt2, scale2, stk_offset), result); 2898 } else if (ae == StrIntrinsicNode::UU) { 2899 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 2900 movw(Address(rsp, cnt2, scale2, stk_offset), result); 2901 } 2902 decrement(cnt2); 2903 jccb(Assembler::notZero, COPY_SUBSTR); 2904 2905 pop(cnt2); 2906 movptr(str2, rsp); // New substring address 2907 } // non constant 2908 2909 bind(CHECK_STR); 2910 cmpl(cnt1, stride); 2911 jccb(Assembler::aboveEqual, BIG_STRINGS); 2912 2913 // Check cross page boundary. 2914 movl(result, str1); // We need only low 32 bits 2915 andl(result, (os::vm_page_size()-1)); 2916 cmpl(result, (os::vm_page_size()-16)); 2917 jccb(Assembler::belowEqual, BIG_STRINGS); 2918 2919 subptr(rsp, 16); 2920 int stk_offset = -(1<<scale1); 2921 if (int_cnt2 < 0) { // not constant 2922 push(cnt2); 2923 stk_offset += wordSize; 2924 } 2925 movl(cnt2, cnt1); 2926 2927 bind(COPY_STR); 2928 if (ae == StrIntrinsicNode::LL) { 2929 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 2930 movb(Address(rsp, cnt2, scale1, stk_offset), result); 2931 } else { 2932 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 2933 movw(Address(rsp, cnt2, scale1, stk_offset), result); 2934 } 2935 decrement(cnt2); 2936 jccb(Assembler::notZero, COPY_STR); 2937 2938 if (int_cnt2 < 0) { // not constant 2939 pop(cnt2); 2940 } 2941 movptr(str1, rsp); // New string address 2942 2943 bind(BIG_STRINGS); 2944 // Load substring. 2945 if (int_cnt2 < 0) { // -1 2946 if (ae == StrIntrinsicNode::UL) { 2947 pmovzxbw(vec, Address(str2, 0)); 2948 } else { 2949 movdqu(vec, Address(str2, 0)); 2950 } 2951 push(cnt2); // substr count 2952 push(str2); // substr addr 2953 push(str1); // string addr 2954 } else { 2955 // Small (< 8 chars) constant substrings are loaded already. 2956 movl(cnt2, int_cnt2); 2957 } 2958 push(tmp); // original SP 2959 2960 } // Finished loading 2961 2962 //======================================================== 2963 // Start search 2964 // 2965 2966 movptr(result, str1); // string addr 2967 2968 if (int_cnt2 < 0) { // Only for non constant substring 2969 jmpb(SCAN_TO_SUBSTR); 2970 2971 // SP saved at sp+0 2972 // String saved at sp+1*wordSize 2973 // Substr saved at sp+2*wordSize 2974 // Substr count saved at sp+3*wordSize 2975 2976 // Reload substr for rescan, this code 2977 // is executed only for large substrings (> 8 chars) 2978 bind(RELOAD_SUBSTR); 2979 movptr(str2, Address(rsp, 2*wordSize)); 2980 movl(cnt2, Address(rsp, 3*wordSize)); 2981 if (ae == StrIntrinsicNode::UL) { 2982 pmovzxbw(vec, Address(str2, 0)); 2983 } else { 2984 movdqu(vec, Address(str2, 0)); 2985 } 2986 // We came here after the beginning of the substring was 2987 // matched but the rest of it was not so we need to search 2988 // again. Start from the next element after the previous match. 2989 subptr(str1, result); // Restore counter 2990 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2991 shrl(str1, 1); 2992 } 2993 addl(cnt1, str1); 2994 decrementl(cnt1); // Shift to next element 2995 cmpl(cnt1, cnt2); 2996 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2997 2998 addptr(result, (1<<scale1)); 2999 } // non constant 3000 3001 // Scan string for start of substr in 16-byte vectors 3002 bind(SCAN_TO_SUBSTR); 3003 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3004 pcmpestri(vec, Address(result, 0), mode); 3005 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3006 subl(cnt1, stride); 3007 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3008 cmpl(cnt1, cnt2); 3009 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3010 addptr(result, 16); 3011 3012 bind(ADJUST_STR); 3013 cmpl(cnt1, stride); // Do not read beyond string 3014 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3015 // Back-up string to avoid reading beyond string. 3016 lea(result, Address(result, cnt1, scale1, -16)); 3017 movl(cnt1, stride); 3018 jmpb(SCAN_TO_SUBSTR); 3019 3020 // Found a potential substr 3021 bind(FOUND_CANDIDATE); 3022 // After pcmpestri tmp(rcx) contains matched element index 3023 3024 // Make sure string is still long enough 3025 subl(cnt1, tmp); 3026 cmpl(cnt1, cnt2); 3027 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3028 // Left less then substring. 3029 3030 bind(RET_NOT_FOUND); 3031 movl(result, -1); 3032 jmp(CLEANUP); 3033 3034 bind(FOUND_SUBSTR); 3035 // Compute start addr of substr 3036 lea(result, Address(result, tmp, scale1)); 3037 if (int_cnt2 > 0) { // Constant substring 3038 // Repeat search for small substring (< 8 chars) 3039 // from new point without reloading substring. 3040 // Have to check that we don't read beyond string. 3041 cmpl(tmp, stride-int_cnt2); 3042 jccb(Assembler::greater, ADJUST_STR); 3043 // Fall through if matched whole substring. 3044 } else { // non constant 3045 assert(int_cnt2 == -1, "should be != 0"); 3046 3047 addl(tmp, cnt2); 3048 // Found result if we matched whole substring. 3049 cmpl(tmp, stride); 3050 jcc(Assembler::lessEqual, RET_FOUND); 3051 3052 // Repeat search for small substring (<= 8 chars) 3053 // from new point 'str1' without reloading substring. 3054 cmpl(cnt2, stride); 3055 // Have to check that we don't read beyond string. 3056 jccb(Assembler::lessEqual, ADJUST_STR); 3057 3058 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3059 // Compare the rest of substring (> 8 chars). 3060 movptr(str1, result); 3061 3062 cmpl(tmp, cnt2); 3063 // First 8 chars are already matched. 3064 jccb(Assembler::equal, CHECK_NEXT); 3065 3066 bind(SCAN_SUBSTR); 3067 pcmpestri(vec, Address(str1, 0), mode); 3068 // Need to reload strings pointers if not matched whole vector 3069 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3070 3071 bind(CHECK_NEXT); 3072 subl(cnt2, stride); 3073 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3074 addptr(str1, 16); 3075 if (ae == StrIntrinsicNode::UL) { 3076 addptr(str2, 8); 3077 } else { 3078 addptr(str2, 16); 3079 } 3080 subl(cnt1, stride); 3081 cmpl(cnt2, stride); // Do not read beyond substring 3082 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3083 // Back-up strings to avoid reading beyond substring. 3084 3085 if (ae == StrIntrinsicNode::UL) { 3086 lea(str2, Address(str2, cnt2, scale2, -8)); 3087 lea(str1, Address(str1, cnt2, scale1, -16)); 3088 } else { 3089 lea(str2, Address(str2, cnt2, scale2, -16)); 3090 lea(str1, Address(str1, cnt2, scale1, -16)); 3091 } 3092 subl(cnt1, cnt2); 3093 movl(cnt2, stride); 3094 addl(cnt1, stride); 3095 bind(CONT_SCAN_SUBSTR); 3096 if (ae == StrIntrinsicNode::UL) { 3097 pmovzxbw(vec, Address(str2, 0)); 3098 } else { 3099 movdqu(vec, Address(str2, 0)); 3100 } 3101 jmp(SCAN_SUBSTR); 3102 3103 bind(RET_FOUND_LONG); 3104 movptr(str1, Address(rsp, wordSize)); 3105 } // non constant 3106 3107 bind(RET_FOUND); 3108 // Compute substr offset 3109 subptr(result, str1); 3110 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3111 shrl(result, 1); // index 3112 } 3113 bind(CLEANUP); 3114 pop(rsp); // restore SP 3115 3116 } // string_indexof 3117 3118 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3119 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3120 ShortBranchVerifier sbv(this); 3121 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3122 3123 int stride = 8; 3124 3125 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3126 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3127 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3128 FOUND_SEQ_CHAR, DONE_LABEL; 3129 3130 movptr(result, str1); 3131 if (UseAVX >= 2) { 3132 cmpl(cnt1, stride); 3133 jcc(Assembler::less, SCAN_TO_CHAR); 3134 cmpl(cnt1, 2*stride); 3135 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3136 movdl(vec1, ch); 3137 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3138 vpxor(vec2, vec2); 3139 movl(tmp, cnt1); 3140 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3141 andl(cnt1,0x0000000F); //tail count (in chars) 3142 3143 bind(SCAN_TO_16_CHAR_LOOP); 3144 vmovdqu(vec3, Address(result, 0)); 3145 vpcmpeqw(vec3, vec3, vec1, 1); 3146 vptest(vec2, vec3); 3147 jcc(Assembler::carryClear, FOUND_CHAR); 3148 addptr(result, 32); 3149 subl(tmp, 2*stride); 3150 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3151 jmp(SCAN_TO_8_CHAR); 3152 bind(SCAN_TO_8_CHAR_INIT); 3153 movdl(vec1, ch); 3154 pshuflw(vec1, vec1, 0x00); 3155 pshufd(vec1, vec1, 0); 3156 pxor(vec2, vec2); 3157 } 3158 bind(SCAN_TO_8_CHAR); 3159 cmpl(cnt1, stride); 3160 jcc(Assembler::less, SCAN_TO_CHAR); 3161 if (UseAVX < 2) { 3162 movdl(vec1, ch); 3163 pshuflw(vec1, vec1, 0x00); 3164 pshufd(vec1, vec1, 0); 3165 pxor(vec2, vec2); 3166 } 3167 movl(tmp, cnt1); 3168 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3169 andl(cnt1,0x00000007); //tail count (in chars) 3170 3171 bind(SCAN_TO_8_CHAR_LOOP); 3172 movdqu(vec3, Address(result, 0)); 3173 pcmpeqw(vec3, vec1); 3174 ptest(vec2, vec3); 3175 jcc(Assembler::carryClear, FOUND_CHAR); 3176 addptr(result, 16); 3177 subl(tmp, stride); 3178 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3179 bind(SCAN_TO_CHAR); 3180 testl(cnt1, cnt1); 3181 jcc(Assembler::zero, RET_NOT_FOUND); 3182 bind(SCAN_TO_CHAR_LOOP); 3183 load_unsigned_short(tmp, Address(result, 0)); 3184 cmpl(ch, tmp); 3185 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3186 addptr(result, 2); 3187 subl(cnt1, 1); 3188 jccb(Assembler::zero, RET_NOT_FOUND); 3189 jmp(SCAN_TO_CHAR_LOOP); 3190 3191 bind(RET_NOT_FOUND); 3192 movl(result, -1); 3193 jmpb(DONE_LABEL); 3194 3195 bind(FOUND_CHAR); 3196 if (UseAVX >= 2) { 3197 vpmovmskb(tmp, vec3); 3198 } else { 3199 pmovmskb(tmp, vec3); 3200 } 3201 bsfl(ch, tmp); 3202 addptr(result, ch); 3203 3204 bind(FOUND_SEQ_CHAR); 3205 subptr(result, str1); 3206 shrl(result, 1); 3207 3208 bind(DONE_LABEL); 3209 } // string_indexof_char 3210 3211 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3212 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3213 ShortBranchVerifier sbv(this); 3214 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3215 3216 int stride = 16; 3217 3218 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3219 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3220 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3221 FOUND_SEQ_CHAR, DONE_LABEL; 3222 3223 movptr(result, str1); 3224 if (UseAVX >= 2) { 3225 cmpl(cnt1, stride); 3226 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3227 cmpl(cnt1, stride*2); 3228 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3229 movdl(vec1, ch); 3230 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3231 vpxor(vec2, vec2); 3232 movl(tmp, cnt1); 3233 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3234 andl(cnt1,0x0000001F); //tail count (in chars) 3235 3236 bind(SCAN_TO_32_CHAR_LOOP); 3237 vmovdqu(vec3, Address(result, 0)); 3238 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3239 vptest(vec2, vec3); 3240 jcc(Assembler::carryClear, FOUND_CHAR); 3241 addptr(result, 32); 3242 subl(tmp, stride*2); 3243 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3244 jmp(SCAN_TO_16_CHAR); 3245 3246 bind(SCAN_TO_16_CHAR_INIT); 3247 movdl(vec1, ch); 3248 pxor(vec2, vec2); 3249 pshufb(vec1, vec2); 3250 } 3251 3252 bind(SCAN_TO_16_CHAR); 3253 cmpl(cnt1, stride); 3254 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entires left 3255 if (UseAVX < 2) { 3256 movdl(vec1, ch); 3257 pxor(vec2, vec2); 3258 pshufb(vec1, vec2); 3259 } 3260 movl(tmp, cnt1); 3261 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3262 andl(cnt1,0x0000000F); //tail count (in bytes) 3263 3264 bind(SCAN_TO_16_CHAR_LOOP); 3265 movdqu(vec3, Address(result, 0)); 3266 pcmpeqb(vec3, vec1); 3267 ptest(vec2, vec3); 3268 jcc(Assembler::carryClear, FOUND_CHAR); 3269 addptr(result, 16); 3270 subl(tmp, stride); 3271 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3272 3273 bind(SCAN_TO_CHAR_INIT); 3274 testl(cnt1, cnt1); 3275 jcc(Assembler::zero, RET_NOT_FOUND); 3276 bind(SCAN_TO_CHAR_LOOP); 3277 load_unsigned_byte(tmp, Address(result, 0)); 3278 cmpl(ch, tmp); 3279 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3280 addptr(result, 1); 3281 subl(cnt1, 1); 3282 jccb(Assembler::zero, RET_NOT_FOUND); 3283 jmp(SCAN_TO_CHAR_LOOP); 3284 3285 bind(RET_NOT_FOUND); 3286 movl(result, -1); 3287 jmpb(DONE_LABEL); 3288 3289 bind(FOUND_CHAR); 3290 if (UseAVX >= 2) { 3291 vpmovmskb(tmp, vec3); 3292 } else { 3293 pmovmskb(tmp, vec3); 3294 } 3295 bsfl(ch, tmp); 3296 addptr(result, ch); 3297 3298 bind(FOUND_SEQ_CHAR); 3299 subptr(result, str1); 3300 3301 bind(DONE_LABEL); 3302 } // stringL_indexof_char 3303 3304 // helper function for string_compare 3305 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3306 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3307 Address::ScaleFactor scale2, Register index, int ae) { 3308 if (ae == StrIntrinsicNode::LL) { 3309 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3310 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3311 } else if (ae == StrIntrinsicNode::UU) { 3312 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3313 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3314 } else { 3315 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3316 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3317 } 3318 } 3319 3320 // Compare strings, used for char[] and byte[]. 3321 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3322 Register cnt1, Register cnt2, Register result, 3323 XMMRegister vec1, int ae, KRegister mask) { 3324 ShortBranchVerifier sbv(this); 3325 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3326 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3327 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3328 int stride2x2 = 0x40; 3329 Address::ScaleFactor scale = Address::no_scale; 3330 Address::ScaleFactor scale1 = Address::no_scale; 3331 Address::ScaleFactor scale2 = Address::no_scale; 3332 3333 if (ae != StrIntrinsicNode::LL) { 3334 stride2x2 = 0x20; 3335 } 3336 3337 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3338 shrl(cnt2, 1); 3339 } 3340 // Compute the minimum of the string lengths and the 3341 // difference of the string lengths (stack). 3342 // Do the conditional move stuff 3343 movl(result, cnt1); 3344 subl(cnt1, cnt2); 3345 push(cnt1); 3346 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3347 3348 // Is the minimum length zero? 3349 testl(cnt2, cnt2); 3350 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3351 if (ae == StrIntrinsicNode::LL) { 3352 // Load first bytes 3353 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3354 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3355 } else if (ae == StrIntrinsicNode::UU) { 3356 // Load first characters 3357 load_unsigned_short(result, Address(str1, 0)); 3358 load_unsigned_short(cnt1, Address(str2, 0)); 3359 } else { 3360 load_unsigned_byte(result, Address(str1, 0)); 3361 load_unsigned_short(cnt1, Address(str2, 0)); 3362 } 3363 subl(result, cnt1); 3364 jcc(Assembler::notZero, POP_LABEL); 3365 3366 if (ae == StrIntrinsicNode::UU) { 3367 // Divide length by 2 to get number of chars 3368 shrl(cnt2, 1); 3369 } 3370 cmpl(cnt2, 1); 3371 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3372 3373 // Check if the strings start at the same location and setup scale and stride 3374 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3375 cmpptr(str1, str2); 3376 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3377 if (ae == StrIntrinsicNode::LL) { 3378 scale = Address::times_1; 3379 stride = 16; 3380 } else { 3381 scale = Address::times_2; 3382 stride = 8; 3383 } 3384 } else { 3385 scale1 = Address::times_1; 3386 scale2 = Address::times_2; 3387 // scale not used 3388 stride = 8; 3389 } 3390 3391 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3392 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3393 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3394 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3395 Label COMPARE_TAIL_LONG; 3396 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3397 3398 int pcmpmask = 0x19; 3399 if (ae == StrIntrinsicNode::LL) { 3400 pcmpmask &= ~0x01; 3401 } 3402 3403 // Setup to compare 16-chars (32-bytes) vectors, 3404 // start from first character again because it has aligned address. 3405 if (ae == StrIntrinsicNode::LL) { 3406 stride2 = 32; 3407 } else { 3408 stride2 = 16; 3409 } 3410 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3411 adr_stride = stride << scale; 3412 } else { 3413 adr_stride1 = 8; //stride << scale1; 3414 adr_stride2 = 16; //stride << scale2; 3415 } 3416 3417 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3418 // rax and rdx are used by pcmpestri as elements counters 3419 movl(result, cnt2); 3420 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3421 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3422 3423 // fast path : compare first 2 8-char vectors. 3424 bind(COMPARE_16_CHARS); 3425 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3426 movdqu(vec1, Address(str1, 0)); 3427 } else { 3428 pmovzxbw(vec1, Address(str1, 0)); 3429 } 3430 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3431 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3432 3433 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3434 movdqu(vec1, Address(str1, adr_stride)); 3435 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3436 } else { 3437 pmovzxbw(vec1, Address(str1, adr_stride1)); 3438 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3439 } 3440 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3441 addl(cnt1, stride); 3442 3443 // Compare the characters at index in cnt1 3444 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3445 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3446 subl(result, cnt2); 3447 jmp(POP_LABEL); 3448 3449 // Setup the registers to start vector comparison loop 3450 bind(COMPARE_WIDE_VECTORS); 3451 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3452 lea(str1, Address(str1, result, scale)); 3453 lea(str2, Address(str2, result, scale)); 3454 } else { 3455 lea(str1, Address(str1, result, scale1)); 3456 lea(str2, Address(str2, result, scale2)); 3457 } 3458 subl(result, stride2); 3459 subl(cnt2, stride2); 3460 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3461 negptr(result); 3462 3463 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3464 bind(COMPARE_WIDE_VECTORS_LOOP); 3465 3466 #ifdef _LP64 3467 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3468 cmpl(cnt2, stride2x2); 3469 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3470 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3471 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3472 3473 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3474 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3475 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3476 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3477 } else { 3478 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3479 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3480 } 3481 kortestql(mask, mask); 3482 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3483 addptr(result, stride2x2); // update since we already compared at this addr 3484 subl(cnt2, stride2x2); // and sub the size too 3485 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3486 3487 vpxor(vec1, vec1); 3488 jmpb(COMPARE_WIDE_TAIL); 3489 }//if (VM_Version::supports_avx512vlbw()) 3490 #endif // _LP64 3491 3492 3493 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3494 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3495 vmovdqu(vec1, Address(str1, result, scale)); 3496 vpxor(vec1, Address(str2, result, scale)); 3497 } else { 3498 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3499 vpxor(vec1, Address(str2, result, scale2)); 3500 } 3501 vptest(vec1, vec1); 3502 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3503 addptr(result, stride2); 3504 subl(cnt2, stride2); 3505 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3506 // clean upper bits of YMM registers 3507 vpxor(vec1, vec1); 3508 3509 // compare wide vectors tail 3510 bind(COMPARE_WIDE_TAIL); 3511 testptr(result, result); 3512 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3513 3514 movl(result, stride2); 3515 movl(cnt2, result); 3516 negptr(result); 3517 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3518 3519 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3520 bind(VECTOR_NOT_EQUAL); 3521 // clean upper bits of YMM registers 3522 vpxor(vec1, vec1); 3523 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3524 lea(str1, Address(str1, result, scale)); 3525 lea(str2, Address(str2, result, scale)); 3526 } else { 3527 lea(str1, Address(str1, result, scale1)); 3528 lea(str2, Address(str2, result, scale2)); 3529 } 3530 jmp(COMPARE_16_CHARS); 3531 3532 // Compare tail chars, length between 1 to 15 chars 3533 bind(COMPARE_TAIL_LONG); 3534 movl(cnt2, result); 3535 cmpl(cnt2, stride); 3536 jcc(Assembler::less, COMPARE_SMALL_STR); 3537 3538 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3539 movdqu(vec1, Address(str1, 0)); 3540 } else { 3541 pmovzxbw(vec1, Address(str1, 0)); 3542 } 3543 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3544 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3545 subptr(cnt2, stride); 3546 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3547 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3548 lea(str1, Address(str1, result, scale)); 3549 lea(str2, Address(str2, result, scale)); 3550 } else { 3551 lea(str1, Address(str1, result, scale1)); 3552 lea(str2, Address(str2, result, scale2)); 3553 } 3554 negptr(cnt2); 3555 jmpb(WHILE_HEAD_LABEL); 3556 3557 bind(COMPARE_SMALL_STR); 3558 } else if (UseSSE42Intrinsics) { 3559 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3560 int pcmpmask = 0x19; 3561 // Setup to compare 8-char (16-byte) vectors, 3562 // start from first character again because it has aligned address. 3563 movl(result, cnt2); 3564 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3565 if (ae == StrIntrinsicNode::LL) { 3566 pcmpmask &= ~0x01; 3567 } 3568 jcc(Assembler::zero, COMPARE_TAIL); 3569 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3570 lea(str1, Address(str1, result, scale)); 3571 lea(str2, Address(str2, result, scale)); 3572 } else { 3573 lea(str1, Address(str1, result, scale1)); 3574 lea(str2, Address(str2, result, scale2)); 3575 } 3576 negptr(result); 3577 3578 // pcmpestri 3579 // inputs: 3580 // vec1- substring 3581 // rax - negative string length (elements count) 3582 // mem - scanned string 3583 // rdx - string length (elements count) 3584 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3585 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3586 // outputs: 3587 // rcx - first mismatched element index 3588 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3589 3590 bind(COMPARE_WIDE_VECTORS); 3591 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3592 movdqu(vec1, Address(str1, result, scale)); 3593 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3594 } else { 3595 pmovzxbw(vec1, Address(str1, result, scale1)); 3596 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3597 } 3598 // After pcmpestri cnt1(rcx) contains mismatched element index 3599 3600 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3601 addptr(result, stride); 3602 subptr(cnt2, stride); 3603 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3604 3605 // compare wide vectors tail 3606 testptr(result, result); 3607 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3608 3609 movl(cnt2, stride); 3610 movl(result, stride); 3611 negptr(result); 3612 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3613 movdqu(vec1, Address(str1, result, scale)); 3614 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3615 } else { 3616 pmovzxbw(vec1, Address(str1, result, scale1)); 3617 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3618 } 3619 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3620 3621 // Mismatched characters in the vectors 3622 bind(VECTOR_NOT_EQUAL); 3623 addptr(cnt1, result); 3624 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3625 subl(result, cnt2); 3626 jmpb(POP_LABEL); 3627 3628 bind(COMPARE_TAIL); // limit is zero 3629 movl(cnt2, result); 3630 // Fallthru to tail compare 3631 } 3632 // Shift str2 and str1 to the end of the arrays, negate min 3633 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3634 lea(str1, Address(str1, cnt2, scale)); 3635 lea(str2, Address(str2, cnt2, scale)); 3636 } else { 3637 lea(str1, Address(str1, cnt2, scale1)); 3638 lea(str2, Address(str2, cnt2, scale2)); 3639 } 3640 decrementl(cnt2); // first character was compared already 3641 negptr(cnt2); 3642 3643 // Compare the rest of the elements 3644 bind(WHILE_HEAD_LABEL); 3645 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3646 subl(result, cnt1); 3647 jccb(Assembler::notZero, POP_LABEL); 3648 increment(cnt2); 3649 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3650 3651 // Strings are equal up to min length. Return the length difference. 3652 bind(LENGTH_DIFF_LABEL); 3653 pop(result); 3654 if (ae == StrIntrinsicNode::UU) { 3655 // Divide diff by 2 to get number of chars 3656 sarl(result, 1); 3657 } 3658 jmpb(DONE_LABEL); 3659 3660 #ifdef _LP64 3661 if (VM_Version::supports_avx512vlbw()) { 3662 3663 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3664 3665 kmovql(cnt1, mask); 3666 notq(cnt1); 3667 bsfq(cnt2, cnt1); 3668 if (ae != StrIntrinsicNode::LL) { 3669 // Divide diff by 2 to get number of chars 3670 sarl(cnt2, 1); 3671 } 3672 addq(result, cnt2); 3673 if (ae == StrIntrinsicNode::LL) { 3674 load_unsigned_byte(cnt1, Address(str2, result)); 3675 load_unsigned_byte(result, Address(str1, result)); 3676 } else if (ae == StrIntrinsicNode::UU) { 3677 load_unsigned_short(cnt1, Address(str2, result, scale)); 3678 load_unsigned_short(result, Address(str1, result, scale)); 3679 } else { 3680 load_unsigned_short(cnt1, Address(str2, result, scale2)); 3681 load_unsigned_byte(result, Address(str1, result, scale1)); 3682 } 3683 subl(result, cnt1); 3684 jmpb(POP_LABEL); 3685 }//if (VM_Version::supports_avx512vlbw()) 3686 #endif // _LP64 3687 3688 // Discard the stored length difference 3689 bind(POP_LABEL); 3690 pop(cnt1); 3691 3692 // That's it 3693 bind(DONE_LABEL); 3694 if(ae == StrIntrinsicNode::UL) { 3695 negl(result); 3696 } 3697 3698 } 3699 3700 // Search for Non-ASCII character (Negative byte value) in a byte array, 3701 // return true if it has any and false otherwise. 3702 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 3703 // @IntrinsicCandidate 3704 // private static boolean hasNegatives(byte[] ba, int off, int len) { 3705 // for (int i = off; i < off + len; i++) { 3706 // if (ba[i] < 0) { 3707 // return true; 3708 // } 3709 // } 3710 // return false; 3711 // } 3712 void C2_MacroAssembler::has_negatives(Register ary1, Register len, 3713 Register result, Register tmp1, 3714 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 3715 // rsi: byte array 3716 // rcx: len 3717 // rax: result 3718 ShortBranchVerifier sbv(this); 3719 assert_different_registers(ary1, len, result, tmp1); 3720 assert_different_registers(vec1, vec2); 3721 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 3722 3723 // len == 0 3724 testl(len, len); 3725 jcc(Assembler::zero, FALSE_LABEL); 3726 3727 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 3728 VM_Version::supports_avx512vlbw() && 3729 VM_Version::supports_bmi2()) { 3730 3731 Label test_64_loop, test_tail; 3732 Register tmp3_aliased = len; 3733 3734 movl(tmp1, len); 3735 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 3736 3737 andl(tmp1, 64 - 1); // tail count (in chars) 0x3F 3738 andl(len, ~(64 - 1)); // vector count (in chars) 3739 jccb(Assembler::zero, test_tail); 3740 3741 lea(ary1, Address(ary1, len, Address::times_1)); 3742 negptr(len); 3743 3744 bind(test_64_loop); 3745 // Check whether our 64 elements of size byte contain negatives 3746 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 3747 kortestql(mask1, mask1); 3748 jcc(Assembler::notZero, TRUE_LABEL); 3749 3750 addptr(len, 64); 3751 jccb(Assembler::notZero, test_64_loop); 3752 3753 3754 bind(test_tail); 3755 // bail out when there is nothing to be done 3756 testl(tmp1, -1); 3757 jcc(Assembler::zero, FALSE_LABEL); 3758 3759 // ~(~0 << len) applied up to two times (for 32-bit scenario) 3760 #ifdef _LP64 3761 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 3762 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 3763 notq(tmp3_aliased); 3764 kmovql(mask2, tmp3_aliased); 3765 #else 3766 Label k_init; 3767 jmp(k_init); 3768 3769 // We could not read 64-bits from a general purpose register thus we move 3770 // data required to compose 64 1's to the instruction stream 3771 // We emit 64 byte wide series of elements from 0..63 which later on would 3772 // be used as a compare targets with tail count contained in tmp1 register. 3773 // Result would be a k register having tmp1 consecutive number or 1 3774 // counting from least significant bit. 3775 address tmp = pc(); 3776 emit_int64(0x0706050403020100); 3777 emit_int64(0x0F0E0D0C0B0A0908); 3778 emit_int64(0x1716151413121110); 3779 emit_int64(0x1F1E1D1C1B1A1918); 3780 emit_int64(0x2726252423222120); 3781 emit_int64(0x2F2E2D2C2B2A2928); 3782 emit_int64(0x3736353433323130); 3783 emit_int64(0x3F3E3D3C3B3A3938); 3784 3785 bind(k_init); 3786 lea(len, InternalAddress(tmp)); 3787 // create mask to test for negative byte inside a vector 3788 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 3789 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 3790 3791 #endif 3792 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 3793 ktestq(mask1, mask2); 3794 jcc(Assembler::notZero, TRUE_LABEL); 3795 3796 jmp(FALSE_LABEL); 3797 } else { 3798 movl(result, len); // copy 3799 3800 if (UseAVX >= 2 && UseSSE >= 2) { 3801 // With AVX2, use 32-byte vector compare 3802 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3803 3804 // Compare 32-byte vectors 3805 andl(result, 0x0000001f); // tail count (in bytes) 3806 andl(len, 0xffffffe0); // vector count (in bytes) 3807 jccb(Assembler::zero, COMPARE_TAIL); 3808 3809 lea(ary1, Address(ary1, len, Address::times_1)); 3810 negptr(len); 3811 3812 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 3813 movdl(vec2, tmp1); 3814 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 3815 3816 bind(COMPARE_WIDE_VECTORS); 3817 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 3818 vptest(vec1, vec2); 3819 jccb(Assembler::notZero, TRUE_LABEL); 3820 addptr(len, 32); 3821 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3822 3823 testl(result, result); 3824 jccb(Assembler::zero, FALSE_LABEL); 3825 3826 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 3827 vptest(vec1, vec2); 3828 jccb(Assembler::notZero, TRUE_LABEL); 3829 jmpb(FALSE_LABEL); 3830 3831 bind(COMPARE_TAIL); // len is zero 3832 movl(len, result); 3833 // Fallthru to tail compare 3834 } else if (UseSSE42Intrinsics) { 3835 // With SSE4.2, use double quad vector compare 3836 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3837 3838 // Compare 16-byte vectors 3839 andl(result, 0x0000000f); // tail count (in bytes) 3840 andl(len, 0xfffffff0); // vector count (in bytes) 3841 jcc(Assembler::zero, COMPARE_TAIL); 3842 3843 lea(ary1, Address(ary1, len, Address::times_1)); 3844 negptr(len); 3845 3846 movl(tmp1, 0x80808080); 3847 movdl(vec2, tmp1); 3848 pshufd(vec2, vec2, 0); 3849 3850 bind(COMPARE_WIDE_VECTORS); 3851 movdqu(vec1, Address(ary1, len, Address::times_1)); 3852 ptest(vec1, vec2); 3853 jcc(Assembler::notZero, TRUE_LABEL); 3854 addptr(len, 16); 3855 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3856 3857 testl(result, result); 3858 jcc(Assembler::zero, FALSE_LABEL); 3859 3860 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 3861 ptest(vec1, vec2); 3862 jccb(Assembler::notZero, TRUE_LABEL); 3863 jmpb(FALSE_LABEL); 3864 3865 bind(COMPARE_TAIL); // len is zero 3866 movl(len, result); 3867 // Fallthru to tail compare 3868 } 3869 } 3870 // Compare 4-byte vectors 3871 andl(len, 0xfffffffc); // vector count (in bytes) 3872 jccb(Assembler::zero, COMPARE_CHAR); 3873 3874 lea(ary1, Address(ary1, len, Address::times_1)); 3875 negptr(len); 3876 3877 bind(COMPARE_VECTORS); 3878 movl(tmp1, Address(ary1, len, Address::times_1)); 3879 andl(tmp1, 0x80808080); 3880 jccb(Assembler::notZero, TRUE_LABEL); 3881 addptr(len, 4); 3882 jcc(Assembler::notZero, COMPARE_VECTORS); 3883 3884 // Compare trailing char (final 2 bytes), if any 3885 bind(COMPARE_CHAR); 3886 testl(result, 0x2); // tail char 3887 jccb(Assembler::zero, COMPARE_BYTE); 3888 load_unsigned_short(tmp1, Address(ary1, 0)); 3889 andl(tmp1, 0x00008080); 3890 jccb(Assembler::notZero, TRUE_LABEL); 3891 subptr(result, 2); 3892 lea(ary1, Address(ary1, 2)); 3893 3894 bind(COMPARE_BYTE); 3895 testl(result, 0x1); // tail byte 3896 jccb(Assembler::zero, FALSE_LABEL); 3897 load_unsigned_byte(tmp1, Address(ary1, 0)); 3898 andl(tmp1, 0x00000080); 3899 jccb(Assembler::notEqual, TRUE_LABEL); 3900 jmpb(FALSE_LABEL); 3901 3902 bind(TRUE_LABEL); 3903 movl(result, 1); // return true 3904 jmpb(DONE); 3905 3906 bind(FALSE_LABEL); 3907 xorl(result, result); // return false 3908 3909 // That's it 3910 bind(DONE); 3911 if (UseAVX >= 2 && UseSSE >= 2) { 3912 // clean upper bits of YMM registers 3913 vpxor(vec1, vec1); 3914 vpxor(vec2, vec2); 3915 } 3916 } 3917 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 3918 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 3919 Register limit, Register result, Register chr, 3920 XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) { 3921 ShortBranchVerifier sbv(this); 3922 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 3923 3924 int length_offset = arrayOopDesc::length_offset_in_bytes(); 3925 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 3926 3927 if (is_array_equ) { 3928 // Check the input args 3929 cmpoop(ary1, ary2); 3930 jcc(Assembler::equal, TRUE_LABEL); 3931 3932 // Need additional checks for arrays_equals. 3933 testptr(ary1, ary1); 3934 jcc(Assembler::zero, FALSE_LABEL); 3935 testptr(ary2, ary2); 3936 jcc(Assembler::zero, FALSE_LABEL); 3937 3938 // Check the lengths 3939 movl(limit, Address(ary1, length_offset)); 3940 cmpl(limit, Address(ary2, length_offset)); 3941 jcc(Assembler::notEqual, FALSE_LABEL); 3942 } 3943 3944 // count == 0 3945 testl(limit, limit); 3946 jcc(Assembler::zero, TRUE_LABEL); 3947 3948 if (is_array_equ) { 3949 // Load array address 3950 lea(ary1, Address(ary1, base_offset)); 3951 lea(ary2, Address(ary2, base_offset)); 3952 } 3953 3954 if (is_array_equ && is_char) { 3955 // arrays_equals when used for char[]. 3956 shll(limit, 1); // byte count != 0 3957 } 3958 movl(result, limit); // copy 3959 3960 if (UseAVX >= 2) { 3961 // With AVX2, use 32-byte vector compare 3962 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3963 3964 // Compare 32-byte vectors 3965 andl(result, 0x0000001f); // tail count (in bytes) 3966 andl(limit, 0xffffffe0); // vector count (in bytes) 3967 jcc(Assembler::zero, COMPARE_TAIL); 3968 3969 lea(ary1, Address(ary1, limit, Address::times_1)); 3970 lea(ary2, Address(ary2, limit, Address::times_1)); 3971 negptr(limit); 3972 3973 #ifdef _LP64 3974 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3975 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 3976 3977 cmpl(limit, -64); 3978 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3979 3980 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3981 3982 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 3983 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 3984 kortestql(mask, mask); 3985 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 3986 addptr(limit, 64); // update since we already compared at this addr 3987 cmpl(limit, -64); 3988 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3989 3990 // At this point we may still need to compare -limit+result bytes. 3991 // We could execute the next two instruction and just continue via non-wide path: 3992 // cmpl(limit, 0); 3993 // jcc(Assembler::equal, COMPARE_TAIL); // true 3994 // But since we stopped at the points ary{1,2}+limit which are 3995 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 3996 // (|limit| <= 32 and result < 32), 3997 // we may just compare the last 64 bytes. 3998 // 3999 addptr(result, -64); // it is safe, bc we just came from this area 4000 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4001 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4002 kortestql(mask, mask); 4003 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4004 4005 jmp(TRUE_LABEL); 4006 4007 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4008 4009 }//if (VM_Version::supports_avx512vlbw()) 4010 #endif //_LP64 4011 bind(COMPARE_WIDE_VECTORS); 4012 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); 4013 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4014 vpxor(vec1, vec2); 4015 4016 vptest(vec1, vec1); 4017 jcc(Assembler::notZero, FALSE_LABEL); 4018 addptr(limit, 32); 4019 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4020 4021 testl(result, result); 4022 jcc(Assembler::zero, TRUE_LABEL); 4023 4024 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 4025 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4026 vpxor(vec1, vec2); 4027 4028 vptest(vec1, vec1); 4029 jccb(Assembler::notZero, FALSE_LABEL); 4030 jmpb(TRUE_LABEL); 4031 4032 bind(COMPARE_TAIL); // limit is zero 4033 movl(limit, result); 4034 // Fallthru to tail compare 4035 } else if (UseSSE42Intrinsics) { 4036 // With SSE4.2, use double quad vector compare 4037 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4038 4039 // Compare 16-byte vectors 4040 andl(result, 0x0000000f); // tail count (in bytes) 4041 andl(limit, 0xfffffff0); // vector count (in bytes) 4042 jcc(Assembler::zero, COMPARE_TAIL); 4043 4044 lea(ary1, Address(ary1, limit, Address::times_1)); 4045 lea(ary2, Address(ary2, limit, Address::times_1)); 4046 negptr(limit); 4047 4048 bind(COMPARE_WIDE_VECTORS); 4049 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4050 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4051 pxor(vec1, vec2); 4052 4053 ptest(vec1, vec1); 4054 jcc(Assembler::notZero, FALSE_LABEL); 4055 addptr(limit, 16); 4056 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4057 4058 testl(result, result); 4059 jcc(Assembler::zero, TRUE_LABEL); 4060 4061 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4062 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4063 pxor(vec1, vec2); 4064 4065 ptest(vec1, vec1); 4066 jccb(Assembler::notZero, FALSE_LABEL); 4067 jmpb(TRUE_LABEL); 4068 4069 bind(COMPARE_TAIL); // limit is zero 4070 movl(limit, result); 4071 // Fallthru to tail compare 4072 } 4073 4074 // Compare 4-byte vectors 4075 andl(limit, 0xfffffffc); // vector count (in bytes) 4076 jccb(Assembler::zero, COMPARE_CHAR); 4077 4078 lea(ary1, Address(ary1, limit, Address::times_1)); 4079 lea(ary2, Address(ary2, limit, Address::times_1)); 4080 negptr(limit); 4081 4082 bind(COMPARE_VECTORS); 4083 movl(chr, Address(ary1, limit, Address::times_1)); 4084 cmpl(chr, Address(ary2, limit, Address::times_1)); 4085 jccb(Assembler::notEqual, FALSE_LABEL); 4086 addptr(limit, 4); 4087 jcc(Assembler::notZero, COMPARE_VECTORS); 4088 4089 // Compare trailing char (final 2 bytes), if any 4090 bind(COMPARE_CHAR); 4091 testl(result, 0x2); // tail char 4092 jccb(Assembler::zero, COMPARE_BYTE); 4093 load_unsigned_short(chr, Address(ary1, 0)); 4094 load_unsigned_short(limit, Address(ary2, 0)); 4095 cmpl(chr, limit); 4096 jccb(Assembler::notEqual, FALSE_LABEL); 4097 4098 if (is_array_equ && is_char) { 4099 bind(COMPARE_BYTE); 4100 } else { 4101 lea(ary1, Address(ary1, 2)); 4102 lea(ary2, Address(ary2, 2)); 4103 4104 bind(COMPARE_BYTE); 4105 testl(result, 0x1); // tail byte 4106 jccb(Assembler::zero, TRUE_LABEL); 4107 load_unsigned_byte(chr, Address(ary1, 0)); 4108 load_unsigned_byte(limit, Address(ary2, 0)); 4109 cmpl(chr, limit); 4110 jccb(Assembler::notEqual, FALSE_LABEL); 4111 } 4112 bind(TRUE_LABEL); 4113 movl(result, 1); // return true 4114 jmpb(DONE); 4115 4116 bind(FALSE_LABEL); 4117 xorl(result, result); // return false 4118 4119 // That's it 4120 bind(DONE); 4121 if (UseAVX >= 2) { 4122 // clean upper bits of YMM registers 4123 vpxor(vec1, vec1); 4124 vpxor(vec2, vec2); 4125 } 4126 } 4127 4128 #ifdef _LP64 4129 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 4130 Register tmp, KRegister ktmp, int masklen, int vec_enc) { 4131 assert(VM_Version::supports_avx512vlbw(), ""); 4132 vpxor(xtmp, xtmp, xtmp, vec_enc); 4133 vpsubb(xtmp, xtmp, mask, vec_enc); 4134 evpmovb2m(ktmp, xtmp, vec_enc); 4135 kmovql(tmp, ktmp); 4136 switch(opc) { 4137 case Op_VectorMaskTrueCount: 4138 popcntq(dst, tmp); 4139 break; 4140 case Op_VectorMaskLastTrue: 4141 mov64(dst, -1); 4142 bsrq(tmp, tmp); 4143 cmov(Assembler::notZero, dst, tmp); 4144 break; 4145 case Op_VectorMaskFirstTrue: 4146 mov64(dst, masklen); 4147 bsfq(tmp, tmp); 4148 cmov(Assembler::notZero, dst, tmp); 4149 break; 4150 default: assert(false, "Unhandled mask operation"); 4151 } 4152 } 4153 4154 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 4155 XMMRegister xtmp1, Register tmp, int masklen, int vec_enc) { 4156 assert(VM_Version::supports_avx(), ""); 4157 vpxor(xtmp, xtmp, xtmp, vec_enc); 4158 vpsubb(xtmp, xtmp, mask, vec_enc); 4159 vpmovmskb(tmp, xtmp, vec_enc); 4160 if (masklen < 64) { 4161 andq(tmp, (((jlong)1 << masklen) - 1)); 4162 } 4163 switch(opc) { 4164 case Op_VectorMaskTrueCount: 4165 popcntq(dst, tmp); 4166 break; 4167 case Op_VectorMaskLastTrue: 4168 mov64(dst, -1); 4169 bsrq(tmp, tmp); 4170 cmov(Assembler::notZero, dst, tmp); 4171 break; 4172 case Op_VectorMaskFirstTrue: 4173 mov64(dst, masklen); 4174 bsfq(tmp, tmp); 4175 cmov(Assembler::notZero, dst, tmp); 4176 break; 4177 default: assert(false, "Unhandled mask operation"); 4178 } 4179 } 4180 #endif 4181 4182 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 4183 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 4184 int vlen_enc) { 4185 assert(VM_Version::supports_avx512bw(), ""); 4186 // Byte shuffles are inlane operations and indices are determined using 4187 // lower 4 bit of each shuffle lane, thus all shuffle indices are 4188 // normalized to index range 0-15. This makes sure that all the multiples 4189 // of an index value are placed at same relative position in 128 bit 4190 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 4191 // will be 16th element in their respective 128 bit lanes. 4192 movl(rtmp, 16); 4193 evpbroadcastb(xtmp1, rtmp, vlen_enc); 4194 4195 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 4196 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 4197 // original shuffle indices and move the shuffled lanes corresponding to true 4198 // mask to destination vector. 4199 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 4200 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 4201 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 4202 4203 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 4204 // and broadcasting second 128 bit lane. 4205 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 4206 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 4207 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 4208 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 4209 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 4210 4211 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 4212 // and broadcasting third 128 bit lane. 4213 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 4214 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 4215 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 4216 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 4217 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 4218 4219 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 4220 // and broadcasting third 128 bit lane. 4221 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 4222 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 4223 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 4224 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 4225 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 4226 } 4227 4228 #ifdef _LP64 4229 void C2_MacroAssembler::load_nklass_compact_c2(Register dst, Register obj, Register index, Address::ScaleFactor scale, int disp) { 4230 C2LoadNKlassStub* stub = new (Compile::current()->comp_arena()) C2LoadNKlassStub(dst); 4231 Compile::current()->output()->add_stub(stub); 4232 4233 // Note: Don't clobber obj anywhere in that method! 4234 4235 // The incoming address is pointing into obj-start + klass_offset_in_bytes. We need to extract 4236 // obj-start, so that we can load from the object's mark-word instead. Usually the address 4237 // comes as obj-start in obj and klass_offset_in_bytes in disp. However, sometimes C2 4238 // emits code that pre-computes obj-start + klass_offset_in_bytes into a register, and 4239 // then passes that register as obj and 0 in disp. The following code extracts the base 4240 // and offset to load the mark-word. 4241 int offset = oopDesc::mark_offset_in_bytes() + disp - oopDesc::klass_offset_in_bytes(); 4242 movq(dst, Address(obj, index, scale, offset)); 4243 testb(dst, markWord::monitor_value); 4244 jcc(Assembler::notZero, stub->entry()); 4245 bind(stub->continuation()); 4246 shrq(dst, markWord::klass_shift); 4247 } 4248 #endif