1 /* 2 * Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "oops/methodData.hpp" 29 #include "opto/c2_CodeStubs.hpp" 30 #include "opto/c2_MacroAssembler.hpp" 31 #include "opto/intrinsicnode.hpp" 32 #include "opto/opcodes.hpp" 33 #include "opto/output.hpp" 34 #include "opto/subnode.hpp" 35 #include "runtime/biasedLocking.hpp" 36 #include "runtime/objectMonitor.hpp" 37 #include "runtime/stubRoutines.hpp" 38 39 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 40 switch (vlen_in_bytes) { 41 case 4: // fall-through 42 case 8: // fall-through 43 case 16: return Assembler::AVX_128bit; 44 case 32: return Assembler::AVX_256bit; 45 case 64: return Assembler::AVX_512bit; 46 47 default: { 48 ShouldNotReachHere(); 49 return Assembler::AVX_NoVec; 50 } 51 } 52 } 53 54 void C2_MacroAssembler::setvectmask(Register dst, Register src, KRegister mask) { 55 guarantee(PostLoopMultiversioning, "must be"); 56 Assembler::movl(dst, 1); 57 Assembler::shlxl(dst, dst, src); 58 Assembler::decl(dst); 59 Assembler::kmovdl(mask, dst); 60 Assembler::movl(dst, src); 61 } 62 63 void C2_MacroAssembler::restorevectmask(KRegister mask) { 64 guarantee(PostLoopMultiversioning, "must be"); 65 Assembler::knotwl(mask, k0); 66 } 67 68 #if INCLUDE_RTM_OPT 69 70 // Update rtm_counters based on abort status 71 // input: abort_status 72 // rtm_counters (RTMLockingCounters*) 73 // flags are killed 74 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) { 75 76 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset())); 77 if (PrintPreciseRTMLockingStatistics) { 78 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 79 Label check_abort; 80 testl(abort_status, (1<<i)); 81 jccb(Assembler::equal, check_abort); 82 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx)))); 83 bind(check_abort); 84 } 85 } 86 } 87 88 // Branch if (random & (count-1) != 0), count is 2^n 89 // tmp, scr and flags are killed 90 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) { 91 assert(tmp == rax, ""); 92 assert(scr == rdx, ""); 93 rdtsc(); // modifies EDX:EAX 94 andptr(tmp, count-1); 95 jccb(Assembler::notZero, brLabel); 96 } 97 98 // Perform abort ratio calculation, set no_rtm bit if high ratio 99 // input: rtm_counters_Reg (RTMLockingCounters* address) 100 // tmpReg, rtm_counters_Reg and flags are killed 101 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg, 102 Register rtm_counters_Reg, 103 RTMLockingCounters* rtm_counters, 104 Metadata* method_data) { 105 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 106 107 if (RTMLockingCalculationDelay > 0) { 108 // Delay calculation 109 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg); 110 testptr(tmpReg, tmpReg); 111 jccb(Assembler::equal, L_done); 112 } 113 // Abort ratio calculation only if abort_count > RTMAbortThreshold 114 // Aborted transactions = abort_count * 100 115 // All transactions = total_count * RTMTotalCountIncrRate 116 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 117 118 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset())); 119 cmpptr(tmpReg, RTMAbortThreshold); 120 jccb(Assembler::below, L_check_always_rtm2); 121 imulptr(tmpReg, tmpReg, 100); 122 123 Register scrReg = rtm_counters_Reg; 124 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 125 imulptr(scrReg, scrReg, RTMTotalCountIncrRate); 126 imulptr(scrReg, scrReg, RTMAbortRatio); 127 cmpptr(tmpReg, scrReg); 128 jccb(Assembler::below, L_check_always_rtm1); 129 if (method_data != NULL) { 130 // set rtm_state to "no rtm" in MDO 131 mov_metadata(tmpReg, method_data); 132 lock(); 133 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM); 134 } 135 jmpb(L_done); 136 bind(L_check_always_rtm1); 137 // Reload RTMLockingCounters* address 138 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 139 bind(L_check_always_rtm2); 140 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 141 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 142 jccb(Assembler::below, L_done); 143 if (method_data != NULL) { 144 // set rtm_state to "always rtm" in MDO 145 mov_metadata(tmpReg, method_data); 146 lock(); 147 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM); 148 } 149 bind(L_done); 150 } 151 152 // Update counters and perform abort ratio calculation 153 // input: abort_status_Reg 154 // rtm_counters_Reg, flags are killed 155 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg, 156 Register rtm_counters_Reg, 157 RTMLockingCounters* rtm_counters, 158 Metadata* method_data, 159 bool profile_rtm) { 160 161 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 162 // update rtm counters based on rax value at abort 163 // reads abort_status_Reg, updates flags 164 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 165 rtm_counters_update(abort_status_Reg, rtm_counters_Reg); 166 if (profile_rtm) { 167 // Save abort status because abort_status_Reg is used by following code. 168 if (RTMRetryCount > 0) { 169 push(abort_status_Reg); 170 } 171 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 172 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data); 173 // restore abort status 174 if (RTMRetryCount > 0) { 175 pop(abort_status_Reg); 176 } 177 } 178 } 179 180 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4) 181 // inputs: retry_count_Reg 182 // : abort_status_Reg 183 // output: retry_count_Reg decremented by 1 184 // flags are killed 185 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) { 186 Label doneRetry; 187 assert(abort_status_Reg == rax, ""); 188 // The abort reason bits are in eax (see all states in rtmLocking.hpp) 189 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) 190 // if reason is in 0x6 and retry count != 0 then retry 191 andptr(abort_status_Reg, 0x6); 192 jccb(Assembler::zero, doneRetry); 193 testl(retry_count_Reg, retry_count_Reg); 194 jccb(Assembler::zero, doneRetry); 195 pause(); 196 decrementl(retry_count_Reg); 197 jmp(retryLabel); 198 bind(doneRetry); 199 } 200 201 // Spin and retry if lock is busy, 202 // inputs: box_Reg (monitor address) 203 // : retry_count_Reg 204 // output: retry_count_Reg decremented by 1 205 // : clear z flag if retry count exceeded 206 // tmp_Reg, scr_Reg, flags are killed 207 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg, 208 Register tmp_Reg, Register scr_Reg, Label& retryLabel) { 209 Label SpinLoop, SpinExit, doneRetry; 210 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 211 212 testl(retry_count_Reg, retry_count_Reg); 213 jccb(Assembler::zero, doneRetry); 214 decrementl(retry_count_Reg); 215 movptr(scr_Reg, RTMSpinLoopCount); 216 217 bind(SpinLoop); 218 pause(); 219 decrementl(scr_Reg); 220 jccb(Assembler::lessEqual, SpinExit); 221 movptr(tmp_Reg, Address(box_Reg, owner_offset)); 222 testptr(tmp_Reg, tmp_Reg); 223 jccb(Assembler::notZero, SpinLoop); 224 225 bind(SpinExit); 226 jmp(retryLabel); 227 bind(doneRetry); 228 incrementl(retry_count_Reg); // clear z flag 229 } 230 231 // Use RTM for normal stack locks 232 // Input: objReg (object to lock) 233 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg, 234 Register retry_on_abort_count_Reg, 235 RTMLockingCounters* stack_rtm_counters, 236 Metadata* method_data, bool profile_rtm, 237 Label& DONE_LABEL, Label& IsInflated) { 238 assert(UseRTMForStackLocks, "why call this otherwise?"); 239 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 240 assert(tmpReg == rax, ""); 241 assert(scrReg == rdx, ""); 242 Label L_rtm_retry, L_decrement_retry, L_on_abort; 243 244 if (RTMRetryCount > 0) { 245 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 246 bind(L_rtm_retry); 247 } 248 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 249 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased 250 jcc(Assembler::notZero, IsInflated); 251 252 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 253 Label L_noincrement; 254 if (RTMTotalCountIncrRate > 1) { 255 // tmpReg, scrReg and flags are killed 256 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 257 } 258 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 259 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg); 260 bind(L_noincrement); 261 } 262 xbegin(L_on_abort); 263 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 264 andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits 265 cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked 266 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked 267 268 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 269 if (UseRTMXendForLockBusy) { 270 xend(); 271 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry) 272 jmp(L_decrement_retry); 273 } 274 else { 275 xabort(0); 276 } 277 bind(L_on_abort); 278 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 279 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm); 280 } 281 bind(L_decrement_retry); 282 if (RTMRetryCount > 0) { 283 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 284 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 285 } 286 } 287 288 // Use RTM for inflating locks 289 // inputs: objReg (object to lock) 290 // boxReg (on-stack box address (displaced header location) - KILLED) 291 // tmpReg (ObjectMonitor address + markWord::monitor_value) 292 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg, 293 Register scrReg, Register retry_on_busy_count_Reg, 294 Register retry_on_abort_count_Reg, 295 RTMLockingCounters* rtm_counters, 296 Metadata* method_data, bool profile_rtm, 297 Label& DONE_LABEL) { 298 assert(UseRTMLocking, "why call this otherwise?"); 299 assert(tmpReg == rax, ""); 300 assert(scrReg == rdx, ""); 301 Label L_rtm_retry, L_decrement_retry, L_on_abort; 302 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 303 304 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 305 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value())); 306 movptr(boxReg, tmpReg); // Save ObjectMonitor address 307 308 if (RTMRetryCount > 0) { 309 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy 310 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 311 bind(L_rtm_retry); 312 } 313 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 314 Label L_noincrement; 315 if (RTMTotalCountIncrRate > 1) { 316 // tmpReg, scrReg and flags are killed 317 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 318 } 319 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 320 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg); 321 bind(L_noincrement); 322 } 323 xbegin(L_on_abort); 324 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 325 movptr(tmpReg, Address(tmpReg, owner_offset)); 326 testptr(tmpReg, tmpReg); 327 jcc(Assembler::zero, DONE_LABEL); 328 if (UseRTMXendForLockBusy) { 329 xend(); 330 jmp(L_decrement_retry); 331 } 332 else { 333 xabort(0); 334 } 335 bind(L_on_abort); 336 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 337 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 338 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm); 339 } 340 if (RTMRetryCount > 0) { 341 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 342 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 343 } 344 345 movptr(tmpReg, Address(boxReg, owner_offset)) ; 346 testptr(tmpReg, tmpReg) ; 347 jccb(Assembler::notZero, L_decrement_retry) ; 348 349 // Appears unlocked - try to swing _owner from null to non-null. 350 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 351 #ifdef _LP64 352 Register threadReg = r15_thread; 353 #else 354 get_thread(scrReg); 355 Register threadReg = scrReg; 356 #endif 357 lock(); 358 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg 359 360 if (RTMRetryCount > 0) { 361 // success done else retry 362 jccb(Assembler::equal, DONE_LABEL) ; 363 bind(L_decrement_retry); 364 // Spin and retry if lock is busy. 365 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry); 366 } 367 else { 368 bind(L_decrement_retry); 369 } 370 } 371 372 #endif // INCLUDE_RTM_OPT 373 374 // fast_lock and fast_unlock used by C2 375 376 // Because the transitions from emitted code to the runtime 377 // monitorenter/exit helper stubs are so slow it's critical that 378 // we inline both the stack-locking fast path and the inflated fast path. 379 // 380 // See also: cmpFastLock and cmpFastUnlock. 381 // 382 // What follows is a specialized inline transliteration of the code 383 // in enter() and exit(). If we're concerned about I$ bloat another 384 // option would be to emit TrySlowEnter and TrySlowExit methods 385 // at startup-time. These methods would accept arguments as 386 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 387 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 388 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 389 // In practice, however, the # of lock sites is bounded and is usually small. 390 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 391 // if the processor uses simple bimodal branch predictors keyed by EIP 392 // Since the helper routines would be called from multiple synchronization 393 // sites. 394 // 395 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 396 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 397 // to those specialized methods. That'd give us a mostly platform-independent 398 // implementation that the JITs could optimize and inline at their pleasure. 399 // Done correctly, the only time we'd need to cross to native could would be 400 // to park() or unpark() threads. We'd also need a few more unsafe operators 401 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 402 // (b) explicit barriers or fence operations. 403 // 404 // TODO: 405 // 406 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 407 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 408 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 409 // the lock operators would typically be faster than reifying Self. 410 // 411 // * Ideally I'd define the primitives as: 412 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 413 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 414 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 415 // Instead, we're stuck with a rather awkward and brittle register assignments below. 416 // Furthermore the register assignments are overconstrained, possibly resulting in 417 // sub-optimal code near the synchronization site. 418 // 419 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 420 // Alternately, use a better sp-proximity test. 421 // 422 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 423 // Either one is sufficient to uniquely identify a thread. 424 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 425 // 426 // * Intrinsify notify() and notifyAll() for the common cases where the 427 // object is locked by the calling thread but the waitlist is empty. 428 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 429 // 430 // * use jccb and jmpb instead of jcc and jmp to improve code density. 431 // But beware of excessive branch density on AMD Opterons. 432 // 433 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 434 // or failure of the fast path. If the fast path fails then we pass 435 // control to the slow path, typically in C. In fast_lock and 436 // fast_unlock we often branch to DONE_LABEL, just to find that C2 437 // will emit a conditional branch immediately after the node. 438 // So we have branches to branches and lots of ICC.ZF games. 439 // Instead, it might be better to have C2 pass a "FailureLabel" 440 // into fast_lock and fast_unlock. In the case of success, control 441 // will drop through the node. ICC.ZF is undefined at exit. 442 // In the case of failure, the node will branch directly to the 443 // FailureLabel 444 445 446 // obj: object to lock 447 // box: on-stack box address (displaced header location) - KILLED 448 // rax,: tmp -- KILLED 449 // scr: tmp -- KILLED 450 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 451 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 452 BiasedLockingCounters* counters, 453 RTMLockingCounters* rtm_counters, 454 RTMLockingCounters* stack_rtm_counters, 455 Metadata* method_data, 456 bool use_rtm, bool profile_rtm) { 457 // Ensure the register assignments are disjoint 458 assert(tmpReg == rax, ""); 459 460 if (use_rtm) { 461 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg); 462 } else { 463 assert(cx2Reg == noreg, ""); 464 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 465 } 466 467 if (counters != NULL) { 468 atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg); 469 } 470 471 // Possible cases that we'll encounter in fast_lock 472 // ------------------------------------------------ 473 // * Inflated 474 // -- unlocked 475 // -- Locked 476 // = by self 477 // = by other 478 // * biased 479 // -- by Self 480 // -- by other 481 // * neutral 482 // * stack-locked 483 // -- by self 484 // = sp-proximity test hits 485 // = sp-proximity test generates false-negative 486 // -- by other 487 // 488 489 Label IsInflated, DONE_LABEL; 490 491 if (DiagnoseSyncOnValueBasedClasses != 0) { 492 load_klass(tmpReg, objReg, cx1Reg); 493 movl(tmpReg, Address(tmpReg, Klass::access_flags_offset())); 494 testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS); 495 jcc(Assembler::notZero, DONE_LABEL); 496 } 497 498 // it's stack-locked, biased or neutral 499 // TODO: optimize away redundant LDs of obj->mark and improve the markword triage 500 // order to reduce the number of conditional branches in the most common cases. 501 // Beware -- there's a subtle invariant that fetch of the markword 502 // at [FETCH], below, will never observe a biased encoding (*101b). 503 // If this invariant is not held we risk exclusion (safety) failure. 504 if (UseBiasedLocking && !UseOptoBiasInlining) { 505 biased_locking_enter(boxReg, objReg, tmpReg, scrReg, cx1Reg, false, DONE_LABEL, NULL, counters); 506 } 507 508 #if INCLUDE_RTM_OPT 509 if (UseRTMForStackLocks && use_rtm) { 510 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg, 511 stack_rtm_counters, method_data, profile_rtm, 512 DONE_LABEL, IsInflated); 513 } 514 #endif // INCLUDE_RTM_OPT 515 516 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 517 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased 518 jcc(Assembler::notZero, IsInflated); 519 520 if (LockingMode == LM_MONITOR) { 521 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 522 testptr(objReg, objReg); 523 } else if (LockingMode == LM_LEGACY) { 524 // Attempt stack-locking ... 525 orptr (tmpReg, markWord::unlocked_value); 526 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 527 lock(); 528 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 529 if (counters != NULL) { 530 cond_inc32(Assembler::equal, 531 ExternalAddress((address)counters->fast_path_entry_count_addr())); 532 } 533 jcc(Assembler::equal, DONE_LABEL); // Success 534 535 // Recursive locking. 536 // The object is stack-locked: markword contains stack pointer to BasicLock. 537 // Locked by current thread if difference with current SP is less than one page. 538 subptr(tmpReg, rsp); 539 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 540 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) ); 541 movptr(Address(boxReg, 0), tmpReg); 542 if (counters != NULL) { 543 cond_inc32(Assembler::equal, 544 ExternalAddress((address)counters->fast_path_entry_count_addr())); 545 } 546 } else { 547 assert(LockingMode == LM_LIGHTWEIGHT, ""); 548 fast_lock_impl(objReg, tmpReg, thread, scrReg, DONE_LABEL); 549 xorl(tmpReg, tmpReg); // Set ZF=1 to indicate success 550 } 551 jmp(DONE_LABEL); 552 553 bind(IsInflated); 554 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 555 556 #if INCLUDE_RTM_OPT 557 // Use the same RTM locking code in 32- and 64-bit VM. 558 if (use_rtm) { 559 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg, 560 rtm_counters, method_data, profile_rtm, DONE_LABEL); 561 } else { 562 #endif // INCLUDE_RTM_OPT 563 564 #ifndef _LP64 565 // The object is inflated. 566 567 // boxReg refers to the on-stack BasicLock in the current frame. 568 // We'd like to write: 569 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 570 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 571 // additional latency as we have another ST in the store buffer that must drain. 572 573 // avoid ST-before-CAS 574 // register juggle because we need tmpReg for cmpxchgptr below 575 movptr(scrReg, boxReg); 576 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 577 578 // Optimistic form: consider XORL tmpReg,tmpReg 579 movptr(tmpReg, NULL_WORD); 580 581 // Appears unlocked - try to swing _owner from null to non-null. 582 // Ideally, I'd manifest "Self" with get_thread and then attempt 583 // to CAS the register containing Self into m->Owner. 584 // But we don't have enough registers, so instead we can either try to CAS 585 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 586 // we later store "Self" into m->Owner. Transiently storing a stack address 587 // (rsp or the address of the box) into m->owner is harmless. 588 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 589 lock(); 590 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 591 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 592 // If we weren't able to swing _owner from NULL to the BasicLock 593 // then take the slow path. 594 jccb (Assembler::notZero, DONE_LABEL); 595 // update _owner from BasicLock to thread 596 get_thread (scrReg); // beware: clobbers ICCs 597 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 598 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 599 600 // If the CAS fails we can either retry or pass control to the slow path. 601 // We use the latter tactic. 602 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 603 // If the CAS was successful ... 604 // Self has acquired the lock 605 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 606 // Intentional fall-through into DONE_LABEL ... 607 #else // _LP64 608 // It's inflated and we use scrReg for ObjectMonitor* in this section. 609 movq(scrReg, tmpReg); 610 xorq(tmpReg, tmpReg); 611 lock(); 612 cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 613 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 614 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 615 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value())); 616 // Propagate ICC.ZF from CAS above into DONE_LABEL. 617 jcc(Assembler::equal, DONE_LABEL); // CAS above succeeded; propagate ZF = 1 (success) 618 619 cmpptr(r15_thread, rax); // Check if we are already the owner (recursive lock) 620 jcc(Assembler::notEqual, DONE_LABEL); // If not recursive, ZF = 0 at this point (fail) 621 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 622 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 623 #endif // _LP64 624 #if INCLUDE_RTM_OPT 625 } // use_rtm() 626 #endif 627 // DONE_LABEL is a hot target - we'd really like to place it at the 628 // start of cache line by padding with NOPs. 629 // See the AMD and Intel software optimization manuals for the 630 // most efficient "long" NOP encodings. 631 // Unfortunately none of our alignment mechanisms suffice. 632 bind(DONE_LABEL); 633 634 // At DONE_LABEL the icc ZFlag is set as follows ... 635 // fast_unlock uses the same protocol. 636 // ZFlag == 1 -> Success 637 // ZFlag == 0 -> Failure - force control through the slow path 638 } 639 640 // obj: object to unlock 641 // box: box address (displaced header location), killed. Must be EAX. 642 // tmp: killed, cannot be obj nor box. 643 // 644 // Some commentary on balanced locking: 645 // 646 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 647 // Methods that don't have provably balanced locking are forced to run in the 648 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 649 // The interpreter provides two properties: 650 // I1: At return-time the interpreter automatically and quietly unlocks any 651 // objects acquired the current activation (frame). Recall that the 652 // interpreter maintains an on-stack list of locks currently held by 653 // a frame. 654 // I2: If a method attempts to unlock an object that is not held by the 655 // the frame the interpreter throws IMSX. 656 // 657 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 658 // B() doesn't have provably balanced locking so it runs in the interpreter. 659 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 660 // is still locked by A(). 661 // 662 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 663 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 664 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 665 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 666 // Arguably given that the spec legislates the JNI case as undefined our implementation 667 // could reasonably *avoid* checking owner in fast_unlock(). 668 // In the interest of performance we elide m->Owner==Self check in unlock. 669 // A perfectly viable alternative is to elide the owner check except when 670 // Xcheck:jni is enabled. 671 672 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { 673 assert(boxReg == rax, ""); 674 assert_different_registers(objReg, boxReg, tmpReg); 675 676 Label DONE_LABEL, Stacked, CheckSucc; 677 678 // Critically, the biased locking test must have precedence over 679 // and appear before the (box->dhw == 0) recursive stack-lock test. 680 if (UseBiasedLocking && !UseOptoBiasInlining) { 681 biased_locking_exit(objReg, tmpReg, DONE_LABEL); 682 } 683 684 #if INCLUDE_RTM_OPT 685 if (UseRTMForStackLocks && use_rtm) { 686 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 687 Label L_regular_unlock; 688 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 689 andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits 690 cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked 691 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock 692 xend(); // otherwise end... 693 jmp(DONE_LABEL); // ... and we're done 694 bind(L_regular_unlock); 695 } 696 #endif 697 698 if (LockingMode == LM_LEGACY) { 699 cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header 700 jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock 701 } 702 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 703 if (LockingMode != LM_MONITOR) { 704 testptr(tmpReg, markWord::monitor_value); // Inflated? 705 jcc(Assembler::zero, Stacked); 706 } 707 708 // It's inflated. 709 if (LockingMode == LM_LIGHTWEIGHT) { 710 // If the owner is ANONYMOUS, we need to fix it. 711 testb(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t) ObjectMonitor::ANONYMOUS_OWNER); 712 #ifdef _LP64 713 if (!Compile::current()->output()->in_scratch_emit_size()) { 714 C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmpReg, boxReg); 715 Compile::current()->output()->add_stub(stub); 716 jcc(Assembler::notEqual, stub->entry()); 717 bind(stub->continuation()); 718 } else 719 #endif 720 { 721 // We can't easily implement this optimization on 32 bit because we don't have a thread register. 722 // Call the slow-path instead. 723 jcc(Assembler::notEqual, DONE_LABEL); 724 } 725 } 726 727 #if INCLUDE_RTM_OPT 728 if (use_rtm) { 729 Label L_regular_inflated_unlock; 730 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 731 movptr(boxReg, Address(tmpReg, owner_offset)); 732 testptr(boxReg, boxReg); 733 jccb(Assembler::notZero, L_regular_inflated_unlock); 734 xend(); 735 jmp(DONE_LABEL); 736 bind(L_regular_inflated_unlock); 737 } 738 #endif 739 740 // Despite our balanced locking property we still check that m->_owner == Self 741 // as java routines or native JNI code called by this thread might 742 // have released the lock. 743 // Refer to the comments in synchronizer.cpp for how we might encode extra 744 // state in _succ so we can avoid fetching EntryList|cxq. 745 // 746 // If there's no contention try a 1-0 exit. That is, exit without 747 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 748 // we detect and recover from the race that the 1-0 exit admits. 749 // 750 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 751 // before it STs null into _owner, releasing the lock. Updates 752 // to data protected by the critical section must be visible before 753 // we drop the lock (and thus before any other thread could acquire 754 // the lock and observe the fields protected by the lock). 755 // IA32's memory-model is SPO, so STs are ordered with respect to 756 // each other and there's no need for an explicit barrier (fence). 757 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 758 #ifndef _LP64 759 get_thread (boxReg); 760 761 // Note that we could employ various encoding schemes to reduce 762 // the number of loads below (currently 4) to just 2 or 3. 763 // Refer to the comments in synchronizer.cpp. 764 // In practice the chain of fetches doesn't seem to impact performance, however. 765 xorptr(boxReg, boxReg); 766 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 767 jccb (Assembler::notZero, DONE_LABEL); 768 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 769 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 770 jccb (Assembler::notZero, DONE_LABEL); 771 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 772 jmpb (DONE_LABEL); 773 774 // Intention fall-thru into DONE_LABEL 775 776 // DONE_LABEL is a hot target - we'd really like to place it at the 777 // start of cache line by padding with NOPs. 778 // See the AMD and Intel software optimization manuals for the 779 // most efficient "long" NOP encodings. 780 // Unfortunately none of our alignment mechanisms suffice. 781 bind (CheckSucc); 782 #else // _LP64 783 // It's inflated 784 Label LNotRecursive, LSuccess, LGoSlowPath; 785 786 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 787 jccb(Assembler::equal, LNotRecursive); 788 789 // Recursive inflated unlock 790 decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 791 jmpb(LSuccess); 792 793 bind(LNotRecursive); 794 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 795 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 796 jccb (Assembler::notZero, CheckSucc); 797 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 798 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); 799 jmpb (DONE_LABEL); 800 801 // Try to avoid passing control into the slow_path ... 802 bind (CheckSucc); 803 804 // The following optional optimization can be elided if necessary 805 // Effectively: if (succ == null) goto slow path 806 // The code reduces the window for a race, however, 807 // and thus benefits performance. 808 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 809 jccb (Assembler::zero, LGoSlowPath); 810 811 xorptr(boxReg, boxReg); 812 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 813 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); 814 815 // Memory barrier/fence 816 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 817 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 818 // This is faster on Nehalem and AMD Shanghai/Barcelona. 819 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 820 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 821 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 822 lock(); addl(Address(rsp, 0), 0); 823 824 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 825 jccb (Assembler::notZero, LSuccess); 826 827 // Rare inopportune interleaving - race. 828 // The successor vanished in the small window above. 829 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 830 // We need to ensure progress and succession. 831 // Try to reacquire the lock. 832 // If that fails then the new owner is responsible for succession and this 833 // thread needs to take no further action and can exit via the fast path (success). 834 // If the re-acquire succeeds then pass control into the slow path. 835 // As implemented, this latter mode is horrible because we generated more 836 // coherence traffic on the lock *and* artifically extended the critical section 837 // length while by virtue of passing control into the slow path. 838 839 // box is really RAX -- the following CMPXCHG depends on that binding 840 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 841 lock(); 842 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 843 // There's no successor so we tried to regrab the lock. 844 // If that didn't work, then another thread grabbed the 845 // lock so we're done (and exit was a success). 846 jccb (Assembler::notEqual, LSuccess); 847 // Intentional fall-through into slow path 848 849 bind (LGoSlowPath); 850 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 851 jmpb (DONE_LABEL); 852 853 bind (LSuccess); 854 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 855 jmpb (DONE_LABEL); 856 857 #endif 858 if (LockingMode != LM_MONITOR) { 859 bind (Stacked); 860 if (LockingMode == LM_LIGHTWEIGHT) { 861 mov(boxReg, tmpReg); 862 fast_unlock_impl(objReg, boxReg, tmpReg, DONE_LABEL); 863 xorl(tmpReg, tmpReg); 864 } else if (LockingMode == LM_LEGACY) { 865 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 866 lock(); 867 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 868 } 869 } 870 bind(DONE_LABEL); 871 } 872 873 //------------------------------------------------------------------------------------------- 874 // Generic instructions support for use in .ad files C2 code generation 875 876 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) { 877 if (dst != src) { 878 movdqu(dst, src); 879 } 880 if (opcode == Op_AbsVD) { 881 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr); 882 } else { 883 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 884 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr); 885 } 886 } 887 888 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) { 889 if (opcode == Op_AbsVD) { 890 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr); 891 } else { 892 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 893 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr); 894 } 895 } 896 897 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) { 898 if (dst != src) { 899 movdqu(dst, src); 900 } 901 if (opcode == Op_AbsVF) { 902 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr); 903 } else { 904 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 905 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr); 906 } 907 } 908 909 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) { 910 if (opcode == Op_AbsVF) { 911 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr); 912 } else { 913 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 914 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr); 915 } 916 } 917 918 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 919 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 920 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 921 922 if (opcode == Op_MinV) { 923 if (elem_bt == T_BYTE) { 924 pminsb(dst, src); 925 } else if (elem_bt == T_SHORT) { 926 pminsw(dst, src); 927 } else if (elem_bt == T_INT) { 928 pminsd(dst, src); 929 } else { 930 assert(elem_bt == T_LONG, "required"); 931 assert(tmp == xmm0, "required"); 932 assert_different_registers(dst, src, tmp); 933 movdqu(xmm0, dst); 934 pcmpgtq(xmm0, src); 935 blendvpd(dst, src); // xmm0 as mask 936 } 937 } else { // opcode == Op_MaxV 938 if (elem_bt == T_BYTE) { 939 pmaxsb(dst, src); 940 } else if (elem_bt == T_SHORT) { 941 pmaxsw(dst, src); 942 } else if (elem_bt == T_INT) { 943 pmaxsd(dst, src); 944 } else { 945 assert(elem_bt == T_LONG, "required"); 946 assert(tmp == xmm0, "required"); 947 assert_different_registers(dst, src, tmp); 948 movdqu(xmm0, src); 949 pcmpgtq(xmm0, dst); 950 blendvpd(dst, src); // xmm0 as mask 951 } 952 } 953 } 954 955 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 956 XMMRegister dst, XMMRegister src1, XMMRegister src2, 957 int vlen_enc) { 958 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 959 960 if (opcode == Op_MinV) { 961 if (elem_bt == T_BYTE) { 962 vpminsb(dst, src1, src2, vlen_enc); 963 } else if (elem_bt == T_SHORT) { 964 vpminsw(dst, src1, src2, vlen_enc); 965 } else if (elem_bt == T_INT) { 966 vpminsd(dst, src1, src2, vlen_enc); 967 } else { 968 assert(elem_bt == T_LONG, "required"); 969 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 970 vpminsq(dst, src1, src2, vlen_enc); 971 } else { 972 assert_different_registers(dst, src1, src2); 973 vpcmpgtq(dst, src1, src2, vlen_enc); 974 vblendvpd(dst, src1, src2, dst, vlen_enc); 975 } 976 } 977 } else { // opcode == Op_MaxV 978 if (elem_bt == T_BYTE) { 979 vpmaxsb(dst, src1, src2, vlen_enc); 980 } else if (elem_bt == T_SHORT) { 981 vpmaxsw(dst, src1, src2, vlen_enc); 982 } else if (elem_bt == T_INT) { 983 vpmaxsd(dst, src1, src2, vlen_enc); 984 } else { 985 assert(elem_bt == T_LONG, "required"); 986 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 987 vpmaxsq(dst, src1, src2, vlen_enc); 988 } else { 989 assert_different_registers(dst, src1, src2); 990 vpcmpgtq(dst, src1, src2, vlen_enc); 991 vblendvpd(dst, src2, src1, dst, vlen_enc); 992 } 993 } 994 } 995 } 996 997 // Float/Double min max 998 999 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1000 XMMRegister dst, XMMRegister a, XMMRegister b, 1001 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1002 int vlen_enc) { 1003 assert(UseAVX > 0, "required"); 1004 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1005 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1006 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1007 assert_different_registers(a, b, tmp, atmp, btmp); 1008 1009 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1010 bool is_double_word = is_double_word_type(elem_bt); 1011 1012 if (!is_double_word && is_min) { 1013 vblendvps(atmp, a, b, a, vlen_enc); 1014 vblendvps(btmp, b, a, a, vlen_enc); 1015 vminps(tmp, atmp, btmp, vlen_enc); 1016 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1017 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 1018 } else if (!is_double_word && !is_min) { 1019 vblendvps(btmp, b, a, b, vlen_enc); 1020 vblendvps(atmp, a, b, b, vlen_enc); 1021 vmaxps(tmp, atmp, btmp, vlen_enc); 1022 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1023 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 1024 } else if (is_double_word && is_min) { 1025 vblendvpd(atmp, a, b, a, vlen_enc); 1026 vblendvpd(btmp, b, a, a, vlen_enc); 1027 vminpd(tmp, atmp, btmp, vlen_enc); 1028 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1029 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 1030 } else { 1031 assert(is_double_word && !is_min, "sanity"); 1032 vblendvpd(btmp, b, a, b, vlen_enc); 1033 vblendvpd(atmp, a, b, b, vlen_enc); 1034 vmaxpd(tmp, atmp, btmp, vlen_enc); 1035 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1036 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 1037 } 1038 } 1039 1040 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1041 XMMRegister dst, XMMRegister a, XMMRegister b, 1042 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1043 int vlen_enc) { 1044 assert(UseAVX > 2, "required"); 1045 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1046 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1047 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1048 assert_different_registers(dst, a, b, atmp, btmp); 1049 1050 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1051 bool is_double_word = is_double_word_type(elem_bt); 1052 bool merge = true; 1053 1054 if (!is_double_word && is_min) { 1055 evpmovd2m(ktmp, a, vlen_enc); 1056 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1057 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1058 vminps(dst, atmp, btmp, vlen_enc); 1059 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1060 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1061 } else if (!is_double_word && !is_min) { 1062 evpmovd2m(ktmp, b, vlen_enc); 1063 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1064 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1065 vmaxps(dst, atmp, btmp, vlen_enc); 1066 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1067 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1068 } else if (is_double_word && is_min) { 1069 evpmovq2m(ktmp, a, vlen_enc); 1070 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1071 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1072 vminpd(dst, atmp, btmp, vlen_enc); 1073 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1074 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1075 } else { 1076 assert(is_double_word && !is_min, "sanity"); 1077 evpmovq2m(ktmp, b, vlen_enc); 1078 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1079 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1080 vmaxpd(dst, atmp, btmp, vlen_enc); 1081 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1082 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1083 } 1084 } 1085 1086 // Float/Double signum 1087 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, 1088 XMMRegister zero, XMMRegister one, 1089 Register scratch) { 1090 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1091 1092 Label DONE_LABEL; 1093 1094 if (opcode == Op_SignumF) { 1095 assert(UseSSE > 0, "required"); 1096 ucomiss(dst, zero); 1097 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1098 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1099 movflt(dst, one); 1100 jcc(Assembler::above, DONE_LABEL); 1101 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scratch); 1102 } else if (opcode == Op_SignumD) { 1103 assert(UseSSE > 1, "required"); 1104 ucomisd(dst, zero); 1105 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1106 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1107 movdbl(dst, one); 1108 jcc(Assembler::above, DONE_LABEL); 1109 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scratch); 1110 } 1111 1112 bind(DONE_LABEL); 1113 } 1114 1115 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1116 if (sign) { 1117 pmovsxbw(dst, src); 1118 } else { 1119 pmovzxbw(dst, src); 1120 } 1121 } 1122 1123 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1124 if (sign) { 1125 vpmovsxbw(dst, src, vector_len); 1126 } else { 1127 vpmovzxbw(dst, src, vector_len); 1128 } 1129 } 1130 1131 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1132 if (sign) { 1133 vpmovsxbd(dst, src, vector_len); 1134 } else { 1135 vpmovzxbd(dst, src, vector_len); 1136 } 1137 } 1138 1139 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1140 if (sign) { 1141 vpmovsxwd(dst, src, vector_len); 1142 } else { 1143 vpmovzxwd(dst, src, vector_len); 1144 } 1145 } 1146 1147 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1148 int shift, int vector_len) { 1149 if (opcode == Op_RotateLeftV) { 1150 if (etype == T_INT) { 1151 evprold(dst, src, shift, vector_len); 1152 } else { 1153 assert(etype == T_LONG, "expected type T_LONG"); 1154 evprolq(dst, src, shift, vector_len); 1155 } 1156 } else { 1157 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1158 if (etype == T_INT) { 1159 evprord(dst, src, shift, vector_len); 1160 } else { 1161 assert(etype == T_LONG, "expected type T_LONG"); 1162 evprorq(dst, src, shift, vector_len); 1163 } 1164 } 1165 } 1166 1167 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1168 XMMRegister shift, int vector_len) { 1169 if (opcode == Op_RotateLeftV) { 1170 if (etype == T_INT) { 1171 evprolvd(dst, src, shift, vector_len); 1172 } else { 1173 assert(etype == T_LONG, "expected type T_LONG"); 1174 evprolvq(dst, src, shift, vector_len); 1175 } 1176 } else { 1177 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1178 if (etype == T_INT) { 1179 evprorvd(dst, src, shift, vector_len); 1180 } else { 1181 assert(etype == T_LONG, "expected type T_LONG"); 1182 evprorvq(dst, src, shift, vector_len); 1183 } 1184 } 1185 } 1186 1187 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1188 if (opcode == Op_RShiftVI) { 1189 psrad(dst, shift); 1190 } else if (opcode == Op_LShiftVI) { 1191 pslld(dst, shift); 1192 } else { 1193 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1194 psrld(dst, shift); 1195 } 1196 } 1197 1198 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1199 switch (opcode) { 1200 case Op_RShiftVI: psrad(dst, shift); break; 1201 case Op_LShiftVI: pslld(dst, shift); break; 1202 case Op_URShiftVI: psrld(dst, shift); break; 1203 1204 default: assert(false, "%s", NodeClassNames[opcode]); 1205 } 1206 } 1207 1208 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1209 if (opcode == Op_RShiftVI) { 1210 vpsrad(dst, nds, shift, vector_len); 1211 } else if (opcode == Op_LShiftVI) { 1212 vpslld(dst, nds, shift, vector_len); 1213 } else { 1214 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1215 vpsrld(dst, nds, shift, vector_len); 1216 } 1217 } 1218 1219 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1220 switch (opcode) { 1221 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1222 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1223 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1224 1225 default: assert(false, "%s", NodeClassNames[opcode]); 1226 } 1227 } 1228 1229 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1230 switch (opcode) { 1231 case Op_RShiftVB: // fall-through 1232 case Op_RShiftVS: psraw(dst, shift); break; 1233 1234 case Op_LShiftVB: // fall-through 1235 case Op_LShiftVS: psllw(dst, shift); break; 1236 1237 case Op_URShiftVS: // fall-through 1238 case Op_URShiftVB: psrlw(dst, shift); break; 1239 1240 default: assert(false, "%s", NodeClassNames[opcode]); 1241 } 1242 } 1243 1244 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1245 switch (opcode) { 1246 case Op_RShiftVB: // fall-through 1247 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1248 1249 case Op_LShiftVB: // fall-through 1250 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1251 1252 case Op_URShiftVS: // fall-through 1253 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1254 1255 default: assert(false, "%s", NodeClassNames[opcode]); 1256 } 1257 } 1258 1259 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1260 switch (opcode) { 1261 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1262 case Op_LShiftVL: psllq(dst, shift); break; 1263 case Op_URShiftVL: psrlq(dst, shift); break; 1264 1265 default: assert(false, "%s", NodeClassNames[opcode]); 1266 } 1267 } 1268 1269 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1270 if (opcode == Op_RShiftVL) { 1271 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1272 } else if (opcode == Op_LShiftVL) { 1273 psllq(dst, shift); 1274 } else { 1275 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1276 psrlq(dst, shift); 1277 } 1278 } 1279 1280 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1281 switch (opcode) { 1282 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1283 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1284 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1285 1286 default: assert(false, "%s", NodeClassNames[opcode]); 1287 } 1288 } 1289 1290 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1291 if (opcode == Op_RShiftVL) { 1292 evpsraq(dst, nds, shift, vector_len); 1293 } else if (opcode == Op_LShiftVL) { 1294 vpsllq(dst, nds, shift, vector_len); 1295 } else { 1296 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1297 vpsrlq(dst, nds, shift, vector_len); 1298 } 1299 } 1300 1301 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1302 switch (opcode) { 1303 case Op_RShiftVB: // fall-through 1304 case Op_RShiftVS: // fall-through 1305 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1306 1307 case Op_LShiftVB: // fall-through 1308 case Op_LShiftVS: // fall-through 1309 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1310 1311 case Op_URShiftVB: // fall-through 1312 case Op_URShiftVS: // fall-through 1313 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1314 1315 default: assert(false, "%s", NodeClassNames[opcode]); 1316 } 1317 } 1318 1319 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1320 switch (opcode) { 1321 case Op_RShiftVB: // fall-through 1322 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1323 1324 case Op_LShiftVB: // fall-through 1325 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1326 1327 case Op_URShiftVB: // fall-through 1328 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1329 1330 default: assert(false, "%s", NodeClassNames[opcode]); 1331 } 1332 } 1333 1334 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1335 assert(UseAVX >= 2, "required"); 1336 switch (opcode) { 1337 case Op_RShiftVL: { 1338 if (UseAVX > 2) { 1339 assert(tmp == xnoreg, "not used"); 1340 if (!VM_Version::supports_avx512vl()) { 1341 vlen_enc = Assembler::AVX_512bit; 1342 } 1343 evpsravq(dst, src, shift, vlen_enc); 1344 } else { 1345 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1346 vpsrlvq(dst, src, shift, vlen_enc); 1347 vpsrlvq(tmp, tmp, shift, vlen_enc); 1348 vpxor(dst, dst, tmp, vlen_enc); 1349 vpsubq(dst, dst, tmp, vlen_enc); 1350 } 1351 break; 1352 } 1353 case Op_LShiftVL: { 1354 assert(tmp == xnoreg, "not used"); 1355 vpsllvq(dst, src, shift, vlen_enc); 1356 break; 1357 } 1358 case Op_URShiftVL: { 1359 assert(tmp == xnoreg, "not used"); 1360 vpsrlvq(dst, src, shift, vlen_enc); 1361 break; 1362 } 1363 default: assert(false, "%s", NodeClassNames[opcode]); 1364 } 1365 } 1366 1367 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1368 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) { 1369 assert(opcode == Op_LShiftVB || 1370 opcode == Op_RShiftVB || 1371 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1372 bool sign = (opcode != Op_URShiftVB); 1373 assert(vector_len == 0, "required"); 1374 vextendbd(sign, dst, src, 1); 1375 vpmovzxbd(vtmp, shift, 1); 1376 varshiftd(opcode, dst, dst, vtmp, 1); 1377 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch); 1378 vextracti128_high(vtmp, dst); 1379 vpackusdw(dst, dst, vtmp, 0); 1380 } 1381 1382 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1383 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) { 1384 assert(opcode == Op_LShiftVB || 1385 opcode == Op_RShiftVB || 1386 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1387 bool sign = (opcode != Op_URShiftVB); 1388 int ext_vector_len = vector_len + 1; 1389 vextendbw(sign, dst, src, ext_vector_len); 1390 vpmovzxbw(vtmp, shift, ext_vector_len); 1391 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1392 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch); 1393 if (vector_len == 0) { 1394 vextracti128_high(vtmp, dst); 1395 vpackuswb(dst, dst, vtmp, vector_len); 1396 } else { 1397 vextracti64x4_high(vtmp, dst); 1398 vpackuswb(dst, dst, vtmp, vector_len); 1399 vpermq(dst, dst, 0xD8, vector_len); 1400 } 1401 } 1402 1403 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1404 switch(typ) { 1405 case T_BYTE: 1406 pinsrb(dst, val, idx); 1407 break; 1408 case T_SHORT: 1409 pinsrw(dst, val, idx); 1410 break; 1411 case T_INT: 1412 pinsrd(dst, val, idx); 1413 break; 1414 case T_LONG: 1415 pinsrq(dst, val, idx); 1416 break; 1417 default: 1418 assert(false,"Should not reach here."); 1419 break; 1420 } 1421 } 1422 1423 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1424 switch(typ) { 1425 case T_BYTE: 1426 vpinsrb(dst, src, val, idx); 1427 break; 1428 case T_SHORT: 1429 vpinsrw(dst, src, val, idx); 1430 break; 1431 case T_INT: 1432 vpinsrd(dst, src, val, idx); 1433 break; 1434 case T_LONG: 1435 vpinsrq(dst, src, val, idx); 1436 break; 1437 default: 1438 assert(false,"Should not reach here."); 1439 break; 1440 } 1441 } 1442 1443 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1444 switch(typ) { 1445 case T_INT: 1446 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1447 break; 1448 case T_FLOAT: 1449 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1450 break; 1451 case T_LONG: 1452 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1453 break; 1454 case T_DOUBLE: 1455 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1456 break; 1457 default: 1458 assert(false,"Should not reach here."); 1459 break; 1460 } 1461 } 1462 1463 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1464 switch(typ) { 1465 case T_INT: 1466 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1467 break; 1468 case T_FLOAT: 1469 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1470 break; 1471 case T_LONG: 1472 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1473 break; 1474 case T_DOUBLE: 1475 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1476 break; 1477 default: 1478 assert(false,"Should not reach here."); 1479 break; 1480 } 1481 } 1482 1483 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1484 switch(typ) { 1485 case T_INT: 1486 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1487 break; 1488 case T_FLOAT: 1489 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1490 break; 1491 case T_LONG: 1492 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1493 break; 1494 case T_DOUBLE: 1495 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1496 break; 1497 default: 1498 assert(false,"Should not reach here."); 1499 break; 1500 } 1501 } 1502 1503 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1504 if (vlen_in_bytes <= 16) { 1505 pxor (dst, dst); 1506 psubb(dst, src); 1507 switch (elem_bt) { 1508 case T_BYTE: /* nothing to do */ break; 1509 case T_SHORT: pmovsxbw(dst, dst); break; 1510 case T_INT: pmovsxbd(dst, dst); break; 1511 case T_FLOAT: pmovsxbd(dst, dst); break; 1512 case T_LONG: pmovsxbq(dst, dst); break; 1513 case T_DOUBLE: pmovsxbq(dst, dst); break; 1514 1515 default: assert(false, "%s", type2name(elem_bt)); 1516 } 1517 } else { 1518 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1519 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1520 1521 vpxor (dst, dst, dst, vlen_enc); 1522 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1523 1524 switch (elem_bt) { 1525 case T_BYTE: /* nothing to do */ break; 1526 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1527 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1528 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1529 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1530 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1531 1532 default: assert(false, "%s", type2name(elem_bt)); 1533 } 1534 } 1535 } 1536 1537 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) { 1538 ExternalAddress addr(StubRoutines::x86::vector_iota_indices()); 1539 if (vlen_in_bytes == 4) { 1540 movdl(dst, addr); 1541 } else if (vlen_in_bytes == 8) { 1542 movq(dst, addr); 1543 } else if (vlen_in_bytes == 16) { 1544 movdqu(dst, addr, scratch); 1545 } else if (vlen_in_bytes == 32) { 1546 vmovdqu(dst, addr, scratch); 1547 } else { 1548 assert(vlen_in_bytes == 64, "%d", vlen_in_bytes); 1549 evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch); 1550 } 1551 } 1552 1553 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1554 1555 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1556 int vector_len = Assembler::AVX_128bit; 1557 1558 switch (opcode) { 1559 case Op_AndReductionV: pand(dst, src); break; 1560 case Op_OrReductionV: por (dst, src); break; 1561 case Op_XorReductionV: pxor(dst, src); break; 1562 case Op_MinReductionV: 1563 switch (typ) { 1564 case T_BYTE: pminsb(dst, src); break; 1565 case T_SHORT: pminsw(dst, src); break; 1566 case T_INT: pminsd(dst, src); break; 1567 case T_LONG: assert(UseAVX > 2, "required"); 1568 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1569 default: assert(false, "wrong type"); 1570 } 1571 break; 1572 case Op_MaxReductionV: 1573 switch (typ) { 1574 case T_BYTE: pmaxsb(dst, src); break; 1575 case T_SHORT: pmaxsw(dst, src); break; 1576 case T_INT: pmaxsd(dst, src); break; 1577 case T_LONG: assert(UseAVX > 2, "required"); 1578 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1579 default: assert(false, "wrong type"); 1580 } 1581 break; 1582 case Op_AddReductionVF: addss(dst, src); break; 1583 case Op_AddReductionVD: addsd(dst, src); break; 1584 case Op_AddReductionVI: 1585 switch (typ) { 1586 case T_BYTE: paddb(dst, src); break; 1587 case T_SHORT: paddw(dst, src); break; 1588 case T_INT: paddd(dst, src); break; 1589 default: assert(false, "wrong type"); 1590 } 1591 break; 1592 case Op_AddReductionVL: paddq(dst, src); break; 1593 case Op_MulReductionVF: mulss(dst, src); break; 1594 case Op_MulReductionVD: mulsd(dst, src); break; 1595 case Op_MulReductionVI: 1596 switch (typ) { 1597 case T_SHORT: pmullw(dst, src); break; 1598 case T_INT: pmulld(dst, src); break; 1599 default: assert(false, "wrong type"); 1600 } 1601 break; 1602 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1603 vpmullq(dst, dst, src, vector_len); break; 1604 default: assert(false, "wrong opcode"); 1605 } 1606 } 1607 1608 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1609 int vector_len = Assembler::AVX_256bit; 1610 1611 switch (opcode) { 1612 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1613 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1614 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1615 case Op_MinReductionV: 1616 switch (typ) { 1617 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1618 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1619 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1620 case T_LONG: assert(UseAVX > 2, "required"); 1621 vpminsq(dst, src1, src2, vector_len); break; 1622 default: assert(false, "wrong type"); 1623 } 1624 break; 1625 case Op_MaxReductionV: 1626 switch (typ) { 1627 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1628 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1629 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1630 case T_LONG: assert(UseAVX > 2, "required"); 1631 vpmaxsq(dst, src1, src2, vector_len); break; 1632 default: assert(false, "wrong type"); 1633 } 1634 break; 1635 case Op_AddReductionVI: 1636 switch (typ) { 1637 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1638 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1639 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1640 default: assert(false, "wrong type"); 1641 } 1642 break; 1643 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1644 case Op_MulReductionVI: 1645 switch (typ) { 1646 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1647 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1648 default: assert(false, "wrong type"); 1649 } 1650 break; 1651 case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break; 1652 default: assert(false, "wrong opcode"); 1653 } 1654 } 1655 1656 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1657 XMMRegister dst, XMMRegister src, 1658 XMMRegister vtmp1, XMMRegister vtmp2) { 1659 switch (opcode) { 1660 case Op_AddReductionVF: 1661 case Op_MulReductionVF: 1662 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1663 break; 1664 1665 case Op_AddReductionVD: 1666 case Op_MulReductionVD: 1667 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1668 break; 1669 1670 default: assert(false, "wrong opcode"); 1671 } 1672 } 1673 1674 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1675 Register dst, Register src1, XMMRegister src2, 1676 XMMRegister vtmp1, XMMRegister vtmp2) { 1677 switch (vlen) { 1678 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1679 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1680 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1681 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1682 1683 default: assert(false, "wrong vector length"); 1684 } 1685 } 1686 1687 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1688 Register dst, Register src1, XMMRegister src2, 1689 XMMRegister vtmp1, XMMRegister vtmp2) { 1690 switch (vlen) { 1691 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1692 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1693 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1694 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1695 1696 default: assert(false, "wrong vector length"); 1697 } 1698 } 1699 1700 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1701 Register dst, Register src1, XMMRegister src2, 1702 XMMRegister vtmp1, XMMRegister vtmp2) { 1703 switch (vlen) { 1704 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1705 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1706 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1707 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1708 1709 default: assert(false, "wrong vector length"); 1710 } 1711 } 1712 1713 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1714 Register dst, Register src1, XMMRegister src2, 1715 XMMRegister vtmp1, XMMRegister vtmp2) { 1716 switch (vlen) { 1717 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1718 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1719 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1720 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1721 1722 default: assert(false, "wrong vector length"); 1723 } 1724 } 1725 1726 #ifdef _LP64 1727 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1728 Register dst, Register src1, XMMRegister src2, 1729 XMMRegister vtmp1, XMMRegister vtmp2) { 1730 switch (vlen) { 1731 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1732 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1733 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1734 1735 default: assert(false, "wrong vector length"); 1736 } 1737 } 1738 #endif // _LP64 1739 1740 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1741 switch (vlen) { 1742 case 2: 1743 assert(vtmp2 == xnoreg, ""); 1744 reduce2F(opcode, dst, src, vtmp1); 1745 break; 1746 case 4: 1747 assert(vtmp2 == xnoreg, ""); 1748 reduce4F(opcode, dst, src, vtmp1); 1749 break; 1750 case 8: 1751 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1752 break; 1753 case 16: 1754 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1755 break; 1756 default: assert(false, "wrong vector length"); 1757 } 1758 } 1759 1760 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1761 switch (vlen) { 1762 case 2: 1763 assert(vtmp2 == xnoreg, ""); 1764 reduce2D(opcode, dst, src, vtmp1); 1765 break; 1766 case 4: 1767 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1768 break; 1769 case 8: 1770 reduce8D(opcode, dst, src, vtmp1, vtmp2); 1771 break; 1772 default: assert(false, "wrong vector length"); 1773 } 1774 } 1775 1776 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1777 if (opcode == Op_AddReductionVI) { 1778 if (vtmp1 != src2) { 1779 movdqu(vtmp1, src2); 1780 } 1781 phaddd(vtmp1, vtmp1); 1782 } else { 1783 pshufd(vtmp1, src2, 0x1); 1784 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1785 } 1786 movdl(vtmp2, src1); 1787 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1788 movdl(dst, vtmp1); 1789 } 1790 1791 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1792 if (opcode == Op_AddReductionVI) { 1793 if (vtmp1 != src2) { 1794 movdqu(vtmp1, src2); 1795 } 1796 phaddd(vtmp1, src2); 1797 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1798 } else { 1799 pshufd(vtmp2, src2, 0xE); 1800 reduce_operation_128(T_INT, opcode, vtmp2, src2); 1801 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1802 } 1803 } 1804 1805 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1806 if (opcode == Op_AddReductionVI) { 1807 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 1808 vextracti128_high(vtmp2, vtmp1); 1809 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 1810 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1811 } else { 1812 vextracti128_high(vtmp1, src2); 1813 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1814 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1815 } 1816 } 1817 1818 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1819 vextracti64x4_high(vtmp2, src2); 1820 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 1821 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1822 } 1823 1824 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1825 pshufd(vtmp2, src2, 0x1); 1826 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1827 movdqu(vtmp1, vtmp2); 1828 psrldq(vtmp1, 2); 1829 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 1830 movdqu(vtmp2, vtmp1); 1831 psrldq(vtmp2, 1); 1832 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 1833 movdl(vtmp2, src1); 1834 pmovsxbd(vtmp1, vtmp1); 1835 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1836 pextrb(dst, vtmp1, 0x0); 1837 movsbl(dst, dst); 1838 } 1839 1840 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1841 pshufd(vtmp1, src2, 0xE); 1842 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 1843 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1844 } 1845 1846 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1847 vextracti128_high(vtmp2, src2); 1848 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1849 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1850 } 1851 1852 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1853 vextracti64x4_high(vtmp1, src2); 1854 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 1855 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1856 } 1857 1858 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1859 pmovsxbw(vtmp2, src2); 1860 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1861 } 1862 1863 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1864 if (UseAVX > 1) { 1865 int vector_len = Assembler::AVX_256bit; 1866 vpmovsxbw(vtmp1, src2, vector_len); 1867 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1868 } else { 1869 pmovsxbw(vtmp2, src2); 1870 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1871 pshufd(vtmp2, src2, 0x1); 1872 pmovsxbw(vtmp2, src2); 1873 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 1874 } 1875 } 1876 1877 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1878 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 1879 int vector_len = Assembler::AVX_512bit; 1880 vpmovsxbw(vtmp1, src2, vector_len); 1881 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1882 } else { 1883 assert(UseAVX >= 2,"Should not reach here."); 1884 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 1885 vextracti128_high(vtmp2, src2); 1886 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 1887 } 1888 } 1889 1890 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1891 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 1892 vextracti64x4_high(vtmp2, src2); 1893 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 1894 } 1895 1896 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1897 if (opcode == Op_AddReductionVI) { 1898 if (vtmp1 != src2) { 1899 movdqu(vtmp1, src2); 1900 } 1901 phaddw(vtmp1, vtmp1); 1902 phaddw(vtmp1, vtmp1); 1903 } else { 1904 pshufd(vtmp2, src2, 0x1); 1905 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 1906 movdqu(vtmp1, vtmp2); 1907 psrldq(vtmp1, 2); 1908 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 1909 } 1910 movdl(vtmp2, src1); 1911 pmovsxwd(vtmp1, vtmp1); 1912 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1913 pextrw(dst, vtmp1, 0x0); 1914 movswl(dst, dst); 1915 } 1916 1917 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1918 if (opcode == Op_AddReductionVI) { 1919 if (vtmp1 != src2) { 1920 movdqu(vtmp1, src2); 1921 } 1922 phaddw(vtmp1, src2); 1923 } else { 1924 pshufd(vtmp1, src2, 0xE); 1925 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 1926 } 1927 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1928 } 1929 1930 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1931 if (opcode == Op_AddReductionVI) { 1932 int vector_len = Assembler::AVX_256bit; 1933 vphaddw(vtmp2, src2, src2, vector_len); 1934 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 1935 } else { 1936 vextracti128_high(vtmp2, src2); 1937 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 1938 } 1939 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1940 } 1941 1942 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1943 int vector_len = Assembler::AVX_256bit; 1944 vextracti64x4_high(vtmp1, src2); 1945 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 1946 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1947 } 1948 1949 #ifdef _LP64 1950 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1951 pshufd(vtmp2, src2, 0xE); 1952 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 1953 movdq(vtmp1, src1); 1954 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 1955 movdq(dst, vtmp1); 1956 } 1957 1958 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1959 vextracti128_high(vtmp1, src2); 1960 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 1961 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1962 } 1963 1964 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1965 vextracti64x4_high(vtmp2, src2); 1966 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 1967 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1968 } 1969 1970 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 1971 assert(ArrayOperationPartialInlineSize > 0 && ArrayOperationPartialInlineSize <= 64, "invalid"); 1972 mov64(temp, -1L); 1973 bzhiq(temp, temp, len); 1974 kmovql(dst, temp); 1975 } 1976 #endif // _LP64 1977 1978 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1979 reduce_operation_128(T_FLOAT, opcode, dst, src); 1980 pshufd(vtmp, src, 0x1); 1981 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 1982 } 1983 1984 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1985 reduce2F(opcode, dst, src, vtmp); 1986 pshufd(vtmp, src, 0x2); 1987 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 1988 pshufd(vtmp, src, 0x3); 1989 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 1990 } 1991 1992 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1993 reduce4F(opcode, dst, src, vtmp2); 1994 vextractf128_high(vtmp2, src); 1995 reduce4F(opcode, dst, vtmp2, vtmp1); 1996 } 1997 1998 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1999 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2000 vextracti64x4_high(vtmp1, src); 2001 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2002 } 2003 2004 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2005 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2006 pshufd(vtmp, src, 0xE); 2007 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2008 } 2009 2010 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2011 reduce2D(opcode, dst, src, vtmp2); 2012 vextractf128_high(vtmp2, src); 2013 reduce2D(opcode, dst, vtmp2, vtmp1); 2014 } 2015 2016 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2017 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2018 vextracti64x4_high(vtmp1, src); 2019 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2020 } 2021 2022 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) { 2023 MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len); 2024 } 2025 2026 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) { 2027 MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len); 2028 } 2029 2030 2031 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2032 XMMRegister dst, XMMRegister src, 2033 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2034 XMMRegister xmm_0, XMMRegister xmm_1) { 2035 int permconst[] = {1, 14}; 2036 XMMRegister wsrc = src; 2037 XMMRegister wdst = xmm_0; 2038 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2039 2040 int vlen_enc = Assembler::AVX_128bit; 2041 if (vlen == 16) { 2042 vlen_enc = Assembler::AVX_256bit; 2043 } 2044 2045 for (int i = log2(vlen) - 1; i >=0; i--) { 2046 if (i == 0 && !is_dst_valid) { 2047 wdst = dst; 2048 } 2049 if (i == 3) { 2050 vextracti64x4_high(wtmp, wsrc); 2051 } else if (i == 2) { 2052 vextracti128_high(wtmp, wsrc); 2053 } else { // i = [0,1] 2054 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2055 } 2056 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2057 wsrc = wdst; 2058 vlen_enc = Assembler::AVX_128bit; 2059 } 2060 if (is_dst_valid) { 2061 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2062 } 2063 } 2064 2065 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2066 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2067 XMMRegister xmm_0, XMMRegister xmm_1) { 2068 XMMRegister wsrc = src; 2069 XMMRegister wdst = xmm_0; 2070 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2071 int vlen_enc = Assembler::AVX_128bit; 2072 if (vlen == 8) { 2073 vlen_enc = Assembler::AVX_256bit; 2074 } 2075 for (int i = log2(vlen) - 1; i >=0; i--) { 2076 if (i == 0 && !is_dst_valid) { 2077 wdst = dst; 2078 } 2079 if (i == 1) { 2080 vextracti128_high(wtmp, wsrc); 2081 } else if (i == 2) { 2082 vextracti64x4_high(wtmp, wsrc); 2083 } else { 2084 assert(i == 0, "%d", i); 2085 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2086 } 2087 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2088 wsrc = wdst; 2089 vlen_enc = Assembler::AVX_128bit; 2090 } 2091 if (is_dst_valid) { 2092 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2093 } 2094 } 2095 2096 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2097 switch (bt) { 2098 case T_BYTE: pextrb(dst, src, idx); break; 2099 case T_SHORT: pextrw(dst, src, idx); break; 2100 case T_INT: pextrd(dst, src, idx); break; 2101 case T_LONG: pextrq(dst, src, idx); break; 2102 2103 default: 2104 assert(false,"Should not reach here."); 2105 break; 2106 } 2107 } 2108 2109 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2110 int esize = type2aelembytes(typ); 2111 int elem_per_lane = 16/esize; 2112 int lane = elemindex / elem_per_lane; 2113 int eindex = elemindex % elem_per_lane; 2114 2115 if (lane >= 2) { 2116 assert(UseAVX > 2, "required"); 2117 vextractf32x4(dst, src, lane & 3); 2118 return dst; 2119 } else if (lane > 0) { 2120 assert(UseAVX > 0, "required"); 2121 vextractf128(dst, src, lane); 2122 return dst; 2123 } else { 2124 return src; 2125 } 2126 } 2127 2128 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2129 if (typ == T_BYTE) { 2130 movsbl(dst, dst); 2131 } else if (typ == T_SHORT) { 2132 movswl(dst, dst); 2133 } 2134 } 2135 2136 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2137 int esize = type2aelembytes(typ); 2138 int elem_per_lane = 16/esize; 2139 int eindex = elemindex % elem_per_lane; 2140 assert(is_integral_type(typ),"required"); 2141 2142 if (eindex == 0) { 2143 if (typ == T_LONG) { 2144 movq(dst, src); 2145 } else { 2146 movdl(dst, src); 2147 movsxl(typ, dst); 2148 } 2149 } else { 2150 extract(typ, dst, src, eindex); 2151 movsxl(typ, dst); 2152 } 2153 } 2154 2155 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) { 2156 int esize = type2aelembytes(typ); 2157 int elem_per_lane = 16/esize; 2158 int eindex = elemindex % elem_per_lane; 2159 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2160 2161 if (eindex == 0) { 2162 movq(dst, src); 2163 } else { 2164 if (typ == T_FLOAT) { 2165 if (UseAVX == 0) { 2166 movdqu(dst, src); 2167 pshufps(dst, dst, eindex); 2168 } else { 2169 vpshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2170 } 2171 } else { 2172 if (UseAVX == 0) { 2173 movdqu(dst, src); 2174 psrldq(dst, eindex*esize); 2175 } else { 2176 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2177 } 2178 movq(dst, dst); 2179 } 2180 } 2181 // Zero upper bits 2182 if (typ == T_FLOAT) { 2183 if (UseAVX == 0) { 2184 assert((vtmp != xnoreg) && (tmp != noreg), "required."); 2185 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp); 2186 pand(dst, vtmp); 2187 } else { 2188 assert((tmp != noreg), "required."); 2189 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp); 2190 } 2191 } 2192 } 2193 2194 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2195 switch(typ) { 2196 case T_BYTE: 2197 case T_BOOLEAN: 2198 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2199 break; 2200 case T_SHORT: 2201 case T_CHAR: 2202 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2203 break; 2204 case T_INT: 2205 case T_FLOAT: 2206 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2207 break; 2208 case T_LONG: 2209 case T_DOUBLE: 2210 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2211 break; 2212 default: 2213 assert(false,"Should not reach here."); 2214 break; 2215 } 2216 } 2217 2218 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) { 2219 switch(typ) { 2220 case T_BOOLEAN: 2221 case T_BYTE: 2222 evpcmpb(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch); 2223 break; 2224 case T_CHAR: 2225 case T_SHORT: 2226 evpcmpw(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch); 2227 break; 2228 case T_INT: 2229 case T_FLOAT: 2230 evpcmpd(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch); 2231 break; 2232 case T_LONG: 2233 case T_DOUBLE: 2234 evpcmpq(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch); 2235 break; 2236 default: 2237 assert(false,"Should not reach here."); 2238 break; 2239 } 2240 } 2241 2242 void C2_MacroAssembler::vpcmpu(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison, 2243 int vlen_in_bytes, XMMRegister vtmp1, XMMRegister vtmp2, Register scratch) { 2244 int vlen_enc = vector_length_encoding(vlen_in_bytes*2); 2245 switch (typ) { 2246 case T_BYTE: 2247 vpmovzxbw(vtmp1, src1, vlen_enc); 2248 vpmovzxbw(vtmp2, src2, vlen_enc); 2249 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch); 2250 vpacksswb(dst, dst, dst, vlen_enc); 2251 break; 2252 case T_SHORT: 2253 vpmovzxwd(vtmp1, src1, vlen_enc); 2254 vpmovzxwd(vtmp2, src2, vlen_enc); 2255 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch); 2256 vpackssdw(dst, dst, dst, vlen_enc); 2257 break; 2258 case T_INT: 2259 vpmovzxdq(vtmp1, src1, vlen_enc); 2260 vpmovzxdq(vtmp2, src2, vlen_enc); 2261 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch); 2262 vpermilps(dst, dst, 8, vlen_enc); 2263 break; 2264 default: 2265 assert(false, "Should not reach here"); 2266 } 2267 if (vlen_in_bytes == 16) { 2268 vpermpd(dst, dst, 0x8, vlen_enc); 2269 } 2270 } 2271 2272 void C2_MacroAssembler::vpcmpu32(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison, int vlen_in_bytes, 2273 XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, Register scratch) { 2274 int vlen_enc = vector_length_encoding(vlen_in_bytes); 2275 switch (typ) { 2276 case T_BYTE: 2277 vpmovzxbw(vtmp1, src1, vlen_enc); 2278 vpmovzxbw(vtmp2, src2, vlen_enc); 2279 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch); 2280 vextracti128(vtmp1, src1, 1); 2281 vextracti128(vtmp2, src2, 1); 2282 vpmovzxbw(vtmp1, vtmp1, vlen_enc); 2283 vpmovzxbw(vtmp2, vtmp2, vlen_enc); 2284 vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch); 2285 vpacksswb(dst, dst, vtmp3, vlen_enc); 2286 vpermpd(dst, dst, 0xd8, vlen_enc); 2287 break; 2288 case T_SHORT: 2289 vpmovzxwd(vtmp1, src1, vlen_enc); 2290 vpmovzxwd(vtmp2, src2, vlen_enc); 2291 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch); 2292 vextracti128(vtmp1, src1, 1); 2293 vextracti128(vtmp2, src2, 1); 2294 vpmovzxwd(vtmp1, vtmp1, vlen_enc); 2295 vpmovzxwd(vtmp2, vtmp2, vlen_enc); 2296 vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch); 2297 vpackssdw(dst, dst, vtmp3, vlen_enc); 2298 vpermpd(dst, dst, 0xd8, vlen_enc); 2299 break; 2300 case T_INT: 2301 vpmovzxdq(vtmp1, src1, vlen_enc); 2302 vpmovzxdq(vtmp2, src2, vlen_enc); 2303 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch); 2304 vpshufd(dst, dst, 8, vlen_enc); 2305 vpermq(dst, dst, 8, vlen_enc); 2306 vextracti128(vtmp1, src1, 1); 2307 vextracti128(vtmp2, src2, 1); 2308 vpmovzxdq(vtmp1, vtmp1, vlen_enc); 2309 vpmovzxdq(vtmp2, vtmp2, vlen_enc); 2310 vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch); 2311 vpshufd(vtmp3, vtmp3, 8, vlen_enc); 2312 vpermq(vtmp3, vtmp3, 0x80, vlen_enc); 2313 vpblendd(dst, dst, vtmp3, 0xf0, vlen_enc); 2314 break; 2315 default: 2316 assert(false, "Should not reach here"); 2317 } 2318 } 2319 2320 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2321 switch(typ) { 2322 case T_BYTE: 2323 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2324 break; 2325 case T_SHORT: 2326 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2327 break; 2328 case T_INT: 2329 case T_FLOAT: 2330 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2331 break; 2332 case T_LONG: 2333 case T_DOUBLE: 2334 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2335 break; 2336 default: 2337 assert(false,"Should not reach here."); 2338 break; 2339 } 2340 } 2341 2342 void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2, 2343 XMMRegister vtmp1, XMMRegister vtmp2, KRegister mask) { 2344 switch(vlen) { 2345 case 4: 2346 assert(vtmp1 != xnoreg, "required."); 2347 // Broadcast lower 32 bits to 128 bits before ptest 2348 pshufd(vtmp1, src1, 0x0); 2349 if (bt == BoolTest::overflow) { 2350 assert(vtmp2 != xnoreg, "required."); 2351 pshufd(vtmp2, src2, 0x0); 2352 } else { 2353 assert(vtmp2 == xnoreg, "required."); 2354 vtmp2 = src2; 2355 } 2356 ptest(vtmp1, vtmp2); 2357 break; 2358 case 8: 2359 assert(vtmp1 != xnoreg, "required."); 2360 // Broadcast lower 64 bits to 128 bits before ptest 2361 pshufd(vtmp1, src1, 0x4); 2362 if (bt == BoolTest::overflow) { 2363 assert(vtmp2 != xnoreg, "required."); 2364 pshufd(vtmp2, src2, 0x4); 2365 } else { 2366 assert(vtmp2 == xnoreg, "required."); 2367 vtmp2 = src2; 2368 } 2369 ptest(vtmp1, vtmp2); 2370 break; 2371 case 16: 2372 assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required."); 2373 ptest(src1, src2); 2374 break; 2375 case 32: 2376 assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required."); 2377 vptest(src1, src2, Assembler::AVX_256bit); 2378 break; 2379 case 64: 2380 { 2381 assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required."); 2382 evpcmpeqb(mask, src1, src2, Assembler::AVX_512bit); 2383 if (bt == BoolTest::ne) { 2384 ktestql(mask, mask); 2385 } else { 2386 assert(bt == BoolTest::overflow, "required"); 2387 kortestql(mask, mask); 2388 } 2389 } 2390 break; 2391 default: 2392 assert(false,"Should not reach here."); 2393 break; 2394 } 2395 } 2396 2397 //------------------------------------------------------------------------------------------- 2398 2399 // IndexOf for constant substrings with size >= 8 chars 2400 // which don't need to be loaded through stack. 2401 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2402 Register cnt1, Register cnt2, 2403 int int_cnt2, Register result, 2404 XMMRegister vec, Register tmp, 2405 int ae) { 2406 ShortBranchVerifier sbv(this); 2407 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2408 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2409 2410 // This method uses the pcmpestri instruction with bound registers 2411 // inputs: 2412 // xmm - substring 2413 // rax - substring length (elements count) 2414 // mem - scanned string 2415 // rdx - string length (elements count) 2416 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2417 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2418 // outputs: 2419 // rcx - matched index in string 2420 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2421 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2422 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2423 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2424 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2425 2426 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2427 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2428 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2429 2430 // Note, inline_string_indexOf() generates checks: 2431 // if (substr.count > string.count) return -1; 2432 // if (substr.count == 0) return 0; 2433 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2434 2435 // Load substring. 2436 if (ae == StrIntrinsicNode::UL) { 2437 pmovzxbw(vec, Address(str2, 0)); 2438 } else { 2439 movdqu(vec, Address(str2, 0)); 2440 } 2441 movl(cnt2, int_cnt2); 2442 movptr(result, str1); // string addr 2443 2444 if (int_cnt2 > stride) { 2445 jmpb(SCAN_TO_SUBSTR); 2446 2447 // Reload substr for rescan, this code 2448 // is executed only for large substrings (> 8 chars) 2449 bind(RELOAD_SUBSTR); 2450 if (ae == StrIntrinsicNode::UL) { 2451 pmovzxbw(vec, Address(str2, 0)); 2452 } else { 2453 movdqu(vec, Address(str2, 0)); 2454 } 2455 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2456 2457 bind(RELOAD_STR); 2458 // We came here after the beginning of the substring was 2459 // matched but the rest of it was not so we need to search 2460 // again. Start from the next element after the previous match. 2461 2462 // cnt2 is number of substring reminding elements and 2463 // cnt1 is number of string reminding elements when cmp failed. 2464 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2465 subl(cnt1, cnt2); 2466 addl(cnt1, int_cnt2); 2467 movl(cnt2, int_cnt2); // Now restore cnt2 2468 2469 decrementl(cnt1); // Shift to next element 2470 cmpl(cnt1, cnt2); 2471 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2472 2473 addptr(result, (1<<scale1)); 2474 2475 } // (int_cnt2 > 8) 2476 2477 // Scan string for start of substr in 16-byte vectors 2478 bind(SCAN_TO_SUBSTR); 2479 pcmpestri(vec, Address(result, 0), mode); 2480 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2481 subl(cnt1, stride); 2482 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2483 cmpl(cnt1, cnt2); 2484 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2485 addptr(result, 16); 2486 jmpb(SCAN_TO_SUBSTR); 2487 2488 // Found a potential substr 2489 bind(FOUND_CANDIDATE); 2490 // Matched whole vector if first element matched (tmp(rcx) == 0). 2491 if (int_cnt2 == stride) { 2492 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2493 } else { // int_cnt2 > 8 2494 jccb(Assembler::overflow, FOUND_SUBSTR); 2495 } 2496 // After pcmpestri tmp(rcx) contains matched element index 2497 // Compute start addr of substr 2498 lea(result, Address(result, tmp, scale1)); 2499 2500 // Make sure string is still long enough 2501 subl(cnt1, tmp); 2502 cmpl(cnt1, cnt2); 2503 if (int_cnt2 == stride) { 2504 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2505 } else { // int_cnt2 > 8 2506 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2507 } 2508 // Left less then substring. 2509 2510 bind(RET_NOT_FOUND); 2511 movl(result, -1); 2512 jmp(EXIT); 2513 2514 if (int_cnt2 > stride) { 2515 // This code is optimized for the case when whole substring 2516 // is matched if its head is matched. 2517 bind(MATCH_SUBSTR_HEAD); 2518 pcmpestri(vec, Address(result, 0), mode); 2519 // Reload only string if does not match 2520 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2521 2522 Label CONT_SCAN_SUBSTR; 2523 // Compare the rest of substring (> 8 chars). 2524 bind(FOUND_SUBSTR); 2525 // First 8 chars are already matched. 2526 negptr(cnt2); 2527 addptr(cnt2, stride); 2528 2529 bind(SCAN_SUBSTR); 2530 subl(cnt1, stride); 2531 cmpl(cnt2, -stride); // Do not read beyond substring 2532 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2533 // Back-up strings to avoid reading beyond substring: 2534 // cnt1 = cnt1 - cnt2 + 8 2535 addl(cnt1, cnt2); // cnt2 is negative 2536 addl(cnt1, stride); 2537 movl(cnt2, stride); negptr(cnt2); 2538 bind(CONT_SCAN_SUBSTR); 2539 if (int_cnt2 < (int)G) { 2540 int tail_off1 = int_cnt2<<scale1; 2541 int tail_off2 = int_cnt2<<scale2; 2542 if (ae == StrIntrinsicNode::UL) { 2543 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2544 } else { 2545 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2546 } 2547 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2548 } else { 2549 // calculate index in register to avoid integer overflow (int_cnt2*2) 2550 movl(tmp, int_cnt2); 2551 addptr(tmp, cnt2); 2552 if (ae == StrIntrinsicNode::UL) { 2553 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2554 } else { 2555 movdqu(vec, Address(str2, tmp, scale2, 0)); 2556 } 2557 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2558 } 2559 // Need to reload strings pointers if not matched whole vector 2560 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2561 addptr(cnt2, stride); 2562 jcc(Assembler::negative, SCAN_SUBSTR); 2563 // Fall through if found full substring 2564 2565 } // (int_cnt2 > 8) 2566 2567 bind(RET_FOUND); 2568 // Found result if we matched full small substring. 2569 // Compute substr offset 2570 subptr(result, str1); 2571 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2572 shrl(result, 1); // index 2573 } 2574 bind(EXIT); 2575 2576 } // string_indexofC8 2577 2578 // Small strings are loaded through stack if they cross page boundary. 2579 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2580 Register cnt1, Register cnt2, 2581 int int_cnt2, Register result, 2582 XMMRegister vec, Register tmp, 2583 int ae) { 2584 ShortBranchVerifier sbv(this); 2585 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2586 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2587 2588 // 2589 // int_cnt2 is length of small (< 8 chars) constant substring 2590 // or (-1) for non constant substring in which case its length 2591 // is in cnt2 register. 2592 // 2593 // Note, inline_string_indexOf() generates checks: 2594 // if (substr.count > string.count) return -1; 2595 // if (substr.count == 0) return 0; 2596 // 2597 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2598 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2599 // This method uses the pcmpestri instruction with bound registers 2600 // inputs: 2601 // xmm - substring 2602 // rax - substring length (elements count) 2603 // mem - scanned string 2604 // rdx - string length (elements count) 2605 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2606 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2607 // outputs: 2608 // rcx - matched index in string 2609 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2610 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2611 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2612 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2613 2614 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2615 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2616 FOUND_CANDIDATE; 2617 2618 { //======================================================== 2619 // We don't know where these strings are located 2620 // and we can't read beyond them. Load them through stack. 2621 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2622 2623 movptr(tmp, rsp); // save old SP 2624 2625 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2626 if (int_cnt2 == (1>>scale2)) { // One byte 2627 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2628 load_unsigned_byte(result, Address(str2, 0)); 2629 movdl(vec, result); // move 32 bits 2630 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2631 // Not enough header space in 32-bit VM: 12+3 = 15. 2632 movl(result, Address(str2, -1)); 2633 shrl(result, 8); 2634 movdl(vec, result); // move 32 bits 2635 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2636 load_unsigned_short(result, Address(str2, 0)); 2637 movdl(vec, result); // move 32 bits 2638 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2639 movdl(vec, Address(str2, 0)); // move 32 bits 2640 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2641 movq(vec, Address(str2, 0)); // move 64 bits 2642 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2643 // Array header size is 12 bytes in 32-bit VM 2644 // + 6 bytes for 3 chars == 18 bytes, 2645 // enough space to load vec and shift. 2646 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2647 if (ae == StrIntrinsicNode::UL) { 2648 int tail_off = int_cnt2-8; 2649 pmovzxbw(vec, Address(str2, tail_off)); 2650 psrldq(vec, -2*tail_off); 2651 } 2652 else { 2653 int tail_off = int_cnt2*(1<<scale2); 2654 movdqu(vec, Address(str2, tail_off-16)); 2655 psrldq(vec, 16-tail_off); 2656 } 2657 } 2658 } else { // not constant substring 2659 cmpl(cnt2, stride); 2660 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 2661 2662 // We can read beyond string if srt+16 does not cross page boundary 2663 // since heaps are aligned and mapped by pages. 2664 assert(os::vm_page_size() < (int)G, "default page should be small"); 2665 movl(result, str2); // We need only low 32 bits 2666 andl(result, (os::vm_page_size()-1)); 2667 cmpl(result, (os::vm_page_size()-16)); 2668 jccb(Assembler::belowEqual, CHECK_STR); 2669 2670 // Move small strings to stack to allow load 16 bytes into vec. 2671 subptr(rsp, 16); 2672 int stk_offset = wordSize-(1<<scale2); 2673 push(cnt2); 2674 2675 bind(COPY_SUBSTR); 2676 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 2677 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 2678 movb(Address(rsp, cnt2, scale2, stk_offset), result); 2679 } else if (ae == StrIntrinsicNode::UU) { 2680 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 2681 movw(Address(rsp, cnt2, scale2, stk_offset), result); 2682 } 2683 decrement(cnt2); 2684 jccb(Assembler::notZero, COPY_SUBSTR); 2685 2686 pop(cnt2); 2687 movptr(str2, rsp); // New substring address 2688 } // non constant 2689 2690 bind(CHECK_STR); 2691 cmpl(cnt1, stride); 2692 jccb(Assembler::aboveEqual, BIG_STRINGS); 2693 2694 // Check cross page boundary. 2695 movl(result, str1); // We need only low 32 bits 2696 andl(result, (os::vm_page_size()-1)); 2697 cmpl(result, (os::vm_page_size()-16)); 2698 jccb(Assembler::belowEqual, BIG_STRINGS); 2699 2700 subptr(rsp, 16); 2701 int stk_offset = -(1<<scale1); 2702 if (int_cnt2 < 0) { // not constant 2703 push(cnt2); 2704 stk_offset += wordSize; 2705 } 2706 movl(cnt2, cnt1); 2707 2708 bind(COPY_STR); 2709 if (ae == StrIntrinsicNode::LL) { 2710 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 2711 movb(Address(rsp, cnt2, scale1, stk_offset), result); 2712 } else { 2713 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 2714 movw(Address(rsp, cnt2, scale1, stk_offset), result); 2715 } 2716 decrement(cnt2); 2717 jccb(Assembler::notZero, COPY_STR); 2718 2719 if (int_cnt2 < 0) { // not constant 2720 pop(cnt2); 2721 } 2722 movptr(str1, rsp); // New string address 2723 2724 bind(BIG_STRINGS); 2725 // Load substring. 2726 if (int_cnt2 < 0) { // -1 2727 if (ae == StrIntrinsicNode::UL) { 2728 pmovzxbw(vec, Address(str2, 0)); 2729 } else { 2730 movdqu(vec, Address(str2, 0)); 2731 } 2732 push(cnt2); // substr count 2733 push(str2); // substr addr 2734 push(str1); // string addr 2735 } else { 2736 // Small (< 8 chars) constant substrings are loaded already. 2737 movl(cnt2, int_cnt2); 2738 } 2739 push(tmp); // original SP 2740 2741 } // Finished loading 2742 2743 //======================================================== 2744 // Start search 2745 // 2746 2747 movptr(result, str1); // string addr 2748 2749 if (int_cnt2 < 0) { // Only for non constant substring 2750 jmpb(SCAN_TO_SUBSTR); 2751 2752 // SP saved at sp+0 2753 // String saved at sp+1*wordSize 2754 // Substr saved at sp+2*wordSize 2755 // Substr count saved at sp+3*wordSize 2756 2757 // Reload substr for rescan, this code 2758 // is executed only for large substrings (> 8 chars) 2759 bind(RELOAD_SUBSTR); 2760 movptr(str2, Address(rsp, 2*wordSize)); 2761 movl(cnt2, Address(rsp, 3*wordSize)); 2762 if (ae == StrIntrinsicNode::UL) { 2763 pmovzxbw(vec, Address(str2, 0)); 2764 } else { 2765 movdqu(vec, Address(str2, 0)); 2766 } 2767 // We came here after the beginning of the substring was 2768 // matched but the rest of it was not so we need to search 2769 // again. Start from the next element after the previous match. 2770 subptr(str1, result); // Restore counter 2771 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2772 shrl(str1, 1); 2773 } 2774 addl(cnt1, str1); 2775 decrementl(cnt1); // Shift to next element 2776 cmpl(cnt1, cnt2); 2777 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2778 2779 addptr(result, (1<<scale1)); 2780 } // non constant 2781 2782 // Scan string for start of substr in 16-byte vectors 2783 bind(SCAN_TO_SUBSTR); 2784 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2785 pcmpestri(vec, Address(result, 0), mode); 2786 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2787 subl(cnt1, stride); 2788 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2789 cmpl(cnt1, cnt2); 2790 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2791 addptr(result, 16); 2792 2793 bind(ADJUST_STR); 2794 cmpl(cnt1, stride); // Do not read beyond string 2795 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2796 // Back-up string to avoid reading beyond string. 2797 lea(result, Address(result, cnt1, scale1, -16)); 2798 movl(cnt1, stride); 2799 jmpb(SCAN_TO_SUBSTR); 2800 2801 // Found a potential substr 2802 bind(FOUND_CANDIDATE); 2803 // After pcmpestri tmp(rcx) contains matched element index 2804 2805 // Make sure string is still long enough 2806 subl(cnt1, tmp); 2807 cmpl(cnt1, cnt2); 2808 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 2809 // Left less then substring. 2810 2811 bind(RET_NOT_FOUND); 2812 movl(result, -1); 2813 jmp(CLEANUP); 2814 2815 bind(FOUND_SUBSTR); 2816 // Compute start addr of substr 2817 lea(result, Address(result, tmp, scale1)); 2818 if (int_cnt2 > 0) { // Constant substring 2819 // Repeat search for small substring (< 8 chars) 2820 // from new point without reloading substring. 2821 // Have to check that we don't read beyond string. 2822 cmpl(tmp, stride-int_cnt2); 2823 jccb(Assembler::greater, ADJUST_STR); 2824 // Fall through if matched whole substring. 2825 } else { // non constant 2826 assert(int_cnt2 == -1, "should be != 0"); 2827 2828 addl(tmp, cnt2); 2829 // Found result if we matched whole substring. 2830 cmpl(tmp, stride); 2831 jcc(Assembler::lessEqual, RET_FOUND); 2832 2833 // Repeat search for small substring (<= 8 chars) 2834 // from new point 'str1' without reloading substring. 2835 cmpl(cnt2, stride); 2836 // Have to check that we don't read beyond string. 2837 jccb(Assembler::lessEqual, ADJUST_STR); 2838 2839 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 2840 // Compare the rest of substring (> 8 chars). 2841 movptr(str1, result); 2842 2843 cmpl(tmp, cnt2); 2844 // First 8 chars are already matched. 2845 jccb(Assembler::equal, CHECK_NEXT); 2846 2847 bind(SCAN_SUBSTR); 2848 pcmpestri(vec, Address(str1, 0), mode); 2849 // Need to reload strings pointers if not matched whole vector 2850 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2851 2852 bind(CHECK_NEXT); 2853 subl(cnt2, stride); 2854 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 2855 addptr(str1, 16); 2856 if (ae == StrIntrinsicNode::UL) { 2857 addptr(str2, 8); 2858 } else { 2859 addptr(str2, 16); 2860 } 2861 subl(cnt1, stride); 2862 cmpl(cnt2, stride); // Do not read beyond substring 2863 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 2864 // Back-up strings to avoid reading beyond substring. 2865 2866 if (ae == StrIntrinsicNode::UL) { 2867 lea(str2, Address(str2, cnt2, scale2, -8)); 2868 lea(str1, Address(str1, cnt2, scale1, -16)); 2869 } else { 2870 lea(str2, Address(str2, cnt2, scale2, -16)); 2871 lea(str1, Address(str1, cnt2, scale1, -16)); 2872 } 2873 subl(cnt1, cnt2); 2874 movl(cnt2, stride); 2875 addl(cnt1, stride); 2876 bind(CONT_SCAN_SUBSTR); 2877 if (ae == StrIntrinsicNode::UL) { 2878 pmovzxbw(vec, Address(str2, 0)); 2879 } else { 2880 movdqu(vec, Address(str2, 0)); 2881 } 2882 jmp(SCAN_SUBSTR); 2883 2884 bind(RET_FOUND_LONG); 2885 movptr(str1, Address(rsp, wordSize)); 2886 } // non constant 2887 2888 bind(RET_FOUND); 2889 // Compute substr offset 2890 subptr(result, str1); 2891 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2892 shrl(result, 1); // index 2893 } 2894 bind(CLEANUP); 2895 pop(rsp); // restore SP 2896 2897 } // string_indexof 2898 2899 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 2900 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 2901 ShortBranchVerifier sbv(this); 2902 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2903 2904 int stride = 8; 2905 2906 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 2907 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 2908 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 2909 FOUND_SEQ_CHAR, DONE_LABEL; 2910 2911 movptr(result, str1); 2912 if (UseAVX >= 2) { 2913 cmpl(cnt1, stride); 2914 jcc(Assembler::less, SCAN_TO_CHAR); 2915 cmpl(cnt1, 2*stride); 2916 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 2917 movdl(vec1, ch); 2918 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 2919 vpxor(vec2, vec2); 2920 movl(tmp, cnt1); 2921 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 2922 andl(cnt1,0x0000000F); //tail count (in chars) 2923 2924 bind(SCAN_TO_16_CHAR_LOOP); 2925 vmovdqu(vec3, Address(result, 0)); 2926 vpcmpeqw(vec3, vec3, vec1, 1); 2927 vptest(vec2, vec3); 2928 jcc(Assembler::carryClear, FOUND_CHAR); 2929 addptr(result, 32); 2930 subl(tmp, 2*stride); 2931 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 2932 jmp(SCAN_TO_8_CHAR); 2933 bind(SCAN_TO_8_CHAR_INIT); 2934 movdl(vec1, ch); 2935 pshuflw(vec1, vec1, 0x00); 2936 pshufd(vec1, vec1, 0); 2937 pxor(vec2, vec2); 2938 } 2939 bind(SCAN_TO_8_CHAR); 2940 cmpl(cnt1, stride); 2941 jcc(Assembler::less, SCAN_TO_CHAR); 2942 if (UseAVX < 2) { 2943 movdl(vec1, ch); 2944 pshuflw(vec1, vec1, 0x00); 2945 pshufd(vec1, vec1, 0); 2946 pxor(vec2, vec2); 2947 } 2948 movl(tmp, cnt1); 2949 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 2950 andl(cnt1,0x00000007); //tail count (in chars) 2951 2952 bind(SCAN_TO_8_CHAR_LOOP); 2953 movdqu(vec3, Address(result, 0)); 2954 pcmpeqw(vec3, vec1); 2955 ptest(vec2, vec3); 2956 jcc(Assembler::carryClear, FOUND_CHAR); 2957 addptr(result, 16); 2958 subl(tmp, stride); 2959 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 2960 bind(SCAN_TO_CHAR); 2961 testl(cnt1, cnt1); 2962 jcc(Assembler::zero, RET_NOT_FOUND); 2963 bind(SCAN_TO_CHAR_LOOP); 2964 load_unsigned_short(tmp, Address(result, 0)); 2965 cmpl(ch, tmp); 2966 jccb(Assembler::equal, FOUND_SEQ_CHAR); 2967 addptr(result, 2); 2968 subl(cnt1, 1); 2969 jccb(Assembler::zero, RET_NOT_FOUND); 2970 jmp(SCAN_TO_CHAR_LOOP); 2971 2972 bind(RET_NOT_FOUND); 2973 movl(result, -1); 2974 jmpb(DONE_LABEL); 2975 2976 bind(FOUND_CHAR); 2977 if (UseAVX >= 2) { 2978 vpmovmskb(tmp, vec3); 2979 } else { 2980 pmovmskb(tmp, vec3); 2981 } 2982 bsfl(ch, tmp); 2983 addptr(result, ch); 2984 2985 bind(FOUND_SEQ_CHAR); 2986 subptr(result, str1); 2987 shrl(result, 1); 2988 2989 bind(DONE_LABEL); 2990 } // string_indexof_char 2991 2992 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 2993 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 2994 ShortBranchVerifier sbv(this); 2995 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2996 2997 int stride = 16; 2998 2999 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3000 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3001 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3002 FOUND_SEQ_CHAR, DONE_LABEL; 3003 3004 movptr(result, str1); 3005 if (UseAVX >= 2) { 3006 cmpl(cnt1, stride); 3007 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3008 cmpl(cnt1, stride*2); 3009 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3010 movdl(vec1, ch); 3011 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3012 vpxor(vec2, vec2); 3013 movl(tmp, cnt1); 3014 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3015 andl(cnt1,0x0000001F); //tail count (in chars) 3016 3017 bind(SCAN_TO_32_CHAR_LOOP); 3018 vmovdqu(vec3, Address(result, 0)); 3019 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3020 vptest(vec2, vec3); 3021 jcc(Assembler::carryClear, FOUND_CHAR); 3022 addptr(result, 32); 3023 subl(tmp, stride*2); 3024 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3025 jmp(SCAN_TO_16_CHAR); 3026 3027 bind(SCAN_TO_16_CHAR_INIT); 3028 movdl(vec1, ch); 3029 pxor(vec2, vec2); 3030 pshufb(vec1, vec2); 3031 } 3032 3033 bind(SCAN_TO_16_CHAR); 3034 cmpl(cnt1, stride); 3035 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entires left 3036 if (UseAVX < 2) { 3037 movdl(vec1, ch); 3038 pxor(vec2, vec2); 3039 pshufb(vec1, vec2); 3040 } 3041 movl(tmp, cnt1); 3042 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3043 andl(cnt1,0x0000000F); //tail count (in bytes) 3044 3045 bind(SCAN_TO_16_CHAR_LOOP); 3046 movdqu(vec3, Address(result, 0)); 3047 pcmpeqb(vec3, vec1); 3048 ptest(vec2, vec3); 3049 jcc(Assembler::carryClear, FOUND_CHAR); 3050 addptr(result, 16); 3051 subl(tmp, stride); 3052 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3053 3054 bind(SCAN_TO_CHAR_INIT); 3055 testl(cnt1, cnt1); 3056 jcc(Assembler::zero, RET_NOT_FOUND); 3057 bind(SCAN_TO_CHAR_LOOP); 3058 load_unsigned_byte(tmp, Address(result, 0)); 3059 cmpl(ch, tmp); 3060 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3061 addptr(result, 1); 3062 subl(cnt1, 1); 3063 jccb(Assembler::zero, RET_NOT_FOUND); 3064 jmp(SCAN_TO_CHAR_LOOP); 3065 3066 bind(RET_NOT_FOUND); 3067 movl(result, -1); 3068 jmpb(DONE_LABEL); 3069 3070 bind(FOUND_CHAR); 3071 if (UseAVX >= 2) { 3072 vpmovmskb(tmp, vec3); 3073 } else { 3074 pmovmskb(tmp, vec3); 3075 } 3076 bsfl(ch, tmp); 3077 addptr(result, ch); 3078 3079 bind(FOUND_SEQ_CHAR); 3080 subptr(result, str1); 3081 3082 bind(DONE_LABEL); 3083 } // stringL_indexof_char 3084 3085 // helper function for string_compare 3086 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3087 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3088 Address::ScaleFactor scale2, Register index, int ae) { 3089 if (ae == StrIntrinsicNode::LL) { 3090 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3091 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3092 } else if (ae == StrIntrinsicNode::UU) { 3093 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3094 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3095 } else { 3096 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3097 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3098 } 3099 } 3100 3101 // Compare strings, used for char[] and byte[]. 3102 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3103 Register cnt1, Register cnt2, Register result, 3104 XMMRegister vec1, int ae, KRegister mask) { 3105 ShortBranchVerifier sbv(this); 3106 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3107 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3108 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3109 int stride2x2 = 0x40; 3110 Address::ScaleFactor scale = Address::no_scale; 3111 Address::ScaleFactor scale1 = Address::no_scale; 3112 Address::ScaleFactor scale2 = Address::no_scale; 3113 3114 if (ae != StrIntrinsicNode::LL) { 3115 stride2x2 = 0x20; 3116 } 3117 3118 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3119 shrl(cnt2, 1); 3120 } 3121 // Compute the minimum of the string lengths and the 3122 // difference of the string lengths (stack). 3123 // Do the conditional move stuff 3124 movl(result, cnt1); 3125 subl(cnt1, cnt2); 3126 push(cnt1); 3127 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3128 3129 // Is the minimum length zero? 3130 testl(cnt2, cnt2); 3131 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3132 if (ae == StrIntrinsicNode::LL) { 3133 // Load first bytes 3134 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3135 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3136 } else if (ae == StrIntrinsicNode::UU) { 3137 // Load first characters 3138 load_unsigned_short(result, Address(str1, 0)); 3139 load_unsigned_short(cnt1, Address(str2, 0)); 3140 } else { 3141 load_unsigned_byte(result, Address(str1, 0)); 3142 load_unsigned_short(cnt1, Address(str2, 0)); 3143 } 3144 subl(result, cnt1); 3145 jcc(Assembler::notZero, POP_LABEL); 3146 3147 if (ae == StrIntrinsicNode::UU) { 3148 // Divide length by 2 to get number of chars 3149 shrl(cnt2, 1); 3150 } 3151 cmpl(cnt2, 1); 3152 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3153 3154 // Check if the strings start at the same location and setup scale and stride 3155 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3156 cmpptr(str1, str2); 3157 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3158 if (ae == StrIntrinsicNode::LL) { 3159 scale = Address::times_1; 3160 stride = 16; 3161 } else { 3162 scale = Address::times_2; 3163 stride = 8; 3164 } 3165 } else { 3166 scale1 = Address::times_1; 3167 scale2 = Address::times_2; 3168 // scale not used 3169 stride = 8; 3170 } 3171 3172 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3173 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3174 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3175 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3176 Label COMPARE_TAIL_LONG; 3177 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3178 3179 int pcmpmask = 0x19; 3180 if (ae == StrIntrinsicNode::LL) { 3181 pcmpmask &= ~0x01; 3182 } 3183 3184 // Setup to compare 16-chars (32-bytes) vectors, 3185 // start from first character again because it has aligned address. 3186 if (ae == StrIntrinsicNode::LL) { 3187 stride2 = 32; 3188 } else { 3189 stride2 = 16; 3190 } 3191 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3192 adr_stride = stride << scale; 3193 } else { 3194 adr_stride1 = 8; //stride << scale1; 3195 adr_stride2 = 16; //stride << scale2; 3196 } 3197 3198 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3199 // rax and rdx are used by pcmpestri as elements counters 3200 movl(result, cnt2); 3201 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3202 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3203 3204 // fast path : compare first 2 8-char vectors. 3205 bind(COMPARE_16_CHARS); 3206 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3207 movdqu(vec1, Address(str1, 0)); 3208 } else { 3209 pmovzxbw(vec1, Address(str1, 0)); 3210 } 3211 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3212 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3213 3214 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3215 movdqu(vec1, Address(str1, adr_stride)); 3216 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3217 } else { 3218 pmovzxbw(vec1, Address(str1, adr_stride1)); 3219 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3220 } 3221 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3222 addl(cnt1, stride); 3223 3224 // Compare the characters at index in cnt1 3225 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3226 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3227 subl(result, cnt2); 3228 jmp(POP_LABEL); 3229 3230 // Setup the registers to start vector comparison loop 3231 bind(COMPARE_WIDE_VECTORS); 3232 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3233 lea(str1, Address(str1, result, scale)); 3234 lea(str2, Address(str2, result, scale)); 3235 } else { 3236 lea(str1, Address(str1, result, scale1)); 3237 lea(str2, Address(str2, result, scale2)); 3238 } 3239 subl(result, stride2); 3240 subl(cnt2, stride2); 3241 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3242 negptr(result); 3243 3244 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3245 bind(COMPARE_WIDE_VECTORS_LOOP); 3246 3247 #ifdef _LP64 3248 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3249 cmpl(cnt2, stride2x2); 3250 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3251 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3252 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3253 3254 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3255 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3256 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3257 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3258 } else { 3259 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3260 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3261 } 3262 kortestql(mask, mask); 3263 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3264 addptr(result, stride2x2); // update since we already compared at this addr 3265 subl(cnt2, stride2x2); // and sub the size too 3266 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3267 3268 vpxor(vec1, vec1); 3269 jmpb(COMPARE_WIDE_TAIL); 3270 }//if (VM_Version::supports_avx512vlbw()) 3271 #endif // _LP64 3272 3273 3274 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3275 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3276 vmovdqu(vec1, Address(str1, result, scale)); 3277 vpxor(vec1, Address(str2, result, scale)); 3278 } else { 3279 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3280 vpxor(vec1, Address(str2, result, scale2)); 3281 } 3282 vptest(vec1, vec1); 3283 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3284 addptr(result, stride2); 3285 subl(cnt2, stride2); 3286 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3287 // clean upper bits of YMM registers 3288 vpxor(vec1, vec1); 3289 3290 // compare wide vectors tail 3291 bind(COMPARE_WIDE_TAIL); 3292 testptr(result, result); 3293 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3294 3295 movl(result, stride2); 3296 movl(cnt2, result); 3297 negptr(result); 3298 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3299 3300 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3301 bind(VECTOR_NOT_EQUAL); 3302 // clean upper bits of YMM registers 3303 vpxor(vec1, vec1); 3304 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3305 lea(str1, Address(str1, result, scale)); 3306 lea(str2, Address(str2, result, scale)); 3307 } else { 3308 lea(str1, Address(str1, result, scale1)); 3309 lea(str2, Address(str2, result, scale2)); 3310 } 3311 jmp(COMPARE_16_CHARS); 3312 3313 // Compare tail chars, length between 1 to 15 chars 3314 bind(COMPARE_TAIL_LONG); 3315 movl(cnt2, result); 3316 cmpl(cnt2, stride); 3317 jcc(Assembler::less, COMPARE_SMALL_STR); 3318 3319 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3320 movdqu(vec1, Address(str1, 0)); 3321 } else { 3322 pmovzxbw(vec1, Address(str1, 0)); 3323 } 3324 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3325 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3326 subptr(cnt2, stride); 3327 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3328 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3329 lea(str1, Address(str1, result, scale)); 3330 lea(str2, Address(str2, result, scale)); 3331 } else { 3332 lea(str1, Address(str1, result, scale1)); 3333 lea(str2, Address(str2, result, scale2)); 3334 } 3335 negptr(cnt2); 3336 jmpb(WHILE_HEAD_LABEL); 3337 3338 bind(COMPARE_SMALL_STR); 3339 } else if (UseSSE42Intrinsics) { 3340 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3341 int pcmpmask = 0x19; 3342 // Setup to compare 8-char (16-byte) vectors, 3343 // start from first character again because it has aligned address. 3344 movl(result, cnt2); 3345 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3346 if (ae == StrIntrinsicNode::LL) { 3347 pcmpmask &= ~0x01; 3348 } 3349 jcc(Assembler::zero, COMPARE_TAIL); 3350 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3351 lea(str1, Address(str1, result, scale)); 3352 lea(str2, Address(str2, result, scale)); 3353 } else { 3354 lea(str1, Address(str1, result, scale1)); 3355 lea(str2, Address(str2, result, scale2)); 3356 } 3357 negptr(result); 3358 3359 // pcmpestri 3360 // inputs: 3361 // vec1- substring 3362 // rax - negative string length (elements count) 3363 // mem - scanned string 3364 // rdx - string length (elements count) 3365 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3366 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3367 // outputs: 3368 // rcx - first mismatched element index 3369 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3370 3371 bind(COMPARE_WIDE_VECTORS); 3372 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3373 movdqu(vec1, Address(str1, result, scale)); 3374 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3375 } else { 3376 pmovzxbw(vec1, Address(str1, result, scale1)); 3377 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3378 } 3379 // After pcmpestri cnt1(rcx) contains mismatched element index 3380 3381 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3382 addptr(result, stride); 3383 subptr(cnt2, stride); 3384 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3385 3386 // compare wide vectors tail 3387 testptr(result, result); 3388 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3389 3390 movl(cnt2, stride); 3391 movl(result, stride); 3392 negptr(result); 3393 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3394 movdqu(vec1, Address(str1, result, scale)); 3395 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3396 } else { 3397 pmovzxbw(vec1, Address(str1, result, scale1)); 3398 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3399 } 3400 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3401 3402 // Mismatched characters in the vectors 3403 bind(VECTOR_NOT_EQUAL); 3404 addptr(cnt1, result); 3405 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3406 subl(result, cnt2); 3407 jmpb(POP_LABEL); 3408 3409 bind(COMPARE_TAIL); // limit is zero 3410 movl(cnt2, result); 3411 // Fallthru to tail compare 3412 } 3413 // Shift str2 and str1 to the end of the arrays, negate min 3414 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3415 lea(str1, Address(str1, cnt2, scale)); 3416 lea(str2, Address(str2, cnt2, scale)); 3417 } else { 3418 lea(str1, Address(str1, cnt2, scale1)); 3419 lea(str2, Address(str2, cnt2, scale2)); 3420 } 3421 decrementl(cnt2); // first character was compared already 3422 negptr(cnt2); 3423 3424 // Compare the rest of the elements 3425 bind(WHILE_HEAD_LABEL); 3426 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3427 subl(result, cnt1); 3428 jccb(Assembler::notZero, POP_LABEL); 3429 increment(cnt2); 3430 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3431 3432 // Strings are equal up to min length. Return the length difference. 3433 bind(LENGTH_DIFF_LABEL); 3434 pop(result); 3435 if (ae == StrIntrinsicNode::UU) { 3436 // Divide diff by 2 to get number of chars 3437 sarl(result, 1); 3438 } 3439 jmpb(DONE_LABEL); 3440 3441 #ifdef _LP64 3442 if (VM_Version::supports_avx512vlbw()) { 3443 3444 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3445 3446 kmovql(cnt1, mask); 3447 notq(cnt1); 3448 bsfq(cnt2, cnt1); 3449 if (ae != StrIntrinsicNode::LL) { 3450 // Divide diff by 2 to get number of chars 3451 sarl(cnt2, 1); 3452 } 3453 addq(result, cnt2); 3454 if (ae == StrIntrinsicNode::LL) { 3455 load_unsigned_byte(cnt1, Address(str2, result)); 3456 load_unsigned_byte(result, Address(str1, result)); 3457 } else if (ae == StrIntrinsicNode::UU) { 3458 load_unsigned_short(cnt1, Address(str2, result, scale)); 3459 load_unsigned_short(result, Address(str1, result, scale)); 3460 } else { 3461 load_unsigned_short(cnt1, Address(str2, result, scale2)); 3462 load_unsigned_byte(result, Address(str1, result, scale1)); 3463 } 3464 subl(result, cnt1); 3465 jmpb(POP_LABEL); 3466 }//if (VM_Version::supports_avx512vlbw()) 3467 #endif // _LP64 3468 3469 // Discard the stored length difference 3470 bind(POP_LABEL); 3471 pop(cnt1); 3472 3473 // That's it 3474 bind(DONE_LABEL); 3475 if(ae == StrIntrinsicNode::UL) { 3476 negl(result); 3477 } 3478 3479 } 3480 3481 // Search for Non-ASCII character (Negative byte value) in a byte array, 3482 // return true if it has any and false otherwise. 3483 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 3484 // @IntrinsicCandidate 3485 // private static boolean hasNegatives(byte[] ba, int off, int len) { 3486 // for (int i = off; i < off + len; i++) { 3487 // if (ba[i] < 0) { 3488 // return true; 3489 // } 3490 // } 3491 // return false; 3492 // } 3493 void C2_MacroAssembler::has_negatives(Register ary1, Register len, 3494 Register result, Register tmp1, 3495 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 3496 // rsi: byte array 3497 // rcx: len 3498 // rax: result 3499 ShortBranchVerifier sbv(this); 3500 assert_different_registers(ary1, len, result, tmp1); 3501 assert_different_registers(vec1, vec2); 3502 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 3503 3504 // len == 0 3505 testl(len, len); 3506 jcc(Assembler::zero, FALSE_LABEL); 3507 3508 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 3509 VM_Version::supports_avx512vlbw() && 3510 VM_Version::supports_bmi2()) { 3511 3512 Label test_64_loop, test_tail; 3513 Register tmp3_aliased = len; 3514 3515 movl(tmp1, len); 3516 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 3517 3518 andl(tmp1, 64 - 1); // tail count (in chars) 0x3F 3519 andl(len, ~(64 - 1)); // vector count (in chars) 3520 jccb(Assembler::zero, test_tail); 3521 3522 lea(ary1, Address(ary1, len, Address::times_1)); 3523 negptr(len); 3524 3525 bind(test_64_loop); 3526 // Check whether our 64 elements of size byte contain negatives 3527 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 3528 kortestql(mask1, mask1); 3529 jcc(Assembler::notZero, TRUE_LABEL); 3530 3531 addptr(len, 64); 3532 jccb(Assembler::notZero, test_64_loop); 3533 3534 3535 bind(test_tail); 3536 // bail out when there is nothing to be done 3537 testl(tmp1, -1); 3538 jcc(Assembler::zero, FALSE_LABEL); 3539 3540 // ~(~0 << len) applied up to two times (for 32-bit scenario) 3541 #ifdef _LP64 3542 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 3543 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 3544 notq(tmp3_aliased); 3545 kmovql(mask2, tmp3_aliased); 3546 #else 3547 Label k_init; 3548 jmp(k_init); 3549 3550 // We could not read 64-bits from a general purpose register thus we move 3551 // data required to compose 64 1's to the instruction stream 3552 // We emit 64 byte wide series of elements from 0..63 which later on would 3553 // be used as a compare targets with tail count contained in tmp1 register. 3554 // Result would be a k register having tmp1 consecutive number or 1 3555 // counting from least significant bit. 3556 address tmp = pc(); 3557 emit_int64(0x0706050403020100); 3558 emit_int64(0x0F0E0D0C0B0A0908); 3559 emit_int64(0x1716151413121110); 3560 emit_int64(0x1F1E1D1C1B1A1918); 3561 emit_int64(0x2726252423222120); 3562 emit_int64(0x2F2E2D2C2B2A2928); 3563 emit_int64(0x3736353433323130); 3564 emit_int64(0x3F3E3D3C3B3A3938); 3565 3566 bind(k_init); 3567 lea(len, InternalAddress(tmp)); 3568 // create mask to test for negative byte inside a vector 3569 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 3570 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 3571 3572 #endif 3573 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 3574 ktestq(mask1, mask2); 3575 jcc(Assembler::notZero, TRUE_LABEL); 3576 3577 jmp(FALSE_LABEL); 3578 } else { 3579 movl(result, len); // copy 3580 3581 if (UseAVX >= 2 && UseSSE >= 2) { 3582 // With AVX2, use 32-byte vector compare 3583 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3584 3585 // Compare 32-byte vectors 3586 andl(result, 0x0000001f); // tail count (in bytes) 3587 andl(len, 0xffffffe0); // vector count (in bytes) 3588 jccb(Assembler::zero, COMPARE_TAIL); 3589 3590 lea(ary1, Address(ary1, len, Address::times_1)); 3591 negptr(len); 3592 3593 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 3594 movdl(vec2, tmp1); 3595 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 3596 3597 bind(COMPARE_WIDE_VECTORS); 3598 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 3599 vptest(vec1, vec2); 3600 jccb(Assembler::notZero, TRUE_LABEL); 3601 addptr(len, 32); 3602 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3603 3604 testl(result, result); 3605 jccb(Assembler::zero, FALSE_LABEL); 3606 3607 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 3608 vptest(vec1, vec2); 3609 jccb(Assembler::notZero, TRUE_LABEL); 3610 jmpb(FALSE_LABEL); 3611 3612 bind(COMPARE_TAIL); // len is zero 3613 movl(len, result); 3614 // Fallthru to tail compare 3615 } else if (UseSSE42Intrinsics) { 3616 // With SSE4.2, use double quad vector compare 3617 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3618 3619 // Compare 16-byte vectors 3620 andl(result, 0x0000000f); // tail count (in bytes) 3621 andl(len, 0xfffffff0); // vector count (in bytes) 3622 jcc(Assembler::zero, COMPARE_TAIL); 3623 3624 lea(ary1, Address(ary1, len, Address::times_1)); 3625 negptr(len); 3626 3627 movl(tmp1, 0x80808080); 3628 movdl(vec2, tmp1); 3629 pshufd(vec2, vec2, 0); 3630 3631 bind(COMPARE_WIDE_VECTORS); 3632 movdqu(vec1, Address(ary1, len, Address::times_1)); 3633 ptest(vec1, vec2); 3634 jcc(Assembler::notZero, TRUE_LABEL); 3635 addptr(len, 16); 3636 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3637 3638 testl(result, result); 3639 jcc(Assembler::zero, FALSE_LABEL); 3640 3641 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 3642 ptest(vec1, vec2); 3643 jccb(Assembler::notZero, TRUE_LABEL); 3644 jmpb(FALSE_LABEL); 3645 3646 bind(COMPARE_TAIL); // len is zero 3647 movl(len, result); 3648 // Fallthru to tail compare 3649 } 3650 } 3651 // Compare 4-byte vectors 3652 andl(len, 0xfffffffc); // vector count (in bytes) 3653 jccb(Assembler::zero, COMPARE_CHAR); 3654 3655 lea(ary1, Address(ary1, len, Address::times_1)); 3656 negptr(len); 3657 3658 bind(COMPARE_VECTORS); 3659 movl(tmp1, Address(ary1, len, Address::times_1)); 3660 andl(tmp1, 0x80808080); 3661 jccb(Assembler::notZero, TRUE_LABEL); 3662 addptr(len, 4); 3663 jcc(Assembler::notZero, COMPARE_VECTORS); 3664 3665 // Compare trailing char (final 2 bytes), if any 3666 bind(COMPARE_CHAR); 3667 testl(result, 0x2); // tail char 3668 jccb(Assembler::zero, COMPARE_BYTE); 3669 load_unsigned_short(tmp1, Address(ary1, 0)); 3670 andl(tmp1, 0x00008080); 3671 jccb(Assembler::notZero, TRUE_LABEL); 3672 subptr(result, 2); 3673 lea(ary1, Address(ary1, 2)); 3674 3675 bind(COMPARE_BYTE); 3676 testl(result, 0x1); // tail byte 3677 jccb(Assembler::zero, FALSE_LABEL); 3678 load_unsigned_byte(tmp1, Address(ary1, 0)); 3679 andl(tmp1, 0x00000080); 3680 jccb(Assembler::notEqual, TRUE_LABEL); 3681 jmpb(FALSE_LABEL); 3682 3683 bind(TRUE_LABEL); 3684 movl(result, 1); // return true 3685 jmpb(DONE); 3686 3687 bind(FALSE_LABEL); 3688 xorl(result, result); // return false 3689 3690 // That's it 3691 bind(DONE); 3692 if (UseAVX >= 2 && UseSSE >= 2) { 3693 // clean upper bits of YMM registers 3694 vpxor(vec1, vec1); 3695 vpxor(vec2, vec2); 3696 } 3697 } 3698 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 3699 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 3700 Register limit, Register result, Register chr, 3701 XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) { 3702 ShortBranchVerifier sbv(this); 3703 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 3704 3705 int length_offset = arrayOopDesc::length_offset_in_bytes(); 3706 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 3707 3708 if (is_array_equ) { 3709 // Check the input args 3710 cmpoop(ary1, ary2); 3711 jcc(Assembler::equal, TRUE_LABEL); 3712 3713 // Need additional checks for arrays_equals. 3714 testptr(ary1, ary1); 3715 jcc(Assembler::zero, FALSE_LABEL); 3716 testptr(ary2, ary2); 3717 jcc(Assembler::zero, FALSE_LABEL); 3718 3719 // Check the lengths 3720 movl(limit, Address(ary1, length_offset)); 3721 cmpl(limit, Address(ary2, length_offset)); 3722 jcc(Assembler::notEqual, FALSE_LABEL); 3723 } 3724 3725 // count == 0 3726 testl(limit, limit); 3727 jcc(Assembler::zero, TRUE_LABEL); 3728 3729 if (is_array_equ) { 3730 // Load array address 3731 lea(ary1, Address(ary1, base_offset)); 3732 lea(ary2, Address(ary2, base_offset)); 3733 } 3734 3735 if (is_array_equ && is_char) { 3736 // arrays_equals when used for char[]. 3737 shll(limit, 1); // byte count != 0 3738 } 3739 movl(result, limit); // copy 3740 3741 if (UseAVX >= 2) { 3742 // With AVX2, use 32-byte vector compare 3743 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3744 3745 // Compare 32-byte vectors 3746 andl(result, 0x0000001f); // tail count (in bytes) 3747 andl(limit, 0xffffffe0); // vector count (in bytes) 3748 jcc(Assembler::zero, COMPARE_TAIL); 3749 3750 lea(ary1, Address(ary1, limit, Address::times_1)); 3751 lea(ary2, Address(ary2, limit, Address::times_1)); 3752 negptr(limit); 3753 3754 #ifdef _LP64 3755 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3756 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 3757 3758 cmpl(limit, -64); 3759 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3760 3761 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3762 3763 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 3764 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 3765 kortestql(mask, mask); 3766 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 3767 addptr(limit, 64); // update since we already compared at this addr 3768 cmpl(limit, -64); 3769 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3770 3771 // At this point we may still need to compare -limit+result bytes. 3772 // We could execute the next two instruction and just continue via non-wide path: 3773 // cmpl(limit, 0); 3774 // jcc(Assembler::equal, COMPARE_TAIL); // true 3775 // But since we stopped at the points ary{1,2}+limit which are 3776 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 3777 // (|limit| <= 32 and result < 32), 3778 // we may just compare the last 64 bytes. 3779 // 3780 addptr(result, -64); // it is safe, bc we just came from this area 3781 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 3782 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 3783 kortestql(mask, mask); 3784 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 3785 3786 jmp(TRUE_LABEL); 3787 3788 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3789 3790 }//if (VM_Version::supports_avx512vlbw()) 3791 #endif //_LP64 3792 bind(COMPARE_WIDE_VECTORS); 3793 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); 3794 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 3795 vpxor(vec1, vec2); 3796 3797 vptest(vec1, vec1); 3798 jcc(Assembler::notZero, FALSE_LABEL); 3799 addptr(limit, 32); 3800 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3801 3802 testl(result, result); 3803 jcc(Assembler::zero, TRUE_LABEL); 3804 3805 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 3806 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 3807 vpxor(vec1, vec2); 3808 3809 vptest(vec1, vec1); 3810 jccb(Assembler::notZero, FALSE_LABEL); 3811 jmpb(TRUE_LABEL); 3812 3813 bind(COMPARE_TAIL); // limit is zero 3814 movl(limit, result); 3815 // Fallthru to tail compare 3816 } else if (UseSSE42Intrinsics) { 3817 // With SSE4.2, use double quad vector compare 3818 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3819 3820 // Compare 16-byte vectors 3821 andl(result, 0x0000000f); // tail count (in bytes) 3822 andl(limit, 0xfffffff0); // vector count (in bytes) 3823 jcc(Assembler::zero, COMPARE_TAIL); 3824 3825 lea(ary1, Address(ary1, limit, Address::times_1)); 3826 lea(ary2, Address(ary2, limit, Address::times_1)); 3827 negptr(limit); 3828 3829 bind(COMPARE_WIDE_VECTORS); 3830 movdqu(vec1, Address(ary1, limit, Address::times_1)); 3831 movdqu(vec2, Address(ary2, limit, Address::times_1)); 3832 pxor(vec1, vec2); 3833 3834 ptest(vec1, vec1); 3835 jcc(Assembler::notZero, FALSE_LABEL); 3836 addptr(limit, 16); 3837 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3838 3839 testl(result, result); 3840 jcc(Assembler::zero, TRUE_LABEL); 3841 3842 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 3843 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 3844 pxor(vec1, vec2); 3845 3846 ptest(vec1, vec1); 3847 jccb(Assembler::notZero, FALSE_LABEL); 3848 jmpb(TRUE_LABEL); 3849 3850 bind(COMPARE_TAIL); // limit is zero 3851 movl(limit, result); 3852 // Fallthru to tail compare 3853 } 3854 3855 // Compare 4-byte vectors 3856 andl(limit, 0xfffffffc); // vector count (in bytes) 3857 jccb(Assembler::zero, COMPARE_CHAR); 3858 3859 lea(ary1, Address(ary1, limit, Address::times_1)); 3860 lea(ary2, Address(ary2, limit, Address::times_1)); 3861 negptr(limit); 3862 3863 bind(COMPARE_VECTORS); 3864 movl(chr, Address(ary1, limit, Address::times_1)); 3865 cmpl(chr, Address(ary2, limit, Address::times_1)); 3866 jccb(Assembler::notEqual, FALSE_LABEL); 3867 addptr(limit, 4); 3868 jcc(Assembler::notZero, COMPARE_VECTORS); 3869 3870 // Compare trailing char (final 2 bytes), if any 3871 bind(COMPARE_CHAR); 3872 testl(result, 0x2); // tail char 3873 jccb(Assembler::zero, COMPARE_BYTE); 3874 load_unsigned_short(chr, Address(ary1, 0)); 3875 load_unsigned_short(limit, Address(ary2, 0)); 3876 cmpl(chr, limit); 3877 jccb(Assembler::notEqual, FALSE_LABEL); 3878 3879 if (is_array_equ && is_char) { 3880 bind(COMPARE_BYTE); 3881 } else { 3882 lea(ary1, Address(ary1, 2)); 3883 lea(ary2, Address(ary2, 2)); 3884 3885 bind(COMPARE_BYTE); 3886 testl(result, 0x1); // tail byte 3887 jccb(Assembler::zero, TRUE_LABEL); 3888 load_unsigned_byte(chr, Address(ary1, 0)); 3889 load_unsigned_byte(limit, Address(ary2, 0)); 3890 cmpl(chr, limit); 3891 jccb(Assembler::notEqual, FALSE_LABEL); 3892 } 3893 bind(TRUE_LABEL); 3894 movl(result, 1); // return true 3895 jmpb(DONE); 3896 3897 bind(FALSE_LABEL); 3898 xorl(result, result); // return false 3899 3900 // That's it 3901 bind(DONE); 3902 if (UseAVX >= 2) { 3903 // clean upper bits of YMM registers 3904 vpxor(vec1, vec1); 3905 vpxor(vec2, vec2); 3906 } 3907 } 3908 3909 #ifdef _LP64 3910 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 3911 Register tmp, KRegister ktmp, int masklen, int vec_enc) { 3912 assert(VM_Version::supports_avx512vlbw(), ""); 3913 vpxor(xtmp, xtmp, xtmp, vec_enc); 3914 vpsubb(xtmp, xtmp, mask, vec_enc); 3915 evpmovb2m(ktmp, xtmp, vec_enc); 3916 kmovql(tmp, ktmp); 3917 switch(opc) { 3918 case Op_VectorMaskTrueCount: 3919 popcntq(dst, tmp); 3920 break; 3921 case Op_VectorMaskLastTrue: 3922 mov64(dst, -1); 3923 bsrq(tmp, tmp); 3924 cmov(Assembler::notZero, dst, tmp); 3925 break; 3926 case Op_VectorMaskFirstTrue: 3927 mov64(dst, masklen); 3928 bsfq(tmp, tmp); 3929 cmov(Assembler::notZero, dst, tmp); 3930 break; 3931 default: assert(false, "Unhandled mask operation"); 3932 } 3933 } 3934 3935 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 3936 XMMRegister xtmp1, Register tmp, int masklen, int vec_enc) { 3937 assert(VM_Version::supports_avx(), ""); 3938 vpxor(xtmp, xtmp, xtmp, vec_enc); 3939 vpsubb(xtmp, xtmp, mask, vec_enc); 3940 vpmovmskb(tmp, xtmp, vec_enc); 3941 if (masklen < 64) { 3942 andq(tmp, (((jlong)1 << masklen) - 1)); 3943 } 3944 switch(opc) { 3945 case Op_VectorMaskTrueCount: 3946 popcntq(dst, tmp); 3947 break; 3948 case Op_VectorMaskLastTrue: 3949 mov64(dst, -1); 3950 bsrq(tmp, tmp); 3951 cmov(Assembler::notZero, dst, tmp); 3952 break; 3953 case Op_VectorMaskFirstTrue: 3954 mov64(dst, masklen); 3955 bsfq(tmp, tmp); 3956 cmov(Assembler::notZero, dst, tmp); 3957 break; 3958 default: assert(false, "Unhandled mask operation"); 3959 } 3960 } 3961 #endif 3962 3963 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 3964 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 3965 int vlen_enc) { 3966 assert(VM_Version::supports_avx512bw(), ""); 3967 // Byte shuffles are inlane operations and indices are determined using 3968 // lower 4 bit of each shuffle lane, thus all shuffle indices are 3969 // normalized to index range 0-15. This makes sure that all the multiples 3970 // of an index value are placed at same relative position in 128 bit 3971 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 3972 // will be 16th element in their respective 128 bit lanes. 3973 movl(rtmp, 16); 3974 evpbroadcastb(xtmp1, rtmp, vlen_enc); 3975 3976 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 3977 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 3978 // original shuffle indices and move the shuffled lanes corresponding to true 3979 // mask to destination vector. 3980 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 3981 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 3982 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 3983 3984 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 3985 // and broadcasting second 128 bit lane. 3986 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 3987 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 3988 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 3989 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 3990 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 3991 3992 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 3993 // and broadcasting third 128 bit lane. 3994 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 3995 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 3996 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 3997 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 3998 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 3999 4000 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 4001 // and broadcasting third 128 bit lane. 4002 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 4003 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 4004 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 4005 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 4006 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 4007 }