1 /* 2 * Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "oops/methodData.hpp" 29 #include "opto/c2_MacroAssembler.hpp" 30 #include "opto/intrinsicnode.hpp" 31 #include "opto/opcodes.hpp" 32 #include "opto/subnode.hpp" 33 #include "runtime/biasedLocking.hpp" 34 #include "runtime/objectMonitor.hpp" 35 #include "runtime/stubRoutines.hpp" 36 37 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 38 switch (vlen_in_bytes) { 39 case 4: // fall-through 40 case 8: // fall-through 41 case 16: return Assembler::AVX_128bit; 42 case 32: return Assembler::AVX_256bit; 43 case 64: return Assembler::AVX_512bit; 44 45 default: { 46 ShouldNotReachHere(); 47 return Assembler::AVX_NoVec; 48 } 49 } 50 } 51 52 void C2_MacroAssembler::setvectmask(Register dst, Register src, KRegister mask) { 53 guarantee(PostLoopMultiversioning, "must be"); 54 Assembler::movl(dst, 1); 55 Assembler::shlxl(dst, dst, src); 56 Assembler::decl(dst); 57 Assembler::kmovdl(mask, dst); 58 Assembler::movl(dst, src); 59 } 60 61 void C2_MacroAssembler::restorevectmask(KRegister mask) { 62 guarantee(PostLoopMultiversioning, "must be"); 63 Assembler::knotwl(mask, k0); 64 } 65 66 #if INCLUDE_RTM_OPT 67 68 // Update rtm_counters based on abort status 69 // input: abort_status 70 // rtm_counters (RTMLockingCounters*) 71 // flags are killed 72 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) { 73 74 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset())); 75 if (PrintPreciseRTMLockingStatistics) { 76 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 77 Label check_abort; 78 testl(abort_status, (1<<i)); 79 jccb(Assembler::equal, check_abort); 80 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx)))); 81 bind(check_abort); 82 } 83 } 84 } 85 86 // Branch if (random & (count-1) != 0), count is 2^n 87 // tmp, scr and flags are killed 88 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) { 89 assert(tmp == rax, ""); 90 assert(scr == rdx, ""); 91 rdtsc(); // modifies EDX:EAX 92 andptr(tmp, count-1); 93 jccb(Assembler::notZero, brLabel); 94 } 95 96 // Perform abort ratio calculation, set no_rtm bit if high ratio 97 // input: rtm_counters_Reg (RTMLockingCounters* address) 98 // tmpReg, rtm_counters_Reg and flags are killed 99 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg, 100 Register rtm_counters_Reg, 101 RTMLockingCounters* rtm_counters, 102 Metadata* method_data) { 103 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 104 105 if (RTMLockingCalculationDelay > 0) { 106 // Delay calculation 107 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg); 108 testptr(tmpReg, tmpReg); 109 jccb(Assembler::equal, L_done); 110 } 111 // Abort ratio calculation only if abort_count > RTMAbortThreshold 112 // Aborted transactions = abort_count * 100 113 // All transactions = total_count * RTMTotalCountIncrRate 114 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 115 116 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset())); 117 cmpptr(tmpReg, RTMAbortThreshold); 118 jccb(Assembler::below, L_check_always_rtm2); 119 imulptr(tmpReg, tmpReg, 100); 120 121 Register scrReg = rtm_counters_Reg; 122 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 123 imulptr(scrReg, scrReg, RTMTotalCountIncrRate); 124 imulptr(scrReg, scrReg, RTMAbortRatio); 125 cmpptr(tmpReg, scrReg); 126 jccb(Assembler::below, L_check_always_rtm1); 127 if (method_data != NULL) { 128 // set rtm_state to "no rtm" in MDO 129 mov_metadata(tmpReg, method_data); 130 lock(); 131 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM); 132 } 133 jmpb(L_done); 134 bind(L_check_always_rtm1); 135 // Reload RTMLockingCounters* address 136 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 137 bind(L_check_always_rtm2); 138 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 139 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 140 jccb(Assembler::below, L_done); 141 if (method_data != NULL) { 142 // set rtm_state to "always rtm" in MDO 143 mov_metadata(tmpReg, method_data); 144 lock(); 145 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM); 146 } 147 bind(L_done); 148 } 149 150 // Update counters and perform abort ratio calculation 151 // input: abort_status_Reg 152 // rtm_counters_Reg, flags are killed 153 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg, 154 Register rtm_counters_Reg, 155 RTMLockingCounters* rtm_counters, 156 Metadata* method_data, 157 bool profile_rtm) { 158 159 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 160 // update rtm counters based on rax value at abort 161 // reads abort_status_Reg, updates flags 162 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 163 rtm_counters_update(abort_status_Reg, rtm_counters_Reg); 164 if (profile_rtm) { 165 // Save abort status because abort_status_Reg is used by following code. 166 if (RTMRetryCount > 0) { 167 push(abort_status_Reg); 168 } 169 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 170 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data); 171 // restore abort status 172 if (RTMRetryCount > 0) { 173 pop(abort_status_Reg); 174 } 175 } 176 } 177 178 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4) 179 // inputs: retry_count_Reg 180 // : abort_status_Reg 181 // output: retry_count_Reg decremented by 1 182 // flags are killed 183 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) { 184 Label doneRetry; 185 assert(abort_status_Reg == rax, ""); 186 // The abort reason bits are in eax (see all states in rtmLocking.hpp) 187 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) 188 // if reason is in 0x6 and retry count != 0 then retry 189 andptr(abort_status_Reg, 0x6); 190 jccb(Assembler::zero, doneRetry); 191 testl(retry_count_Reg, retry_count_Reg); 192 jccb(Assembler::zero, doneRetry); 193 pause(); 194 decrementl(retry_count_Reg); 195 jmp(retryLabel); 196 bind(doneRetry); 197 } 198 199 // Spin and retry if lock is busy, 200 // inputs: box_Reg (monitor address) 201 // : retry_count_Reg 202 // output: retry_count_Reg decremented by 1 203 // : clear z flag if retry count exceeded 204 // tmp_Reg, scr_Reg, flags are killed 205 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg, 206 Register tmp_Reg, Register scr_Reg, Label& retryLabel) { 207 Label SpinLoop, SpinExit, doneRetry; 208 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 209 210 testl(retry_count_Reg, retry_count_Reg); 211 jccb(Assembler::zero, doneRetry); 212 decrementl(retry_count_Reg); 213 movptr(scr_Reg, RTMSpinLoopCount); 214 215 bind(SpinLoop); 216 pause(); 217 decrementl(scr_Reg); 218 jccb(Assembler::lessEqual, SpinExit); 219 movptr(tmp_Reg, Address(box_Reg, owner_offset)); 220 testptr(tmp_Reg, tmp_Reg); 221 jccb(Assembler::notZero, SpinLoop); 222 223 bind(SpinExit); 224 jmp(retryLabel); 225 bind(doneRetry); 226 incrementl(retry_count_Reg); // clear z flag 227 } 228 229 // Use RTM for normal stack locks 230 // Input: objReg (object to lock) 231 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg, 232 Register retry_on_abort_count_Reg, 233 RTMLockingCounters* stack_rtm_counters, 234 Metadata* method_data, bool profile_rtm, 235 Label& DONE_LABEL, Label& IsInflated) { 236 assert(UseRTMForStackLocks, "why call this otherwise?"); 237 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 238 assert(tmpReg == rax, ""); 239 assert(scrReg == rdx, ""); 240 Label L_rtm_retry, L_decrement_retry, L_on_abort; 241 242 if (RTMRetryCount > 0) { 243 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 244 bind(L_rtm_retry); 245 } 246 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 247 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased 248 jcc(Assembler::notZero, IsInflated); 249 250 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 251 Label L_noincrement; 252 if (RTMTotalCountIncrRate > 1) { 253 // tmpReg, scrReg and flags are killed 254 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 255 } 256 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 257 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg); 258 bind(L_noincrement); 259 } 260 xbegin(L_on_abort); 261 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 262 andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits 263 cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked 264 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked 265 266 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 267 if (UseRTMXendForLockBusy) { 268 xend(); 269 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry) 270 jmp(L_decrement_retry); 271 } 272 else { 273 xabort(0); 274 } 275 bind(L_on_abort); 276 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 277 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm); 278 } 279 bind(L_decrement_retry); 280 if (RTMRetryCount > 0) { 281 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 282 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 283 } 284 } 285 286 // Use RTM for inflating locks 287 // inputs: objReg (object to lock) 288 // boxReg (on-stack box address (displaced header location) - KILLED) 289 // tmpReg (ObjectMonitor address + markWord::monitor_value) 290 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg, 291 Register scrReg, Register retry_on_busy_count_Reg, 292 Register retry_on_abort_count_Reg, 293 RTMLockingCounters* rtm_counters, 294 Metadata* method_data, bool profile_rtm, 295 Label& DONE_LABEL) { 296 assert(UseRTMLocking, "why call this otherwise?"); 297 assert(tmpReg == rax, ""); 298 assert(scrReg == rdx, ""); 299 Label L_rtm_retry, L_decrement_retry, L_on_abort; 300 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 301 302 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 303 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value())); 304 movptr(boxReg, tmpReg); // Save ObjectMonitor address 305 306 if (RTMRetryCount > 0) { 307 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy 308 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 309 bind(L_rtm_retry); 310 } 311 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 312 Label L_noincrement; 313 if (RTMTotalCountIncrRate > 1) { 314 // tmpReg, scrReg and flags are killed 315 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 316 } 317 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 318 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg); 319 bind(L_noincrement); 320 } 321 xbegin(L_on_abort); 322 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 323 movptr(tmpReg, Address(tmpReg, owner_offset)); 324 testptr(tmpReg, tmpReg); 325 jcc(Assembler::zero, DONE_LABEL); 326 if (UseRTMXendForLockBusy) { 327 xend(); 328 jmp(L_decrement_retry); 329 } 330 else { 331 xabort(0); 332 } 333 bind(L_on_abort); 334 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 335 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 336 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm); 337 } 338 if (RTMRetryCount > 0) { 339 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 340 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 341 } 342 343 movptr(tmpReg, Address(boxReg, owner_offset)) ; 344 testptr(tmpReg, tmpReg) ; 345 jccb(Assembler::notZero, L_decrement_retry) ; 346 347 // Appears unlocked - try to swing _owner from null to non-null. 348 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 349 #ifdef _LP64 350 Register threadReg = r15_thread; 351 #else 352 get_thread(scrReg); 353 Register threadReg = scrReg; 354 #endif 355 lock(); 356 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg 357 358 if (RTMRetryCount > 0) { 359 // success done else retry 360 jccb(Assembler::equal, DONE_LABEL) ; 361 bind(L_decrement_retry); 362 // Spin and retry if lock is busy. 363 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry); 364 } 365 else { 366 bind(L_decrement_retry); 367 } 368 } 369 370 #endif // INCLUDE_RTM_OPT 371 372 // fast_lock and fast_unlock used by C2 373 374 // Because the transitions from emitted code to the runtime 375 // monitorenter/exit helper stubs are so slow it's critical that 376 // we inline both the stack-locking fast path and the inflated fast path. 377 // 378 // See also: cmpFastLock and cmpFastUnlock. 379 // 380 // What follows is a specialized inline transliteration of the code 381 // in enter() and exit(). If we're concerned about I$ bloat another 382 // option would be to emit TrySlowEnter and TrySlowExit methods 383 // at startup-time. These methods would accept arguments as 384 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 385 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 386 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 387 // In practice, however, the # of lock sites is bounded and is usually small. 388 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 389 // if the processor uses simple bimodal branch predictors keyed by EIP 390 // Since the helper routines would be called from multiple synchronization 391 // sites. 392 // 393 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 394 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 395 // to those specialized methods. That'd give us a mostly platform-independent 396 // implementation that the JITs could optimize and inline at their pleasure. 397 // Done correctly, the only time we'd need to cross to native could would be 398 // to park() or unpark() threads. We'd also need a few more unsafe operators 399 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 400 // (b) explicit barriers or fence operations. 401 // 402 // TODO: 403 // 404 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 405 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 406 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 407 // the lock operators would typically be faster than reifying Self. 408 // 409 // * Ideally I'd define the primitives as: 410 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 411 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 412 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 413 // Instead, we're stuck with a rather awkward and brittle register assignments below. 414 // Furthermore the register assignments are overconstrained, possibly resulting in 415 // sub-optimal code near the synchronization site. 416 // 417 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 418 // Alternately, use a better sp-proximity test. 419 // 420 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 421 // Either one is sufficient to uniquely identify a thread. 422 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 423 // 424 // * Intrinsify notify() and notifyAll() for the common cases where the 425 // object is locked by the calling thread but the waitlist is empty. 426 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 427 // 428 // * use jccb and jmpb instead of jcc and jmp to improve code density. 429 // But beware of excessive branch density on AMD Opterons. 430 // 431 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 432 // or failure of the fast path. If the fast path fails then we pass 433 // control to the slow path, typically in C. In fast_lock and 434 // fast_unlock we often branch to DONE_LABEL, just to find that C2 435 // will emit a conditional branch immediately after the node. 436 // So we have branches to branches and lots of ICC.ZF games. 437 // Instead, it might be better to have C2 pass a "FailureLabel" 438 // into fast_lock and fast_unlock. In the case of success, control 439 // will drop through the node. ICC.ZF is undefined at exit. 440 // In the case of failure, the node will branch directly to the 441 // FailureLabel 442 443 444 // obj: object to lock 445 // box: on-stack box address (displaced header location) - KILLED 446 // rax,: tmp -- KILLED 447 // scr: tmp -- KILLED 448 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 449 Register scrReg, Register cx1Reg, Register cx2Reg, 450 BiasedLockingCounters* counters, 451 RTMLockingCounters* rtm_counters, 452 RTMLockingCounters* stack_rtm_counters, 453 Metadata* method_data, 454 bool use_rtm, bool profile_rtm) { 455 // Ensure the register assignments are disjoint 456 assert(tmpReg == rax, ""); 457 458 if (use_rtm) { 459 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg); 460 } else { 461 assert(cx2Reg == noreg, ""); 462 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 463 } 464 465 if (counters != NULL) { 466 atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg); 467 } 468 469 // Possible cases that we'll encounter in fast_lock 470 // ------------------------------------------------ 471 // * Inflated 472 // -- unlocked 473 // -- Locked 474 // = by self 475 // = by other 476 // * biased 477 // -- by Self 478 // -- by other 479 // * neutral 480 // * stack-locked 481 // -- by self 482 // = sp-proximity test hits 483 // = sp-proximity test generates false-negative 484 // -- by other 485 // 486 487 Label IsInflated, DONE_LABEL; 488 489 if (DiagnoseSyncOnValueBasedClasses != 0) { 490 load_klass(tmpReg, objReg, cx1Reg); 491 movl(tmpReg, Address(tmpReg, Klass::access_flags_offset())); 492 testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS); 493 jcc(Assembler::notZero, DONE_LABEL); 494 } 495 496 // it's stack-locked, biased or neutral 497 // TODO: optimize away redundant LDs of obj->mark and improve the markword triage 498 // order to reduce the number of conditional branches in the most common cases. 499 // Beware -- there's a subtle invariant that fetch of the markword 500 // at [FETCH], below, will never observe a biased encoding (*101b). 501 // If this invariant is not held we risk exclusion (safety) failure. 502 if (UseBiasedLocking && !UseOptoBiasInlining) { 503 biased_locking_enter(boxReg, objReg, tmpReg, scrReg, cx1Reg, false, DONE_LABEL, NULL, counters); 504 } 505 506 #if INCLUDE_RTM_OPT 507 if (UseRTMForStackLocks && use_rtm) { 508 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg, 509 stack_rtm_counters, method_data, profile_rtm, 510 DONE_LABEL, IsInflated); 511 } 512 #endif // INCLUDE_RTM_OPT 513 514 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 515 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased 516 jccb(Assembler::notZero, IsInflated); 517 518 // Attempt stack-locking ... 519 orptr (tmpReg, markWord::unlocked_value); 520 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 521 lock(); 522 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 523 if (counters != NULL) { 524 cond_inc32(Assembler::equal, 525 ExternalAddress((address)counters->fast_path_entry_count_addr())); 526 } 527 jcc(Assembler::equal, DONE_LABEL); // Success 528 529 // Recursive locking. 530 // The object is stack-locked: markword contains stack pointer to BasicLock. 531 // Locked by current thread if difference with current SP is less than one page. 532 subptr(tmpReg, rsp); 533 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 534 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) ); 535 movptr(Address(boxReg, 0), tmpReg); 536 if (counters != NULL) { 537 cond_inc32(Assembler::equal, 538 ExternalAddress((address)counters->fast_path_entry_count_addr())); 539 } 540 jmp(DONE_LABEL); 541 542 bind(IsInflated); 543 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 544 545 #if INCLUDE_RTM_OPT 546 // Use the same RTM locking code in 32- and 64-bit VM. 547 if (use_rtm) { 548 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg, 549 rtm_counters, method_data, profile_rtm, DONE_LABEL); 550 } else { 551 #endif // INCLUDE_RTM_OPT 552 553 #ifndef _LP64 554 // The object is inflated. 555 556 // boxReg refers to the on-stack BasicLock in the current frame. 557 // We'd like to write: 558 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 559 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 560 // additional latency as we have another ST in the store buffer that must drain. 561 562 // avoid ST-before-CAS 563 // register juggle because we need tmpReg for cmpxchgptr below 564 movptr(scrReg, boxReg); 565 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 566 567 // Optimistic form: consider XORL tmpReg,tmpReg 568 movptr(tmpReg, NULL_WORD); 569 570 // Appears unlocked - try to swing _owner from null to non-null. 571 // Ideally, I'd manifest "Self" with get_thread and then attempt 572 // to CAS the register containing Self into m->Owner. 573 // But we don't have enough registers, so instead we can either try to CAS 574 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 575 // we later store "Self" into m->Owner. Transiently storing a stack address 576 // (rsp or the address of the box) into m->owner is harmless. 577 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 578 lock(); 579 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 580 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 581 // If we weren't able to swing _owner from NULL to the BasicLock 582 // then take the slow path. 583 jccb (Assembler::notZero, DONE_LABEL); 584 // update _owner from BasicLock to thread 585 get_thread (scrReg); // beware: clobbers ICCs 586 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 587 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 588 589 // If the CAS fails we can either retry or pass control to the slow path. 590 // We use the latter tactic. 591 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 592 // If the CAS was successful ... 593 // Self has acquired the lock 594 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 595 // Intentional fall-through into DONE_LABEL ... 596 #else // _LP64 597 // It's inflated and we use scrReg for ObjectMonitor* in this section. 598 movq(scrReg, tmpReg); 599 xorq(tmpReg, tmpReg); 600 lock(); 601 cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 602 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 603 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 604 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value())); 605 // Propagate ICC.ZF from CAS above into DONE_LABEL. 606 jcc(Assembler::equal, DONE_LABEL); // CAS above succeeded; propagate ZF = 1 (success) 607 608 cmpptr(r15_thread, rax); // Check if we are already the owner (recursive lock) 609 jcc(Assembler::notEqual, DONE_LABEL); // If not recursive, ZF = 0 at this point (fail) 610 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 611 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 612 #endif // _LP64 613 #if INCLUDE_RTM_OPT 614 } // use_rtm() 615 #endif 616 // DONE_LABEL is a hot target - we'd really like to place it at the 617 // start of cache line by padding with NOPs. 618 // See the AMD and Intel software optimization manuals for the 619 // most efficient "long" NOP encodings. 620 // Unfortunately none of our alignment mechanisms suffice. 621 bind(DONE_LABEL); 622 623 // At DONE_LABEL the icc ZFlag is set as follows ... 624 // fast_unlock uses the same protocol. 625 // ZFlag == 1 -> Success 626 // ZFlag == 0 -> Failure - force control through the slow path 627 } 628 629 // obj: object to unlock 630 // box: box address (displaced header location), killed. Must be EAX. 631 // tmp: killed, cannot be obj nor box. 632 // 633 // Some commentary on balanced locking: 634 // 635 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 636 // Methods that don't have provably balanced locking are forced to run in the 637 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 638 // The interpreter provides two properties: 639 // I1: At return-time the interpreter automatically and quietly unlocks any 640 // objects acquired the current activation (frame). Recall that the 641 // interpreter maintains an on-stack list of locks currently held by 642 // a frame. 643 // I2: If a method attempts to unlock an object that is not held by the 644 // the frame the interpreter throws IMSX. 645 // 646 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 647 // B() doesn't have provably balanced locking so it runs in the interpreter. 648 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 649 // is still locked by A(). 650 // 651 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 652 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 653 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 654 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 655 // Arguably given that the spec legislates the JNI case as undefined our implementation 656 // could reasonably *avoid* checking owner in fast_unlock(). 657 // In the interest of performance we elide m->Owner==Self check in unlock. 658 // A perfectly viable alternative is to elide the owner check except when 659 // Xcheck:jni is enabled. 660 661 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { 662 assert(boxReg == rax, ""); 663 assert_different_registers(objReg, boxReg, tmpReg); 664 665 Label DONE_LABEL, Stacked, CheckSucc; 666 667 // Critically, the biased locking test must have precedence over 668 // and appear before the (box->dhw == 0) recursive stack-lock test. 669 if (UseBiasedLocking && !UseOptoBiasInlining) { 670 biased_locking_exit(objReg, tmpReg, DONE_LABEL); 671 } 672 673 #if INCLUDE_RTM_OPT 674 if (UseRTMForStackLocks && use_rtm) { 675 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 676 Label L_regular_unlock; 677 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 678 andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits 679 cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked 680 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock 681 xend(); // otherwise end... 682 jmp(DONE_LABEL); // ... and we're done 683 bind(L_regular_unlock); 684 } 685 #endif 686 687 cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header 688 jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock 689 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 690 testptr(tmpReg, markWord::monitor_value); // Inflated? 691 jccb (Assembler::zero, Stacked); 692 693 // It's inflated. 694 #if INCLUDE_RTM_OPT 695 if (use_rtm) { 696 Label L_regular_inflated_unlock; 697 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 698 movptr(boxReg, Address(tmpReg, owner_offset)); 699 testptr(boxReg, boxReg); 700 jccb(Assembler::notZero, L_regular_inflated_unlock); 701 xend(); 702 jmpb(DONE_LABEL); 703 bind(L_regular_inflated_unlock); 704 } 705 #endif 706 707 // Despite our balanced locking property we still check that m->_owner == Self 708 // as java routines or native JNI code called by this thread might 709 // have released the lock. 710 // Refer to the comments in synchronizer.cpp for how we might encode extra 711 // state in _succ so we can avoid fetching EntryList|cxq. 712 // 713 // If there's no contention try a 1-0 exit. That is, exit without 714 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 715 // we detect and recover from the race that the 1-0 exit admits. 716 // 717 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 718 // before it STs null into _owner, releasing the lock. Updates 719 // to data protected by the critical section must be visible before 720 // we drop the lock (and thus before any other thread could acquire 721 // the lock and observe the fields protected by the lock). 722 // IA32's memory-model is SPO, so STs are ordered with respect to 723 // each other and there's no need for an explicit barrier (fence). 724 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 725 #ifndef _LP64 726 get_thread (boxReg); 727 728 // Note that we could employ various encoding schemes to reduce 729 // the number of loads below (currently 4) to just 2 or 3. 730 // Refer to the comments in synchronizer.cpp. 731 // In practice the chain of fetches doesn't seem to impact performance, however. 732 xorptr(boxReg, boxReg); 733 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 734 jccb (Assembler::notZero, DONE_LABEL); 735 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 736 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 737 jccb (Assembler::notZero, CheckSucc); 738 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 739 jmpb (DONE_LABEL); 740 741 bind (Stacked); 742 // It's not inflated and it's not recursively stack-locked and it's not biased. 743 // It must be stack-locked. 744 // Try to reset the header to displaced header. 745 // The "box" value on the stack is stable, so we can reload 746 // and be assured we observe the same value as above. 747 movptr(tmpReg, Address(boxReg, 0)); 748 lock(); 749 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 750 // Intention fall-thru into DONE_LABEL 751 752 // DONE_LABEL is a hot target - we'd really like to place it at the 753 // start of cache line by padding with NOPs. 754 // See the AMD and Intel software optimization manuals for the 755 // most efficient "long" NOP encodings. 756 // Unfortunately none of our alignment mechanisms suffice. 757 bind (CheckSucc); 758 #else // _LP64 759 // It's inflated 760 Label LNotRecursive, LSuccess, LGoSlowPath; 761 762 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 763 jccb(Assembler::equal, LNotRecursive); 764 765 // Recursive inflated unlock 766 decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 767 jmpb(LSuccess); 768 769 bind(LNotRecursive); 770 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 771 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 772 jccb (Assembler::notZero, CheckSucc); 773 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 774 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); 775 jmpb (DONE_LABEL); 776 777 // Try to avoid passing control into the slow_path ... 778 bind (CheckSucc); 779 780 // The following optional optimization can be elided if necessary 781 // Effectively: if (succ == null) goto slow path 782 // The code reduces the window for a race, however, 783 // and thus benefits performance. 784 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 785 jccb (Assembler::zero, LGoSlowPath); 786 787 xorptr(boxReg, boxReg); 788 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 789 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); 790 791 // Memory barrier/fence 792 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 793 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 794 // This is faster on Nehalem and AMD Shanghai/Barcelona. 795 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 796 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 797 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 798 lock(); addl(Address(rsp, 0), 0); 799 800 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 801 jccb (Assembler::notZero, LSuccess); 802 803 // Rare inopportune interleaving - race. 804 // The successor vanished in the small window above. 805 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 806 // We need to ensure progress and succession. 807 // Try to reacquire the lock. 808 // If that fails then the new owner is responsible for succession and this 809 // thread needs to take no further action and can exit via the fast path (success). 810 // If the re-acquire succeeds then pass control into the slow path. 811 // As implemented, this latter mode is horrible because we generated more 812 // coherence traffic on the lock *and* artifically extended the critical section 813 // length while by virtue of passing control into the slow path. 814 815 // box is really RAX -- the following CMPXCHG depends on that binding 816 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 817 lock(); 818 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 819 // There's no successor so we tried to regrab the lock. 820 // If that didn't work, then another thread grabbed the 821 // lock so we're done (and exit was a success). 822 jccb (Assembler::notEqual, LSuccess); 823 // Intentional fall-through into slow path 824 825 bind (LGoSlowPath); 826 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 827 jmpb (DONE_LABEL); 828 829 bind (LSuccess); 830 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 831 jmpb (DONE_LABEL); 832 833 bind (Stacked); 834 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 835 lock(); 836 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 837 838 #endif 839 bind(DONE_LABEL); 840 } 841 842 //------------------------------------------------------------------------------------------- 843 // Generic instructions support for use in .ad files C2 code generation 844 845 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) { 846 if (dst != src) { 847 movdqu(dst, src); 848 } 849 if (opcode == Op_AbsVD) { 850 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr); 851 } else { 852 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 853 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr); 854 } 855 } 856 857 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) { 858 if (opcode == Op_AbsVD) { 859 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr); 860 } else { 861 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 862 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr); 863 } 864 } 865 866 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) { 867 if (dst != src) { 868 movdqu(dst, src); 869 } 870 if (opcode == Op_AbsVF) { 871 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr); 872 } else { 873 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 874 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr); 875 } 876 } 877 878 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) { 879 if (opcode == Op_AbsVF) { 880 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr); 881 } else { 882 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 883 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr); 884 } 885 } 886 887 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 888 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 889 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 890 891 if (opcode == Op_MinV) { 892 if (elem_bt == T_BYTE) { 893 pminsb(dst, src); 894 } else if (elem_bt == T_SHORT) { 895 pminsw(dst, src); 896 } else if (elem_bt == T_INT) { 897 pminsd(dst, src); 898 } else { 899 assert(elem_bt == T_LONG, "required"); 900 assert(tmp == xmm0, "required"); 901 assert_different_registers(dst, src, tmp); 902 movdqu(xmm0, dst); 903 pcmpgtq(xmm0, src); 904 blendvpd(dst, src); // xmm0 as mask 905 } 906 } else { // opcode == Op_MaxV 907 if (elem_bt == T_BYTE) { 908 pmaxsb(dst, src); 909 } else if (elem_bt == T_SHORT) { 910 pmaxsw(dst, src); 911 } else if (elem_bt == T_INT) { 912 pmaxsd(dst, src); 913 } else { 914 assert(elem_bt == T_LONG, "required"); 915 assert(tmp == xmm0, "required"); 916 assert_different_registers(dst, src, tmp); 917 movdqu(xmm0, src); 918 pcmpgtq(xmm0, dst); 919 blendvpd(dst, src); // xmm0 as mask 920 } 921 } 922 } 923 924 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 925 XMMRegister dst, XMMRegister src1, XMMRegister src2, 926 int vlen_enc) { 927 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 928 929 if (opcode == Op_MinV) { 930 if (elem_bt == T_BYTE) { 931 vpminsb(dst, src1, src2, vlen_enc); 932 } else if (elem_bt == T_SHORT) { 933 vpminsw(dst, src1, src2, vlen_enc); 934 } else if (elem_bt == T_INT) { 935 vpminsd(dst, src1, src2, vlen_enc); 936 } else { 937 assert(elem_bt == T_LONG, "required"); 938 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 939 vpminsq(dst, src1, src2, vlen_enc); 940 } else { 941 assert_different_registers(dst, src1, src2); 942 vpcmpgtq(dst, src1, src2, vlen_enc); 943 vblendvpd(dst, src1, src2, dst, vlen_enc); 944 } 945 } 946 } else { // opcode == Op_MaxV 947 if (elem_bt == T_BYTE) { 948 vpmaxsb(dst, src1, src2, vlen_enc); 949 } else if (elem_bt == T_SHORT) { 950 vpmaxsw(dst, src1, src2, vlen_enc); 951 } else if (elem_bt == T_INT) { 952 vpmaxsd(dst, src1, src2, vlen_enc); 953 } else { 954 assert(elem_bt == T_LONG, "required"); 955 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 956 vpmaxsq(dst, src1, src2, vlen_enc); 957 } else { 958 assert_different_registers(dst, src1, src2); 959 vpcmpgtq(dst, src1, src2, vlen_enc); 960 vblendvpd(dst, src2, src1, dst, vlen_enc); 961 } 962 } 963 } 964 } 965 966 // Float/Double min max 967 968 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 969 XMMRegister dst, XMMRegister a, XMMRegister b, 970 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 971 int vlen_enc) { 972 assert(UseAVX > 0, "required"); 973 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 974 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 975 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 976 assert_different_registers(a, b, tmp, atmp, btmp); 977 978 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 979 bool is_double_word = is_double_word_type(elem_bt); 980 981 if (!is_double_word && is_min) { 982 vblendvps(atmp, a, b, a, vlen_enc); 983 vblendvps(btmp, b, a, a, vlen_enc); 984 vminps(tmp, atmp, btmp, vlen_enc); 985 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 986 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 987 } else if (!is_double_word && !is_min) { 988 vblendvps(btmp, b, a, b, vlen_enc); 989 vblendvps(atmp, a, b, b, vlen_enc); 990 vmaxps(tmp, atmp, btmp, vlen_enc); 991 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 992 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 993 } else if (is_double_word && is_min) { 994 vblendvpd(atmp, a, b, a, vlen_enc); 995 vblendvpd(btmp, b, a, a, vlen_enc); 996 vminpd(tmp, atmp, btmp, vlen_enc); 997 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 998 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 999 } else { 1000 assert(is_double_word && !is_min, "sanity"); 1001 vblendvpd(btmp, b, a, b, vlen_enc); 1002 vblendvpd(atmp, a, b, b, vlen_enc); 1003 vmaxpd(tmp, atmp, btmp, vlen_enc); 1004 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1005 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 1006 } 1007 } 1008 1009 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1010 XMMRegister dst, XMMRegister a, XMMRegister b, 1011 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1012 int vlen_enc) { 1013 assert(UseAVX > 2, "required"); 1014 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1015 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1016 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1017 assert_different_registers(dst, a, b, atmp, btmp); 1018 1019 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1020 bool is_double_word = is_double_word_type(elem_bt); 1021 bool merge = true; 1022 1023 if (!is_double_word && is_min) { 1024 evpmovd2m(ktmp, a, vlen_enc); 1025 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1026 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1027 vminps(dst, atmp, btmp, vlen_enc); 1028 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1029 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1030 } else if (!is_double_word && !is_min) { 1031 evpmovd2m(ktmp, b, vlen_enc); 1032 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1033 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1034 vmaxps(dst, atmp, btmp, vlen_enc); 1035 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1036 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1037 } else if (is_double_word && is_min) { 1038 evpmovq2m(ktmp, a, vlen_enc); 1039 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1040 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1041 vminpd(dst, atmp, btmp, vlen_enc); 1042 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1043 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1044 } else { 1045 assert(is_double_word && !is_min, "sanity"); 1046 evpmovq2m(ktmp, b, vlen_enc); 1047 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1048 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1049 vmaxpd(dst, atmp, btmp, vlen_enc); 1050 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1051 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1052 } 1053 } 1054 1055 // Float/Double signum 1056 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, 1057 XMMRegister zero, XMMRegister one, 1058 Register scratch) { 1059 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1060 1061 Label DONE_LABEL; 1062 1063 if (opcode == Op_SignumF) { 1064 assert(UseSSE > 0, "required"); 1065 ucomiss(dst, zero); 1066 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1067 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1068 movflt(dst, one); 1069 jcc(Assembler::above, DONE_LABEL); 1070 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scratch); 1071 } else if (opcode == Op_SignumD) { 1072 assert(UseSSE > 1, "required"); 1073 ucomisd(dst, zero); 1074 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1075 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1076 movdbl(dst, one); 1077 jcc(Assembler::above, DONE_LABEL); 1078 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scratch); 1079 } 1080 1081 bind(DONE_LABEL); 1082 } 1083 1084 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1085 if (sign) { 1086 pmovsxbw(dst, src); 1087 } else { 1088 pmovzxbw(dst, src); 1089 } 1090 } 1091 1092 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1093 if (sign) { 1094 vpmovsxbw(dst, src, vector_len); 1095 } else { 1096 vpmovzxbw(dst, src, vector_len); 1097 } 1098 } 1099 1100 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1101 if (sign) { 1102 vpmovsxbd(dst, src, vector_len); 1103 } else { 1104 vpmovzxbd(dst, src, vector_len); 1105 } 1106 } 1107 1108 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1109 if (sign) { 1110 vpmovsxwd(dst, src, vector_len); 1111 } else { 1112 vpmovzxwd(dst, src, vector_len); 1113 } 1114 } 1115 1116 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1117 int shift, int vector_len) { 1118 if (opcode == Op_RotateLeftV) { 1119 if (etype == T_INT) { 1120 evprold(dst, src, shift, vector_len); 1121 } else { 1122 assert(etype == T_LONG, "expected type T_LONG"); 1123 evprolq(dst, src, shift, vector_len); 1124 } 1125 } else { 1126 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1127 if (etype == T_INT) { 1128 evprord(dst, src, shift, vector_len); 1129 } else { 1130 assert(etype == T_LONG, "expected type T_LONG"); 1131 evprorq(dst, src, shift, vector_len); 1132 } 1133 } 1134 } 1135 1136 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1137 XMMRegister shift, int vector_len) { 1138 if (opcode == Op_RotateLeftV) { 1139 if (etype == T_INT) { 1140 evprolvd(dst, src, shift, vector_len); 1141 } else { 1142 assert(etype == T_LONG, "expected type T_LONG"); 1143 evprolvq(dst, src, shift, vector_len); 1144 } 1145 } else { 1146 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1147 if (etype == T_INT) { 1148 evprorvd(dst, src, shift, vector_len); 1149 } else { 1150 assert(etype == T_LONG, "expected type T_LONG"); 1151 evprorvq(dst, src, shift, vector_len); 1152 } 1153 } 1154 } 1155 1156 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1157 if (opcode == Op_RShiftVI) { 1158 psrad(dst, shift); 1159 } else if (opcode == Op_LShiftVI) { 1160 pslld(dst, shift); 1161 } else { 1162 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1163 psrld(dst, shift); 1164 } 1165 } 1166 1167 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1168 switch (opcode) { 1169 case Op_RShiftVI: psrad(dst, shift); break; 1170 case Op_LShiftVI: pslld(dst, shift); break; 1171 case Op_URShiftVI: psrld(dst, shift); break; 1172 1173 default: assert(false, "%s", NodeClassNames[opcode]); 1174 } 1175 } 1176 1177 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1178 if (opcode == Op_RShiftVI) { 1179 vpsrad(dst, nds, shift, vector_len); 1180 } else if (opcode == Op_LShiftVI) { 1181 vpslld(dst, nds, shift, vector_len); 1182 } else { 1183 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1184 vpsrld(dst, nds, shift, vector_len); 1185 } 1186 } 1187 1188 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1189 switch (opcode) { 1190 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1191 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1192 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1193 1194 default: assert(false, "%s", NodeClassNames[opcode]); 1195 } 1196 } 1197 1198 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1199 switch (opcode) { 1200 case Op_RShiftVB: // fall-through 1201 case Op_RShiftVS: psraw(dst, shift); break; 1202 1203 case Op_LShiftVB: // fall-through 1204 case Op_LShiftVS: psllw(dst, shift); break; 1205 1206 case Op_URShiftVS: // fall-through 1207 case Op_URShiftVB: psrlw(dst, shift); break; 1208 1209 default: assert(false, "%s", NodeClassNames[opcode]); 1210 } 1211 } 1212 1213 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1214 switch (opcode) { 1215 case Op_RShiftVB: // fall-through 1216 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1217 1218 case Op_LShiftVB: // fall-through 1219 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1220 1221 case Op_URShiftVS: // fall-through 1222 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1223 1224 default: assert(false, "%s", NodeClassNames[opcode]); 1225 } 1226 } 1227 1228 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1229 switch (opcode) { 1230 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1231 case Op_LShiftVL: psllq(dst, shift); break; 1232 case Op_URShiftVL: psrlq(dst, shift); break; 1233 1234 default: assert(false, "%s", NodeClassNames[opcode]); 1235 } 1236 } 1237 1238 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1239 if (opcode == Op_RShiftVL) { 1240 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1241 } else if (opcode == Op_LShiftVL) { 1242 psllq(dst, shift); 1243 } else { 1244 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1245 psrlq(dst, shift); 1246 } 1247 } 1248 1249 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1250 switch (opcode) { 1251 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1252 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1253 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1254 1255 default: assert(false, "%s", NodeClassNames[opcode]); 1256 } 1257 } 1258 1259 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1260 if (opcode == Op_RShiftVL) { 1261 evpsraq(dst, nds, shift, vector_len); 1262 } else if (opcode == Op_LShiftVL) { 1263 vpsllq(dst, nds, shift, vector_len); 1264 } else { 1265 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1266 vpsrlq(dst, nds, shift, vector_len); 1267 } 1268 } 1269 1270 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1271 switch (opcode) { 1272 case Op_RShiftVB: // fall-through 1273 case Op_RShiftVS: // fall-through 1274 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1275 1276 case Op_LShiftVB: // fall-through 1277 case Op_LShiftVS: // fall-through 1278 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1279 1280 case Op_URShiftVB: // fall-through 1281 case Op_URShiftVS: // fall-through 1282 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1283 1284 default: assert(false, "%s", NodeClassNames[opcode]); 1285 } 1286 } 1287 1288 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1289 switch (opcode) { 1290 case Op_RShiftVB: // fall-through 1291 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1292 1293 case Op_LShiftVB: // fall-through 1294 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1295 1296 case Op_URShiftVB: // fall-through 1297 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1298 1299 default: assert(false, "%s", NodeClassNames[opcode]); 1300 } 1301 } 1302 1303 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1304 assert(UseAVX >= 2, "required"); 1305 switch (opcode) { 1306 case Op_RShiftVL: { 1307 if (UseAVX > 2) { 1308 assert(tmp == xnoreg, "not used"); 1309 if (!VM_Version::supports_avx512vl()) { 1310 vlen_enc = Assembler::AVX_512bit; 1311 } 1312 evpsravq(dst, src, shift, vlen_enc); 1313 } else { 1314 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1315 vpsrlvq(dst, src, shift, vlen_enc); 1316 vpsrlvq(tmp, tmp, shift, vlen_enc); 1317 vpxor(dst, dst, tmp, vlen_enc); 1318 vpsubq(dst, dst, tmp, vlen_enc); 1319 } 1320 break; 1321 } 1322 case Op_LShiftVL: { 1323 assert(tmp == xnoreg, "not used"); 1324 vpsllvq(dst, src, shift, vlen_enc); 1325 break; 1326 } 1327 case Op_URShiftVL: { 1328 assert(tmp == xnoreg, "not used"); 1329 vpsrlvq(dst, src, shift, vlen_enc); 1330 break; 1331 } 1332 default: assert(false, "%s", NodeClassNames[opcode]); 1333 } 1334 } 1335 1336 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1337 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) { 1338 assert(opcode == Op_LShiftVB || 1339 opcode == Op_RShiftVB || 1340 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1341 bool sign = (opcode != Op_URShiftVB); 1342 assert(vector_len == 0, "required"); 1343 vextendbd(sign, dst, src, 1); 1344 vpmovzxbd(vtmp, shift, 1); 1345 varshiftd(opcode, dst, dst, vtmp, 1); 1346 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch); 1347 vextracti128_high(vtmp, dst); 1348 vpackusdw(dst, dst, vtmp, 0); 1349 } 1350 1351 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1352 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) { 1353 assert(opcode == Op_LShiftVB || 1354 opcode == Op_RShiftVB || 1355 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1356 bool sign = (opcode != Op_URShiftVB); 1357 int ext_vector_len = vector_len + 1; 1358 vextendbw(sign, dst, src, ext_vector_len); 1359 vpmovzxbw(vtmp, shift, ext_vector_len); 1360 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1361 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch); 1362 if (vector_len == 0) { 1363 vextracti128_high(vtmp, dst); 1364 vpackuswb(dst, dst, vtmp, vector_len); 1365 } else { 1366 vextracti64x4_high(vtmp, dst); 1367 vpackuswb(dst, dst, vtmp, vector_len); 1368 vpermq(dst, dst, 0xD8, vector_len); 1369 } 1370 } 1371 1372 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1373 switch(typ) { 1374 case T_BYTE: 1375 pinsrb(dst, val, idx); 1376 break; 1377 case T_SHORT: 1378 pinsrw(dst, val, idx); 1379 break; 1380 case T_INT: 1381 pinsrd(dst, val, idx); 1382 break; 1383 case T_LONG: 1384 pinsrq(dst, val, idx); 1385 break; 1386 default: 1387 assert(false,"Should not reach here."); 1388 break; 1389 } 1390 } 1391 1392 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1393 switch(typ) { 1394 case T_BYTE: 1395 vpinsrb(dst, src, val, idx); 1396 break; 1397 case T_SHORT: 1398 vpinsrw(dst, src, val, idx); 1399 break; 1400 case T_INT: 1401 vpinsrd(dst, src, val, idx); 1402 break; 1403 case T_LONG: 1404 vpinsrq(dst, src, val, idx); 1405 break; 1406 default: 1407 assert(false,"Should not reach here."); 1408 break; 1409 } 1410 } 1411 1412 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1413 switch(typ) { 1414 case T_INT: 1415 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1416 break; 1417 case T_FLOAT: 1418 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1419 break; 1420 case T_LONG: 1421 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1422 break; 1423 case T_DOUBLE: 1424 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1425 break; 1426 default: 1427 assert(false,"Should not reach here."); 1428 break; 1429 } 1430 } 1431 1432 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1433 switch(typ) { 1434 case T_INT: 1435 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1436 break; 1437 case T_FLOAT: 1438 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1439 break; 1440 case T_LONG: 1441 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1442 break; 1443 case T_DOUBLE: 1444 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1445 break; 1446 default: 1447 assert(false,"Should not reach here."); 1448 break; 1449 } 1450 } 1451 1452 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1453 switch(typ) { 1454 case T_INT: 1455 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1456 break; 1457 case T_FLOAT: 1458 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1459 break; 1460 case T_LONG: 1461 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1462 break; 1463 case T_DOUBLE: 1464 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1465 break; 1466 default: 1467 assert(false,"Should not reach here."); 1468 break; 1469 } 1470 } 1471 1472 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1473 if (vlen_in_bytes <= 16) { 1474 pxor (dst, dst); 1475 psubb(dst, src); 1476 switch (elem_bt) { 1477 case T_BYTE: /* nothing to do */ break; 1478 case T_SHORT: pmovsxbw(dst, dst); break; 1479 case T_INT: pmovsxbd(dst, dst); break; 1480 case T_FLOAT: pmovsxbd(dst, dst); break; 1481 case T_LONG: pmovsxbq(dst, dst); break; 1482 case T_DOUBLE: pmovsxbq(dst, dst); break; 1483 1484 default: assert(false, "%s", type2name(elem_bt)); 1485 } 1486 } else { 1487 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1488 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1489 1490 vpxor (dst, dst, dst, vlen_enc); 1491 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1492 1493 switch (elem_bt) { 1494 case T_BYTE: /* nothing to do */ break; 1495 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1496 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1497 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1498 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1499 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1500 1501 default: assert(false, "%s", type2name(elem_bt)); 1502 } 1503 } 1504 } 1505 1506 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) { 1507 ExternalAddress addr(StubRoutines::x86::vector_iota_indices()); 1508 if (vlen_in_bytes == 4) { 1509 movdl(dst, addr); 1510 } else if (vlen_in_bytes == 8) { 1511 movq(dst, addr); 1512 } else if (vlen_in_bytes == 16) { 1513 movdqu(dst, addr, scratch); 1514 } else if (vlen_in_bytes == 32) { 1515 vmovdqu(dst, addr, scratch); 1516 } else { 1517 assert(vlen_in_bytes == 64, "%d", vlen_in_bytes); 1518 evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch); 1519 } 1520 } 1521 1522 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1523 1524 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1525 int vector_len = Assembler::AVX_128bit; 1526 1527 switch (opcode) { 1528 case Op_AndReductionV: pand(dst, src); break; 1529 case Op_OrReductionV: por (dst, src); break; 1530 case Op_XorReductionV: pxor(dst, src); break; 1531 case Op_MinReductionV: 1532 switch (typ) { 1533 case T_BYTE: pminsb(dst, src); break; 1534 case T_SHORT: pminsw(dst, src); break; 1535 case T_INT: pminsd(dst, src); break; 1536 case T_LONG: assert(UseAVX > 2, "required"); 1537 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1538 default: assert(false, "wrong type"); 1539 } 1540 break; 1541 case Op_MaxReductionV: 1542 switch (typ) { 1543 case T_BYTE: pmaxsb(dst, src); break; 1544 case T_SHORT: pmaxsw(dst, src); break; 1545 case T_INT: pmaxsd(dst, src); break; 1546 case T_LONG: assert(UseAVX > 2, "required"); 1547 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1548 default: assert(false, "wrong type"); 1549 } 1550 break; 1551 case Op_AddReductionVF: addss(dst, src); break; 1552 case Op_AddReductionVD: addsd(dst, src); break; 1553 case Op_AddReductionVI: 1554 switch (typ) { 1555 case T_BYTE: paddb(dst, src); break; 1556 case T_SHORT: paddw(dst, src); break; 1557 case T_INT: paddd(dst, src); break; 1558 default: assert(false, "wrong type"); 1559 } 1560 break; 1561 case Op_AddReductionVL: paddq(dst, src); break; 1562 case Op_MulReductionVF: mulss(dst, src); break; 1563 case Op_MulReductionVD: mulsd(dst, src); break; 1564 case Op_MulReductionVI: 1565 switch (typ) { 1566 case T_SHORT: pmullw(dst, src); break; 1567 case T_INT: pmulld(dst, src); break; 1568 default: assert(false, "wrong type"); 1569 } 1570 break; 1571 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1572 vpmullq(dst, dst, src, vector_len); break; 1573 default: assert(false, "wrong opcode"); 1574 } 1575 } 1576 1577 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1578 int vector_len = Assembler::AVX_256bit; 1579 1580 switch (opcode) { 1581 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1582 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1583 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1584 case Op_MinReductionV: 1585 switch (typ) { 1586 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1587 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1588 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1589 case T_LONG: assert(UseAVX > 2, "required"); 1590 vpminsq(dst, src1, src2, vector_len); break; 1591 default: assert(false, "wrong type"); 1592 } 1593 break; 1594 case Op_MaxReductionV: 1595 switch (typ) { 1596 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1597 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1598 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1599 case T_LONG: assert(UseAVX > 2, "required"); 1600 vpmaxsq(dst, src1, src2, vector_len); break; 1601 default: assert(false, "wrong type"); 1602 } 1603 break; 1604 case Op_AddReductionVI: 1605 switch (typ) { 1606 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1607 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1608 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1609 default: assert(false, "wrong type"); 1610 } 1611 break; 1612 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1613 case Op_MulReductionVI: 1614 switch (typ) { 1615 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1616 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1617 default: assert(false, "wrong type"); 1618 } 1619 break; 1620 case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break; 1621 default: assert(false, "wrong opcode"); 1622 } 1623 } 1624 1625 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1626 XMMRegister dst, XMMRegister src, 1627 XMMRegister vtmp1, XMMRegister vtmp2) { 1628 switch (opcode) { 1629 case Op_AddReductionVF: 1630 case Op_MulReductionVF: 1631 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1632 break; 1633 1634 case Op_AddReductionVD: 1635 case Op_MulReductionVD: 1636 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1637 break; 1638 1639 default: assert(false, "wrong opcode"); 1640 } 1641 } 1642 1643 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1644 Register dst, Register src1, XMMRegister src2, 1645 XMMRegister vtmp1, XMMRegister vtmp2) { 1646 switch (vlen) { 1647 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1648 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1649 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1650 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1651 1652 default: assert(false, "wrong vector length"); 1653 } 1654 } 1655 1656 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1657 Register dst, Register src1, XMMRegister src2, 1658 XMMRegister vtmp1, XMMRegister vtmp2) { 1659 switch (vlen) { 1660 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1661 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1662 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1663 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1664 1665 default: assert(false, "wrong vector length"); 1666 } 1667 } 1668 1669 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1670 Register dst, Register src1, XMMRegister src2, 1671 XMMRegister vtmp1, XMMRegister vtmp2) { 1672 switch (vlen) { 1673 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1674 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1675 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1676 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1677 1678 default: assert(false, "wrong vector length"); 1679 } 1680 } 1681 1682 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1683 Register dst, Register src1, XMMRegister src2, 1684 XMMRegister vtmp1, XMMRegister vtmp2) { 1685 switch (vlen) { 1686 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1687 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1688 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1689 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1690 1691 default: assert(false, "wrong vector length"); 1692 } 1693 } 1694 1695 #ifdef _LP64 1696 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1697 Register dst, Register src1, XMMRegister src2, 1698 XMMRegister vtmp1, XMMRegister vtmp2) { 1699 switch (vlen) { 1700 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1701 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1702 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1703 1704 default: assert(false, "wrong vector length"); 1705 } 1706 } 1707 #endif // _LP64 1708 1709 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1710 switch (vlen) { 1711 case 2: 1712 assert(vtmp2 == xnoreg, ""); 1713 reduce2F(opcode, dst, src, vtmp1); 1714 break; 1715 case 4: 1716 assert(vtmp2 == xnoreg, ""); 1717 reduce4F(opcode, dst, src, vtmp1); 1718 break; 1719 case 8: 1720 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1721 break; 1722 case 16: 1723 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1724 break; 1725 default: assert(false, "wrong vector length"); 1726 } 1727 } 1728 1729 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1730 switch (vlen) { 1731 case 2: 1732 assert(vtmp2 == xnoreg, ""); 1733 reduce2D(opcode, dst, src, vtmp1); 1734 break; 1735 case 4: 1736 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1737 break; 1738 case 8: 1739 reduce8D(opcode, dst, src, vtmp1, vtmp2); 1740 break; 1741 default: assert(false, "wrong vector length"); 1742 } 1743 } 1744 1745 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1746 if (opcode == Op_AddReductionVI) { 1747 if (vtmp1 != src2) { 1748 movdqu(vtmp1, src2); 1749 } 1750 phaddd(vtmp1, vtmp1); 1751 } else { 1752 pshufd(vtmp1, src2, 0x1); 1753 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1754 } 1755 movdl(vtmp2, src1); 1756 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1757 movdl(dst, vtmp1); 1758 } 1759 1760 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1761 if (opcode == Op_AddReductionVI) { 1762 if (vtmp1 != src2) { 1763 movdqu(vtmp1, src2); 1764 } 1765 phaddd(vtmp1, src2); 1766 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1767 } else { 1768 pshufd(vtmp2, src2, 0xE); 1769 reduce_operation_128(T_INT, opcode, vtmp2, src2); 1770 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1771 } 1772 } 1773 1774 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1775 if (opcode == Op_AddReductionVI) { 1776 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 1777 vextracti128_high(vtmp2, vtmp1); 1778 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 1779 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1780 } else { 1781 vextracti128_high(vtmp1, src2); 1782 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1783 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1784 } 1785 } 1786 1787 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1788 vextracti64x4_high(vtmp2, src2); 1789 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 1790 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1791 } 1792 1793 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1794 pshufd(vtmp2, src2, 0x1); 1795 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1796 movdqu(vtmp1, vtmp2); 1797 psrldq(vtmp1, 2); 1798 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 1799 movdqu(vtmp2, vtmp1); 1800 psrldq(vtmp2, 1); 1801 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 1802 movdl(vtmp2, src1); 1803 pmovsxbd(vtmp1, vtmp1); 1804 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1805 pextrb(dst, vtmp1, 0x0); 1806 movsbl(dst, dst); 1807 } 1808 1809 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1810 pshufd(vtmp1, src2, 0xE); 1811 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 1812 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1813 } 1814 1815 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1816 vextracti128_high(vtmp2, src2); 1817 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1818 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1819 } 1820 1821 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1822 vextracti64x4_high(vtmp1, src2); 1823 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 1824 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1825 } 1826 1827 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1828 pmovsxbw(vtmp2, src2); 1829 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1830 } 1831 1832 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1833 if (UseAVX > 1) { 1834 int vector_len = Assembler::AVX_256bit; 1835 vpmovsxbw(vtmp1, src2, vector_len); 1836 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1837 } else { 1838 pmovsxbw(vtmp2, src2); 1839 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1840 pshufd(vtmp2, src2, 0x1); 1841 pmovsxbw(vtmp2, src2); 1842 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 1843 } 1844 } 1845 1846 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1847 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 1848 int vector_len = Assembler::AVX_512bit; 1849 vpmovsxbw(vtmp1, src2, vector_len); 1850 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1851 } else { 1852 assert(UseAVX >= 2,"Should not reach here."); 1853 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 1854 vextracti128_high(vtmp2, src2); 1855 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 1856 } 1857 } 1858 1859 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1860 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 1861 vextracti64x4_high(vtmp2, src2); 1862 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 1863 } 1864 1865 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1866 if (opcode == Op_AddReductionVI) { 1867 if (vtmp1 != src2) { 1868 movdqu(vtmp1, src2); 1869 } 1870 phaddw(vtmp1, vtmp1); 1871 phaddw(vtmp1, vtmp1); 1872 } else { 1873 pshufd(vtmp2, src2, 0x1); 1874 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 1875 movdqu(vtmp1, vtmp2); 1876 psrldq(vtmp1, 2); 1877 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 1878 } 1879 movdl(vtmp2, src1); 1880 pmovsxwd(vtmp1, vtmp1); 1881 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1882 pextrw(dst, vtmp1, 0x0); 1883 movswl(dst, dst); 1884 } 1885 1886 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1887 if (opcode == Op_AddReductionVI) { 1888 if (vtmp1 != src2) { 1889 movdqu(vtmp1, src2); 1890 } 1891 phaddw(vtmp1, src2); 1892 } else { 1893 pshufd(vtmp1, src2, 0xE); 1894 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 1895 } 1896 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1897 } 1898 1899 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1900 if (opcode == Op_AddReductionVI) { 1901 int vector_len = Assembler::AVX_256bit; 1902 vphaddw(vtmp2, src2, src2, vector_len); 1903 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 1904 } else { 1905 vextracti128_high(vtmp2, src2); 1906 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 1907 } 1908 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1909 } 1910 1911 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1912 int vector_len = Assembler::AVX_256bit; 1913 vextracti64x4_high(vtmp1, src2); 1914 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 1915 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1916 } 1917 1918 #ifdef _LP64 1919 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1920 pshufd(vtmp2, src2, 0xE); 1921 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 1922 movdq(vtmp1, src1); 1923 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 1924 movdq(dst, vtmp1); 1925 } 1926 1927 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1928 vextracti128_high(vtmp1, src2); 1929 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 1930 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1931 } 1932 1933 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1934 vextracti64x4_high(vtmp2, src2); 1935 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 1936 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1937 } 1938 1939 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 1940 assert(ArrayOperationPartialInlineSize > 0 && ArrayOperationPartialInlineSize <= 64, "invalid"); 1941 mov64(temp, -1L); 1942 bzhiq(temp, temp, len); 1943 kmovql(dst, temp); 1944 } 1945 #endif // _LP64 1946 1947 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1948 reduce_operation_128(T_FLOAT, opcode, dst, src); 1949 pshufd(vtmp, src, 0x1); 1950 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 1951 } 1952 1953 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1954 reduce2F(opcode, dst, src, vtmp); 1955 pshufd(vtmp, src, 0x2); 1956 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 1957 pshufd(vtmp, src, 0x3); 1958 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 1959 } 1960 1961 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1962 reduce4F(opcode, dst, src, vtmp2); 1963 vextractf128_high(vtmp2, src); 1964 reduce4F(opcode, dst, vtmp2, vtmp1); 1965 } 1966 1967 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1968 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1969 vextracti64x4_high(vtmp1, src); 1970 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 1971 } 1972 1973 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1974 reduce_operation_128(T_DOUBLE, opcode, dst, src); 1975 pshufd(vtmp, src, 0xE); 1976 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 1977 } 1978 1979 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1980 reduce2D(opcode, dst, src, vtmp2); 1981 vextractf128_high(vtmp2, src); 1982 reduce2D(opcode, dst, vtmp2, vtmp1); 1983 } 1984 1985 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1986 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1987 vextracti64x4_high(vtmp1, src); 1988 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 1989 } 1990 1991 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) { 1992 MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len); 1993 } 1994 1995 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) { 1996 MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len); 1997 } 1998 1999 2000 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2001 XMMRegister dst, XMMRegister src, 2002 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2003 XMMRegister xmm_0, XMMRegister xmm_1) { 2004 int permconst[] = {1, 14}; 2005 XMMRegister wsrc = src; 2006 XMMRegister wdst = xmm_0; 2007 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2008 2009 int vlen_enc = Assembler::AVX_128bit; 2010 if (vlen == 16) { 2011 vlen_enc = Assembler::AVX_256bit; 2012 } 2013 2014 for (int i = log2(vlen) - 1; i >=0; i--) { 2015 if (i == 0 && !is_dst_valid) { 2016 wdst = dst; 2017 } 2018 if (i == 3) { 2019 vextracti64x4_high(wtmp, wsrc); 2020 } else if (i == 2) { 2021 vextracti128_high(wtmp, wsrc); 2022 } else { // i = [0,1] 2023 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2024 } 2025 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2026 wsrc = wdst; 2027 vlen_enc = Assembler::AVX_128bit; 2028 } 2029 if (is_dst_valid) { 2030 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2031 } 2032 } 2033 2034 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2035 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2036 XMMRegister xmm_0, XMMRegister xmm_1) { 2037 XMMRegister wsrc = src; 2038 XMMRegister wdst = xmm_0; 2039 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2040 int vlen_enc = Assembler::AVX_128bit; 2041 if (vlen == 8) { 2042 vlen_enc = Assembler::AVX_256bit; 2043 } 2044 for (int i = log2(vlen) - 1; i >=0; i--) { 2045 if (i == 0 && !is_dst_valid) { 2046 wdst = dst; 2047 } 2048 if (i == 1) { 2049 vextracti128_high(wtmp, wsrc); 2050 } else if (i == 2) { 2051 vextracti64x4_high(wtmp, wsrc); 2052 } else { 2053 assert(i == 0, "%d", i); 2054 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2055 } 2056 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2057 wsrc = wdst; 2058 vlen_enc = Assembler::AVX_128bit; 2059 } 2060 if (is_dst_valid) { 2061 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2062 } 2063 } 2064 2065 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2066 switch (bt) { 2067 case T_BYTE: pextrb(dst, src, idx); break; 2068 case T_SHORT: pextrw(dst, src, idx); break; 2069 case T_INT: pextrd(dst, src, idx); break; 2070 case T_LONG: pextrq(dst, src, idx); break; 2071 2072 default: 2073 assert(false,"Should not reach here."); 2074 break; 2075 } 2076 } 2077 2078 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2079 int esize = type2aelembytes(typ); 2080 int elem_per_lane = 16/esize; 2081 int lane = elemindex / elem_per_lane; 2082 int eindex = elemindex % elem_per_lane; 2083 2084 if (lane >= 2) { 2085 assert(UseAVX > 2, "required"); 2086 vextractf32x4(dst, src, lane & 3); 2087 return dst; 2088 } else if (lane > 0) { 2089 assert(UseAVX > 0, "required"); 2090 vextractf128(dst, src, lane); 2091 return dst; 2092 } else { 2093 return src; 2094 } 2095 } 2096 2097 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2098 int esize = type2aelembytes(typ); 2099 int elem_per_lane = 16/esize; 2100 int eindex = elemindex % elem_per_lane; 2101 assert(is_integral_type(typ),"required"); 2102 2103 if (eindex == 0) { 2104 if (typ == T_LONG) { 2105 movq(dst, src); 2106 } else { 2107 movdl(dst, src); 2108 if (typ == T_BYTE) 2109 movsbl(dst, dst); 2110 else if (typ == T_SHORT) 2111 movswl(dst, dst); 2112 } 2113 } else { 2114 extract(typ, dst, src, eindex); 2115 } 2116 } 2117 2118 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) { 2119 int esize = type2aelembytes(typ); 2120 int elem_per_lane = 16/esize; 2121 int eindex = elemindex % elem_per_lane; 2122 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2123 2124 if (eindex == 0) { 2125 movq(dst, src); 2126 } else { 2127 if (typ == T_FLOAT) { 2128 if (UseAVX == 0) { 2129 movdqu(dst, src); 2130 pshufps(dst, dst, eindex); 2131 } else { 2132 vpshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2133 } 2134 } else { 2135 if (UseAVX == 0) { 2136 movdqu(dst, src); 2137 psrldq(dst, eindex*esize); 2138 } else { 2139 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2140 } 2141 movq(dst, dst); 2142 } 2143 } 2144 // Zero upper bits 2145 if (typ == T_FLOAT) { 2146 if (UseAVX == 0) { 2147 assert((vtmp != xnoreg) && (tmp != noreg), "required."); 2148 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp); 2149 pand(dst, vtmp); 2150 } else { 2151 assert((tmp != noreg), "required."); 2152 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp); 2153 } 2154 } 2155 } 2156 2157 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2158 switch(typ) { 2159 case T_BYTE: 2160 case T_BOOLEAN: 2161 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2162 break; 2163 case T_SHORT: 2164 case T_CHAR: 2165 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2166 break; 2167 case T_INT: 2168 case T_FLOAT: 2169 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2170 break; 2171 case T_LONG: 2172 case T_DOUBLE: 2173 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2174 break; 2175 default: 2176 assert(false,"Should not reach here."); 2177 break; 2178 } 2179 } 2180 2181 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) { 2182 switch(typ) { 2183 case T_BOOLEAN: 2184 case T_BYTE: 2185 evpcmpb(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch); 2186 break; 2187 case T_CHAR: 2188 case T_SHORT: 2189 evpcmpw(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch); 2190 break; 2191 case T_INT: 2192 case T_FLOAT: 2193 evpcmpd(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch); 2194 break; 2195 case T_LONG: 2196 case T_DOUBLE: 2197 evpcmpq(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch); 2198 break; 2199 default: 2200 assert(false,"Should not reach here."); 2201 break; 2202 } 2203 } 2204 2205 void C2_MacroAssembler::vpcmpu(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison, 2206 int vlen_in_bytes, XMMRegister vtmp1, XMMRegister vtmp2, Register scratch) { 2207 int vlen_enc = vector_length_encoding(vlen_in_bytes*2); 2208 switch (typ) { 2209 case T_BYTE: 2210 vpmovzxbw(vtmp1, src1, vlen_enc); 2211 vpmovzxbw(vtmp2, src2, vlen_enc); 2212 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch); 2213 vpacksswb(dst, dst, dst, vlen_enc); 2214 break; 2215 case T_SHORT: 2216 vpmovzxwd(vtmp1, src1, vlen_enc); 2217 vpmovzxwd(vtmp2, src2, vlen_enc); 2218 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch); 2219 vpackssdw(dst, dst, dst, vlen_enc); 2220 break; 2221 case T_INT: 2222 vpmovzxdq(vtmp1, src1, vlen_enc); 2223 vpmovzxdq(vtmp2, src2, vlen_enc); 2224 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch); 2225 vpermilps(dst, dst, 8, vlen_enc); 2226 break; 2227 default: 2228 assert(false, "Should not reach here"); 2229 } 2230 if (vlen_in_bytes == 16) { 2231 vpermpd(dst, dst, 0x8, vlen_enc); 2232 } 2233 } 2234 2235 void C2_MacroAssembler::vpcmpu32(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison, int vlen_in_bytes, 2236 XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, Register scratch) { 2237 int vlen_enc = vector_length_encoding(vlen_in_bytes); 2238 switch (typ) { 2239 case T_BYTE: 2240 vpmovzxbw(vtmp1, src1, vlen_enc); 2241 vpmovzxbw(vtmp2, src2, vlen_enc); 2242 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch); 2243 vextracti128(vtmp1, src1, 1); 2244 vextracti128(vtmp2, src2, 1); 2245 vpmovzxbw(vtmp1, vtmp1, vlen_enc); 2246 vpmovzxbw(vtmp2, vtmp2, vlen_enc); 2247 vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch); 2248 vpacksswb(dst, dst, vtmp3, vlen_enc); 2249 vpermpd(dst, dst, 0xd8, vlen_enc); 2250 break; 2251 case T_SHORT: 2252 vpmovzxwd(vtmp1, src1, vlen_enc); 2253 vpmovzxwd(vtmp2, src2, vlen_enc); 2254 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch); 2255 vextracti128(vtmp1, src1, 1); 2256 vextracti128(vtmp2, src2, 1); 2257 vpmovzxwd(vtmp1, vtmp1, vlen_enc); 2258 vpmovzxwd(vtmp2, vtmp2, vlen_enc); 2259 vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch); 2260 vpackssdw(dst, dst, vtmp3, vlen_enc); 2261 vpermpd(dst, dst, 0xd8, vlen_enc); 2262 break; 2263 case T_INT: 2264 vpmovzxdq(vtmp1, src1, vlen_enc); 2265 vpmovzxdq(vtmp2, src2, vlen_enc); 2266 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch); 2267 vpshufd(dst, dst, 8, vlen_enc); 2268 vpermq(dst, dst, 8, vlen_enc); 2269 vextracti128(vtmp1, src1, 1); 2270 vextracti128(vtmp2, src2, 1); 2271 vpmovzxdq(vtmp1, vtmp1, vlen_enc); 2272 vpmovzxdq(vtmp2, vtmp2, vlen_enc); 2273 vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch); 2274 vpshufd(vtmp3, vtmp3, 8, vlen_enc); 2275 vpermq(vtmp3, vtmp3, 0x80, vlen_enc); 2276 vpblendd(dst, dst, vtmp3, 0xf0, vlen_enc); 2277 break; 2278 default: 2279 assert(false, "Should not reach here"); 2280 } 2281 } 2282 2283 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2284 switch(typ) { 2285 case T_BYTE: 2286 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2287 break; 2288 case T_SHORT: 2289 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2290 break; 2291 case T_INT: 2292 case T_FLOAT: 2293 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2294 break; 2295 case T_LONG: 2296 case T_DOUBLE: 2297 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2298 break; 2299 default: 2300 assert(false,"Should not reach here."); 2301 break; 2302 } 2303 } 2304 2305 void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2, 2306 XMMRegister vtmp1, XMMRegister vtmp2, KRegister mask) { 2307 switch(vlen) { 2308 case 4: 2309 assert(vtmp1 != xnoreg, "required."); 2310 // Broadcast lower 32 bits to 128 bits before ptest 2311 pshufd(vtmp1, src1, 0x0); 2312 if (bt == BoolTest::overflow) { 2313 assert(vtmp2 != xnoreg, "required."); 2314 pshufd(vtmp2, src2, 0x0); 2315 } else { 2316 assert(vtmp2 == xnoreg, "required."); 2317 vtmp2 = src2; 2318 } 2319 ptest(vtmp1, vtmp2); 2320 break; 2321 case 8: 2322 assert(vtmp1 != xnoreg, "required."); 2323 // Broadcast lower 64 bits to 128 bits before ptest 2324 pshufd(vtmp1, src1, 0x4); 2325 if (bt == BoolTest::overflow) { 2326 assert(vtmp2 != xnoreg, "required."); 2327 pshufd(vtmp2, src2, 0x4); 2328 } else { 2329 assert(vtmp2 == xnoreg, "required."); 2330 vtmp2 = src2; 2331 } 2332 ptest(vtmp1, vtmp2); 2333 break; 2334 case 16: 2335 assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required."); 2336 ptest(src1, src2); 2337 break; 2338 case 32: 2339 assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required."); 2340 vptest(src1, src2, Assembler::AVX_256bit); 2341 break; 2342 case 64: 2343 { 2344 assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required."); 2345 evpcmpeqb(mask, src1, src2, Assembler::AVX_512bit); 2346 if (bt == BoolTest::ne) { 2347 ktestql(mask, mask); 2348 } else { 2349 assert(bt == BoolTest::overflow, "required"); 2350 kortestql(mask, mask); 2351 } 2352 } 2353 break; 2354 default: 2355 assert(false,"Should not reach here."); 2356 break; 2357 } 2358 } 2359 2360 //------------------------------------------------------------------------------------------- 2361 2362 // IndexOf for constant substrings with size >= 8 chars 2363 // which don't need to be loaded through stack. 2364 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2365 Register cnt1, Register cnt2, 2366 int int_cnt2, Register result, 2367 XMMRegister vec, Register tmp, 2368 int ae) { 2369 ShortBranchVerifier sbv(this); 2370 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2371 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2372 2373 // This method uses the pcmpestri instruction with bound registers 2374 // inputs: 2375 // xmm - substring 2376 // rax - substring length (elements count) 2377 // mem - scanned string 2378 // rdx - string length (elements count) 2379 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2380 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2381 // outputs: 2382 // rcx - matched index in string 2383 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2384 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2385 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2386 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2387 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2388 2389 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2390 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2391 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2392 2393 // Note, inline_string_indexOf() generates checks: 2394 // if (substr.count > string.count) return -1; 2395 // if (substr.count == 0) return 0; 2396 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2397 2398 // Load substring. 2399 if (ae == StrIntrinsicNode::UL) { 2400 pmovzxbw(vec, Address(str2, 0)); 2401 } else { 2402 movdqu(vec, Address(str2, 0)); 2403 } 2404 movl(cnt2, int_cnt2); 2405 movptr(result, str1); // string addr 2406 2407 if (int_cnt2 > stride) { 2408 jmpb(SCAN_TO_SUBSTR); 2409 2410 // Reload substr for rescan, this code 2411 // is executed only for large substrings (> 8 chars) 2412 bind(RELOAD_SUBSTR); 2413 if (ae == StrIntrinsicNode::UL) { 2414 pmovzxbw(vec, Address(str2, 0)); 2415 } else { 2416 movdqu(vec, Address(str2, 0)); 2417 } 2418 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2419 2420 bind(RELOAD_STR); 2421 // We came here after the beginning of the substring was 2422 // matched but the rest of it was not so we need to search 2423 // again. Start from the next element after the previous match. 2424 2425 // cnt2 is number of substring reminding elements and 2426 // cnt1 is number of string reminding elements when cmp failed. 2427 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2428 subl(cnt1, cnt2); 2429 addl(cnt1, int_cnt2); 2430 movl(cnt2, int_cnt2); // Now restore cnt2 2431 2432 decrementl(cnt1); // Shift to next element 2433 cmpl(cnt1, cnt2); 2434 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2435 2436 addptr(result, (1<<scale1)); 2437 2438 } // (int_cnt2 > 8) 2439 2440 // Scan string for start of substr in 16-byte vectors 2441 bind(SCAN_TO_SUBSTR); 2442 pcmpestri(vec, Address(result, 0), mode); 2443 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2444 subl(cnt1, stride); 2445 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2446 cmpl(cnt1, cnt2); 2447 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2448 addptr(result, 16); 2449 jmpb(SCAN_TO_SUBSTR); 2450 2451 // Found a potential substr 2452 bind(FOUND_CANDIDATE); 2453 // Matched whole vector if first element matched (tmp(rcx) == 0). 2454 if (int_cnt2 == stride) { 2455 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2456 } else { // int_cnt2 > 8 2457 jccb(Assembler::overflow, FOUND_SUBSTR); 2458 } 2459 // After pcmpestri tmp(rcx) contains matched element index 2460 // Compute start addr of substr 2461 lea(result, Address(result, tmp, scale1)); 2462 2463 // Make sure string is still long enough 2464 subl(cnt1, tmp); 2465 cmpl(cnt1, cnt2); 2466 if (int_cnt2 == stride) { 2467 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2468 } else { // int_cnt2 > 8 2469 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2470 } 2471 // Left less then substring. 2472 2473 bind(RET_NOT_FOUND); 2474 movl(result, -1); 2475 jmp(EXIT); 2476 2477 if (int_cnt2 > stride) { 2478 // This code is optimized for the case when whole substring 2479 // is matched if its head is matched. 2480 bind(MATCH_SUBSTR_HEAD); 2481 pcmpestri(vec, Address(result, 0), mode); 2482 // Reload only string if does not match 2483 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2484 2485 Label CONT_SCAN_SUBSTR; 2486 // Compare the rest of substring (> 8 chars). 2487 bind(FOUND_SUBSTR); 2488 // First 8 chars are already matched. 2489 negptr(cnt2); 2490 addptr(cnt2, stride); 2491 2492 bind(SCAN_SUBSTR); 2493 subl(cnt1, stride); 2494 cmpl(cnt2, -stride); // Do not read beyond substring 2495 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2496 // Back-up strings to avoid reading beyond substring: 2497 // cnt1 = cnt1 - cnt2 + 8 2498 addl(cnt1, cnt2); // cnt2 is negative 2499 addl(cnt1, stride); 2500 movl(cnt2, stride); negptr(cnt2); 2501 bind(CONT_SCAN_SUBSTR); 2502 if (int_cnt2 < (int)G) { 2503 int tail_off1 = int_cnt2<<scale1; 2504 int tail_off2 = int_cnt2<<scale2; 2505 if (ae == StrIntrinsicNode::UL) { 2506 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2507 } else { 2508 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2509 } 2510 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2511 } else { 2512 // calculate index in register to avoid integer overflow (int_cnt2*2) 2513 movl(tmp, int_cnt2); 2514 addptr(tmp, cnt2); 2515 if (ae == StrIntrinsicNode::UL) { 2516 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2517 } else { 2518 movdqu(vec, Address(str2, tmp, scale2, 0)); 2519 } 2520 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2521 } 2522 // Need to reload strings pointers if not matched whole vector 2523 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2524 addptr(cnt2, stride); 2525 jcc(Assembler::negative, SCAN_SUBSTR); 2526 // Fall through if found full substring 2527 2528 } // (int_cnt2 > 8) 2529 2530 bind(RET_FOUND); 2531 // Found result if we matched full small substring. 2532 // Compute substr offset 2533 subptr(result, str1); 2534 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2535 shrl(result, 1); // index 2536 } 2537 bind(EXIT); 2538 2539 } // string_indexofC8 2540 2541 // Small strings are loaded through stack if they cross page boundary. 2542 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2543 Register cnt1, Register cnt2, 2544 int int_cnt2, Register result, 2545 XMMRegister vec, Register tmp, 2546 int ae) { 2547 ShortBranchVerifier sbv(this); 2548 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2549 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2550 2551 // 2552 // int_cnt2 is length of small (< 8 chars) constant substring 2553 // or (-1) for non constant substring in which case its length 2554 // is in cnt2 register. 2555 // 2556 // Note, inline_string_indexOf() generates checks: 2557 // if (substr.count > string.count) return -1; 2558 // if (substr.count == 0) return 0; 2559 // 2560 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2561 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2562 // This method uses the pcmpestri instruction with bound registers 2563 // inputs: 2564 // xmm - substring 2565 // rax - substring length (elements count) 2566 // mem - scanned string 2567 // rdx - string length (elements count) 2568 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2569 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2570 // outputs: 2571 // rcx - matched index in string 2572 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2573 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2574 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2575 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2576 2577 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2578 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2579 FOUND_CANDIDATE; 2580 2581 { //======================================================== 2582 // We don't know where these strings are located 2583 // and we can't read beyond them. Load them through stack. 2584 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2585 2586 movptr(tmp, rsp); // save old SP 2587 2588 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2589 if (int_cnt2 == (1>>scale2)) { // One byte 2590 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2591 load_unsigned_byte(result, Address(str2, 0)); 2592 movdl(vec, result); // move 32 bits 2593 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2594 // Not enough header space in 32-bit VM: 12+3 = 15. 2595 movl(result, Address(str2, -1)); 2596 shrl(result, 8); 2597 movdl(vec, result); // move 32 bits 2598 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2599 load_unsigned_short(result, Address(str2, 0)); 2600 movdl(vec, result); // move 32 bits 2601 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2602 movdl(vec, Address(str2, 0)); // move 32 bits 2603 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2604 movq(vec, Address(str2, 0)); // move 64 bits 2605 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2606 // Array header size is 12 bytes in 32-bit VM 2607 // + 6 bytes for 3 chars == 18 bytes, 2608 // enough space to load vec and shift. 2609 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2610 if (ae == StrIntrinsicNode::UL) { 2611 int tail_off = int_cnt2-8; 2612 pmovzxbw(vec, Address(str2, tail_off)); 2613 psrldq(vec, -2*tail_off); 2614 } 2615 else { 2616 int tail_off = int_cnt2*(1<<scale2); 2617 movdqu(vec, Address(str2, tail_off-16)); 2618 psrldq(vec, 16-tail_off); 2619 } 2620 } 2621 } else { // not constant substring 2622 cmpl(cnt2, stride); 2623 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 2624 2625 // We can read beyond string if srt+16 does not cross page boundary 2626 // since heaps are aligned and mapped by pages. 2627 assert(os::vm_page_size() < (int)G, "default page should be small"); 2628 movl(result, str2); // We need only low 32 bits 2629 andl(result, (os::vm_page_size()-1)); 2630 cmpl(result, (os::vm_page_size()-16)); 2631 jccb(Assembler::belowEqual, CHECK_STR); 2632 2633 // Move small strings to stack to allow load 16 bytes into vec. 2634 subptr(rsp, 16); 2635 int stk_offset = wordSize-(1<<scale2); 2636 push(cnt2); 2637 2638 bind(COPY_SUBSTR); 2639 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 2640 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 2641 movb(Address(rsp, cnt2, scale2, stk_offset), result); 2642 } else if (ae == StrIntrinsicNode::UU) { 2643 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 2644 movw(Address(rsp, cnt2, scale2, stk_offset), result); 2645 } 2646 decrement(cnt2); 2647 jccb(Assembler::notZero, COPY_SUBSTR); 2648 2649 pop(cnt2); 2650 movptr(str2, rsp); // New substring address 2651 } // non constant 2652 2653 bind(CHECK_STR); 2654 cmpl(cnt1, stride); 2655 jccb(Assembler::aboveEqual, BIG_STRINGS); 2656 2657 // Check cross page boundary. 2658 movl(result, str1); // We need only low 32 bits 2659 andl(result, (os::vm_page_size()-1)); 2660 cmpl(result, (os::vm_page_size()-16)); 2661 jccb(Assembler::belowEqual, BIG_STRINGS); 2662 2663 subptr(rsp, 16); 2664 int stk_offset = -(1<<scale1); 2665 if (int_cnt2 < 0) { // not constant 2666 push(cnt2); 2667 stk_offset += wordSize; 2668 } 2669 movl(cnt2, cnt1); 2670 2671 bind(COPY_STR); 2672 if (ae == StrIntrinsicNode::LL) { 2673 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 2674 movb(Address(rsp, cnt2, scale1, stk_offset), result); 2675 } else { 2676 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 2677 movw(Address(rsp, cnt2, scale1, stk_offset), result); 2678 } 2679 decrement(cnt2); 2680 jccb(Assembler::notZero, COPY_STR); 2681 2682 if (int_cnt2 < 0) { // not constant 2683 pop(cnt2); 2684 } 2685 movptr(str1, rsp); // New string address 2686 2687 bind(BIG_STRINGS); 2688 // Load substring. 2689 if (int_cnt2 < 0) { // -1 2690 if (ae == StrIntrinsicNode::UL) { 2691 pmovzxbw(vec, Address(str2, 0)); 2692 } else { 2693 movdqu(vec, Address(str2, 0)); 2694 } 2695 push(cnt2); // substr count 2696 push(str2); // substr addr 2697 push(str1); // string addr 2698 } else { 2699 // Small (< 8 chars) constant substrings are loaded already. 2700 movl(cnt2, int_cnt2); 2701 } 2702 push(tmp); // original SP 2703 2704 } // Finished loading 2705 2706 //======================================================== 2707 // Start search 2708 // 2709 2710 movptr(result, str1); // string addr 2711 2712 if (int_cnt2 < 0) { // Only for non constant substring 2713 jmpb(SCAN_TO_SUBSTR); 2714 2715 // SP saved at sp+0 2716 // String saved at sp+1*wordSize 2717 // Substr saved at sp+2*wordSize 2718 // Substr count saved at sp+3*wordSize 2719 2720 // Reload substr for rescan, this code 2721 // is executed only for large substrings (> 8 chars) 2722 bind(RELOAD_SUBSTR); 2723 movptr(str2, Address(rsp, 2*wordSize)); 2724 movl(cnt2, Address(rsp, 3*wordSize)); 2725 if (ae == StrIntrinsicNode::UL) { 2726 pmovzxbw(vec, Address(str2, 0)); 2727 } else { 2728 movdqu(vec, Address(str2, 0)); 2729 } 2730 // We came here after the beginning of the substring was 2731 // matched but the rest of it was not so we need to search 2732 // again. Start from the next element after the previous match. 2733 subptr(str1, result); // Restore counter 2734 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2735 shrl(str1, 1); 2736 } 2737 addl(cnt1, str1); 2738 decrementl(cnt1); // Shift to next element 2739 cmpl(cnt1, cnt2); 2740 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2741 2742 addptr(result, (1<<scale1)); 2743 } // non constant 2744 2745 // Scan string for start of substr in 16-byte vectors 2746 bind(SCAN_TO_SUBSTR); 2747 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2748 pcmpestri(vec, Address(result, 0), mode); 2749 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2750 subl(cnt1, stride); 2751 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2752 cmpl(cnt1, cnt2); 2753 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2754 addptr(result, 16); 2755 2756 bind(ADJUST_STR); 2757 cmpl(cnt1, stride); // Do not read beyond string 2758 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2759 // Back-up string to avoid reading beyond string. 2760 lea(result, Address(result, cnt1, scale1, -16)); 2761 movl(cnt1, stride); 2762 jmpb(SCAN_TO_SUBSTR); 2763 2764 // Found a potential substr 2765 bind(FOUND_CANDIDATE); 2766 // After pcmpestri tmp(rcx) contains matched element index 2767 2768 // Make sure string is still long enough 2769 subl(cnt1, tmp); 2770 cmpl(cnt1, cnt2); 2771 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 2772 // Left less then substring. 2773 2774 bind(RET_NOT_FOUND); 2775 movl(result, -1); 2776 jmp(CLEANUP); 2777 2778 bind(FOUND_SUBSTR); 2779 // Compute start addr of substr 2780 lea(result, Address(result, tmp, scale1)); 2781 if (int_cnt2 > 0) { // Constant substring 2782 // Repeat search for small substring (< 8 chars) 2783 // from new point without reloading substring. 2784 // Have to check that we don't read beyond string. 2785 cmpl(tmp, stride-int_cnt2); 2786 jccb(Assembler::greater, ADJUST_STR); 2787 // Fall through if matched whole substring. 2788 } else { // non constant 2789 assert(int_cnt2 == -1, "should be != 0"); 2790 2791 addl(tmp, cnt2); 2792 // Found result if we matched whole substring. 2793 cmpl(tmp, stride); 2794 jcc(Assembler::lessEqual, RET_FOUND); 2795 2796 // Repeat search for small substring (<= 8 chars) 2797 // from new point 'str1' without reloading substring. 2798 cmpl(cnt2, stride); 2799 // Have to check that we don't read beyond string. 2800 jccb(Assembler::lessEqual, ADJUST_STR); 2801 2802 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 2803 // Compare the rest of substring (> 8 chars). 2804 movptr(str1, result); 2805 2806 cmpl(tmp, cnt2); 2807 // First 8 chars are already matched. 2808 jccb(Assembler::equal, CHECK_NEXT); 2809 2810 bind(SCAN_SUBSTR); 2811 pcmpestri(vec, Address(str1, 0), mode); 2812 // Need to reload strings pointers if not matched whole vector 2813 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2814 2815 bind(CHECK_NEXT); 2816 subl(cnt2, stride); 2817 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 2818 addptr(str1, 16); 2819 if (ae == StrIntrinsicNode::UL) { 2820 addptr(str2, 8); 2821 } else { 2822 addptr(str2, 16); 2823 } 2824 subl(cnt1, stride); 2825 cmpl(cnt2, stride); // Do not read beyond substring 2826 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 2827 // Back-up strings to avoid reading beyond substring. 2828 2829 if (ae == StrIntrinsicNode::UL) { 2830 lea(str2, Address(str2, cnt2, scale2, -8)); 2831 lea(str1, Address(str1, cnt2, scale1, -16)); 2832 } else { 2833 lea(str2, Address(str2, cnt2, scale2, -16)); 2834 lea(str1, Address(str1, cnt2, scale1, -16)); 2835 } 2836 subl(cnt1, cnt2); 2837 movl(cnt2, stride); 2838 addl(cnt1, stride); 2839 bind(CONT_SCAN_SUBSTR); 2840 if (ae == StrIntrinsicNode::UL) { 2841 pmovzxbw(vec, Address(str2, 0)); 2842 } else { 2843 movdqu(vec, Address(str2, 0)); 2844 } 2845 jmp(SCAN_SUBSTR); 2846 2847 bind(RET_FOUND_LONG); 2848 movptr(str1, Address(rsp, wordSize)); 2849 } // non constant 2850 2851 bind(RET_FOUND); 2852 // Compute substr offset 2853 subptr(result, str1); 2854 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2855 shrl(result, 1); // index 2856 } 2857 bind(CLEANUP); 2858 pop(rsp); // restore SP 2859 2860 } // string_indexof 2861 2862 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 2863 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 2864 ShortBranchVerifier sbv(this); 2865 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2866 2867 int stride = 8; 2868 2869 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 2870 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 2871 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 2872 FOUND_SEQ_CHAR, DONE_LABEL; 2873 2874 movptr(result, str1); 2875 if (UseAVX >= 2) { 2876 cmpl(cnt1, stride); 2877 jcc(Assembler::less, SCAN_TO_CHAR); 2878 cmpl(cnt1, 2*stride); 2879 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 2880 movdl(vec1, ch); 2881 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 2882 vpxor(vec2, vec2); 2883 movl(tmp, cnt1); 2884 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 2885 andl(cnt1,0x0000000F); //tail count (in chars) 2886 2887 bind(SCAN_TO_16_CHAR_LOOP); 2888 vmovdqu(vec3, Address(result, 0)); 2889 vpcmpeqw(vec3, vec3, vec1, 1); 2890 vptest(vec2, vec3); 2891 jcc(Assembler::carryClear, FOUND_CHAR); 2892 addptr(result, 32); 2893 subl(tmp, 2*stride); 2894 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 2895 jmp(SCAN_TO_8_CHAR); 2896 bind(SCAN_TO_8_CHAR_INIT); 2897 movdl(vec1, ch); 2898 pshuflw(vec1, vec1, 0x00); 2899 pshufd(vec1, vec1, 0); 2900 pxor(vec2, vec2); 2901 } 2902 bind(SCAN_TO_8_CHAR); 2903 cmpl(cnt1, stride); 2904 jcc(Assembler::less, SCAN_TO_CHAR); 2905 if (UseAVX < 2) { 2906 movdl(vec1, ch); 2907 pshuflw(vec1, vec1, 0x00); 2908 pshufd(vec1, vec1, 0); 2909 pxor(vec2, vec2); 2910 } 2911 movl(tmp, cnt1); 2912 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 2913 andl(cnt1,0x00000007); //tail count (in chars) 2914 2915 bind(SCAN_TO_8_CHAR_LOOP); 2916 movdqu(vec3, Address(result, 0)); 2917 pcmpeqw(vec3, vec1); 2918 ptest(vec2, vec3); 2919 jcc(Assembler::carryClear, FOUND_CHAR); 2920 addptr(result, 16); 2921 subl(tmp, stride); 2922 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 2923 bind(SCAN_TO_CHAR); 2924 testl(cnt1, cnt1); 2925 jcc(Assembler::zero, RET_NOT_FOUND); 2926 bind(SCAN_TO_CHAR_LOOP); 2927 load_unsigned_short(tmp, Address(result, 0)); 2928 cmpl(ch, tmp); 2929 jccb(Assembler::equal, FOUND_SEQ_CHAR); 2930 addptr(result, 2); 2931 subl(cnt1, 1); 2932 jccb(Assembler::zero, RET_NOT_FOUND); 2933 jmp(SCAN_TO_CHAR_LOOP); 2934 2935 bind(RET_NOT_FOUND); 2936 movl(result, -1); 2937 jmpb(DONE_LABEL); 2938 2939 bind(FOUND_CHAR); 2940 if (UseAVX >= 2) { 2941 vpmovmskb(tmp, vec3); 2942 } else { 2943 pmovmskb(tmp, vec3); 2944 } 2945 bsfl(ch, tmp); 2946 addptr(result, ch); 2947 2948 bind(FOUND_SEQ_CHAR); 2949 subptr(result, str1); 2950 shrl(result, 1); 2951 2952 bind(DONE_LABEL); 2953 } // string_indexof_char 2954 2955 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 2956 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 2957 ShortBranchVerifier sbv(this); 2958 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2959 2960 int stride = 16; 2961 2962 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 2963 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 2964 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 2965 FOUND_SEQ_CHAR, DONE_LABEL; 2966 2967 movptr(result, str1); 2968 if (UseAVX >= 2) { 2969 cmpl(cnt1, stride); 2970 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 2971 cmpl(cnt1, stride*2); 2972 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 2973 movdl(vec1, ch); 2974 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 2975 vpxor(vec2, vec2); 2976 movl(tmp, cnt1); 2977 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 2978 andl(cnt1,0x0000001F); //tail count (in chars) 2979 2980 bind(SCAN_TO_32_CHAR_LOOP); 2981 vmovdqu(vec3, Address(result, 0)); 2982 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 2983 vptest(vec2, vec3); 2984 jcc(Assembler::carryClear, FOUND_CHAR); 2985 addptr(result, 32); 2986 subl(tmp, stride*2); 2987 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 2988 jmp(SCAN_TO_16_CHAR); 2989 2990 bind(SCAN_TO_16_CHAR_INIT); 2991 movdl(vec1, ch); 2992 pxor(vec2, vec2); 2993 pshufb(vec1, vec2); 2994 } 2995 2996 bind(SCAN_TO_16_CHAR); 2997 cmpl(cnt1, stride); 2998 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entires left 2999 if (UseAVX < 2) { 3000 movdl(vec1, ch); 3001 pxor(vec2, vec2); 3002 pshufb(vec1, vec2); 3003 } 3004 movl(tmp, cnt1); 3005 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3006 andl(cnt1,0x0000000F); //tail count (in bytes) 3007 3008 bind(SCAN_TO_16_CHAR_LOOP); 3009 movdqu(vec3, Address(result, 0)); 3010 pcmpeqb(vec3, vec1); 3011 ptest(vec2, vec3); 3012 jcc(Assembler::carryClear, FOUND_CHAR); 3013 addptr(result, 16); 3014 subl(tmp, stride); 3015 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3016 3017 bind(SCAN_TO_CHAR_INIT); 3018 testl(cnt1, cnt1); 3019 jcc(Assembler::zero, RET_NOT_FOUND); 3020 bind(SCAN_TO_CHAR_LOOP); 3021 load_unsigned_byte(tmp, Address(result, 0)); 3022 cmpl(ch, tmp); 3023 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3024 addptr(result, 1); 3025 subl(cnt1, 1); 3026 jccb(Assembler::zero, RET_NOT_FOUND); 3027 jmp(SCAN_TO_CHAR_LOOP); 3028 3029 bind(RET_NOT_FOUND); 3030 movl(result, -1); 3031 jmpb(DONE_LABEL); 3032 3033 bind(FOUND_CHAR); 3034 if (UseAVX >= 2) { 3035 vpmovmskb(tmp, vec3); 3036 } else { 3037 pmovmskb(tmp, vec3); 3038 } 3039 bsfl(ch, tmp); 3040 addptr(result, ch); 3041 3042 bind(FOUND_SEQ_CHAR); 3043 subptr(result, str1); 3044 3045 bind(DONE_LABEL); 3046 } // stringL_indexof_char 3047 3048 // helper function for string_compare 3049 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3050 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3051 Address::ScaleFactor scale2, Register index, int ae) { 3052 if (ae == StrIntrinsicNode::LL) { 3053 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3054 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3055 } else if (ae == StrIntrinsicNode::UU) { 3056 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3057 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3058 } else { 3059 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3060 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3061 } 3062 } 3063 3064 // Compare strings, used for char[] and byte[]. 3065 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3066 Register cnt1, Register cnt2, Register result, 3067 XMMRegister vec1, int ae, KRegister mask) { 3068 ShortBranchVerifier sbv(this); 3069 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3070 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3071 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3072 int stride2x2 = 0x40; 3073 Address::ScaleFactor scale = Address::no_scale; 3074 Address::ScaleFactor scale1 = Address::no_scale; 3075 Address::ScaleFactor scale2 = Address::no_scale; 3076 3077 if (ae != StrIntrinsicNode::LL) { 3078 stride2x2 = 0x20; 3079 } 3080 3081 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3082 shrl(cnt2, 1); 3083 } 3084 // Compute the minimum of the string lengths and the 3085 // difference of the string lengths (stack). 3086 // Do the conditional move stuff 3087 movl(result, cnt1); 3088 subl(cnt1, cnt2); 3089 push(cnt1); 3090 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3091 3092 // Is the minimum length zero? 3093 testl(cnt2, cnt2); 3094 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3095 if (ae == StrIntrinsicNode::LL) { 3096 // Load first bytes 3097 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3098 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3099 } else if (ae == StrIntrinsicNode::UU) { 3100 // Load first characters 3101 load_unsigned_short(result, Address(str1, 0)); 3102 load_unsigned_short(cnt1, Address(str2, 0)); 3103 } else { 3104 load_unsigned_byte(result, Address(str1, 0)); 3105 load_unsigned_short(cnt1, Address(str2, 0)); 3106 } 3107 subl(result, cnt1); 3108 jcc(Assembler::notZero, POP_LABEL); 3109 3110 if (ae == StrIntrinsicNode::UU) { 3111 // Divide length by 2 to get number of chars 3112 shrl(cnt2, 1); 3113 } 3114 cmpl(cnt2, 1); 3115 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3116 3117 // Check if the strings start at the same location and setup scale and stride 3118 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3119 cmpptr(str1, str2); 3120 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3121 if (ae == StrIntrinsicNode::LL) { 3122 scale = Address::times_1; 3123 stride = 16; 3124 } else { 3125 scale = Address::times_2; 3126 stride = 8; 3127 } 3128 } else { 3129 scale1 = Address::times_1; 3130 scale2 = Address::times_2; 3131 // scale not used 3132 stride = 8; 3133 } 3134 3135 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3136 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3137 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3138 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3139 Label COMPARE_TAIL_LONG; 3140 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3141 3142 int pcmpmask = 0x19; 3143 if (ae == StrIntrinsicNode::LL) { 3144 pcmpmask &= ~0x01; 3145 } 3146 3147 // Setup to compare 16-chars (32-bytes) vectors, 3148 // start from first character again because it has aligned address. 3149 if (ae == StrIntrinsicNode::LL) { 3150 stride2 = 32; 3151 } else { 3152 stride2 = 16; 3153 } 3154 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3155 adr_stride = stride << scale; 3156 } else { 3157 adr_stride1 = 8; //stride << scale1; 3158 adr_stride2 = 16; //stride << scale2; 3159 } 3160 3161 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3162 // rax and rdx are used by pcmpestri as elements counters 3163 movl(result, cnt2); 3164 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3165 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3166 3167 // fast path : compare first 2 8-char vectors. 3168 bind(COMPARE_16_CHARS); 3169 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3170 movdqu(vec1, Address(str1, 0)); 3171 } else { 3172 pmovzxbw(vec1, Address(str1, 0)); 3173 } 3174 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3175 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3176 3177 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3178 movdqu(vec1, Address(str1, adr_stride)); 3179 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3180 } else { 3181 pmovzxbw(vec1, Address(str1, adr_stride1)); 3182 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3183 } 3184 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3185 addl(cnt1, stride); 3186 3187 // Compare the characters at index in cnt1 3188 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3189 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3190 subl(result, cnt2); 3191 jmp(POP_LABEL); 3192 3193 // Setup the registers to start vector comparison loop 3194 bind(COMPARE_WIDE_VECTORS); 3195 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3196 lea(str1, Address(str1, result, scale)); 3197 lea(str2, Address(str2, result, scale)); 3198 } else { 3199 lea(str1, Address(str1, result, scale1)); 3200 lea(str2, Address(str2, result, scale2)); 3201 } 3202 subl(result, stride2); 3203 subl(cnt2, stride2); 3204 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3205 negptr(result); 3206 3207 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3208 bind(COMPARE_WIDE_VECTORS_LOOP); 3209 3210 #ifdef _LP64 3211 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3212 cmpl(cnt2, stride2x2); 3213 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3214 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3215 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3216 3217 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3218 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3219 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3220 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3221 } else { 3222 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3223 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3224 } 3225 kortestql(mask, mask); 3226 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3227 addptr(result, stride2x2); // update since we already compared at this addr 3228 subl(cnt2, stride2x2); // and sub the size too 3229 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3230 3231 vpxor(vec1, vec1); 3232 jmpb(COMPARE_WIDE_TAIL); 3233 }//if (VM_Version::supports_avx512vlbw()) 3234 #endif // _LP64 3235 3236 3237 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3238 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3239 vmovdqu(vec1, Address(str1, result, scale)); 3240 vpxor(vec1, Address(str2, result, scale)); 3241 } else { 3242 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3243 vpxor(vec1, Address(str2, result, scale2)); 3244 } 3245 vptest(vec1, vec1); 3246 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3247 addptr(result, stride2); 3248 subl(cnt2, stride2); 3249 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3250 // clean upper bits of YMM registers 3251 vpxor(vec1, vec1); 3252 3253 // compare wide vectors tail 3254 bind(COMPARE_WIDE_TAIL); 3255 testptr(result, result); 3256 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3257 3258 movl(result, stride2); 3259 movl(cnt2, result); 3260 negptr(result); 3261 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3262 3263 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3264 bind(VECTOR_NOT_EQUAL); 3265 // clean upper bits of YMM registers 3266 vpxor(vec1, vec1); 3267 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3268 lea(str1, Address(str1, result, scale)); 3269 lea(str2, Address(str2, result, scale)); 3270 } else { 3271 lea(str1, Address(str1, result, scale1)); 3272 lea(str2, Address(str2, result, scale2)); 3273 } 3274 jmp(COMPARE_16_CHARS); 3275 3276 // Compare tail chars, length between 1 to 15 chars 3277 bind(COMPARE_TAIL_LONG); 3278 movl(cnt2, result); 3279 cmpl(cnt2, stride); 3280 jcc(Assembler::less, COMPARE_SMALL_STR); 3281 3282 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3283 movdqu(vec1, Address(str1, 0)); 3284 } else { 3285 pmovzxbw(vec1, Address(str1, 0)); 3286 } 3287 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3288 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3289 subptr(cnt2, stride); 3290 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3291 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3292 lea(str1, Address(str1, result, scale)); 3293 lea(str2, Address(str2, result, scale)); 3294 } else { 3295 lea(str1, Address(str1, result, scale1)); 3296 lea(str2, Address(str2, result, scale2)); 3297 } 3298 negptr(cnt2); 3299 jmpb(WHILE_HEAD_LABEL); 3300 3301 bind(COMPARE_SMALL_STR); 3302 } else if (UseSSE42Intrinsics) { 3303 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3304 int pcmpmask = 0x19; 3305 // Setup to compare 8-char (16-byte) vectors, 3306 // start from first character again because it has aligned address. 3307 movl(result, cnt2); 3308 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3309 if (ae == StrIntrinsicNode::LL) { 3310 pcmpmask &= ~0x01; 3311 } 3312 jcc(Assembler::zero, COMPARE_TAIL); 3313 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3314 lea(str1, Address(str1, result, scale)); 3315 lea(str2, Address(str2, result, scale)); 3316 } else { 3317 lea(str1, Address(str1, result, scale1)); 3318 lea(str2, Address(str2, result, scale2)); 3319 } 3320 negptr(result); 3321 3322 // pcmpestri 3323 // inputs: 3324 // vec1- substring 3325 // rax - negative string length (elements count) 3326 // mem - scanned string 3327 // rdx - string length (elements count) 3328 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3329 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3330 // outputs: 3331 // rcx - first mismatched element index 3332 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3333 3334 bind(COMPARE_WIDE_VECTORS); 3335 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3336 movdqu(vec1, Address(str1, result, scale)); 3337 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3338 } else { 3339 pmovzxbw(vec1, Address(str1, result, scale1)); 3340 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3341 } 3342 // After pcmpestri cnt1(rcx) contains mismatched element index 3343 3344 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3345 addptr(result, stride); 3346 subptr(cnt2, stride); 3347 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3348 3349 // compare wide vectors tail 3350 testptr(result, result); 3351 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3352 3353 movl(cnt2, stride); 3354 movl(result, stride); 3355 negptr(result); 3356 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3357 movdqu(vec1, Address(str1, result, scale)); 3358 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3359 } else { 3360 pmovzxbw(vec1, Address(str1, result, scale1)); 3361 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3362 } 3363 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3364 3365 // Mismatched characters in the vectors 3366 bind(VECTOR_NOT_EQUAL); 3367 addptr(cnt1, result); 3368 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3369 subl(result, cnt2); 3370 jmpb(POP_LABEL); 3371 3372 bind(COMPARE_TAIL); // limit is zero 3373 movl(cnt2, result); 3374 // Fallthru to tail compare 3375 } 3376 // Shift str2 and str1 to the end of the arrays, negate min 3377 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3378 lea(str1, Address(str1, cnt2, scale)); 3379 lea(str2, Address(str2, cnt2, scale)); 3380 } else { 3381 lea(str1, Address(str1, cnt2, scale1)); 3382 lea(str2, Address(str2, cnt2, scale2)); 3383 } 3384 decrementl(cnt2); // first character was compared already 3385 negptr(cnt2); 3386 3387 // Compare the rest of the elements 3388 bind(WHILE_HEAD_LABEL); 3389 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3390 subl(result, cnt1); 3391 jccb(Assembler::notZero, POP_LABEL); 3392 increment(cnt2); 3393 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3394 3395 // Strings are equal up to min length. Return the length difference. 3396 bind(LENGTH_DIFF_LABEL); 3397 pop(result); 3398 if (ae == StrIntrinsicNode::UU) { 3399 // Divide diff by 2 to get number of chars 3400 sarl(result, 1); 3401 } 3402 jmpb(DONE_LABEL); 3403 3404 #ifdef _LP64 3405 if (VM_Version::supports_avx512vlbw()) { 3406 3407 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3408 3409 kmovql(cnt1, mask); 3410 notq(cnt1); 3411 bsfq(cnt2, cnt1); 3412 if (ae != StrIntrinsicNode::LL) { 3413 // Divide diff by 2 to get number of chars 3414 sarl(cnt2, 1); 3415 } 3416 addq(result, cnt2); 3417 if (ae == StrIntrinsicNode::LL) { 3418 load_unsigned_byte(cnt1, Address(str2, result)); 3419 load_unsigned_byte(result, Address(str1, result)); 3420 } else if (ae == StrIntrinsicNode::UU) { 3421 load_unsigned_short(cnt1, Address(str2, result, scale)); 3422 load_unsigned_short(result, Address(str1, result, scale)); 3423 } else { 3424 load_unsigned_short(cnt1, Address(str2, result, scale2)); 3425 load_unsigned_byte(result, Address(str1, result, scale1)); 3426 } 3427 subl(result, cnt1); 3428 jmpb(POP_LABEL); 3429 }//if (VM_Version::supports_avx512vlbw()) 3430 #endif // _LP64 3431 3432 // Discard the stored length difference 3433 bind(POP_LABEL); 3434 pop(cnt1); 3435 3436 // That's it 3437 bind(DONE_LABEL); 3438 if(ae == StrIntrinsicNode::UL) { 3439 negl(result); 3440 } 3441 3442 } 3443 3444 // Search for Non-ASCII character (Negative byte value) in a byte array, 3445 // return true if it has any and false otherwise. 3446 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 3447 // @IntrinsicCandidate 3448 // private static boolean hasNegatives(byte[] ba, int off, int len) { 3449 // for (int i = off; i < off + len; i++) { 3450 // if (ba[i] < 0) { 3451 // return true; 3452 // } 3453 // } 3454 // return false; 3455 // } 3456 void C2_MacroAssembler::has_negatives(Register ary1, Register len, 3457 Register result, Register tmp1, 3458 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 3459 // rsi: byte array 3460 // rcx: len 3461 // rax: result 3462 ShortBranchVerifier sbv(this); 3463 assert_different_registers(ary1, len, result, tmp1); 3464 assert_different_registers(vec1, vec2); 3465 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 3466 3467 // len == 0 3468 testl(len, len); 3469 jcc(Assembler::zero, FALSE_LABEL); 3470 3471 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 3472 VM_Version::supports_avx512vlbw() && 3473 VM_Version::supports_bmi2()) { 3474 3475 Label test_64_loop, test_tail; 3476 Register tmp3_aliased = len; 3477 3478 movl(tmp1, len); 3479 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 3480 3481 andl(tmp1, 64 - 1); // tail count (in chars) 0x3F 3482 andl(len, ~(64 - 1)); // vector count (in chars) 3483 jccb(Assembler::zero, test_tail); 3484 3485 lea(ary1, Address(ary1, len, Address::times_1)); 3486 negptr(len); 3487 3488 bind(test_64_loop); 3489 // Check whether our 64 elements of size byte contain negatives 3490 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 3491 kortestql(mask1, mask1); 3492 jcc(Assembler::notZero, TRUE_LABEL); 3493 3494 addptr(len, 64); 3495 jccb(Assembler::notZero, test_64_loop); 3496 3497 3498 bind(test_tail); 3499 // bail out when there is nothing to be done 3500 testl(tmp1, -1); 3501 jcc(Assembler::zero, FALSE_LABEL); 3502 3503 // ~(~0 << len) applied up to two times (for 32-bit scenario) 3504 #ifdef _LP64 3505 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 3506 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 3507 notq(tmp3_aliased); 3508 kmovql(mask2, tmp3_aliased); 3509 #else 3510 Label k_init; 3511 jmp(k_init); 3512 3513 // We could not read 64-bits from a general purpose register thus we move 3514 // data required to compose 64 1's to the instruction stream 3515 // We emit 64 byte wide series of elements from 0..63 which later on would 3516 // be used as a compare targets with tail count contained in tmp1 register. 3517 // Result would be a k register having tmp1 consecutive number or 1 3518 // counting from least significant bit. 3519 address tmp = pc(); 3520 emit_int64(0x0706050403020100); 3521 emit_int64(0x0F0E0D0C0B0A0908); 3522 emit_int64(0x1716151413121110); 3523 emit_int64(0x1F1E1D1C1B1A1918); 3524 emit_int64(0x2726252423222120); 3525 emit_int64(0x2F2E2D2C2B2A2928); 3526 emit_int64(0x3736353433323130); 3527 emit_int64(0x3F3E3D3C3B3A3938); 3528 3529 bind(k_init); 3530 lea(len, InternalAddress(tmp)); 3531 // create mask to test for negative byte inside a vector 3532 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 3533 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 3534 3535 #endif 3536 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 3537 ktestq(mask1, mask2); 3538 jcc(Assembler::notZero, TRUE_LABEL); 3539 3540 jmp(FALSE_LABEL); 3541 } else { 3542 movl(result, len); // copy 3543 3544 if (UseAVX >= 2 && UseSSE >= 2) { 3545 // With AVX2, use 32-byte vector compare 3546 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3547 3548 // Compare 32-byte vectors 3549 andl(result, 0x0000001f); // tail count (in bytes) 3550 andl(len, 0xffffffe0); // vector count (in bytes) 3551 jccb(Assembler::zero, COMPARE_TAIL); 3552 3553 lea(ary1, Address(ary1, len, Address::times_1)); 3554 negptr(len); 3555 3556 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 3557 movdl(vec2, tmp1); 3558 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 3559 3560 bind(COMPARE_WIDE_VECTORS); 3561 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 3562 vptest(vec1, vec2); 3563 jccb(Assembler::notZero, TRUE_LABEL); 3564 addptr(len, 32); 3565 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3566 3567 testl(result, result); 3568 jccb(Assembler::zero, FALSE_LABEL); 3569 3570 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 3571 vptest(vec1, vec2); 3572 jccb(Assembler::notZero, TRUE_LABEL); 3573 jmpb(FALSE_LABEL); 3574 3575 bind(COMPARE_TAIL); // len is zero 3576 movl(len, result); 3577 // Fallthru to tail compare 3578 } else if (UseSSE42Intrinsics) { 3579 // With SSE4.2, use double quad vector compare 3580 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3581 3582 // Compare 16-byte vectors 3583 andl(result, 0x0000000f); // tail count (in bytes) 3584 andl(len, 0xfffffff0); // vector count (in bytes) 3585 jcc(Assembler::zero, COMPARE_TAIL); 3586 3587 lea(ary1, Address(ary1, len, Address::times_1)); 3588 negptr(len); 3589 3590 movl(tmp1, 0x80808080); 3591 movdl(vec2, tmp1); 3592 pshufd(vec2, vec2, 0); 3593 3594 bind(COMPARE_WIDE_VECTORS); 3595 movdqu(vec1, Address(ary1, len, Address::times_1)); 3596 ptest(vec1, vec2); 3597 jcc(Assembler::notZero, TRUE_LABEL); 3598 addptr(len, 16); 3599 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3600 3601 testl(result, result); 3602 jcc(Assembler::zero, FALSE_LABEL); 3603 3604 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 3605 ptest(vec1, vec2); 3606 jccb(Assembler::notZero, TRUE_LABEL); 3607 jmpb(FALSE_LABEL); 3608 3609 bind(COMPARE_TAIL); // len is zero 3610 movl(len, result); 3611 // Fallthru to tail compare 3612 } 3613 } 3614 // Compare 4-byte vectors 3615 andl(len, 0xfffffffc); // vector count (in bytes) 3616 jccb(Assembler::zero, COMPARE_CHAR); 3617 3618 lea(ary1, Address(ary1, len, Address::times_1)); 3619 negptr(len); 3620 3621 bind(COMPARE_VECTORS); 3622 movl(tmp1, Address(ary1, len, Address::times_1)); 3623 andl(tmp1, 0x80808080); 3624 jccb(Assembler::notZero, TRUE_LABEL); 3625 addptr(len, 4); 3626 jcc(Assembler::notZero, COMPARE_VECTORS); 3627 3628 // Compare trailing char (final 2 bytes), if any 3629 bind(COMPARE_CHAR); 3630 testl(result, 0x2); // tail char 3631 jccb(Assembler::zero, COMPARE_BYTE); 3632 load_unsigned_short(tmp1, Address(ary1, 0)); 3633 andl(tmp1, 0x00008080); 3634 jccb(Assembler::notZero, TRUE_LABEL); 3635 subptr(result, 2); 3636 lea(ary1, Address(ary1, 2)); 3637 3638 bind(COMPARE_BYTE); 3639 testl(result, 0x1); // tail byte 3640 jccb(Assembler::zero, FALSE_LABEL); 3641 load_unsigned_byte(tmp1, Address(ary1, 0)); 3642 andl(tmp1, 0x00000080); 3643 jccb(Assembler::notEqual, TRUE_LABEL); 3644 jmpb(FALSE_LABEL); 3645 3646 bind(TRUE_LABEL); 3647 movl(result, 1); // return true 3648 jmpb(DONE); 3649 3650 bind(FALSE_LABEL); 3651 xorl(result, result); // return false 3652 3653 // That's it 3654 bind(DONE); 3655 if (UseAVX >= 2 && UseSSE >= 2) { 3656 // clean upper bits of YMM registers 3657 vpxor(vec1, vec1); 3658 vpxor(vec2, vec2); 3659 } 3660 } 3661 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 3662 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 3663 Register limit, Register result, Register chr, 3664 XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) { 3665 ShortBranchVerifier sbv(this); 3666 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 3667 3668 int length_offset = arrayOopDesc::length_offset_in_bytes(); 3669 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 3670 3671 if (is_array_equ) { 3672 // Check the input args 3673 cmpoop(ary1, ary2); 3674 jcc(Assembler::equal, TRUE_LABEL); 3675 3676 // Need additional checks for arrays_equals. 3677 testptr(ary1, ary1); 3678 jcc(Assembler::zero, FALSE_LABEL); 3679 testptr(ary2, ary2); 3680 jcc(Assembler::zero, FALSE_LABEL); 3681 3682 // Check the lengths 3683 movl(limit, Address(ary1, length_offset)); 3684 cmpl(limit, Address(ary2, length_offset)); 3685 jcc(Assembler::notEqual, FALSE_LABEL); 3686 } 3687 3688 // count == 0 3689 testl(limit, limit); 3690 jcc(Assembler::zero, TRUE_LABEL); 3691 3692 if (is_array_equ) { 3693 // Load array address 3694 lea(ary1, Address(ary1, base_offset)); 3695 lea(ary2, Address(ary2, base_offset)); 3696 } 3697 3698 if (is_array_equ && is_char) { 3699 // arrays_equals when used for char[]. 3700 shll(limit, 1); // byte count != 0 3701 } 3702 movl(result, limit); // copy 3703 3704 if (UseAVX >= 2) { 3705 // With AVX2, use 32-byte vector compare 3706 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3707 3708 // Compare 32-byte vectors 3709 andl(result, 0x0000001f); // tail count (in bytes) 3710 andl(limit, 0xffffffe0); // vector count (in bytes) 3711 jcc(Assembler::zero, COMPARE_TAIL); 3712 3713 lea(ary1, Address(ary1, limit, Address::times_1)); 3714 lea(ary2, Address(ary2, limit, Address::times_1)); 3715 negptr(limit); 3716 3717 #ifdef _LP64 3718 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3719 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 3720 3721 cmpl(limit, -64); 3722 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3723 3724 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3725 3726 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 3727 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 3728 kortestql(mask, mask); 3729 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 3730 addptr(limit, 64); // update since we already compared at this addr 3731 cmpl(limit, -64); 3732 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3733 3734 // At this point we may still need to compare -limit+result bytes. 3735 // We could execute the next two instruction and just continue via non-wide path: 3736 // cmpl(limit, 0); 3737 // jcc(Assembler::equal, COMPARE_TAIL); // true 3738 // But since we stopped at the points ary{1,2}+limit which are 3739 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 3740 // (|limit| <= 32 and result < 32), 3741 // we may just compare the last 64 bytes. 3742 // 3743 addptr(result, -64); // it is safe, bc we just came from this area 3744 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 3745 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 3746 kortestql(mask, mask); 3747 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 3748 3749 jmp(TRUE_LABEL); 3750 3751 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3752 3753 }//if (VM_Version::supports_avx512vlbw()) 3754 #endif //_LP64 3755 bind(COMPARE_WIDE_VECTORS); 3756 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); 3757 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 3758 vpxor(vec1, vec2); 3759 3760 vptest(vec1, vec1); 3761 jcc(Assembler::notZero, FALSE_LABEL); 3762 addptr(limit, 32); 3763 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3764 3765 testl(result, result); 3766 jcc(Assembler::zero, TRUE_LABEL); 3767 3768 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 3769 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 3770 vpxor(vec1, vec2); 3771 3772 vptest(vec1, vec1); 3773 jccb(Assembler::notZero, FALSE_LABEL); 3774 jmpb(TRUE_LABEL); 3775 3776 bind(COMPARE_TAIL); // limit is zero 3777 movl(limit, result); 3778 // Fallthru to tail compare 3779 } else if (UseSSE42Intrinsics) { 3780 // With SSE4.2, use double quad vector compare 3781 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3782 3783 // Compare 16-byte vectors 3784 andl(result, 0x0000000f); // tail count (in bytes) 3785 andl(limit, 0xfffffff0); // vector count (in bytes) 3786 jcc(Assembler::zero, COMPARE_TAIL); 3787 3788 lea(ary1, Address(ary1, limit, Address::times_1)); 3789 lea(ary2, Address(ary2, limit, Address::times_1)); 3790 negptr(limit); 3791 3792 bind(COMPARE_WIDE_VECTORS); 3793 movdqu(vec1, Address(ary1, limit, Address::times_1)); 3794 movdqu(vec2, Address(ary2, limit, Address::times_1)); 3795 pxor(vec1, vec2); 3796 3797 ptest(vec1, vec1); 3798 jcc(Assembler::notZero, FALSE_LABEL); 3799 addptr(limit, 16); 3800 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3801 3802 testl(result, result); 3803 jcc(Assembler::zero, TRUE_LABEL); 3804 3805 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 3806 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 3807 pxor(vec1, vec2); 3808 3809 ptest(vec1, vec1); 3810 jccb(Assembler::notZero, FALSE_LABEL); 3811 jmpb(TRUE_LABEL); 3812 3813 bind(COMPARE_TAIL); // limit is zero 3814 movl(limit, result); 3815 // Fallthru to tail compare 3816 } 3817 3818 // Compare 4-byte vectors 3819 andl(limit, 0xfffffffc); // vector count (in bytes) 3820 jccb(Assembler::zero, COMPARE_CHAR); 3821 3822 lea(ary1, Address(ary1, limit, Address::times_1)); 3823 lea(ary2, Address(ary2, limit, Address::times_1)); 3824 negptr(limit); 3825 3826 bind(COMPARE_VECTORS); 3827 movl(chr, Address(ary1, limit, Address::times_1)); 3828 cmpl(chr, Address(ary2, limit, Address::times_1)); 3829 jccb(Assembler::notEqual, FALSE_LABEL); 3830 addptr(limit, 4); 3831 jcc(Assembler::notZero, COMPARE_VECTORS); 3832 3833 // Compare trailing char (final 2 bytes), if any 3834 bind(COMPARE_CHAR); 3835 testl(result, 0x2); // tail char 3836 jccb(Assembler::zero, COMPARE_BYTE); 3837 load_unsigned_short(chr, Address(ary1, 0)); 3838 load_unsigned_short(limit, Address(ary2, 0)); 3839 cmpl(chr, limit); 3840 jccb(Assembler::notEqual, FALSE_LABEL); 3841 3842 if (is_array_equ && is_char) { 3843 bind(COMPARE_BYTE); 3844 } else { 3845 lea(ary1, Address(ary1, 2)); 3846 lea(ary2, Address(ary2, 2)); 3847 3848 bind(COMPARE_BYTE); 3849 testl(result, 0x1); // tail byte 3850 jccb(Assembler::zero, TRUE_LABEL); 3851 load_unsigned_byte(chr, Address(ary1, 0)); 3852 load_unsigned_byte(limit, Address(ary2, 0)); 3853 cmpl(chr, limit); 3854 jccb(Assembler::notEqual, FALSE_LABEL); 3855 } 3856 bind(TRUE_LABEL); 3857 movl(result, 1); // return true 3858 jmpb(DONE); 3859 3860 bind(FALSE_LABEL); 3861 xorl(result, result); // return false 3862 3863 // That's it 3864 bind(DONE); 3865 if (UseAVX >= 2) { 3866 // clean upper bits of YMM registers 3867 vpxor(vec1, vec1); 3868 vpxor(vec2, vec2); 3869 } 3870 } 3871 3872 #ifdef _LP64 3873 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 3874 Register tmp, KRegister ktmp, int masklen, int vec_enc) { 3875 assert(VM_Version::supports_avx512vlbw(), ""); 3876 vpxor(xtmp, xtmp, xtmp, vec_enc); 3877 vpsubb(xtmp, xtmp, mask, vec_enc); 3878 evpmovb2m(ktmp, xtmp, vec_enc); 3879 kmovql(tmp, ktmp); 3880 switch(opc) { 3881 case Op_VectorMaskTrueCount: 3882 popcntq(dst, tmp); 3883 break; 3884 case Op_VectorMaskLastTrue: 3885 mov64(dst, -1); 3886 bsrq(tmp, tmp); 3887 cmov(Assembler::notZero, dst, tmp); 3888 break; 3889 case Op_VectorMaskFirstTrue: 3890 mov64(dst, masklen); 3891 bsfq(tmp, tmp); 3892 cmov(Assembler::notZero, dst, tmp); 3893 break; 3894 default: assert(false, "Unhandled mask operation"); 3895 } 3896 } 3897 3898 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 3899 XMMRegister xtmp1, Register tmp, int masklen, int vec_enc) { 3900 assert(VM_Version::supports_avx(), ""); 3901 vpxor(xtmp, xtmp, xtmp, vec_enc); 3902 vpsubb(xtmp, xtmp, mask, vec_enc); 3903 vpmovmskb(tmp, xtmp, vec_enc); 3904 if (masklen < 64) { 3905 andq(tmp, (((jlong)1 << masklen) - 1)); 3906 } 3907 switch(opc) { 3908 case Op_VectorMaskTrueCount: 3909 popcntq(dst, tmp); 3910 break; 3911 case Op_VectorMaskLastTrue: 3912 mov64(dst, -1); 3913 bsrq(tmp, tmp); 3914 cmov(Assembler::notZero, dst, tmp); 3915 break; 3916 case Op_VectorMaskFirstTrue: 3917 mov64(dst, masklen); 3918 bsfq(tmp, tmp); 3919 cmov(Assembler::notZero, dst, tmp); 3920 break; 3921 default: assert(false, "Unhandled mask operation"); 3922 } 3923 } 3924 #endif 3925 3926 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 3927 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 3928 int vlen_enc) { 3929 assert(VM_Version::supports_avx512bw(), ""); 3930 // Byte shuffles are inlane operations and indices are determined using 3931 // lower 4 bit of each shuffle lane, thus all shuffle indices are 3932 // normalized to index range 0-15. This makes sure that all the multiples 3933 // of an index value are placed at same relative position in 128 bit 3934 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 3935 // will be 16th element in their respective 128 bit lanes. 3936 movl(rtmp, 16); 3937 evpbroadcastb(xtmp1, rtmp, vlen_enc); 3938 3939 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 3940 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 3941 // original shuffle indices and move the shuffled lanes corresponding to true 3942 // mask to destination vector. 3943 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 3944 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 3945 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 3946 3947 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 3948 // and broadcasting second 128 bit lane. 3949 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 3950 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 3951 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 3952 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 3953 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 3954 3955 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 3956 // and broadcasting third 128 bit lane. 3957 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 3958 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 3959 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 3960 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 3961 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 3962 3963 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 3964 // and broadcasting third 128 bit lane. 3965 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 3966 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 3967 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 3968 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 3969 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 3970 }