1 /* 2 * Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "oops/methodData.hpp" 29 #include "opto/c2_MacroAssembler.hpp" 30 #include "opto/intrinsicnode.hpp" 31 #include "opto/opcodes.hpp" 32 #include "opto/subnode.hpp" 33 #include "runtime/biasedLocking.hpp" 34 #include "runtime/objectMonitor.hpp" 35 #include "runtime/stubRoutines.hpp" 36 37 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 38 switch (vlen_in_bytes) { 39 case 4: // fall-through 40 case 8: // fall-through 41 case 16: return Assembler::AVX_128bit; 42 case 32: return Assembler::AVX_256bit; 43 case 64: return Assembler::AVX_512bit; 44 45 default: { 46 ShouldNotReachHere(); 47 return Assembler::AVX_NoVec; 48 } 49 } 50 } 51 52 void C2_MacroAssembler::setvectmask(Register dst, Register src, KRegister mask) { 53 guarantee(PostLoopMultiversioning, "must be"); 54 Assembler::movl(dst, 1); 55 Assembler::shlxl(dst, dst, src); 56 Assembler::decl(dst); 57 Assembler::kmovdl(mask, dst); 58 Assembler::movl(dst, src); 59 } 60 61 void C2_MacroAssembler::restorevectmask(KRegister mask) { 62 guarantee(PostLoopMultiversioning, "must be"); 63 Assembler::knotwl(mask, k0); 64 } 65 66 #if INCLUDE_RTM_OPT 67 68 // Update rtm_counters based on abort status 69 // input: abort_status 70 // rtm_counters (RTMLockingCounters*) 71 // flags are killed 72 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) { 73 74 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset())); 75 if (PrintPreciseRTMLockingStatistics) { 76 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 77 Label check_abort; 78 testl(abort_status, (1<<i)); 79 jccb(Assembler::equal, check_abort); 80 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx)))); 81 bind(check_abort); 82 } 83 } 84 } 85 86 // Branch if (random & (count-1) != 0), count is 2^n 87 // tmp, scr and flags are killed 88 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) { 89 assert(tmp == rax, ""); 90 assert(scr == rdx, ""); 91 rdtsc(); // modifies EDX:EAX 92 andptr(tmp, count-1); 93 jccb(Assembler::notZero, brLabel); 94 } 95 96 // Perform abort ratio calculation, set no_rtm bit if high ratio 97 // input: rtm_counters_Reg (RTMLockingCounters* address) 98 // tmpReg, rtm_counters_Reg and flags are killed 99 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg, 100 Register rtm_counters_Reg, 101 RTMLockingCounters* rtm_counters, 102 Metadata* method_data) { 103 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 104 105 if (RTMLockingCalculationDelay > 0) { 106 // Delay calculation 107 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg); 108 testptr(tmpReg, tmpReg); 109 jccb(Assembler::equal, L_done); 110 } 111 // Abort ratio calculation only if abort_count > RTMAbortThreshold 112 // Aborted transactions = abort_count * 100 113 // All transactions = total_count * RTMTotalCountIncrRate 114 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 115 116 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset())); 117 cmpptr(tmpReg, RTMAbortThreshold); 118 jccb(Assembler::below, L_check_always_rtm2); 119 imulptr(tmpReg, tmpReg, 100); 120 121 Register scrReg = rtm_counters_Reg; 122 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 123 imulptr(scrReg, scrReg, RTMTotalCountIncrRate); 124 imulptr(scrReg, scrReg, RTMAbortRatio); 125 cmpptr(tmpReg, scrReg); 126 jccb(Assembler::below, L_check_always_rtm1); 127 if (method_data != NULL) { 128 // set rtm_state to "no rtm" in MDO 129 mov_metadata(tmpReg, method_data); 130 lock(); 131 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM); 132 } 133 jmpb(L_done); 134 bind(L_check_always_rtm1); 135 // Reload RTMLockingCounters* address 136 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 137 bind(L_check_always_rtm2); 138 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 139 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 140 jccb(Assembler::below, L_done); 141 if (method_data != NULL) { 142 // set rtm_state to "always rtm" in MDO 143 mov_metadata(tmpReg, method_data); 144 lock(); 145 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM); 146 } 147 bind(L_done); 148 } 149 150 // Update counters and perform abort ratio calculation 151 // input: abort_status_Reg 152 // rtm_counters_Reg, flags are killed 153 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg, 154 Register rtm_counters_Reg, 155 RTMLockingCounters* rtm_counters, 156 Metadata* method_data, 157 bool profile_rtm) { 158 159 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 160 // update rtm counters based on rax value at abort 161 // reads abort_status_Reg, updates flags 162 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 163 rtm_counters_update(abort_status_Reg, rtm_counters_Reg); 164 if (profile_rtm) { 165 // Save abort status because abort_status_Reg is used by following code. 166 if (RTMRetryCount > 0) { 167 push(abort_status_Reg); 168 } 169 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 170 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data); 171 // restore abort status 172 if (RTMRetryCount > 0) { 173 pop(abort_status_Reg); 174 } 175 } 176 } 177 178 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4) 179 // inputs: retry_count_Reg 180 // : abort_status_Reg 181 // output: retry_count_Reg decremented by 1 182 // flags are killed 183 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) { 184 Label doneRetry; 185 assert(abort_status_Reg == rax, ""); 186 // The abort reason bits are in eax (see all states in rtmLocking.hpp) 187 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) 188 // if reason is in 0x6 and retry count != 0 then retry 189 andptr(abort_status_Reg, 0x6); 190 jccb(Assembler::zero, doneRetry); 191 testl(retry_count_Reg, retry_count_Reg); 192 jccb(Assembler::zero, doneRetry); 193 pause(); 194 decrementl(retry_count_Reg); 195 jmp(retryLabel); 196 bind(doneRetry); 197 } 198 199 // Spin and retry if lock is busy, 200 // inputs: box_Reg (monitor address) 201 // : retry_count_Reg 202 // output: retry_count_Reg decremented by 1 203 // : clear z flag if retry count exceeded 204 // tmp_Reg, scr_Reg, flags are killed 205 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg, 206 Register tmp_Reg, Register scr_Reg, Label& retryLabel) { 207 Label SpinLoop, SpinExit, doneRetry; 208 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 209 210 testl(retry_count_Reg, retry_count_Reg); 211 jccb(Assembler::zero, doneRetry); 212 decrementl(retry_count_Reg); 213 movptr(scr_Reg, RTMSpinLoopCount); 214 215 bind(SpinLoop); 216 pause(); 217 decrementl(scr_Reg); 218 jccb(Assembler::lessEqual, SpinExit); 219 movptr(tmp_Reg, Address(box_Reg, owner_offset)); 220 testptr(tmp_Reg, tmp_Reg); 221 jccb(Assembler::notZero, SpinLoop); 222 223 bind(SpinExit); 224 jmp(retryLabel); 225 bind(doneRetry); 226 incrementl(retry_count_Reg); // clear z flag 227 } 228 229 // Use RTM for normal stack locks 230 // Input: objReg (object to lock) 231 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg, 232 Register retry_on_abort_count_Reg, 233 RTMLockingCounters* stack_rtm_counters, 234 Metadata* method_data, bool profile_rtm, 235 Label& DONE_LABEL, Label& IsInflated) { 236 assert(UseRTMForStackLocks, "why call this otherwise?"); 237 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 238 assert(tmpReg == rax, ""); 239 assert(scrReg == rdx, ""); 240 Label L_rtm_retry, L_decrement_retry, L_on_abort; 241 242 if (RTMRetryCount > 0) { 243 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 244 bind(L_rtm_retry); 245 } 246 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 247 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased 248 jcc(Assembler::notZero, IsInflated); 249 250 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 251 Label L_noincrement; 252 if (RTMTotalCountIncrRate > 1) { 253 // tmpReg, scrReg and flags are killed 254 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 255 } 256 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 257 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg); 258 bind(L_noincrement); 259 } 260 xbegin(L_on_abort); 261 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 262 andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits 263 cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked 264 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked 265 266 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 267 if (UseRTMXendForLockBusy) { 268 xend(); 269 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry) 270 jmp(L_decrement_retry); 271 } 272 else { 273 xabort(0); 274 } 275 bind(L_on_abort); 276 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 277 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm); 278 } 279 bind(L_decrement_retry); 280 if (RTMRetryCount > 0) { 281 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 282 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 283 } 284 } 285 286 // Use RTM for inflating locks 287 // inputs: objReg (object to lock) 288 // boxReg (on-stack box address (displaced header location) - KILLED) 289 // tmpReg (ObjectMonitor address + markWord::monitor_value) 290 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg, 291 Register scrReg, Register retry_on_busy_count_Reg, 292 Register retry_on_abort_count_Reg, 293 RTMLockingCounters* rtm_counters, 294 Metadata* method_data, bool profile_rtm, 295 Label& DONE_LABEL) { 296 assert(UseRTMLocking, "why call this otherwise?"); 297 assert(tmpReg == rax, ""); 298 assert(scrReg == rdx, ""); 299 Label L_rtm_retry, L_decrement_retry, L_on_abort; 300 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 301 302 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 303 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value())); 304 movptr(boxReg, tmpReg); // Save ObjectMonitor address 305 306 if (RTMRetryCount > 0) { 307 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy 308 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 309 bind(L_rtm_retry); 310 } 311 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 312 Label L_noincrement; 313 if (RTMTotalCountIncrRate > 1) { 314 // tmpReg, scrReg and flags are killed 315 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 316 } 317 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 318 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg); 319 bind(L_noincrement); 320 } 321 xbegin(L_on_abort); 322 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 323 movptr(tmpReg, Address(tmpReg, owner_offset)); 324 testptr(tmpReg, tmpReg); 325 jcc(Assembler::zero, DONE_LABEL); 326 if (UseRTMXendForLockBusy) { 327 xend(); 328 jmp(L_decrement_retry); 329 } 330 else { 331 xabort(0); 332 } 333 bind(L_on_abort); 334 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 335 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 336 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm); 337 } 338 if (RTMRetryCount > 0) { 339 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 340 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 341 } 342 343 movptr(tmpReg, Address(boxReg, owner_offset)) ; 344 testptr(tmpReg, tmpReg) ; 345 jccb(Assembler::notZero, L_decrement_retry) ; 346 347 // Appears unlocked - try to swing _owner from null to non-null. 348 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 349 #ifdef _LP64 350 Register threadReg = r15_thread; 351 #else 352 get_thread(scrReg); 353 Register threadReg = scrReg; 354 #endif 355 lock(); 356 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg 357 358 if (RTMRetryCount > 0) { 359 // success done else retry 360 jccb(Assembler::equal, DONE_LABEL) ; 361 bind(L_decrement_retry); 362 // Spin and retry if lock is busy. 363 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry); 364 } 365 else { 366 bind(L_decrement_retry); 367 } 368 } 369 370 #endif // INCLUDE_RTM_OPT 371 372 // fast_lock and fast_unlock used by C2 373 374 // Because the transitions from emitted code to the runtime 375 // monitorenter/exit helper stubs are so slow it's critical that 376 // we inline both the stack-locking fast path and the inflated fast path. 377 // 378 // See also: cmpFastLock and cmpFastUnlock. 379 // 380 // What follows is a specialized inline transliteration of the code 381 // in enter() and exit(). If we're concerned about I$ bloat another 382 // option would be to emit TrySlowEnter and TrySlowExit methods 383 // at startup-time. These methods would accept arguments as 384 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 385 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 386 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 387 // In practice, however, the # of lock sites is bounded and is usually small. 388 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 389 // if the processor uses simple bimodal branch predictors keyed by EIP 390 // Since the helper routines would be called from multiple synchronization 391 // sites. 392 // 393 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 394 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 395 // to those specialized methods. That'd give us a mostly platform-independent 396 // implementation that the JITs could optimize and inline at their pleasure. 397 // Done correctly, the only time we'd need to cross to native could would be 398 // to park() or unpark() threads. We'd also need a few more unsafe operators 399 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 400 // (b) explicit barriers or fence operations. 401 // 402 // TODO: 403 // 404 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 405 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 406 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 407 // the lock operators would typically be faster than reifying Self. 408 // 409 // * Ideally I'd define the primitives as: 410 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 411 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 412 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 413 // Instead, we're stuck with a rather awkward and brittle register assignments below. 414 // Furthermore the register assignments are overconstrained, possibly resulting in 415 // sub-optimal code near the synchronization site. 416 // 417 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 418 // Alternately, use a better sp-proximity test. 419 // 420 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 421 // Either one is sufficient to uniquely identify a thread. 422 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 423 // 424 // * Intrinsify notify() and notifyAll() for the common cases where the 425 // object is locked by the calling thread but the waitlist is empty. 426 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 427 // 428 // * use jccb and jmpb instead of jcc and jmp to improve code density. 429 // But beware of excessive branch density on AMD Opterons. 430 // 431 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 432 // or failure of the fast path. If the fast path fails then we pass 433 // control to the slow path, typically in C. In fast_lock and 434 // fast_unlock we often branch to DONE_LABEL, just to find that C2 435 // will emit a conditional branch immediately after the node. 436 // So we have branches to branches and lots of ICC.ZF games. 437 // Instead, it might be better to have C2 pass a "FailureLabel" 438 // into fast_lock and fast_unlock. In the case of success, control 439 // will drop through the node. ICC.ZF is undefined at exit. 440 // In the case of failure, the node will branch directly to the 441 // FailureLabel 442 443 444 // obj: object to lock 445 // box: on-stack box address (displaced header location) - KILLED 446 // rax,: tmp -- KILLED 447 // scr: tmp -- KILLED 448 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 449 Register scrReg, Register cx1Reg, Register cx2Reg, 450 BiasedLockingCounters* counters, 451 RTMLockingCounters* rtm_counters, 452 RTMLockingCounters* stack_rtm_counters, 453 Metadata* method_data, 454 bool use_rtm, bool profile_rtm) { 455 // Ensure the register assignments are disjoint 456 assert(tmpReg == rax, ""); 457 458 if (use_rtm) { 459 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg); 460 } else { 461 assert(cx2Reg == noreg, ""); 462 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 463 } 464 465 if (counters != NULL) { 466 atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg); 467 } 468 469 // Possible cases that we'll encounter in fast_lock 470 // ------------------------------------------------ 471 // * Inflated 472 // -- unlocked 473 // -- Locked 474 // = by self 475 // = by other 476 // * biased 477 // -- by Self 478 // -- by other 479 // * neutral 480 // * stack-locked 481 // -- by self 482 // = sp-proximity test hits 483 // = sp-proximity test generates false-negative 484 // -- by other 485 // 486 487 Label IsInflated, DONE_LABEL; 488 489 if (DiagnoseSyncOnValueBasedClasses != 0) { 490 load_klass(tmpReg, objReg, cx1Reg); 491 movl(tmpReg, Address(tmpReg, Klass::access_flags_offset())); 492 testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS); 493 jcc(Assembler::notZero, DONE_LABEL); 494 } 495 496 // it's stack-locked, biased or neutral 497 // TODO: optimize away redundant LDs of obj->mark and improve the markword triage 498 // order to reduce the number of conditional branches in the most common cases. 499 // Beware -- there's a subtle invariant that fetch of the markword 500 // at [FETCH], below, will never observe a biased encoding (*101b). 501 // If this invariant is not held we risk exclusion (safety) failure. 502 if (UseBiasedLocking && !UseOptoBiasInlining) { 503 biased_locking_enter(boxReg, objReg, tmpReg, scrReg, cx1Reg, false, DONE_LABEL, NULL, counters); 504 } 505 506 #if INCLUDE_RTM_OPT 507 if (UseRTMForStackLocks && use_rtm) { 508 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg, 509 stack_rtm_counters, method_data, profile_rtm, 510 DONE_LABEL, IsInflated); 511 } 512 #endif // INCLUDE_RTM_OPT 513 514 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 515 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased 516 jccb(Assembler::notZero, IsInflated); 517 518 // Attempt stack-locking ... 519 orptr (tmpReg, markWord::unlocked_value); 520 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 521 lock(); 522 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 523 if (counters != NULL) { 524 cond_inc32(Assembler::equal, 525 ExternalAddress((address)counters->fast_path_entry_count_addr())); 526 } 527 jcc(Assembler::equal, DONE_LABEL); // Success 528 529 // Recursive locking. 530 // The object is stack-locked: markword contains stack pointer to BasicLock. 531 // Locked by current thread if difference with current SP is less than one page. 532 subptr(tmpReg, rsp); 533 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 534 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) ); 535 movptr(Address(boxReg, 0), tmpReg); 536 if (counters != NULL) { 537 cond_inc32(Assembler::equal, 538 ExternalAddress((address)counters->fast_path_entry_count_addr())); 539 } 540 jmp(DONE_LABEL); 541 542 bind(IsInflated); 543 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 544 545 #if INCLUDE_RTM_OPT 546 // Use the same RTM locking code in 32- and 64-bit VM. 547 if (use_rtm) { 548 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg, 549 rtm_counters, method_data, profile_rtm, DONE_LABEL); 550 } else { 551 #endif // INCLUDE_RTM_OPT 552 553 #ifndef _LP64 554 // The object is inflated. 555 556 // boxReg refers to the on-stack BasicLock in the current frame. 557 // We'd like to write: 558 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 559 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 560 // additional latency as we have another ST in the store buffer that must drain. 561 562 // avoid ST-before-CAS 563 // register juggle because we need tmpReg for cmpxchgptr below 564 movptr(scrReg, boxReg); 565 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 566 567 // Optimistic form: consider XORL tmpReg,tmpReg 568 movptr(tmpReg, NULL_WORD); 569 570 // Appears unlocked - try to swing _owner from null to non-null. 571 // Ideally, I'd manifest "Self" with get_thread and then attempt 572 // to CAS the register containing Self into m->Owner. 573 // But we don't have enough registers, so instead we can either try to CAS 574 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 575 // we later store "Self" into m->Owner. Transiently storing a stack address 576 // (rsp or the address of the box) into m->owner is harmless. 577 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 578 lock(); 579 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 580 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 581 // If we weren't able to swing _owner from NULL to the BasicLock 582 // then take the slow path. 583 jccb (Assembler::notZero, DONE_LABEL); 584 // update _owner from BasicLock to thread 585 get_thread (scrReg); // beware: clobbers ICCs 586 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 587 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 588 589 // If the CAS fails we can either retry or pass control to the slow path. 590 // We use the latter tactic. 591 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 592 // If the CAS was successful ... 593 // Self has acquired the lock 594 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 595 // Intentional fall-through into DONE_LABEL ... 596 #else // _LP64 597 // It's inflated and we use scrReg for ObjectMonitor* in this section. 598 movq(scrReg, tmpReg); 599 xorq(tmpReg, tmpReg); 600 lock(); 601 cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 602 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 603 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 604 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value())); 605 // Propagate ICC.ZF from CAS above into DONE_LABEL. 606 jcc(Assembler::equal, DONE_LABEL); // CAS above succeeded; propagate ZF = 1 (success) 607 608 cmpptr(r15_thread, rax); // Check if we are already the owner (recursive lock) 609 jcc(Assembler::notEqual, DONE_LABEL); // If not recursive, ZF = 0 at this point (fail) 610 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 611 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 612 #endif // _LP64 613 #if INCLUDE_RTM_OPT 614 } // use_rtm() 615 #endif 616 // DONE_LABEL is a hot target - we'd really like to place it at the 617 // start of cache line by padding with NOPs. 618 // See the AMD and Intel software optimization manuals for the 619 // most efficient "long" NOP encodings. 620 // Unfortunately none of our alignment mechanisms suffice. 621 bind(DONE_LABEL); 622 623 // At DONE_LABEL the icc ZFlag is set as follows ... 624 // fast_unlock uses the same protocol. 625 // ZFlag == 1 -> Success 626 // ZFlag == 0 -> Failure - force control through the slow path 627 } 628 629 // obj: object to unlock 630 // box: box address (displaced header location), killed. Must be EAX. 631 // tmp: killed, cannot be obj nor box. 632 // 633 // Some commentary on balanced locking: 634 // 635 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 636 // Methods that don't have provably balanced locking are forced to run in the 637 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 638 // The interpreter provides two properties: 639 // I1: At return-time the interpreter automatically and quietly unlocks any 640 // objects acquired the current activation (frame). Recall that the 641 // interpreter maintains an on-stack list of locks currently held by 642 // a frame. 643 // I2: If a method attempts to unlock an object that is not held by the 644 // the frame the interpreter throws IMSX. 645 // 646 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 647 // B() doesn't have provably balanced locking so it runs in the interpreter. 648 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 649 // is still locked by A(). 650 // 651 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 652 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 653 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 654 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 655 // Arguably given that the spec legislates the JNI case as undefined our implementation 656 // could reasonably *avoid* checking owner in fast_unlock(). 657 // In the interest of performance we elide m->Owner==Self check in unlock. 658 // A perfectly viable alternative is to elide the owner check except when 659 // Xcheck:jni is enabled. 660 661 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { 662 assert(boxReg == rax, ""); 663 assert_different_registers(objReg, boxReg, tmpReg); 664 665 Label DONE_LABEL, Stacked, CheckSucc; 666 667 // Critically, the biased locking test must have precedence over 668 // and appear before the (box->dhw == 0) recursive stack-lock test. 669 if (UseBiasedLocking && !UseOptoBiasInlining) { 670 biased_locking_exit(objReg, tmpReg, DONE_LABEL); 671 } 672 673 #if INCLUDE_RTM_OPT 674 if (UseRTMForStackLocks && use_rtm) { 675 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 676 Label L_regular_unlock; 677 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 678 andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits 679 cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked 680 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock 681 xend(); // otherwise end... 682 jmp(DONE_LABEL); // ... and we're done 683 bind(L_regular_unlock); 684 } 685 #endif 686 687 cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header 688 jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock 689 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 690 testptr(tmpReg, markWord::monitor_value); // Inflated? 691 jccb (Assembler::zero, Stacked); 692 693 // It's inflated. 694 #if INCLUDE_RTM_OPT 695 if (use_rtm) { 696 Label L_regular_inflated_unlock; 697 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 698 movptr(boxReg, Address(tmpReg, owner_offset)); 699 testptr(boxReg, boxReg); 700 jccb(Assembler::notZero, L_regular_inflated_unlock); 701 xend(); 702 jmpb(DONE_LABEL); 703 bind(L_regular_inflated_unlock); 704 } 705 #endif 706 707 // Despite our balanced locking property we still check that m->_owner == Self 708 // as java routines or native JNI code called by this thread might 709 // have released the lock. 710 // Refer to the comments in synchronizer.cpp for how we might encode extra 711 // state in _succ so we can avoid fetching EntryList|cxq. 712 // 713 // If there's no contention try a 1-0 exit. That is, exit without 714 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 715 // we detect and recover from the race that the 1-0 exit admits. 716 // 717 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 718 // before it STs null into _owner, releasing the lock. Updates 719 // to data protected by the critical section must be visible before 720 // we drop the lock (and thus before any other thread could acquire 721 // the lock and observe the fields protected by the lock). 722 // IA32's memory-model is SPO, so STs are ordered with respect to 723 // each other and there's no need for an explicit barrier (fence). 724 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 725 #ifndef _LP64 726 get_thread (boxReg); 727 728 // Note that we could employ various encoding schemes to reduce 729 // the number of loads below (currently 4) to just 2 or 3. 730 // Refer to the comments in synchronizer.cpp. 731 // In practice the chain of fetches doesn't seem to impact performance, however. 732 xorptr(boxReg, boxReg); 733 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 734 jccb (Assembler::notZero, DONE_LABEL); 735 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 736 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 737 jccb (Assembler::notZero, CheckSucc); 738 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 739 jmpb (DONE_LABEL); 740 741 bind (Stacked); 742 // It's not inflated and it's not recursively stack-locked and it's not biased. 743 // It must be stack-locked. 744 // Try to reset the header to displaced header. 745 // The "box" value on the stack is stable, so we can reload 746 // and be assured we observe the same value as above. 747 movptr(tmpReg, Address(boxReg, 0)); 748 lock(); 749 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 750 // Intention fall-thru into DONE_LABEL 751 752 // DONE_LABEL is a hot target - we'd really like to place it at the 753 // start of cache line by padding with NOPs. 754 // See the AMD and Intel software optimization manuals for the 755 // most efficient "long" NOP encodings. 756 // Unfortunately none of our alignment mechanisms suffice. 757 bind (CheckSucc); 758 #else // _LP64 759 // It's inflated 760 Label LNotRecursive, LSuccess, LGoSlowPath; 761 762 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 763 jccb(Assembler::equal, LNotRecursive); 764 765 // Recursive inflated unlock 766 decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 767 jmpb(LSuccess); 768 769 bind(LNotRecursive); 770 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 771 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 772 jccb (Assembler::notZero, CheckSucc); 773 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 774 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); 775 jmpb (DONE_LABEL); 776 777 // Try to avoid passing control into the slow_path ... 778 bind (CheckSucc); 779 780 // The following optional optimization can be elided if necessary 781 // Effectively: if (succ == null) goto slow path 782 // The code reduces the window for a race, however, 783 // and thus benefits performance. 784 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 785 jccb (Assembler::zero, LGoSlowPath); 786 787 xorptr(boxReg, boxReg); 788 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 789 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); 790 791 // Memory barrier/fence 792 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 793 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 794 // This is faster on Nehalem and AMD Shanghai/Barcelona. 795 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 796 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 797 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 798 lock(); addl(Address(rsp, 0), 0); 799 800 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 801 jccb (Assembler::notZero, LSuccess); 802 803 // Rare inopportune interleaving - race. 804 // The successor vanished in the small window above. 805 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 806 // We need to ensure progress and succession. 807 // Try to reacquire the lock. 808 // If that fails then the new owner is responsible for succession and this 809 // thread needs to take no further action and can exit via the fast path (success). 810 // If the re-acquire succeeds then pass control into the slow path. 811 // As implemented, this latter mode is horrible because we generated more 812 // coherence traffic on the lock *and* artifically extended the critical section 813 // length while by virtue of passing control into the slow path. 814 815 // box is really RAX -- the following CMPXCHG depends on that binding 816 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 817 lock(); 818 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 819 // There's no successor so we tried to regrab the lock. 820 // If that didn't work, then another thread grabbed the 821 // lock so we're done (and exit was a success). 822 jccb (Assembler::notEqual, LSuccess); 823 // Intentional fall-through into slow path 824 825 bind (LGoSlowPath); 826 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 827 jmpb (DONE_LABEL); 828 829 bind (LSuccess); 830 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 831 jmpb (DONE_LABEL); 832 833 bind (Stacked); 834 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 835 lock(); 836 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 837 838 #endif 839 bind(DONE_LABEL); 840 } 841 842 //------------------------------------------------------------------------------------------- 843 // Generic instructions support for use in .ad files C2 code generation 844 845 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) { 846 if (dst != src) { 847 movdqu(dst, src); 848 } 849 if (opcode == Op_AbsVD) { 850 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr); 851 } else { 852 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 853 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr); 854 } 855 } 856 857 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) { 858 if (opcode == Op_AbsVD) { 859 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr); 860 } else { 861 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 862 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr); 863 } 864 } 865 866 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) { 867 if (dst != src) { 868 movdqu(dst, src); 869 } 870 if (opcode == Op_AbsVF) { 871 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr); 872 } else { 873 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 874 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr); 875 } 876 } 877 878 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) { 879 if (opcode == Op_AbsVF) { 880 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr); 881 } else { 882 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 883 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr); 884 } 885 } 886 887 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 888 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 889 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 890 891 if (opcode == Op_MinV) { 892 if (elem_bt == T_BYTE) { 893 pminsb(dst, src); 894 } else if (elem_bt == T_SHORT) { 895 pminsw(dst, src); 896 } else if (elem_bt == T_INT) { 897 pminsd(dst, src); 898 } else { 899 assert(elem_bt == T_LONG, "required"); 900 assert(tmp == xmm0, "required"); 901 assert_different_registers(dst, src, tmp); 902 movdqu(xmm0, dst); 903 pcmpgtq(xmm0, src); 904 blendvpd(dst, src); // xmm0 as mask 905 } 906 } else { // opcode == Op_MaxV 907 if (elem_bt == T_BYTE) { 908 pmaxsb(dst, src); 909 } else if (elem_bt == T_SHORT) { 910 pmaxsw(dst, src); 911 } else if (elem_bt == T_INT) { 912 pmaxsd(dst, src); 913 } else { 914 assert(elem_bt == T_LONG, "required"); 915 assert(tmp == xmm0, "required"); 916 assert_different_registers(dst, src, tmp); 917 movdqu(xmm0, src); 918 pcmpgtq(xmm0, dst); 919 blendvpd(dst, src); // xmm0 as mask 920 } 921 } 922 } 923 924 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 925 XMMRegister dst, XMMRegister src1, XMMRegister src2, 926 int vlen_enc) { 927 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 928 929 if (opcode == Op_MinV) { 930 if (elem_bt == T_BYTE) { 931 vpminsb(dst, src1, src2, vlen_enc); 932 } else if (elem_bt == T_SHORT) { 933 vpminsw(dst, src1, src2, vlen_enc); 934 } else if (elem_bt == T_INT) { 935 vpminsd(dst, src1, src2, vlen_enc); 936 } else { 937 assert(elem_bt == T_LONG, "required"); 938 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 939 vpminsq(dst, src1, src2, vlen_enc); 940 } else { 941 assert_different_registers(dst, src1, src2); 942 vpcmpgtq(dst, src1, src2, vlen_enc); 943 vblendvpd(dst, src1, src2, dst, vlen_enc); 944 } 945 } 946 } else { // opcode == Op_MaxV 947 if (elem_bt == T_BYTE) { 948 vpmaxsb(dst, src1, src2, vlen_enc); 949 } else if (elem_bt == T_SHORT) { 950 vpmaxsw(dst, src1, src2, vlen_enc); 951 } else if (elem_bt == T_INT) { 952 vpmaxsd(dst, src1, src2, vlen_enc); 953 } else { 954 assert(elem_bt == T_LONG, "required"); 955 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 956 vpmaxsq(dst, src1, src2, vlen_enc); 957 } else { 958 assert_different_registers(dst, src1, src2); 959 vpcmpgtq(dst, src1, src2, vlen_enc); 960 vblendvpd(dst, src2, src1, dst, vlen_enc); 961 } 962 } 963 } 964 } 965 966 // Float/Double min max 967 968 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 969 XMMRegister dst, XMMRegister a, XMMRegister b, 970 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 971 int vlen_enc) { 972 assert(UseAVX > 0, "required"); 973 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 974 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 975 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 976 assert_different_registers(a, b, tmp, atmp, btmp); 977 978 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 979 bool is_double_word = is_double_word_type(elem_bt); 980 981 if (!is_double_word && is_min) { 982 vblendvps(atmp, a, b, a, vlen_enc); 983 vblendvps(btmp, b, a, a, vlen_enc); 984 vminps(tmp, atmp, btmp, vlen_enc); 985 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 986 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 987 } else if (!is_double_word && !is_min) { 988 vblendvps(btmp, b, a, b, vlen_enc); 989 vblendvps(atmp, a, b, b, vlen_enc); 990 vmaxps(tmp, atmp, btmp, vlen_enc); 991 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 992 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 993 } else if (is_double_word && is_min) { 994 vblendvpd(atmp, a, b, a, vlen_enc); 995 vblendvpd(btmp, b, a, a, vlen_enc); 996 vminpd(tmp, atmp, btmp, vlen_enc); 997 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 998 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 999 } else { 1000 assert(is_double_word && !is_min, "sanity"); 1001 vblendvpd(btmp, b, a, b, vlen_enc); 1002 vblendvpd(atmp, a, b, b, vlen_enc); 1003 vmaxpd(tmp, atmp, btmp, vlen_enc); 1004 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1005 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 1006 } 1007 } 1008 1009 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1010 XMMRegister dst, XMMRegister a, XMMRegister b, 1011 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1012 int vlen_enc) { 1013 assert(UseAVX > 2, "required"); 1014 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1015 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1016 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1017 assert_different_registers(dst, a, b, atmp, btmp); 1018 1019 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1020 bool is_double_word = is_double_word_type(elem_bt); 1021 bool merge = true; 1022 1023 if (!is_double_word && is_min) { 1024 evpmovd2m(ktmp, a, vlen_enc); 1025 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1026 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1027 vminps(dst, atmp, btmp, vlen_enc); 1028 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1029 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1030 } else if (!is_double_word && !is_min) { 1031 evpmovd2m(ktmp, b, vlen_enc); 1032 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1033 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1034 vmaxps(dst, atmp, btmp, vlen_enc); 1035 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1036 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1037 } else if (is_double_word && is_min) { 1038 evpmovq2m(ktmp, a, vlen_enc); 1039 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1040 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1041 vminpd(dst, atmp, btmp, vlen_enc); 1042 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1043 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1044 } else { 1045 assert(is_double_word && !is_min, "sanity"); 1046 evpmovq2m(ktmp, b, vlen_enc); 1047 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1048 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1049 vmaxpd(dst, atmp, btmp, vlen_enc); 1050 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1051 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1052 } 1053 } 1054 1055 // Float/Double signum 1056 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, 1057 XMMRegister zero, XMMRegister one, 1058 Register scratch) { 1059 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1060 1061 Label DONE_LABEL; 1062 1063 if (opcode == Op_SignumF) { 1064 assert(UseSSE > 0, "required"); 1065 ucomiss(dst, zero); 1066 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1067 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1068 movflt(dst, one); 1069 jcc(Assembler::above, DONE_LABEL); 1070 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scratch); 1071 } else if (opcode == Op_SignumD) { 1072 assert(UseSSE > 1, "required"); 1073 ucomisd(dst, zero); 1074 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1075 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1076 movdbl(dst, one); 1077 jcc(Assembler::above, DONE_LABEL); 1078 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scratch); 1079 } 1080 1081 bind(DONE_LABEL); 1082 } 1083 1084 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1085 if (sign) { 1086 pmovsxbw(dst, src); 1087 } else { 1088 pmovzxbw(dst, src); 1089 } 1090 } 1091 1092 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1093 if (sign) { 1094 vpmovsxbw(dst, src, vector_len); 1095 } else { 1096 vpmovzxbw(dst, src, vector_len); 1097 } 1098 } 1099 1100 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1101 if (sign) { 1102 vpmovsxbd(dst, src, vector_len); 1103 } else { 1104 vpmovzxbd(dst, src, vector_len); 1105 } 1106 } 1107 1108 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1109 if (sign) { 1110 vpmovsxwd(dst, src, vector_len); 1111 } else { 1112 vpmovzxwd(dst, src, vector_len); 1113 } 1114 } 1115 1116 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1117 int shift, int vector_len) { 1118 if (opcode == Op_RotateLeftV) { 1119 if (etype == T_INT) { 1120 evprold(dst, src, shift, vector_len); 1121 } else { 1122 assert(etype == T_LONG, "expected type T_LONG"); 1123 evprolq(dst, src, shift, vector_len); 1124 } 1125 } else { 1126 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1127 if (etype == T_INT) { 1128 evprord(dst, src, shift, vector_len); 1129 } else { 1130 assert(etype == T_LONG, "expected type T_LONG"); 1131 evprorq(dst, src, shift, vector_len); 1132 } 1133 } 1134 } 1135 1136 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1137 XMMRegister shift, int vector_len) { 1138 if (opcode == Op_RotateLeftV) { 1139 if (etype == T_INT) { 1140 evprolvd(dst, src, shift, vector_len); 1141 } else { 1142 assert(etype == T_LONG, "expected type T_LONG"); 1143 evprolvq(dst, src, shift, vector_len); 1144 } 1145 } else { 1146 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1147 if (etype == T_INT) { 1148 evprorvd(dst, src, shift, vector_len); 1149 } else { 1150 assert(etype == T_LONG, "expected type T_LONG"); 1151 evprorvq(dst, src, shift, vector_len); 1152 } 1153 } 1154 } 1155 1156 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1157 if (opcode == Op_RShiftVI) { 1158 psrad(dst, shift); 1159 } else if (opcode == Op_LShiftVI) { 1160 pslld(dst, shift); 1161 } else { 1162 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1163 psrld(dst, shift); 1164 } 1165 } 1166 1167 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1168 switch (opcode) { 1169 case Op_RShiftVI: psrad(dst, shift); break; 1170 case Op_LShiftVI: pslld(dst, shift); break; 1171 case Op_URShiftVI: psrld(dst, shift); break; 1172 1173 default: assert(false, "%s", NodeClassNames[opcode]); 1174 } 1175 } 1176 1177 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1178 if (opcode == Op_RShiftVI) { 1179 vpsrad(dst, nds, shift, vector_len); 1180 } else if (opcode == Op_LShiftVI) { 1181 vpslld(dst, nds, shift, vector_len); 1182 } else { 1183 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1184 vpsrld(dst, nds, shift, vector_len); 1185 } 1186 } 1187 1188 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1189 switch (opcode) { 1190 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1191 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1192 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1193 1194 default: assert(false, "%s", NodeClassNames[opcode]); 1195 } 1196 } 1197 1198 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1199 switch (opcode) { 1200 case Op_RShiftVB: // fall-through 1201 case Op_RShiftVS: psraw(dst, shift); break; 1202 1203 case Op_LShiftVB: // fall-through 1204 case Op_LShiftVS: psllw(dst, shift); break; 1205 1206 case Op_URShiftVS: // fall-through 1207 case Op_URShiftVB: psrlw(dst, shift); break; 1208 1209 default: assert(false, "%s", NodeClassNames[opcode]); 1210 } 1211 } 1212 1213 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1214 switch (opcode) { 1215 case Op_RShiftVB: // fall-through 1216 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1217 1218 case Op_LShiftVB: // fall-through 1219 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1220 1221 case Op_URShiftVS: // fall-through 1222 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1223 1224 default: assert(false, "%s", NodeClassNames[opcode]); 1225 } 1226 } 1227 1228 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1229 switch (opcode) { 1230 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1231 case Op_LShiftVL: psllq(dst, shift); break; 1232 case Op_URShiftVL: psrlq(dst, shift); break; 1233 1234 default: assert(false, "%s", NodeClassNames[opcode]); 1235 } 1236 } 1237 1238 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1239 if (opcode == Op_RShiftVL) { 1240 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1241 } else if (opcode == Op_LShiftVL) { 1242 psllq(dst, shift); 1243 } else { 1244 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1245 psrlq(dst, shift); 1246 } 1247 } 1248 1249 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1250 switch (opcode) { 1251 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1252 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1253 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1254 1255 default: assert(false, "%s", NodeClassNames[opcode]); 1256 } 1257 } 1258 1259 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1260 if (opcode == Op_RShiftVL) { 1261 evpsraq(dst, nds, shift, vector_len); 1262 } else if (opcode == Op_LShiftVL) { 1263 vpsllq(dst, nds, shift, vector_len); 1264 } else { 1265 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1266 vpsrlq(dst, nds, shift, vector_len); 1267 } 1268 } 1269 1270 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1271 switch (opcode) { 1272 case Op_RShiftVB: // fall-through 1273 case Op_RShiftVS: // fall-through 1274 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1275 1276 case Op_LShiftVB: // fall-through 1277 case Op_LShiftVS: // fall-through 1278 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1279 1280 case Op_URShiftVB: // fall-through 1281 case Op_URShiftVS: // fall-through 1282 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1283 1284 default: assert(false, "%s", NodeClassNames[opcode]); 1285 } 1286 } 1287 1288 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1289 switch (opcode) { 1290 case Op_RShiftVB: // fall-through 1291 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1292 1293 case Op_LShiftVB: // fall-through 1294 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1295 1296 case Op_URShiftVB: // fall-through 1297 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1298 1299 default: assert(false, "%s", NodeClassNames[opcode]); 1300 } 1301 } 1302 1303 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1304 assert(UseAVX >= 2, "required"); 1305 switch (opcode) { 1306 case Op_RShiftVL: { 1307 if (UseAVX > 2) { 1308 assert(tmp == xnoreg, "not used"); 1309 if (!VM_Version::supports_avx512vl()) { 1310 vlen_enc = Assembler::AVX_512bit; 1311 } 1312 evpsravq(dst, src, shift, vlen_enc); 1313 } else { 1314 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1315 vpsrlvq(dst, src, shift, vlen_enc); 1316 vpsrlvq(tmp, tmp, shift, vlen_enc); 1317 vpxor(dst, dst, tmp, vlen_enc); 1318 vpsubq(dst, dst, tmp, vlen_enc); 1319 } 1320 break; 1321 } 1322 case Op_LShiftVL: { 1323 assert(tmp == xnoreg, "not used"); 1324 vpsllvq(dst, src, shift, vlen_enc); 1325 break; 1326 } 1327 case Op_URShiftVL: { 1328 assert(tmp == xnoreg, "not used"); 1329 vpsrlvq(dst, src, shift, vlen_enc); 1330 break; 1331 } 1332 default: assert(false, "%s", NodeClassNames[opcode]); 1333 } 1334 } 1335 1336 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1337 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) { 1338 assert(opcode == Op_LShiftVB || 1339 opcode == Op_RShiftVB || 1340 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1341 bool sign = (opcode != Op_URShiftVB); 1342 assert(vector_len == 0, "required"); 1343 vextendbd(sign, dst, src, 1); 1344 vpmovzxbd(vtmp, shift, 1); 1345 varshiftd(opcode, dst, dst, vtmp, 1); 1346 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch); 1347 vextracti128_high(vtmp, dst); 1348 vpackusdw(dst, dst, vtmp, 0); 1349 } 1350 1351 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1352 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) { 1353 assert(opcode == Op_LShiftVB || 1354 opcode == Op_RShiftVB || 1355 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1356 bool sign = (opcode != Op_URShiftVB); 1357 int ext_vector_len = vector_len + 1; 1358 vextendbw(sign, dst, src, ext_vector_len); 1359 vpmovzxbw(vtmp, shift, ext_vector_len); 1360 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1361 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch); 1362 if (vector_len == 0) { 1363 vextracti128_high(vtmp, dst); 1364 vpackuswb(dst, dst, vtmp, vector_len); 1365 } else { 1366 vextracti64x4_high(vtmp, dst); 1367 vpackuswb(dst, dst, vtmp, vector_len); 1368 vpermq(dst, dst, 0xD8, vector_len); 1369 } 1370 } 1371 1372 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1373 switch(typ) { 1374 case T_BYTE: 1375 pinsrb(dst, val, idx); 1376 break; 1377 case T_SHORT: 1378 pinsrw(dst, val, idx); 1379 break; 1380 case T_INT: 1381 pinsrd(dst, val, idx); 1382 break; 1383 case T_LONG: 1384 pinsrq(dst, val, idx); 1385 break; 1386 default: 1387 assert(false,"Should not reach here."); 1388 break; 1389 } 1390 } 1391 1392 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1393 switch(typ) { 1394 case T_BYTE: 1395 vpinsrb(dst, src, val, idx); 1396 break; 1397 case T_SHORT: 1398 vpinsrw(dst, src, val, idx); 1399 break; 1400 case T_INT: 1401 vpinsrd(dst, src, val, idx); 1402 break; 1403 case T_LONG: 1404 vpinsrq(dst, src, val, idx); 1405 break; 1406 default: 1407 assert(false,"Should not reach here."); 1408 break; 1409 } 1410 } 1411 1412 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1413 switch(typ) { 1414 case T_INT: 1415 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1416 break; 1417 case T_FLOAT: 1418 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1419 break; 1420 case T_LONG: 1421 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1422 break; 1423 case T_DOUBLE: 1424 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1425 break; 1426 default: 1427 assert(false,"Should not reach here."); 1428 break; 1429 } 1430 } 1431 1432 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1433 switch(typ) { 1434 case T_INT: 1435 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1436 break; 1437 case T_FLOAT: 1438 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1439 break; 1440 case T_LONG: 1441 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1442 break; 1443 case T_DOUBLE: 1444 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1445 break; 1446 default: 1447 assert(false,"Should not reach here."); 1448 break; 1449 } 1450 } 1451 1452 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1453 switch(typ) { 1454 case T_INT: 1455 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1456 break; 1457 case T_FLOAT: 1458 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1459 break; 1460 case T_LONG: 1461 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1462 break; 1463 case T_DOUBLE: 1464 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1465 break; 1466 default: 1467 assert(false,"Should not reach here."); 1468 break; 1469 } 1470 } 1471 1472 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1473 if (vlen_in_bytes <= 16) { 1474 pxor (dst, dst); 1475 psubb(dst, src); 1476 switch (elem_bt) { 1477 case T_BYTE: /* nothing to do */ break; 1478 case T_SHORT: pmovsxbw(dst, dst); break; 1479 case T_INT: pmovsxbd(dst, dst); break; 1480 case T_FLOAT: pmovsxbd(dst, dst); break; 1481 case T_LONG: pmovsxbq(dst, dst); break; 1482 case T_DOUBLE: pmovsxbq(dst, dst); break; 1483 1484 default: assert(false, "%s", type2name(elem_bt)); 1485 } 1486 } else { 1487 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1488 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1489 1490 vpxor (dst, dst, dst, vlen_enc); 1491 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1492 1493 switch (elem_bt) { 1494 case T_BYTE: /* nothing to do */ break; 1495 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1496 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1497 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1498 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1499 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1500 1501 default: assert(false, "%s", type2name(elem_bt)); 1502 } 1503 } 1504 } 1505 1506 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) { 1507 ExternalAddress addr(StubRoutines::x86::vector_iota_indices()); 1508 if (vlen_in_bytes == 4) { 1509 movdl(dst, addr); 1510 } else if (vlen_in_bytes == 8) { 1511 movq(dst, addr); 1512 } else if (vlen_in_bytes == 16) { 1513 movdqu(dst, addr, scratch); 1514 } else if (vlen_in_bytes == 32) { 1515 vmovdqu(dst, addr, scratch); 1516 } else { 1517 assert(vlen_in_bytes == 64, "%d", vlen_in_bytes); 1518 evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch); 1519 } 1520 } 1521 1522 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1523 1524 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1525 int vector_len = Assembler::AVX_128bit; 1526 1527 switch (opcode) { 1528 case Op_AndReductionV: pand(dst, src); break; 1529 case Op_OrReductionV: por (dst, src); break; 1530 case Op_XorReductionV: pxor(dst, src); break; 1531 case Op_MinReductionV: 1532 switch (typ) { 1533 case T_BYTE: pminsb(dst, src); break; 1534 case T_SHORT: pminsw(dst, src); break; 1535 case T_INT: pminsd(dst, src); break; 1536 case T_LONG: assert(UseAVX > 2, "required"); 1537 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1538 default: assert(false, "wrong type"); 1539 } 1540 break; 1541 case Op_MaxReductionV: 1542 switch (typ) { 1543 case T_BYTE: pmaxsb(dst, src); break; 1544 case T_SHORT: pmaxsw(dst, src); break; 1545 case T_INT: pmaxsd(dst, src); break; 1546 case T_LONG: assert(UseAVX > 2, "required"); 1547 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1548 default: assert(false, "wrong type"); 1549 } 1550 break; 1551 case Op_AddReductionVF: addss(dst, src); break; 1552 case Op_AddReductionVD: addsd(dst, src); break; 1553 case Op_AddReductionVI: 1554 switch (typ) { 1555 case T_BYTE: paddb(dst, src); break; 1556 case T_SHORT: paddw(dst, src); break; 1557 case T_INT: paddd(dst, src); break; 1558 default: assert(false, "wrong type"); 1559 } 1560 break; 1561 case Op_AddReductionVL: paddq(dst, src); break; 1562 case Op_MulReductionVF: mulss(dst, src); break; 1563 case Op_MulReductionVD: mulsd(dst, src); break; 1564 case Op_MulReductionVI: 1565 switch (typ) { 1566 case T_SHORT: pmullw(dst, src); break; 1567 case T_INT: pmulld(dst, src); break; 1568 default: assert(false, "wrong type"); 1569 } 1570 break; 1571 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1572 vpmullq(dst, dst, src, vector_len); break; 1573 default: assert(false, "wrong opcode"); 1574 } 1575 } 1576 1577 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1578 int vector_len = Assembler::AVX_256bit; 1579 1580 switch (opcode) { 1581 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1582 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1583 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1584 case Op_MinReductionV: 1585 switch (typ) { 1586 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1587 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1588 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1589 case T_LONG: assert(UseAVX > 2, "required"); 1590 vpminsq(dst, src1, src2, vector_len); break; 1591 default: assert(false, "wrong type"); 1592 } 1593 break; 1594 case Op_MaxReductionV: 1595 switch (typ) { 1596 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1597 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1598 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1599 case T_LONG: assert(UseAVX > 2, "required"); 1600 vpmaxsq(dst, src1, src2, vector_len); break; 1601 default: assert(false, "wrong type"); 1602 } 1603 break; 1604 case Op_AddReductionVI: 1605 switch (typ) { 1606 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1607 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1608 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1609 default: assert(false, "wrong type"); 1610 } 1611 break; 1612 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1613 case Op_MulReductionVI: 1614 switch (typ) { 1615 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1616 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1617 default: assert(false, "wrong type"); 1618 } 1619 break; 1620 case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break; 1621 default: assert(false, "wrong opcode"); 1622 } 1623 } 1624 1625 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1626 XMMRegister dst, XMMRegister src, 1627 XMMRegister vtmp1, XMMRegister vtmp2) { 1628 switch (opcode) { 1629 case Op_AddReductionVF: 1630 case Op_MulReductionVF: 1631 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1632 break; 1633 1634 case Op_AddReductionVD: 1635 case Op_MulReductionVD: 1636 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1637 break; 1638 1639 default: assert(false, "wrong opcode"); 1640 } 1641 } 1642 1643 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1644 Register dst, Register src1, XMMRegister src2, 1645 XMMRegister vtmp1, XMMRegister vtmp2) { 1646 switch (vlen) { 1647 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1648 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1649 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1650 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1651 1652 default: assert(false, "wrong vector length"); 1653 } 1654 } 1655 1656 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1657 Register dst, Register src1, XMMRegister src2, 1658 XMMRegister vtmp1, XMMRegister vtmp2) { 1659 switch (vlen) { 1660 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1661 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1662 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1663 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1664 1665 default: assert(false, "wrong vector length"); 1666 } 1667 } 1668 1669 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1670 Register dst, Register src1, XMMRegister src2, 1671 XMMRegister vtmp1, XMMRegister vtmp2) { 1672 switch (vlen) { 1673 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1674 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1675 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1676 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1677 1678 default: assert(false, "wrong vector length"); 1679 } 1680 } 1681 1682 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1683 Register dst, Register src1, XMMRegister src2, 1684 XMMRegister vtmp1, XMMRegister vtmp2) { 1685 switch (vlen) { 1686 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1687 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1688 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1689 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1690 1691 default: assert(false, "wrong vector length"); 1692 } 1693 } 1694 1695 #ifdef _LP64 1696 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1697 Register dst, Register src1, XMMRegister src2, 1698 XMMRegister vtmp1, XMMRegister vtmp2) { 1699 switch (vlen) { 1700 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1701 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1702 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1703 1704 default: assert(false, "wrong vector length"); 1705 } 1706 } 1707 #endif // _LP64 1708 1709 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1710 switch (vlen) { 1711 case 2: 1712 assert(vtmp2 == xnoreg, ""); 1713 reduce2F(opcode, dst, src, vtmp1); 1714 break; 1715 case 4: 1716 assert(vtmp2 == xnoreg, ""); 1717 reduce4F(opcode, dst, src, vtmp1); 1718 break; 1719 case 8: 1720 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1721 break; 1722 case 16: 1723 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1724 break; 1725 default: assert(false, "wrong vector length"); 1726 } 1727 } 1728 1729 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1730 switch (vlen) { 1731 case 2: 1732 assert(vtmp2 == xnoreg, ""); 1733 reduce2D(opcode, dst, src, vtmp1); 1734 break; 1735 case 4: 1736 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1737 break; 1738 case 8: 1739 reduce8D(opcode, dst, src, vtmp1, vtmp2); 1740 break; 1741 default: assert(false, "wrong vector length"); 1742 } 1743 } 1744 1745 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1746 if (opcode == Op_AddReductionVI) { 1747 if (vtmp1 != src2) { 1748 movdqu(vtmp1, src2); 1749 } 1750 phaddd(vtmp1, vtmp1); 1751 } else { 1752 pshufd(vtmp1, src2, 0x1); 1753 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1754 } 1755 movdl(vtmp2, src1); 1756 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1757 movdl(dst, vtmp1); 1758 } 1759 1760 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1761 if (opcode == Op_AddReductionVI) { 1762 if (vtmp1 != src2) { 1763 movdqu(vtmp1, src2); 1764 } 1765 phaddd(vtmp1, src2); 1766 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1767 } else { 1768 pshufd(vtmp2, src2, 0xE); 1769 reduce_operation_128(T_INT, opcode, vtmp2, src2); 1770 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1771 } 1772 } 1773 1774 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1775 if (opcode == Op_AddReductionVI) { 1776 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 1777 vextracti128_high(vtmp2, vtmp1); 1778 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 1779 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1780 } else { 1781 vextracti128_high(vtmp1, src2); 1782 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1783 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1784 } 1785 } 1786 1787 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1788 vextracti64x4_high(vtmp2, src2); 1789 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 1790 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1791 } 1792 1793 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1794 pshufd(vtmp2, src2, 0x1); 1795 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1796 movdqu(vtmp1, vtmp2); 1797 psrldq(vtmp1, 2); 1798 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 1799 movdqu(vtmp2, vtmp1); 1800 psrldq(vtmp2, 1); 1801 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 1802 movdl(vtmp2, src1); 1803 pmovsxbd(vtmp1, vtmp1); 1804 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1805 pextrb(dst, vtmp1, 0x0); 1806 movsbl(dst, dst); 1807 } 1808 1809 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1810 pshufd(vtmp1, src2, 0xE); 1811 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 1812 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1813 } 1814 1815 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1816 vextracti128_high(vtmp2, src2); 1817 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1818 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1819 } 1820 1821 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1822 vextracti64x4_high(vtmp1, src2); 1823 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 1824 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1825 } 1826 1827 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1828 pmovsxbw(vtmp2, src2); 1829 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1830 } 1831 1832 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1833 if (UseAVX > 1) { 1834 int vector_len = Assembler::AVX_256bit; 1835 vpmovsxbw(vtmp1, src2, vector_len); 1836 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1837 } else { 1838 pmovsxbw(vtmp2, src2); 1839 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1840 pshufd(vtmp2, src2, 0x1); 1841 pmovsxbw(vtmp2, src2); 1842 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 1843 } 1844 } 1845 1846 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1847 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 1848 int vector_len = Assembler::AVX_512bit; 1849 vpmovsxbw(vtmp1, src2, vector_len); 1850 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1851 } else { 1852 assert(UseAVX >= 2,"Should not reach here."); 1853 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 1854 vextracti128_high(vtmp2, src2); 1855 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 1856 } 1857 } 1858 1859 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1860 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 1861 vextracti64x4_high(vtmp2, src2); 1862 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 1863 } 1864 1865 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1866 if (opcode == Op_AddReductionVI) { 1867 if (vtmp1 != src2) { 1868 movdqu(vtmp1, src2); 1869 } 1870 phaddw(vtmp1, vtmp1); 1871 phaddw(vtmp1, vtmp1); 1872 } else { 1873 pshufd(vtmp2, src2, 0x1); 1874 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 1875 movdqu(vtmp1, vtmp2); 1876 psrldq(vtmp1, 2); 1877 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 1878 } 1879 movdl(vtmp2, src1); 1880 pmovsxwd(vtmp1, vtmp1); 1881 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1882 pextrw(dst, vtmp1, 0x0); 1883 movswl(dst, dst); 1884 } 1885 1886 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1887 if (opcode == Op_AddReductionVI) { 1888 if (vtmp1 != src2) { 1889 movdqu(vtmp1, src2); 1890 } 1891 phaddw(vtmp1, src2); 1892 } else { 1893 pshufd(vtmp1, src2, 0xE); 1894 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 1895 } 1896 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1897 } 1898 1899 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1900 if (opcode == Op_AddReductionVI) { 1901 int vector_len = Assembler::AVX_256bit; 1902 vphaddw(vtmp2, src2, src2, vector_len); 1903 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 1904 } else { 1905 vextracti128_high(vtmp2, src2); 1906 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 1907 } 1908 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1909 } 1910 1911 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1912 int vector_len = Assembler::AVX_256bit; 1913 vextracti64x4_high(vtmp1, src2); 1914 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 1915 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1916 } 1917 1918 #ifdef _LP64 1919 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1920 pshufd(vtmp2, src2, 0xE); 1921 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 1922 movdq(vtmp1, src1); 1923 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 1924 movdq(dst, vtmp1); 1925 } 1926 1927 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1928 vextracti128_high(vtmp1, src2); 1929 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 1930 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1931 } 1932 1933 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1934 vextracti64x4_high(vtmp2, src2); 1935 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 1936 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1937 } 1938 1939 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 1940 assert(ArrayOperationPartialInlineSize > 0 && ArrayOperationPartialInlineSize <= 64, "invalid"); 1941 mov64(temp, -1L); 1942 bzhiq(temp, temp, len); 1943 kmovql(dst, temp); 1944 } 1945 #endif // _LP64 1946 1947 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1948 reduce_operation_128(T_FLOAT, opcode, dst, src); 1949 pshufd(vtmp, src, 0x1); 1950 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 1951 } 1952 1953 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1954 reduce2F(opcode, dst, src, vtmp); 1955 pshufd(vtmp, src, 0x2); 1956 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 1957 pshufd(vtmp, src, 0x3); 1958 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 1959 } 1960 1961 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1962 reduce4F(opcode, dst, src, vtmp2); 1963 vextractf128_high(vtmp2, src); 1964 reduce4F(opcode, dst, vtmp2, vtmp1); 1965 } 1966 1967 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1968 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1969 vextracti64x4_high(vtmp1, src); 1970 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 1971 } 1972 1973 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1974 reduce_operation_128(T_DOUBLE, opcode, dst, src); 1975 pshufd(vtmp, src, 0xE); 1976 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 1977 } 1978 1979 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1980 reduce2D(opcode, dst, src, vtmp2); 1981 vextractf128_high(vtmp2, src); 1982 reduce2D(opcode, dst, vtmp2, vtmp1); 1983 } 1984 1985 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1986 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1987 vextracti64x4_high(vtmp1, src); 1988 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 1989 } 1990 1991 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) { 1992 MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len); 1993 } 1994 1995 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) { 1996 MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len); 1997 } 1998 1999 2000 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2001 XMMRegister dst, XMMRegister src, 2002 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2003 XMMRegister xmm_0, XMMRegister xmm_1) { 2004 int permconst[] = {1, 14}; 2005 XMMRegister wsrc = src; 2006 XMMRegister wdst = xmm_0; 2007 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2008 2009 int vlen_enc = Assembler::AVX_128bit; 2010 if (vlen == 16) { 2011 vlen_enc = Assembler::AVX_256bit; 2012 } 2013 2014 for (int i = log2(vlen) - 1; i >=0; i--) { 2015 if (i == 0 && !is_dst_valid) { 2016 wdst = dst; 2017 } 2018 if (i == 3) { 2019 vextracti64x4_high(wtmp, wsrc); 2020 } else if (i == 2) { 2021 vextracti128_high(wtmp, wsrc); 2022 } else { // i = [0,1] 2023 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2024 } 2025 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2026 wsrc = wdst; 2027 vlen_enc = Assembler::AVX_128bit; 2028 } 2029 if (is_dst_valid) { 2030 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2031 } 2032 } 2033 2034 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2035 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2036 XMMRegister xmm_0, XMMRegister xmm_1) { 2037 XMMRegister wsrc = src; 2038 XMMRegister wdst = xmm_0; 2039 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2040 int vlen_enc = Assembler::AVX_128bit; 2041 if (vlen == 8) { 2042 vlen_enc = Assembler::AVX_256bit; 2043 } 2044 for (int i = log2(vlen) - 1; i >=0; i--) { 2045 if (i == 0 && !is_dst_valid) { 2046 wdst = dst; 2047 } 2048 if (i == 1) { 2049 vextracti128_high(wtmp, wsrc); 2050 } else if (i == 2) { 2051 vextracti64x4_high(wtmp, wsrc); 2052 } else { 2053 assert(i == 0, "%d", i); 2054 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2055 } 2056 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2057 wsrc = wdst; 2058 vlen_enc = Assembler::AVX_128bit; 2059 } 2060 if (is_dst_valid) { 2061 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2062 } 2063 } 2064 2065 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2066 switch (bt) { 2067 case T_BYTE: pextrb(dst, src, idx); break; 2068 case T_SHORT: pextrw(dst, src, idx); break; 2069 case T_INT: pextrd(dst, src, idx); break; 2070 case T_LONG: pextrq(dst, src, idx); break; 2071 2072 default: 2073 assert(false,"Should not reach here."); 2074 break; 2075 } 2076 } 2077 2078 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2079 int esize = type2aelembytes(typ); 2080 int elem_per_lane = 16/esize; 2081 int lane = elemindex / elem_per_lane; 2082 int eindex = elemindex % elem_per_lane; 2083 2084 if (lane >= 2) { 2085 assert(UseAVX > 2, "required"); 2086 vextractf32x4(dst, src, lane & 3); 2087 return dst; 2088 } else if (lane > 0) { 2089 assert(UseAVX > 0, "required"); 2090 vextractf128(dst, src, lane); 2091 return dst; 2092 } else { 2093 return src; 2094 } 2095 } 2096 2097 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2098 if (typ == T_BYTE) { 2099 movsbl(dst, dst); 2100 } else if (typ == T_SHORT) { 2101 movswl(dst, dst); 2102 } 2103 } 2104 2105 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2106 int esize = type2aelembytes(typ); 2107 int elem_per_lane = 16/esize; 2108 int eindex = elemindex % elem_per_lane; 2109 assert(is_integral_type(typ),"required"); 2110 2111 if (eindex == 0) { 2112 if (typ == T_LONG) { 2113 movq(dst, src); 2114 } else { 2115 movdl(dst, src); 2116 movsxl(typ, dst); 2117 } 2118 } else { 2119 extract(typ, dst, src, eindex); 2120 movsxl(typ, dst); 2121 } 2122 } 2123 2124 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) { 2125 int esize = type2aelembytes(typ); 2126 int elem_per_lane = 16/esize; 2127 int eindex = elemindex % elem_per_lane; 2128 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2129 2130 if (eindex == 0) { 2131 movq(dst, src); 2132 } else { 2133 if (typ == T_FLOAT) { 2134 if (UseAVX == 0) { 2135 movdqu(dst, src); 2136 pshufps(dst, dst, eindex); 2137 } else { 2138 vpshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2139 } 2140 } else { 2141 if (UseAVX == 0) { 2142 movdqu(dst, src); 2143 psrldq(dst, eindex*esize); 2144 } else { 2145 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2146 } 2147 movq(dst, dst); 2148 } 2149 } 2150 // Zero upper bits 2151 if (typ == T_FLOAT) { 2152 if (UseAVX == 0) { 2153 assert((vtmp != xnoreg) && (tmp != noreg), "required."); 2154 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp); 2155 pand(dst, vtmp); 2156 } else { 2157 assert((tmp != noreg), "required."); 2158 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp); 2159 } 2160 } 2161 } 2162 2163 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2164 switch(typ) { 2165 case T_BYTE: 2166 case T_BOOLEAN: 2167 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2168 break; 2169 case T_SHORT: 2170 case T_CHAR: 2171 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2172 break; 2173 case T_INT: 2174 case T_FLOAT: 2175 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2176 break; 2177 case T_LONG: 2178 case T_DOUBLE: 2179 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2180 break; 2181 default: 2182 assert(false,"Should not reach here."); 2183 break; 2184 } 2185 } 2186 2187 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) { 2188 switch(typ) { 2189 case T_BOOLEAN: 2190 case T_BYTE: 2191 evpcmpb(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch); 2192 break; 2193 case T_CHAR: 2194 case T_SHORT: 2195 evpcmpw(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch); 2196 break; 2197 case T_INT: 2198 case T_FLOAT: 2199 evpcmpd(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch); 2200 break; 2201 case T_LONG: 2202 case T_DOUBLE: 2203 evpcmpq(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch); 2204 break; 2205 default: 2206 assert(false,"Should not reach here."); 2207 break; 2208 } 2209 } 2210 2211 void C2_MacroAssembler::vpcmpu(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison, 2212 int vlen_in_bytes, XMMRegister vtmp1, XMMRegister vtmp2, Register scratch) { 2213 int vlen_enc = vector_length_encoding(vlen_in_bytes*2); 2214 switch (typ) { 2215 case T_BYTE: 2216 vpmovzxbw(vtmp1, src1, vlen_enc); 2217 vpmovzxbw(vtmp2, src2, vlen_enc); 2218 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch); 2219 vpacksswb(dst, dst, dst, vlen_enc); 2220 break; 2221 case T_SHORT: 2222 vpmovzxwd(vtmp1, src1, vlen_enc); 2223 vpmovzxwd(vtmp2, src2, vlen_enc); 2224 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch); 2225 vpackssdw(dst, dst, dst, vlen_enc); 2226 break; 2227 case T_INT: 2228 vpmovzxdq(vtmp1, src1, vlen_enc); 2229 vpmovzxdq(vtmp2, src2, vlen_enc); 2230 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch); 2231 vpermilps(dst, dst, 8, vlen_enc); 2232 break; 2233 default: 2234 assert(false, "Should not reach here"); 2235 } 2236 if (vlen_in_bytes == 16) { 2237 vpermpd(dst, dst, 0x8, vlen_enc); 2238 } 2239 } 2240 2241 void C2_MacroAssembler::vpcmpu32(BasicType typ, XMMRegister dst, XMMRegister src1, XMMRegister src2, ComparisonPredicate comparison, int vlen_in_bytes, 2242 XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, Register scratch) { 2243 int vlen_enc = vector_length_encoding(vlen_in_bytes); 2244 switch (typ) { 2245 case T_BYTE: 2246 vpmovzxbw(vtmp1, src1, vlen_enc); 2247 vpmovzxbw(vtmp2, src2, vlen_enc); 2248 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch); 2249 vextracti128(vtmp1, src1, 1); 2250 vextracti128(vtmp2, src2, 1); 2251 vpmovzxbw(vtmp1, vtmp1, vlen_enc); 2252 vpmovzxbw(vtmp2, vtmp2, vlen_enc); 2253 vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::W, vlen_enc, scratch); 2254 vpacksswb(dst, dst, vtmp3, vlen_enc); 2255 vpermpd(dst, dst, 0xd8, vlen_enc); 2256 break; 2257 case T_SHORT: 2258 vpmovzxwd(vtmp1, src1, vlen_enc); 2259 vpmovzxwd(vtmp2, src2, vlen_enc); 2260 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch); 2261 vextracti128(vtmp1, src1, 1); 2262 vextracti128(vtmp2, src2, 1); 2263 vpmovzxwd(vtmp1, vtmp1, vlen_enc); 2264 vpmovzxwd(vtmp2, vtmp2, vlen_enc); 2265 vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::D, vlen_enc, scratch); 2266 vpackssdw(dst, dst, vtmp3, vlen_enc); 2267 vpermpd(dst, dst, 0xd8, vlen_enc); 2268 break; 2269 case T_INT: 2270 vpmovzxdq(vtmp1, src1, vlen_enc); 2271 vpmovzxdq(vtmp2, src2, vlen_enc); 2272 vpcmpCCW(dst, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch); 2273 vpshufd(dst, dst, 8, vlen_enc); 2274 vpermq(dst, dst, 8, vlen_enc); 2275 vextracti128(vtmp1, src1, 1); 2276 vextracti128(vtmp2, src2, 1); 2277 vpmovzxdq(vtmp1, vtmp1, vlen_enc); 2278 vpmovzxdq(vtmp2, vtmp2, vlen_enc); 2279 vpcmpCCW(vtmp3, vtmp1, vtmp2, comparison, Assembler::Q, vlen_enc, scratch); 2280 vpshufd(vtmp3, vtmp3, 8, vlen_enc); 2281 vpermq(vtmp3, vtmp3, 0x80, vlen_enc); 2282 vpblendd(dst, dst, vtmp3, 0xf0, vlen_enc); 2283 break; 2284 default: 2285 assert(false, "Should not reach here"); 2286 } 2287 } 2288 2289 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2290 switch(typ) { 2291 case T_BYTE: 2292 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2293 break; 2294 case T_SHORT: 2295 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2296 break; 2297 case T_INT: 2298 case T_FLOAT: 2299 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2300 break; 2301 case T_LONG: 2302 case T_DOUBLE: 2303 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2304 break; 2305 default: 2306 assert(false,"Should not reach here."); 2307 break; 2308 } 2309 } 2310 2311 void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2, 2312 XMMRegister vtmp1, XMMRegister vtmp2, KRegister mask) { 2313 switch(vlen) { 2314 case 4: 2315 assert(vtmp1 != xnoreg, "required."); 2316 // Broadcast lower 32 bits to 128 bits before ptest 2317 pshufd(vtmp1, src1, 0x0); 2318 if (bt == BoolTest::overflow) { 2319 assert(vtmp2 != xnoreg, "required."); 2320 pshufd(vtmp2, src2, 0x0); 2321 } else { 2322 assert(vtmp2 == xnoreg, "required."); 2323 vtmp2 = src2; 2324 } 2325 ptest(vtmp1, vtmp2); 2326 break; 2327 case 8: 2328 assert(vtmp1 != xnoreg, "required."); 2329 // Broadcast lower 64 bits to 128 bits before ptest 2330 pshufd(vtmp1, src1, 0x4); 2331 if (bt == BoolTest::overflow) { 2332 assert(vtmp2 != xnoreg, "required."); 2333 pshufd(vtmp2, src2, 0x4); 2334 } else { 2335 assert(vtmp2 == xnoreg, "required."); 2336 vtmp2 = src2; 2337 } 2338 ptest(vtmp1, vtmp2); 2339 break; 2340 case 16: 2341 assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required."); 2342 ptest(src1, src2); 2343 break; 2344 case 32: 2345 assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required."); 2346 vptest(src1, src2, Assembler::AVX_256bit); 2347 break; 2348 case 64: 2349 { 2350 assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required."); 2351 evpcmpeqb(mask, src1, src2, Assembler::AVX_512bit); 2352 if (bt == BoolTest::ne) { 2353 ktestql(mask, mask); 2354 } else { 2355 assert(bt == BoolTest::overflow, "required"); 2356 kortestql(mask, mask); 2357 } 2358 } 2359 break; 2360 default: 2361 assert(false,"Should not reach here."); 2362 break; 2363 } 2364 } 2365 2366 //------------------------------------------------------------------------------------------- 2367 2368 // IndexOf for constant substrings with size >= 8 chars 2369 // which don't need to be loaded through stack. 2370 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2371 Register cnt1, Register cnt2, 2372 int int_cnt2, Register result, 2373 XMMRegister vec, Register tmp, 2374 int ae) { 2375 ShortBranchVerifier sbv(this); 2376 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2377 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2378 2379 // This method uses the pcmpestri instruction with bound registers 2380 // inputs: 2381 // xmm - substring 2382 // rax - substring length (elements count) 2383 // mem - scanned string 2384 // rdx - string length (elements count) 2385 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2386 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2387 // outputs: 2388 // rcx - matched index in string 2389 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2390 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2391 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2392 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2393 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2394 2395 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2396 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2397 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2398 2399 // Note, inline_string_indexOf() generates checks: 2400 // if (substr.count > string.count) return -1; 2401 // if (substr.count == 0) return 0; 2402 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2403 2404 // Load substring. 2405 if (ae == StrIntrinsicNode::UL) { 2406 pmovzxbw(vec, Address(str2, 0)); 2407 } else { 2408 movdqu(vec, Address(str2, 0)); 2409 } 2410 movl(cnt2, int_cnt2); 2411 movptr(result, str1); // string addr 2412 2413 if (int_cnt2 > stride) { 2414 jmpb(SCAN_TO_SUBSTR); 2415 2416 // Reload substr for rescan, this code 2417 // is executed only for large substrings (> 8 chars) 2418 bind(RELOAD_SUBSTR); 2419 if (ae == StrIntrinsicNode::UL) { 2420 pmovzxbw(vec, Address(str2, 0)); 2421 } else { 2422 movdqu(vec, Address(str2, 0)); 2423 } 2424 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2425 2426 bind(RELOAD_STR); 2427 // We came here after the beginning of the substring was 2428 // matched but the rest of it was not so we need to search 2429 // again. Start from the next element after the previous match. 2430 2431 // cnt2 is number of substring reminding elements and 2432 // cnt1 is number of string reminding elements when cmp failed. 2433 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2434 subl(cnt1, cnt2); 2435 addl(cnt1, int_cnt2); 2436 movl(cnt2, int_cnt2); // Now restore cnt2 2437 2438 decrementl(cnt1); // Shift to next element 2439 cmpl(cnt1, cnt2); 2440 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2441 2442 addptr(result, (1<<scale1)); 2443 2444 } // (int_cnt2 > 8) 2445 2446 // Scan string for start of substr in 16-byte vectors 2447 bind(SCAN_TO_SUBSTR); 2448 pcmpestri(vec, Address(result, 0), mode); 2449 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2450 subl(cnt1, stride); 2451 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2452 cmpl(cnt1, cnt2); 2453 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2454 addptr(result, 16); 2455 jmpb(SCAN_TO_SUBSTR); 2456 2457 // Found a potential substr 2458 bind(FOUND_CANDIDATE); 2459 // Matched whole vector if first element matched (tmp(rcx) == 0). 2460 if (int_cnt2 == stride) { 2461 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2462 } else { // int_cnt2 > 8 2463 jccb(Assembler::overflow, FOUND_SUBSTR); 2464 } 2465 // After pcmpestri tmp(rcx) contains matched element index 2466 // Compute start addr of substr 2467 lea(result, Address(result, tmp, scale1)); 2468 2469 // Make sure string is still long enough 2470 subl(cnt1, tmp); 2471 cmpl(cnt1, cnt2); 2472 if (int_cnt2 == stride) { 2473 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2474 } else { // int_cnt2 > 8 2475 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2476 } 2477 // Left less then substring. 2478 2479 bind(RET_NOT_FOUND); 2480 movl(result, -1); 2481 jmp(EXIT); 2482 2483 if (int_cnt2 > stride) { 2484 // This code is optimized for the case when whole substring 2485 // is matched if its head is matched. 2486 bind(MATCH_SUBSTR_HEAD); 2487 pcmpestri(vec, Address(result, 0), mode); 2488 // Reload only string if does not match 2489 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2490 2491 Label CONT_SCAN_SUBSTR; 2492 // Compare the rest of substring (> 8 chars). 2493 bind(FOUND_SUBSTR); 2494 // First 8 chars are already matched. 2495 negptr(cnt2); 2496 addptr(cnt2, stride); 2497 2498 bind(SCAN_SUBSTR); 2499 subl(cnt1, stride); 2500 cmpl(cnt2, -stride); // Do not read beyond substring 2501 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2502 // Back-up strings to avoid reading beyond substring: 2503 // cnt1 = cnt1 - cnt2 + 8 2504 addl(cnt1, cnt2); // cnt2 is negative 2505 addl(cnt1, stride); 2506 movl(cnt2, stride); negptr(cnt2); 2507 bind(CONT_SCAN_SUBSTR); 2508 if (int_cnt2 < (int)G) { 2509 int tail_off1 = int_cnt2<<scale1; 2510 int tail_off2 = int_cnt2<<scale2; 2511 if (ae == StrIntrinsicNode::UL) { 2512 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2513 } else { 2514 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2515 } 2516 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2517 } else { 2518 // calculate index in register to avoid integer overflow (int_cnt2*2) 2519 movl(tmp, int_cnt2); 2520 addptr(tmp, cnt2); 2521 if (ae == StrIntrinsicNode::UL) { 2522 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2523 } else { 2524 movdqu(vec, Address(str2, tmp, scale2, 0)); 2525 } 2526 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2527 } 2528 // Need to reload strings pointers if not matched whole vector 2529 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2530 addptr(cnt2, stride); 2531 jcc(Assembler::negative, SCAN_SUBSTR); 2532 // Fall through if found full substring 2533 2534 } // (int_cnt2 > 8) 2535 2536 bind(RET_FOUND); 2537 // Found result if we matched full small substring. 2538 // Compute substr offset 2539 subptr(result, str1); 2540 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2541 shrl(result, 1); // index 2542 } 2543 bind(EXIT); 2544 2545 } // string_indexofC8 2546 2547 // Small strings are loaded through stack if they cross page boundary. 2548 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2549 Register cnt1, Register cnt2, 2550 int int_cnt2, Register result, 2551 XMMRegister vec, Register tmp, 2552 int ae) { 2553 ShortBranchVerifier sbv(this); 2554 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2555 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2556 2557 // 2558 // int_cnt2 is length of small (< 8 chars) constant substring 2559 // or (-1) for non constant substring in which case its length 2560 // is in cnt2 register. 2561 // 2562 // Note, inline_string_indexOf() generates checks: 2563 // if (substr.count > string.count) return -1; 2564 // if (substr.count == 0) return 0; 2565 // 2566 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2567 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2568 // This method uses the pcmpestri instruction with bound registers 2569 // inputs: 2570 // xmm - substring 2571 // rax - substring length (elements count) 2572 // mem - scanned string 2573 // rdx - string length (elements count) 2574 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2575 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2576 // outputs: 2577 // rcx - matched index in string 2578 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2579 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2580 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2581 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2582 2583 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2584 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2585 FOUND_CANDIDATE; 2586 2587 { //======================================================== 2588 // We don't know where these strings are located 2589 // and we can't read beyond them. Load them through stack. 2590 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2591 2592 movptr(tmp, rsp); // save old SP 2593 2594 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2595 if (int_cnt2 == (1>>scale2)) { // One byte 2596 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2597 load_unsigned_byte(result, Address(str2, 0)); 2598 movdl(vec, result); // move 32 bits 2599 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2600 // Not enough header space in 32-bit VM: 12+3 = 15. 2601 movl(result, Address(str2, -1)); 2602 shrl(result, 8); 2603 movdl(vec, result); // move 32 bits 2604 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2605 load_unsigned_short(result, Address(str2, 0)); 2606 movdl(vec, result); // move 32 bits 2607 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2608 movdl(vec, Address(str2, 0)); // move 32 bits 2609 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2610 movq(vec, Address(str2, 0)); // move 64 bits 2611 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2612 // Array header size is 12 bytes in 32-bit VM 2613 // + 6 bytes for 3 chars == 18 bytes, 2614 // enough space to load vec and shift. 2615 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2616 if (ae == StrIntrinsicNode::UL) { 2617 int tail_off = int_cnt2-8; 2618 pmovzxbw(vec, Address(str2, tail_off)); 2619 psrldq(vec, -2*tail_off); 2620 } 2621 else { 2622 int tail_off = int_cnt2*(1<<scale2); 2623 movdqu(vec, Address(str2, tail_off-16)); 2624 psrldq(vec, 16-tail_off); 2625 } 2626 } 2627 } else { // not constant substring 2628 cmpl(cnt2, stride); 2629 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 2630 2631 // We can read beyond string if srt+16 does not cross page boundary 2632 // since heaps are aligned and mapped by pages. 2633 assert(os::vm_page_size() < (int)G, "default page should be small"); 2634 movl(result, str2); // We need only low 32 bits 2635 andl(result, (os::vm_page_size()-1)); 2636 cmpl(result, (os::vm_page_size()-16)); 2637 jccb(Assembler::belowEqual, CHECK_STR); 2638 2639 // Move small strings to stack to allow load 16 bytes into vec. 2640 subptr(rsp, 16); 2641 int stk_offset = wordSize-(1<<scale2); 2642 push(cnt2); 2643 2644 bind(COPY_SUBSTR); 2645 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 2646 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 2647 movb(Address(rsp, cnt2, scale2, stk_offset), result); 2648 } else if (ae == StrIntrinsicNode::UU) { 2649 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 2650 movw(Address(rsp, cnt2, scale2, stk_offset), result); 2651 } 2652 decrement(cnt2); 2653 jccb(Assembler::notZero, COPY_SUBSTR); 2654 2655 pop(cnt2); 2656 movptr(str2, rsp); // New substring address 2657 } // non constant 2658 2659 bind(CHECK_STR); 2660 cmpl(cnt1, stride); 2661 jccb(Assembler::aboveEqual, BIG_STRINGS); 2662 2663 // Check cross page boundary. 2664 movl(result, str1); // We need only low 32 bits 2665 andl(result, (os::vm_page_size()-1)); 2666 cmpl(result, (os::vm_page_size()-16)); 2667 jccb(Assembler::belowEqual, BIG_STRINGS); 2668 2669 subptr(rsp, 16); 2670 int stk_offset = -(1<<scale1); 2671 if (int_cnt2 < 0) { // not constant 2672 push(cnt2); 2673 stk_offset += wordSize; 2674 } 2675 movl(cnt2, cnt1); 2676 2677 bind(COPY_STR); 2678 if (ae == StrIntrinsicNode::LL) { 2679 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 2680 movb(Address(rsp, cnt2, scale1, stk_offset), result); 2681 } else { 2682 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 2683 movw(Address(rsp, cnt2, scale1, stk_offset), result); 2684 } 2685 decrement(cnt2); 2686 jccb(Assembler::notZero, COPY_STR); 2687 2688 if (int_cnt2 < 0) { // not constant 2689 pop(cnt2); 2690 } 2691 movptr(str1, rsp); // New string address 2692 2693 bind(BIG_STRINGS); 2694 // Load substring. 2695 if (int_cnt2 < 0) { // -1 2696 if (ae == StrIntrinsicNode::UL) { 2697 pmovzxbw(vec, Address(str2, 0)); 2698 } else { 2699 movdqu(vec, Address(str2, 0)); 2700 } 2701 push(cnt2); // substr count 2702 push(str2); // substr addr 2703 push(str1); // string addr 2704 } else { 2705 // Small (< 8 chars) constant substrings are loaded already. 2706 movl(cnt2, int_cnt2); 2707 } 2708 push(tmp); // original SP 2709 2710 } // Finished loading 2711 2712 //======================================================== 2713 // Start search 2714 // 2715 2716 movptr(result, str1); // string addr 2717 2718 if (int_cnt2 < 0) { // Only for non constant substring 2719 jmpb(SCAN_TO_SUBSTR); 2720 2721 // SP saved at sp+0 2722 // String saved at sp+1*wordSize 2723 // Substr saved at sp+2*wordSize 2724 // Substr count saved at sp+3*wordSize 2725 2726 // Reload substr for rescan, this code 2727 // is executed only for large substrings (> 8 chars) 2728 bind(RELOAD_SUBSTR); 2729 movptr(str2, Address(rsp, 2*wordSize)); 2730 movl(cnt2, Address(rsp, 3*wordSize)); 2731 if (ae == StrIntrinsicNode::UL) { 2732 pmovzxbw(vec, Address(str2, 0)); 2733 } else { 2734 movdqu(vec, Address(str2, 0)); 2735 } 2736 // We came here after the beginning of the substring was 2737 // matched but the rest of it was not so we need to search 2738 // again. Start from the next element after the previous match. 2739 subptr(str1, result); // Restore counter 2740 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2741 shrl(str1, 1); 2742 } 2743 addl(cnt1, str1); 2744 decrementl(cnt1); // Shift to next element 2745 cmpl(cnt1, cnt2); 2746 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2747 2748 addptr(result, (1<<scale1)); 2749 } // non constant 2750 2751 // Scan string for start of substr in 16-byte vectors 2752 bind(SCAN_TO_SUBSTR); 2753 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2754 pcmpestri(vec, Address(result, 0), mode); 2755 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2756 subl(cnt1, stride); 2757 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2758 cmpl(cnt1, cnt2); 2759 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2760 addptr(result, 16); 2761 2762 bind(ADJUST_STR); 2763 cmpl(cnt1, stride); // Do not read beyond string 2764 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2765 // Back-up string to avoid reading beyond string. 2766 lea(result, Address(result, cnt1, scale1, -16)); 2767 movl(cnt1, stride); 2768 jmpb(SCAN_TO_SUBSTR); 2769 2770 // Found a potential substr 2771 bind(FOUND_CANDIDATE); 2772 // After pcmpestri tmp(rcx) contains matched element index 2773 2774 // Make sure string is still long enough 2775 subl(cnt1, tmp); 2776 cmpl(cnt1, cnt2); 2777 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 2778 // Left less then substring. 2779 2780 bind(RET_NOT_FOUND); 2781 movl(result, -1); 2782 jmp(CLEANUP); 2783 2784 bind(FOUND_SUBSTR); 2785 // Compute start addr of substr 2786 lea(result, Address(result, tmp, scale1)); 2787 if (int_cnt2 > 0) { // Constant substring 2788 // Repeat search for small substring (< 8 chars) 2789 // from new point without reloading substring. 2790 // Have to check that we don't read beyond string. 2791 cmpl(tmp, stride-int_cnt2); 2792 jccb(Assembler::greater, ADJUST_STR); 2793 // Fall through if matched whole substring. 2794 } else { // non constant 2795 assert(int_cnt2 == -1, "should be != 0"); 2796 2797 addl(tmp, cnt2); 2798 // Found result if we matched whole substring. 2799 cmpl(tmp, stride); 2800 jcc(Assembler::lessEqual, RET_FOUND); 2801 2802 // Repeat search for small substring (<= 8 chars) 2803 // from new point 'str1' without reloading substring. 2804 cmpl(cnt2, stride); 2805 // Have to check that we don't read beyond string. 2806 jccb(Assembler::lessEqual, ADJUST_STR); 2807 2808 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 2809 // Compare the rest of substring (> 8 chars). 2810 movptr(str1, result); 2811 2812 cmpl(tmp, cnt2); 2813 // First 8 chars are already matched. 2814 jccb(Assembler::equal, CHECK_NEXT); 2815 2816 bind(SCAN_SUBSTR); 2817 pcmpestri(vec, Address(str1, 0), mode); 2818 // Need to reload strings pointers if not matched whole vector 2819 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2820 2821 bind(CHECK_NEXT); 2822 subl(cnt2, stride); 2823 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 2824 addptr(str1, 16); 2825 if (ae == StrIntrinsicNode::UL) { 2826 addptr(str2, 8); 2827 } else { 2828 addptr(str2, 16); 2829 } 2830 subl(cnt1, stride); 2831 cmpl(cnt2, stride); // Do not read beyond substring 2832 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 2833 // Back-up strings to avoid reading beyond substring. 2834 2835 if (ae == StrIntrinsicNode::UL) { 2836 lea(str2, Address(str2, cnt2, scale2, -8)); 2837 lea(str1, Address(str1, cnt2, scale1, -16)); 2838 } else { 2839 lea(str2, Address(str2, cnt2, scale2, -16)); 2840 lea(str1, Address(str1, cnt2, scale1, -16)); 2841 } 2842 subl(cnt1, cnt2); 2843 movl(cnt2, stride); 2844 addl(cnt1, stride); 2845 bind(CONT_SCAN_SUBSTR); 2846 if (ae == StrIntrinsicNode::UL) { 2847 pmovzxbw(vec, Address(str2, 0)); 2848 } else { 2849 movdqu(vec, Address(str2, 0)); 2850 } 2851 jmp(SCAN_SUBSTR); 2852 2853 bind(RET_FOUND_LONG); 2854 movptr(str1, Address(rsp, wordSize)); 2855 } // non constant 2856 2857 bind(RET_FOUND); 2858 // Compute substr offset 2859 subptr(result, str1); 2860 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2861 shrl(result, 1); // index 2862 } 2863 bind(CLEANUP); 2864 pop(rsp); // restore SP 2865 2866 } // string_indexof 2867 2868 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 2869 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 2870 ShortBranchVerifier sbv(this); 2871 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2872 2873 int stride = 8; 2874 2875 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 2876 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 2877 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 2878 FOUND_SEQ_CHAR, DONE_LABEL; 2879 2880 movptr(result, str1); 2881 if (UseAVX >= 2) { 2882 cmpl(cnt1, stride); 2883 jcc(Assembler::less, SCAN_TO_CHAR); 2884 cmpl(cnt1, 2*stride); 2885 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 2886 movdl(vec1, ch); 2887 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 2888 vpxor(vec2, vec2); 2889 movl(tmp, cnt1); 2890 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 2891 andl(cnt1,0x0000000F); //tail count (in chars) 2892 2893 bind(SCAN_TO_16_CHAR_LOOP); 2894 vmovdqu(vec3, Address(result, 0)); 2895 vpcmpeqw(vec3, vec3, vec1, 1); 2896 vptest(vec2, vec3); 2897 jcc(Assembler::carryClear, FOUND_CHAR); 2898 addptr(result, 32); 2899 subl(tmp, 2*stride); 2900 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 2901 jmp(SCAN_TO_8_CHAR); 2902 bind(SCAN_TO_8_CHAR_INIT); 2903 movdl(vec1, ch); 2904 pshuflw(vec1, vec1, 0x00); 2905 pshufd(vec1, vec1, 0); 2906 pxor(vec2, vec2); 2907 } 2908 bind(SCAN_TO_8_CHAR); 2909 cmpl(cnt1, stride); 2910 jcc(Assembler::less, SCAN_TO_CHAR); 2911 if (UseAVX < 2) { 2912 movdl(vec1, ch); 2913 pshuflw(vec1, vec1, 0x00); 2914 pshufd(vec1, vec1, 0); 2915 pxor(vec2, vec2); 2916 } 2917 movl(tmp, cnt1); 2918 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 2919 andl(cnt1,0x00000007); //tail count (in chars) 2920 2921 bind(SCAN_TO_8_CHAR_LOOP); 2922 movdqu(vec3, Address(result, 0)); 2923 pcmpeqw(vec3, vec1); 2924 ptest(vec2, vec3); 2925 jcc(Assembler::carryClear, FOUND_CHAR); 2926 addptr(result, 16); 2927 subl(tmp, stride); 2928 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 2929 bind(SCAN_TO_CHAR); 2930 testl(cnt1, cnt1); 2931 jcc(Assembler::zero, RET_NOT_FOUND); 2932 bind(SCAN_TO_CHAR_LOOP); 2933 load_unsigned_short(tmp, Address(result, 0)); 2934 cmpl(ch, tmp); 2935 jccb(Assembler::equal, FOUND_SEQ_CHAR); 2936 addptr(result, 2); 2937 subl(cnt1, 1); 2938 jccb(Assembler::zero, RET_NOT_FOUND); 2939 jmp(SCAN_TO_CHAR_LOOP); 2940 2941 bind(RET_NOT_FOUND); 2942 movl(result, -1); 2943 jmpb(DONE_LABEL); 2944 2945 bind(FOUND_CHAR); 2946 if (UseAVX >= 2) { 2947 vpmovmskb(tmp, vec3); 2948 } else { 2949 pmovmskb(tmp, vec3); 2950 } 2951 bsfl(ch, tmp); 2952 addptr(result, ch); 2953 2954 bind(FOUND_SEQ_CHAR); 2955 subptr(result, str1); 2956 shrl(result, 1); 2957 2958 bind(DONE_LABEL); 2959 } // string_indexof_char 2960 2961 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 2962 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 2963 ShortBranchVerifier sbv(this); 2964 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2965 2966 int stride = 16; 2967 2968 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 2969 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 2970 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 2971 FOUND_SEQ_CHAR, DONE_LABEL; 2972 2973 movptr(result, str1); 2974 if (UseAVX >= 2) { 2975 cmpl(cnt1, stride); 2976 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 2977 cmpl(cnt1, stride*2); 2978 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 2979 movdl(vec1, ch); 2980 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 2981 vpxor(vec2, vec2); 2982 movl(tmp, cnt1); 2983 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 2984 andl(cnt1,0x0000001F); //tail count (in chars) 2985 2986 bind(SCAN_TO_32_CHAR_LOOP); 2987 vmovdqu(vec3, Address(result, 0)); 2988 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 2989 vptest(vec2, vec3); 2990 jcc(Assembler::carryClear, FOUND_CHAR); 2991 addptr(result, 32); 2992 subl(tmp, stride*2); 2993 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 2994 jmp(SCAN_TO_16_CHAR); 2995 2996 bind(SCAN_TO_16_CHAR_INIT); 2997 movdl(vec1, ch); 2998 pxor(vec2, vec2); 2999 pshufb(vec1, vec2); 3000 } 3001 3002 bind(SCAN_TO_16_CHAR); 3003 cmpl(cnt1, stride); 3004 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entires left 3005 if (UseAVX < 2) { 3006 movdl(vec1, ch); 3007 pxor(vec2, vec2); 3008 pshufb(vec1, vec2); 3009 } 3010 movl(tmp, cnt1); 3011 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3012 andl(cnt1,0x0000000F); //tail count (in bytes) 3013 3014 bind(SCAN_TO_16_CHAR_LOOP); 3015 movdqu(vec3, Address(result, 0)); 3016 pcmpeqb(vec3, vec1); 3017 ptest(vec2, vec3); 3018 jcc(Assembler::carryClear, FOUND_CHAR); 3019 addptr(result, 16); 3020 subl(tmp, stride); 3021 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3022 3023 bind(SCAN_TO_CHAR_INIT); 3024 testl(cnt1, cnt1); 3025 jcc(Assembler::zero, RET_NOT_FOUND); 3026 bind(SCAN_TO_CHAR_LOOP); 3027 load_unsigned_byte(tmp, Address(result, 0)); 3028 cmpl(ch, tmp); 3029 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3030 addptr(result, 1); 3031 subl(cnt1, 1); 3032 jccb(Assembler::zero, RET_NOT_FOUND); 3033 jmp(SCAN_TO_CHAR_LOOP); 3034 3035 bind(RET_NOT_FOUND); 3036 movl(result, -1); 3037 jmpb(DONE_LABEL); 3038 3039 bind(FOUND_CHAR); 3040 if (UseAVX >= 2) { 3041 vpmovmskb(tmp, vec3); 3042 } else { 3043 pmovmskb(tmp, vec3); 3044 } 3045 bsfl(ch, tmp); 3046 addptr(result, ch); 3047 3048 bind(FOUND_SEQ_CHAR); 3049 subptr(result, str1); 3050 3051 bind(DONE_LABEL); 3052 } // stringL_indexof_char 3053 3054 // helper function for string_compare 3055 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3056 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3057 Address::ScaleFactor scale2, Register index, int ae) { 3058 if (ae == StrIntrinsicNode::LL) { 3059 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3060 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3061 } else if (ae == StrIntrinsicNode::UU) { 3062 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3063 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3064 } else { 3065 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3066 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3067 } 3068 } 3069 3070 // Compare strings, used for char[] and byte[]. 3071 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3072 Register cnt1, Register cnt2, Register result, 3073 XMMRegister vec1, int ae, KRegister mask) { 3074 ShortBranchVerifier sbv(this); 3075 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3076 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3077 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3078 int stride2x2 = 0x40; 3079 Address::ScaleFactor scale = Address::no_scale; 3080 Address::ScaleFactor scale1 = Address::no_scale; 3081 Address::ScaleFactor scale2 = Address::no_scale; 3082 3083 if (ae != StrIntrinsicNode::LL) { 3084 stride2x2 = 0x20; 3085 } 3086 3087 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3088 shrl(cnt2, 1); 3089 } 3090 // Compute the minimum of the string lengths and the 3091 // difference of the string lengths (stack). 3092 // Do the conditional move stuff 3093 movl(result, cnt1); 3094 subl(cnt1, cnt2); 3095 push(cnt1); 3096 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3097 3098 // Is the minimum length zero? 3099 testl(cnt2, cnt2); 3100 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3101 if (ae == StrIntrinsicNode::LL) { 3102 // Load first bytes 3103 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3104 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3105 } else if (ae == StrIntrinsicNode::UU) { 3106 // Load first characters 3107 load_unsigned_short(result, Address(str1, 0)); 3108 load_unsigned_short(cnt1, Address(str2, 0)); 3109 } else { 3110 load_unsigned_byte(result, Address(str1, 0)); 3111 load_unsigned_short(cnt1, Address(str2, 0)); 3112 } 3113 subl(result, cnt1); 3114 jcc(Assembler::notZero, POP_LABEL); 3115 3116 if (ae == StrIntrinsicNode::UU) { 3117 // Divide length by 2 to get number of chars 3118 shrl(cnt2, 1); 3119 } 3120 cmpl(cnt2, 1); 3121 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3122 3123 // Check if the strings start at the same location and setup scale and stride 3124 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3125 cmpptr(str1, str2); 3126 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3127 if (ae == StrIntrinsicNode::LL) { 3128 scale = Address::times_1; 3129 stride = 16; 3130 } else { 3131 scale = Address::times_2; 3132 stride = 8; 3133 } 3134 } else { 3135 scale1 = Address::times_1; 3136 scale2 = Address::times_2; 3137 // scale not used 3138 stride = 8; 3139 } 3140 3141 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3142 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3143 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3144 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3145 Label COMPARE_TAIL_LONG; 3146 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3147 3148 int pcmpmask = 0x19; 3149 if (ae == StrIntrinsicNode::LL) { 3150 pcmpmask &= ~0x01; 3151 } 3152 3153 // Setup to compare 16-chars (32-bytes) vectors, 3154 // start from first character again because it has aligned address. 3155 if (ae == StrIntrinsicNode::LL) { 3156 stride2 = 32; 3157 } else { 3158 stride2 = 16; 3159 } 3160 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3161 adr_stride = stride << scale; 3162 } else { 3163 adr_stride1 = 8; //stride << scale1; 3164 adr_stride2 = 16; //stride << scale2; 3165 } 3166 3167 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3168 // rax and rdx are used by pcmpestri as elements counters 3169 movl(result, cnt2); 3170 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3171 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3172 3173 // fast path : compare first 2 8-char vectors. 3174 bind(COMPARE_16_CHARS); 3175 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3176 movdqu(vec1, Address(str1, 0)); 3177 } else { 3178 pmovzxbw(vec1, Address(str1, 0)); 3179 } 3180 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3181 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3182 3183 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3184 movdqu(vec1, Address(str1, adr_stride)); 3185 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3186 } else { 3187 pmovzxbw(vec1, Address(str1, adr_stride1)); 3188 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3189 } 3190 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3191 addl(cnt1, stride); 3192 3193 // Compare the characters at index in cnt1 3194 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3195 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3196 subl(result, cnt2); 3197 jmp(POP_LABEL); 3198 3199 // Setup the registers to start vector comparison loop 3200 bind(COMPARE_WIDE_VECTORS); 3201 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3202 lea(str1, Address(str1, result, scale)); 3203 lea(str2, Address(str2, result, scale)); 3204 } else { 3205 lea(str1, Address(str1, result, scale1)); 3206 lea(str2, Address(str2, result, scale2)); 3207 } 3208 subl(result, stride2); 3209 subl(cnt2, stride2); 3210 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3211 negptr(result); 3212 3213 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3214 bind(COMPARE_WIDE_VECTORS_LOOP); 3215 3216 #ifdef _LP64 3217 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3218 cmpl(cnt2, stride2x2); 3219 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3220 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3221 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3222 3223 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3224 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3225 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3226 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3227 } else { 3228 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3229 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3230 } 3231 kortestql(mask, mask); 3232 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3233 addptr(result, stride2x2); // update since we already compared at this addr 3234 subl(cnt2, stride2x2); // and sub the size too 3235 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3236 3237 vpxor(vec1, vec1); 3238 jmpb(COMPARE_WIDE_TAIL); 3239 }//if (VM_Version::supports_avx512vlbw()) 3240 #endif // _LP64 3241 3242 3243 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3244 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3245 vmovdqu(vec1, Address(str1, result, scale)); 3246 vpxor(vec1, Address(str2, result, scale)); 3247 } else { 3248 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3249 vpxor(vec1, Address(str2, result, scale2)); 3250 } 3251 vptest(vec1, vec1); 3252 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3253 addptr(result, stride2); 3254 subl(cnt2, stride2); 3255 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3256 // clean upper bits of YMM registers 3257 vpxor(vec1, vec1); 3258 3259 // compare wide vectors tail 3260 bind(COMPARE_WIDE_TAIL); 3261 testptr(result, result); 3262 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3263 3264 movl(result, stride2); 3265 movl(cnt2, result); 3266 negptr(result); 3267 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3268 3269 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3270 bind(VECTOR_NOT_EQUAL); 3271 // clean upper bits of YMM registers 3272 vpxor(vec1, vec1); 3273 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3274 lea(str1, Address(str1, result, scale)); 3275 lea(str2, Address(str2, result, scale)); 3276 } else { 3277 lea(str1, Address(str1, result, scale1)); 3278 lea(str2, Address(str2, result, scale2)); 3279 } 3280 jmp(COMPARE_16_CHARS); 3281 3282 // Compare tail chars, length between 1 to 15 chars 3283 bind(COMPARE_TAIL_LONG); 3284 movl(cnt2, result); 3285 cmpl(cnt2, stride); 3286 jcc(Assembler::less, COMPARE_SMALL_STR); 3287 3288 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3289 movdqu(vec1, Address(str1, 0)); 3290 } else { 3291 pmovzxbw(vec1, Address(str1, 0)); 3292 } 3293 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3294 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3295 subptr(cnt2, stride); 3296 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3297 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3298 lea(str1, Address(str1, result, scale)); 3299 lea(str2, Address(str2, result, scale)); 3300 } else { 3301 lea(str1, Address(str1, result, scale1)); 3302 lea(str2, Address(str2, result, scale2)); 3303 } 3304 negptr(cnt2); 3305 jmpb(WHILE_HEAD_LABEL); 3306 3307 bind(COMPARE_SMALL_STR); 3308 } else if (UseSSE42Intrinsics) { 3309 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3310 int pcmpmask = 0x19; 3311 // Setup to compare 8-char (16-byte) vectors, 3312 // start from first character again because it has aligned address. 3313 movl(result, cnt2); 3314 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3315 if (ae == StrIntrinsicNode::LL) { 3316 pcmpmask &= ~0x01; 3317 } 3318 jcc(Assembler::zero, COMPARE_TAIL); 3319 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3320 lea(str1, Address(str1, result, scale)); 3321 lea(str2, Address(str2, result, scale)); 3322 } else { 3323 lea(str1, Address(str1, result, scale1)); 3324 lea(str2, Address(str2, result, scale2)); 3325 } 3326 negptr(result); 3327 3328 // pcmpestri 3329 // inputs: 3330 // vec1- substring 3331 // rax - negative string length (elements count) 3332 // mem - scanned string 3333 // rdx - string length (elements count) 3334 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3335 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3336 // outputs: 3337 // rcx - first mismatched element index 3338 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3339 3340 bind(COMPARE_WIDE_VECTORS); 3341 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3342 movdqu(vec1, Address(str1, result, scale)); 3343 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3344 } else { 3345 pmovzxbw(vec1, Address(str1, result, scale1)); 3346 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3347 } 3348 // After pcmpestri cnt1(rcx) contains mismatched element index 3349 3350 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3351 addptr(result, stride); 3352 subptr(cnt2, stride); 3353 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3354 3355 // compare wide vectors tail 3356 testptr(result, result); 3357 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3358 3359 movl(cnt2, stride); 3360 movl(result, stride); 3361 negptr(result); 3362 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3363 movdqu(vec1, Address(str1, result, scale)); 3364 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3365 } else { 3366 pmovzxbw(vec1, Address(str1, result, scale1)); 3367 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3368 } 3369 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3370 3371 // Mismatched characters in the vectors 3372 bind(VECTOR_NOT_EQUAL); 3373 addptr(cnt1, result); 3374 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3375 subl(result, cnt2); 3376 jmpb(POP_LABEL); 3377 3378 bind(COMPARE_TAIL); // limit is zero 3379 movl(cnt2, result); 3380 // Fallthru to tail compare 3381 } 3382 // Shift str2 and str1 to the end of the arrays, negate min 3383 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3384 lea(str1, Address(str1, cnt2, scale)); 3385 lea(str2, Address(str2, cnt2, scale)); 3386 } else { 3387 lea(str1, Address(str1, cnt2, scale1)); 3388 lea(str2, Address(str2, cnt2, scale2)); 3389 } 3390 decrementl(cnt2); // first character was compared already 3391 negptr(cnt2); 3392 3393 // Compare the rest of the elements 3394 bind(WHILE_HEAD_LABEL); 3395 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3396 subl(result, cnt1); 3397 jccb(Assembler::notZero, POP_LABEL); 3398 increment(cnt2); 3399 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3400 3401 // Strings are equal up to min length. Return the length difference. 3402 bind(LENGTH_DIFF_LABEL); 3403 pop(result); 3404 if (ae == StrIntrinsicNode::UU) { 3405 // Divide diff by 2 to get number of chars 3406 sarl(result, 1); 3407 } 3408 jmpb(DONE_LABEL); 3409 3410 #ifdef _LP64 3411 if (VM_Version::supports_avx512vlbw()) { 3412 3413 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3414 3415 kmovql(cnt1, mask); 3416 notq(cnt1); 3417 bsfq(cnt2, cnt1); 3418 if (ae != StrIntrinsicNode::LL) { 3419 // Divide diff by 2 to get number of chars 3420 sarl(cnt2, 1); 3421 } 3422 addq(result, cnt2); 3423 if (ae == StrIntrinsicNode::LL) { 3424 load_unsigned_byte(cnt1, Address(str2, result)); 3425 load_unsigned_byte(result, Address(str1, result)); 3426 } else if (ae == StrIntrinsicNode::UU) { 3427 load_unsigned_short(cnt1, Address(str2, result, scale)); 3428 load_unsigned_short(result, Address(str1, result, scale)); 3429 } else { 3430 load_unsigned_short(cnt1, Address(str2, result, scale2)); 3431 load_unsigned_byte(result, Address(str1, result, scale1)); 3432 } 3433 subl(result, cnt1); 3434 jmpb(POP_LABEL); 3435 }//if (VM_Version::supports_avx512vlbw()) 3436 #endif // _LP64 3437 3438 // Discard the stored length difference 3439 bind(POP_LABEL); 3440 pop(cnt1); 3441 3442 // That's it 3443 bind(DONE_LABEL); 3444 if(ae == StrIntrinsicNode::UL) { 3445 negl(result); 3446 } 3447 3448 } 3449 3450 // Search for Non-ASCII character (Negative byte value) in a byte array, 3451 // return true if it has any and false otherwise. 3452 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 3453 // @IntrinsicCandidate 3454 // private static boolean hasNegatives(byte[] ba, int off, int len) { 3455 // for (int i = off; i < off + len; i++) { 3456 // if (ba[i] < 0) { 3457 // return true; 3458 // } 3459 // } 3460 // return false; 3461 // } 3462 void C2_MacroAssembler::has_negatives(Register ary1, Register len, 3463 Register result, Register tmp1, 3464 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 3465 // rsi: byte array 3466 // rcx: len 3467 // rax: result 3468 ShortBranchVerifier sbv(this); 3469 assert_different_registers(ary1, len, result, tmp1); 3470 assert_different_registers(vec1, vec2); 3471 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 3472 3473 // len == 0 3474 testl(len, len); 3475 jcc(Assembler::zero, FALSE_LABEL); 3476 3477 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 3478 VM_Version::supports_avx512vlbw() && 3479 VM_Version::supports_bmi2()) { 3480 3481 Label test_64_loop, test_tail; 3482 Register tmp3_aliased = len; 3483 3484 movl(tmp1, len); 3485 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 3486 3487 andl(tmp1, 64 - 1); // tail count (in chars) 0x3F 3488 andl(len, ~(64 - 1)); // vector count (in chars) 3489 jccb(Assembler::zero, test_tail); 3490 3491 lea(ary1, Address(ary1, len, Address::times_1)); 3492 negptr(len); 3493 3494 bind(test_64_loop); 3495 // Check whether our 64 elements of size byte contain negatives 3496 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 3497 kortestql(mask1, mask1); 3498 jcc(Assembler::notZero, TRUE_LABEL); 3499 3500 addptr(len, 64); 3501 jccb(Assembler::notZero, test_64_loop); 3502 3503 3504 bind(test_tail); 3505 // bail out when there is nothing to be done 3506 testl(tmp1, -1); 3507 jcc(Assembler::zero, FALSE_LABEL); 3508 3509 // ~(~0 << len) applied up to two times (for 32-bit scenario) 3510 #ifdef _LP64 3511 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 3512 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 3513 notq(tmp3_aliased); 3514 kmovql(mask2, tmp3_aliased); 3515 #else 3516 Label k_init; 3517 jmp(k_init); 3518 3519 // We could not read 64-bits from a general purpose register thus we move 3520 // data required to compose 64 1's to the instruction stream 3521 // We emit 64 byte wide series of elements from 0..63 which later on would 3522 // be used as a compare targets with tail count contained in tmp1 register. 3523 // Result would be a k register having tmp1 consecutive number or 1 3524 // counting from least significant bit. 3525 address tmp = pc(); 3526 emit_int64(0x0706050403020100); 3527 emit_int64(0x0F0E0D0C0B0A0908); 3528 emit_int64(0x1716151413121110); 3529 emit_int64(0x1F1E1D1C1B1A1918); 3530 emit_int64(0x2726252423222120); 3531 emit_int64(0x2F2E2D2C2B2A2928); 3532 emit_int64(0x3736353433323130); 3533 emit_int64(0x3F3E3D3C3B3A3938); 3534 3535 bind(k_init); 3536 lea(len, InternalAddress(tmp)); 3537 // create mask to test for negative byte inside a vector 3538 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 3539 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 3540 3541 #endif 3542 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 3543 ktestq(mask1, mask2); 3544 jcc(Assembler::notZero, TRUE_LABEL); 3545 3546 jmp(FALSE_LABEL); 3547 } else { 3548 movl(result, len); // copy 3549 3550 if (UseAVX >= 2 && UseSSE >= 2) { 3551 // With AVX2, use 32-byte vector compare 3552 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3553 3554 // Compare 32-byte vectors 3555 andl(result, 0x0000001f); // tail count (in bytes) 3556 andl(len, 0xffffffe0); // vector count (in bytes) 3557 jccb(Assembler::zero, COMPARE_TAIL); 3558 3559 lea(ary1, Address(ary1, len, Address::times_1)); 3560 negptr(len); 3561 3562 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 3563 movdl(vec2, tmp1); 3564 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 3565 3566 bind(COMPARE_WIDE_VECTORS); 3567 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 3568 vptest(vec1, vec2); 3569 jccb(Assembler::notZero, TRUE_LABEL); 3570 addptr(len, 32); 3571 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3572 3573 testl(result, result); 3574 jccb(Assembler::zero, FALSE_LABEL); 3575 3576 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 3577 vptest(vec1, vec2); 3578 jccb(Assembler::notZero, TRUE_LABEL); 3579 jmpb(FALSE_LABEL); 3580 3581 bind(COMPARE_TAIL); // len is zero 3582 movl(len, result); 3583 // Fallthru to tail compare 3584 } else if (UseSSE42Intrinsics) { 3585 // With SSE4.2, use double quad vector compare 3586 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3587 3588 // Compare 16-byte vectors 3589 andl(result, 0x0000000f); // tail count (in bytes) 3590 andl(len, 0xfffffff0); // vector count (in bytes) 3591 jcc(Assembler::zero, COMPARE_TAIL); 3592 3593 lea(ary1, Address(ary1, len, Address::times_1)); 3594 negptr(len); 3595 3596 movl(tmp1, 0x80808080); 3597 movdl(vec2, tmp1); 3598 pshufd(vec2, vec2, 0); 3599 3600 bind(COMPARE_WIDE_VECTORS); 3601 movdqu(vec1, Address(ary1, len, Address::times_1)); 3602 ptest(vec1, vec2); 3603 jcc(Assembler::notZero, TRUE_LABEL); 3604 addptr(len, 16); 3605 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3606 3607 testl(result, result); 3608 jcc(Assembler::zero, FALSE_LABEL); 3609 3610 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 3611 ptest(vec1, vec2); 3612 jccb(Assembler::notZero, TRUE_LABEL); 3613 jmpb(FALSE_LABEL); 3614 3615 bind(COMPARE_TAIL); // len is zero 3616 movl(len, result); 3617 // Fallthru to tail compare 3618 } 3619 } 3620 // Compare 4-byte vectors 3621 andl(len, 0xfffffffc); // vector count (in bytes) 3622 jccb(Assembler::zero, COMPARE_CHAR); 3623 3624 lea(ary1, Address(ary1, len, Address::times_1)); 3625 negptr(len); 3626 3627 bind(COMPARE_VECTORS); 3628 movl(tmp1, Address(ary1, len, Address::times_1)); 3629 andl(tmp1, 0x80808080); 3630 jccb(Assembler::notZero, TRUE_LABEL); 3631 addptr(len, 4); 3632 jcc(Assembler::notZero, COMPARE_VECTORS); 3633 3634 // Compare trailing char (final 2 bytes), if any 3635 bind(COMPARE_CHAR); 3636 testl(result, 0x2); // tail char 3637 jccb(Assembler::zero, COMPARE_BYTE); 3638 load_unsigned_short(tmp1, Address(ary1, 0)); 3639 andl(tmp1, 0x00008080); 3640 jccb(Assembler::notZero, TRUE_LABEL); 3641 subptr(result, 2); 3642 lea(ary1, Address(ary1, 2)); 3643 3644 bind(COMPARE_BYTE); 3645 testl(result, 0x1); // tail byte 3646 jccb(Assembler::zero, FALSE_LABEL); 3647 load_unsigned_byte(tmp1, Address(ary1, 0)); 3648 andl(tmp1, 0x00000080); 3649 jccb(Assembler::notEqual, TRUE_LABEL); 3650 jmpb(FALSE_LABEL); 3651 3652 bind(TRUE_LABEL); 3653 movl(result, 1); // return true 3654 jmpb(DONE); 3655 3656 bind(FALSE_LABEL); 3657 xorl(result, result); // return false 3658 3659 // That's it 3660 bind(DONE); 3661 if (UseAVX >= 2 && UseSSE >= 2) { 3662 // clean upper bits of YMM registers 3663 vpxor(vec1, vec1); 3664 vpxor(vec2, vec2); 3665 } 3666 } 3667 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 3668 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 3669 Register limit, Register result, Register chr, 3670 XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) { 3671 ShortBranchVerifier sbv(this); 3672 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 3673 3674 int length_offset = arrayOopDesc::length_offset_in_bytes(); 3675 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 3676 3677 if (is_array_equ) { 3678 // Check the input args 3679 cmpoop(ary1, ary2); 3680 jcc(Assembler::equal, TRUE_LABEL); 3681 3682 // Need additional checks for arrays_equals. 3683 testptr(ary1, ary1); 3684 jcc(Assembler::zero, FALSE_LABEL); 3685 testptr(ary2, ary2); 3686 jcc(Assembler::zero, FALSE_LABEL); 3687 3688 // Check the lengths 3689 movl(limit, Address(ary1, length_offset)); 3690 cmpl(limit, Address(ary2, length_offset)); 3691 jcc(Assembler::notEqual, FALSE_LABEL); 3692 } 3693 3694 // count == 0 3695 testl(limit, limit); 3696 jcc(Assembler::zero, TRUE_LABEL); 3697 3698 if (is_array_equ) { 3699 // Load array address 3700 lea(ary1, Address(ary1, base_offset)); 3701 lea(ary2, Address(ary2, base_offset)); 3702 } 3703 3704 if (is_array_equ && is_char) { 3705 // arrays_equals when used for char[]. 3706 shll(limit, 1); // byte count != 0 3707 } 3708 movl(result, limit); // copy 3709 3710 if (UseAVX >= 2) { 3711 // With AVX2, use 32-byte vector compare 3712 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3713 3714 // Compare 32-byte vectors 3715 andl(result, 0x0000001f); // tail count (in bytes) 3716 andl(limit, 0xffffffe0); // vector count (in bytes) 3717 jcc(Assembler::zero, COMPARE_TAIL); 3718 3719 lea(ary1, Address(ary1, limit, Address::times_1)); 3720 lea(ary2, Address(ary2, limit, Address::times_1)); 3721 negptr(limit); 3722 3723 #ifdef _LP64 3724 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3725 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 3726 3727 cmpl(limit, -64); 3728 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3729 3730 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3731 3732 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 3733 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 3734 kortestql(mask, mask); 3735 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 3736 addptr(limit, 64); // update since we already compared at this addr 3737 cmpl(limit, -64); 3738 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3739 3740 // At this point we may still need to compare -limit+result bytes. 3741 // We could execute the next two instruction and just continue via non-wide path: 3742 // cmpl(limit, 0); 3743 // jcc(Assembler::equal, COMPARE_TAIL); // true 3744 // But since we stopped at the points ary{1,2}+limit which are 3745 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 3746 // (|limit| <= 32 and result < 32), 3747 // we may just compare the last 64 bytes. 3748 // 3749 addptr(result, -64); // it is safe, bc we just came from this area 3750 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 3751 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 3752 kortestql(mask, mask); 3753 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 3754 3755 jmp(TRUE_LABEL); 3756 3757 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3758 3759 }//if (VM_Version::supports_avx512vlbw()) 3760 #endif //_LP64 3761 bind(COMPARE_WIDE_VECTORS); 3762 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); 3763 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 3764 vpxor(vec1, vec2); 3765 3766 vptest(vec1, vec1); 3767 jcc(Assembler::notZero, FALSE_LABEL); 3768 addptr(limit, 32); 3769 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3770 3771 testl(result, result); 3772 jcc(Assembler::zero, TRUE_LABEL); 3773 3774 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 3775 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 3776 vpxor(vec1, vec2); 3777 3778 vptest(vec1, vec1); 3779 jccb(Assembler::notZero, FALSE_LABEL); 3780 jmpb(TRUE_LABEL); 3781 3782 bind(COMPARE_TAIL); // limit is zero 3783 movl(limit, result); 3784 // Fallthru to tail compare 3785 } else if (UseSSE42Intrinsics) { 3786 // With SSE4.2, use double quad vector compare 3787 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3788 3789 // Compare 16-byte vectors 3790 andl(result, 0x0000000f); // tail count (in bytes) 3791 andl(limit, 0xfffffff0); // vector count (in bytes) 3792 jcc(Assembler::zero, COMPARE_TAIL); 3793 3794 lea(ary1, Address(ary1, limit, Address::times_1)); 3795 lea(ary2, Address(ary2, limit, Address::times_1)); 3796 negptr(limit); 3797 3798 bind(COMPARE_WIDE_VECTORS); 3799 movdqu(vec1, Address(ary1, limit, Address::times_1)); 3800 movdqu(vec2, Address(ary2, limit, Address::times_1)); 3801 pxor(vec1, vec2); 3802 3803 ptest(vec1, vec1); 3804 jcc(Assembler::notZero, FALSE_LABEL); 3805 addptr(limit, 16); 3806 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3807 3808 testl(result, result); 3809 jcc(Assembler::zero, TRUE_LABEL); 3810 3811 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 3812 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 3813 pxor(vec1, vec2); 3814 3815 ptest(vec1, vec1); 3816 jccb(Assembler::notZero, FALSE_LABEL); 3817 jmpb(TRUE_LABEL); 3818 3819 bind(COMPARE_TAIL); // limit is zero 3820 movl(limit, result); 3821 // Fallthru to tail compare 3822 } 3823 3824 // Compare 4-byte vectors 3825 andl(limit, 0xfffffffc); // vector count (in bytes) 3826 jccb(Assembler::zero, COMPARE_CHAR); 3827 3828 lea(ary1, Address(ary1, limit, Address::times_1)); 3829 lea(ary2, Address(ary2, limit, Address::times_1)); 3830 negptr(limit); 3831 3832 bind(COMPARE_VECTORS); 3833 movl(chr, Address(ary1, limit, Address::times_1)); 3834 cmpl(chr, Address(ary2, limit, Address::times_1)); 3835 jccb(Assembler::notEqual, FALSE_LABEL); 3836 addptr(limit, 4); 3837 jcc(Assembler::notZero, COMPARE_VECTORS); 3838 3839 // Compare trailing char (final 2 bytes), if any 3840 bind(COMPARE_CHAR); 3841 testl(result, 0x2); // tail char 3842 jccb(Assembler::zero, COMPARE_BYTE); 3843 load_unsigned_short(chr, Address(ary1, 0)); 3844 load_unsigned_short(limit, Address(ary2, 0)); 3845 cmpl(chr, limit); 3846 jccb(Assembler::notEqual, FALSE_LABEL); 3847 3848 if (is_array_equ && is_char) { 3849 bind(COMPARE_BYTE); 3850 } else { 3851 lea(ary1, Address(ary1, 2)); 3852 lea(ary2, Address(ary2, 2)); 3853 3854 bind(COMPARE_BYTE); 3855 testl(result, 0x1); // tail byte 3856 jccb(Assembler::zero, TRUE_LABEL); 3857 load_unsigned_byte(chr, Address(ary1, 0)); 3858 load_unsigned_byte(limit, Address(ary2, 0)); 3859 cmpl(chr, limit); 3860 jccb(Assembler::notEqual, FALSE_LABEL); 3861 } 3862 bind(TRUE_LABEL); 3863 movl(result, 1); // return true 3864 jmpb(DONE); 3865 3866 bind(FALSE_LABEL); 3867 xorl(result, result); // return false 3868 3869 // That's it 3870 bind(DONE); 3871 if (UseAVX >= 2) { 3872 // clean upper bits of YMM registers 3873 vpxor(vec1, vec1); 3874 vpxor(vec2, vec2); 3875 } 3876 } 3877 3878 #ifdef _LP64 3879 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 3880 Register tmp, KRegister ktmp, int masklen, int vec_enc) { 3881 assert(VM_Version::supports_avx512vlbw(), ""); 3882 vpxor(xtmp, xtmp, xtmp, vec_enc); 3883 vpsubb(xtmp, xtmp, mask, vec_enc); 3884 evpmovb2m(ktmp, xtmp, vec_enc); 3885 kmovql(tmp, ktmp); 3886 switch(opc) { 3887 case Op_VectorMaskTrueCount: 3888 popcntq(dst, tmp); 3889 break; 3890 case Op_VectorMaskLastTrue: 3891 mov64(dst, -1); 3892 bsrq(tmp, tmp); 3893 cmov(Assembler::notZero, dst, tmp); 3894 break; 3895 case Op_VectorMaskFirstTrue: 3896 mov64(dst, masklen); 3897 bsfq(tmp, tmp); 3898 cmov(Assembler::notZero, dst, tmp); 3899 break; 3900 default: assert(false, "Unhandled mask operation"); 3901 } 3902 } 3903 3904 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 3905 XMMRegister xtmp1, Register tmp, int masklen, int vec_enc) { 3906 assert(VM_Version::supports_avx(), ""); 3907 vpxor(xtmp, xtmp, xtmp, vec_enc); 3908 vpsubb(xtmp, xtmp, mask, vec_enc); 3909 vpmovmskb(tmp, xtmp, vec_enc); 3910 if (masklen < 64) { 3911 andq(tmp, (((jlong)1 << masklen) - 1)); 3912 } 3913 switch(opc) { 3914 case Op_VectorMaskTrueCount: 3915 popcntq(dst, tmp); 3916 break; 3917 case Op_VectorMaskLastTrue: 3918 mov64(dst, -1); 3919 bsrq(tmp, tmp); 3920 cmov(Assembler::notZero, dst, tmp); 3921 break; 3922 case Op_VectorMaskFirstTrue: 3923 mov64(dst, masklen); 3924 bsfq(tmp, tmp); 3925 cmov(Assembler::notZero, dst, tmp); 3926 break; 3927 default: assert(false, "Unhandled mask operation"); 3928 } 3929 } 3930 #endif 3931 3932 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 3933 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 3934 int vlen_enc) { 3935 assert(VM_Version::supports_avx512bw(), ""); 3936 // Byte shuffles are inlane operations and indices are determined using 3937 // lower 4 bit of each shuffle lane, thus all shuffle indices are 3938 // normalized to index range 0-15. This makes sure that all the multiples 3939 // of an index value are placed at same relative position in 128 bit 3940 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 3941 // will be 16th element in their respective 128 bit lanes. 3942 movl(rtmp, 16); 3943 evpbroadcastb(xtmp1, rtmp, vlen_enc); 3944 3945 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 3946 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 3947 // original shuffle indices and move the shuffled lanes corresponding to true 3948 // mask to destination vector. 3949 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 3950 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 3951 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 3952 3953 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 3954 // and broadcasting second 128 bit lane. 3955 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 3956 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 3957 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 3958 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 3959 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 3960 3961 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 3962 // and broadcasting third 128 bit lane. 3963 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 3964 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 3965 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 3966 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 3967 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 3968 3969 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 3970 // and broadcasting third 128 bit lane. 3971 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 3972 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 3973 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 3974 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 3975 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 3976 }