1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "oops/methodData.hpp" 31 #include "opto/c2_MacroAssembler.hpp" 32 #include "opto/intrinsicnode.hpp" 33 #include "opto/output.hpp" 34 #include "opto/opcodes.hpp" 35 #include "opto/subnode.hpp" 36 #include "runtime/globals.hpp" 37 #include "runtime/objectMonitor.hpp" 38 #include "runtime/stubRoutines.hpp" 39 #include "utilities/checkedCast.hpp" 40 #include "utilities/globalDefinitions.hpp" 41 #include "utilities/powerOfTwo.hpp" 42 #include "utilities/sizes.hpp" 43 44 #ifdef PRODUCT 45 #define BLOCK_COMMENT(str) /* nothing */ 46 #define STOP(error) stop(error) 47 #else 48 #define BLOCK_COMMENT(str) block_comment(str) 49 #define STOP(error) block_comment(error); stop(error) 50 #endif 51 52 // C2 compiled method's prolog code. 53 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { 54 55 // WARNING: Initial instruction MUST be 5 bytes or longer so that 56 // NativeJump::patch_verified_entry will be able to patch out the entry 57 // code safely. The push to verify stack depth is ok at 5 bytes, 58 // the frame allocation can be either 3 or 6 bytes. So if we don't do 59 // stack bang then we must use the 6 byte frame allocation even if 60 // we have no frame. :-( 61 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 62 63 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 64 // Remove word for return addr 65 framesize -= wordSize; 66 stack_bang_size -= wordSize; 67 68 // Calls to C2R adapters often do not accept exceptional returns. 69 // We require that their callers must bang for them. But be careful, because 70 // some VM calls (such as call site linkage) can use several kilobytes of 71 // stack. But the stack safety zone should account for that. 72 // See bugs 4446381, 4468289, 4497237. 73 if (stack_bang_size > 0) { 74 generate_stack_overflow_check(stack_bang_size); 75 76 // We always push rbp, so that on return to interpreter rbp, will be 77 // restored correctly and we can correct the stack. 78 push(rbp); 79 // Save caller's stack pointer into RBP if the frame pointer is preserved. 80 if (PreserveFramePointer) { 81 mov(rbp, rsp); 82 } 83 // Remove word for ebp 84 framesize -= wordSize; 85 86 // Create frame 87 if (framesize) { 88 subptr(rsp, framesize); 89 } 90 } else { 91 // Create frame (force generation of a 4 byte immediate value) 92 subptr_imm32(rsp, framesize); 93 94 // Save RBP register now. 95 framesize -= wordSize; 96 movptr(Address(rsp, framesize), rbp); 97 // Save caller's stack pointer into RBP if the frame pointer is preserved. 98 if (PreserveFramePointer) { 99 movptr(rbp, rsp); 100 if (framesize > 0) { 101 addptr(rbp, framesize); 102 } 103 } 104 } 105 106 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 107 framesize -= wordSize; 108 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 109 } 110 111 #ifndef _LP64 112 // If method sets FPU control word do it now 113 if (fp_mode_24b) { 114 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 115 } 116 if (UseSSE >= 2 && VerifyFPU) { 117 verify_FPU(0, "FPU stack must be clean on entry"); 118 } 119 #endif 120 121 #ifdef ASSERT 122 if (VerifyStackAtCalls) { 123 Label L; 124 push(rax); 125 mov(rax, rsp); 126 andptr(rax, StackAlignmentInBytes-1); 127 cmpptr(rax, StackAlignmentInBytes-wordSize); 128 pop(rax); 129 jcc(Assembler::equal, L); 130 STOP("Stack is not properly aligned!"); 131 bind(L); 132 } 133 #endif 134 135 if (!is_stub) { 136 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 137 #ifdef _LP64 138 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 139 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 140 Label dummy_slow_path; 141 Label dummy_continuation; 142 Label* slow_path = &dummy_slow_path; 143 Label* continuation = &dummy_continuation; 144 if (!Compile::current()->output()->in_scratch_emit_size()) { 145 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 146 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 147 Compile::current()->output()->add_stub(stub); 148 slow_path = &stub->entry(); 149 continuation = &stub->continuation(); 150 } 151 bs->nmethod_entry_barrier(this, slow_path, continuation); 152 } 153 #else 154 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 155 bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */); 156 #endif 157 } 158 } 159 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 161 switch (vlen_in_bytes) { 162 case 4: // fall-through 163 case 8: // fall-through 164 case 16: return Assembler::AVX_128bit; 165 case 32: return Assembler::AVX_256bit; 166 case 64: return Assembler::AVX_512bit; 167 168 default: { 169 ShouldNotReachHere(); 170 return Assembler::AVX_NoVec; 171 } 172 } 173 } 174 175 #if INCLUDE_RTM_OPT 176 177 // Update rtm_counters based on abort status 178 // input: abort_status 179 // rtm_counters (RTMLockingCounters*) 180 // flags are killed 181 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) { 182 183 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset())); 184 if (PrintPreciseRTMLockingStatistics) { 185 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 186 Label check_abort; 187 testl(abort_status, (1<<i)); 188 jccb(Assembler::equal, check_abort); 189 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx)))); 190 bind(check_abort); 191 } 192 } 193 } 194 195 // Branch if (random & (count-1) != 0), count is 2^n 196 // tmp, scr and flags are killed 197 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) { 198 assert(tmp == rax, ""); 199 assert(scr == rdx, ""); 200 rdtsc(); // modifies EDX:EAX 201 andptr(tmp, count-1); 202 jccb(Assembler::notZero, brLabel); 203 } 204 205 // Perform abort ratio calculation, set no_rtm bit if high ratio 206 // input: rtm_counters_Reg (RTMLockingCounters* address) 207 // tmpReg, rtm_counters_Reg and flags are killed 208 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg, 209 Register rtm_counters_Reg, 210 RTMLockingCounters* rtm_counters, 211 Metadata* method_data) { 212 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 213 214 if (RTMLockingCalculationDelay > 0) { 215 // Delay calculation 216 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr())); 217 testptr(tmpReg, tmpReg); 218 jccb(Assembler::equal, L_done); 219 } 220 // Abort ratio calculation only if abort_count > RTMAbortThreshold 221 // Aborted transactions = abort_count * 100 222 // All transactions = total_count * RTMTotalCountIncrRate 223 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 224 225 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset())); 226 cmpptr(tmpReg, RTMAbortThreshold); 227 jccb(Assembler::below, L_check_always_rtm2); 228 imulptr(tmpReg, tmpReg, 100); 229 230 Register scrReg = rtm_counters_Reg; 231 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 232 imulptr(scrReg, scrReg, RTMTotalCountIncrRate); 233 imulptr(scrReg, scrReg, RTMAbortRatio); 234 cmpptr(tmpReg, scrReg); 235 jccb(Assembler::below, L_check_always_rtm1); 236 if (method_data != nullptr) { 237 // set rtm_state to "no rtm" in MDO 238 mov_metadata(tmpReg, method_data); 239 lock(); 240 orl(Address(tmpReg, MethodData::rtm_state_offset()), NoRTM); 241 } 242 jmpb(L_done); 243 bind(L_check_always_rtm1); 244 // Reload RTMLockingCounters* address 245 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 246 bind(L_check_always_rtm2); 247 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 248 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 249 jccb(Assembler::below, L_done); 250 if (method_data != nullptr) { 251 // set rtm_state to "always rtm" in MDO 252 mov_metadata(tmpReg, method_data); 253 lock(); 254 orl(Address(tmpReg, MethodData::rtm_state_offset()), UseRTM); 255 } 256 bind(L_done); 257 } 258 259 // Update counters and perform abort ratio calculation 260 // input: abort_status_Reg 261 // rtm_counters_Reg, flags are killed 262 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg, 263 Register rtm_counters_Reg, 264 RTMLockingCounters* rtm_counters, 265 Metadata* method_data, 266 bool profile_rtm) { 267 268 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 269 // update rtm counters based on rax value at abort 270 // reads abort_status_Reg, updates flags 271 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 272 rtm_counters_update(abort_status_Reg, rtm_counters_Reg); 273 if (profile_rtm) { 274 // Save abort status because abort_status_Reg is used by following code. 275 if (RTMRetryCount > 0) { 276 push(abort_status_Reg); 277 } 278 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 279 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data); 280 // restore abort status 281 if (RTMRetryCount > 0) { 282 pop(abort_status_Reg); 283 } 284 } 285 } 286 287 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4) 288 // inputs: retry_count_Reg 289 // : abort_status_Reg 290 // output: retry_count_Reg decremented by 1 291 // flags are killed 292 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) { 293 Label doneRetry; 294 assert(abort_status_Reg == rax, ""); 295 // The abort reason bits are in eax (see all states in rtmLocking.hpp) 296 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) 297 // if reason is in 0x6 and retry count != 0 then retry 298 andptr(abort_status_Reg, 0x6); 299 jccb(Assembler::zero, doneRetry); 300 testl(retry_count_Reg, retry_count_Reg); 301 jccb(Assembler::zero, doneRetry); 302 pause(); 303 decrementl(retry_count_Reg); 304 jmp(retryLabel); 305 bind(doneRetry); 306 } 307 308 // Spin and retry if lock is busy, 309 // inputs: box_Reg (monitor address) 310 // : retry_count_Reg 311 // output: retry_count_Reg decremented by 1 312 // : clear z flag if retry count exceeded 313 // tmp_Reg, scr_Reg, flags are killed 314 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg, 315 Register tmp_Reg, Register scr_Reg, Label& retryLabel) { 316 Label SpinLoop, SpinExit, doneRetry; 317 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 318 319 testl(retry_count_Reg, retry_count_Reg); 320 jccb(Assembler::zero, doneRetry); 321 decrementl(retry_count_Reg); 322 movptr(scr_Reg, RTMSpinLoopCount); 323 324 bind(SpinLoop); 325 pause(); 326 decrementl(scr_Reg); 327 jccb(Assembler::lessEqual, SpinExit); 328 movptr(tmp_Reg, Address(box_Reg, owner_offset)); 329 testptr(tmp_Reg, tmp_Reg); 330 jccb(Assembler::notZero, SpinLoop); 331 332 bind(SpinExit); 333 jmp(retryLabel); 334 bind(doneRetry); 335 incrementl(retry_count_Reg); // clear z flag 336 } 337 338 // Use RTM for normal stack locks 339 // Input: objReg (object to lock) 340 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg, 341 Register retry_on_abort_count_Reg, 342 RTMLockingCounters* stack_rtm_counters, 343 Metadata* method_data, bool profile_rtm, 344 Label& DONE_LABEL, Label& IsInflated) { 345 assert(UseRTMForStackLocks, "why call this otherwise?"); 346 assert(tmpReg == rax, ""); 347 assert(scrReg == rdx, ""); 348 Label L_rtm_retry, L_decrement_retry, L_on_abort; 349 350 if (RTMRetryCount > 0) { 351 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 352 bind(L_rtm_retry); 353 } 354 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 355 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 356 jcc(Assembler::notZero, IsInflated); 357 358 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 359 Label L_noincrement; 360 if (RTMTotalCountIncrRate > 1) { 361 // tmpReg, scrReg and flags are killed 362 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 363 } 364 assert(stack_rtm_counters != nullptr, "should not be null when profiling RTM"); 365 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg); 366 bind(L_noincrement); 367 } 368 xbegin(L_on_abort); 369 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 370 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 371 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 372 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked 373 374 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 375 if (UseRTMXendForLockBusy) { 376 xend(); 377 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry) 378 jmp(L_decrement_retry); 379 } 380 else { 381 xabort(0); 382 } 383 bind(L_on_abort); 384 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 385 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm); 386 } 387 bind(L_decrement_retry); 388 if (RTMRetryCount > 0) { 389 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 390 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 391 } 392 } 393 394 // Use RTM for inflating locks 395 // inputs: objReg (object to lock) 396 // boxReg (on-stack box address (displaced header location) - KILLED) 397 // tmpReg (ObjectMonitor address + markWord::monitor_value) 398 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg, 399 Register scrReg, Register retry_on_busy_count_Reg, 400 Register retry_on_abort_count_Reg, 401 RTMLockingCounters* rtm_counters, 402 Metadata* method_data, bool profile_rtm, 403 Label& DONE_LABEL) { 404 assert(UseRTMLocking, "why call this otherwise?"); 405 assert(tmpReg == rax, ""); 406 assert(scrReg == rdx, ""); 407 Label L_rtm_retry, L_decrement_retry, L_on_abort; 408 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 409 410 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 411 movptr(boxReg, tmpReg); // Save ObjectMonitor address 412 413 if (RTMRetryCount > 0) { 414 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy 415 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 416 bind(L_rtm_retry); 417 } 418 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 419 Label L_noincrement; 420 if (RTMTotalCountIncrRate > 1) { 421 // tmpReg, scrReg and flags are killed 422 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 423 } 424 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 425 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg); 426 bind(L_noincrement); 427 } 428 xbegin(L_on_abort); 429 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 430 movptr(tmpReg, Address(tmpReg, owner_offset)); 431 testptr(tmpReg, tmpReg); 432 jcc(Assembler::zero, DONE_LABEL); 433 if (UseRTMXendForLockBusy) { 434 xend(); 435 jmp(L_decrement_retry); 436 } 437 else { 438 xabort(0); 439 } 440 bind(L_on_abort); 441 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 442 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 443 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm); 444 } 445 if (RTMRetryCount > 0) { 446 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 447 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 448 } 449 450 movptr(tmpReg, Address(boxReg, owner_offset)) ; 451 testptr(tmpReg, tmpReg) ; 452 jccb(Assembler::notZero, L_decrement_retry) ; 453 454 // Appears unlocked - try to swing _owner from null to non-null. 455 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 456 #ifdef _LP64 457 Register threadReg = r15_thread; 458 #else 459 get_thread(scrReg); 460 Register threadReg = scrReg; 461 #endif 462 lock(); 463 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg 464 465 if (RTMRetryCount > 0) { 466 // success done else retry 467 jccb(Assembler::equal, DONE_LABEL) ; 468 bind(L_decrement_retry); 469 // Spin and retry if lock is busy. 470 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry); 471 } 472 else { 473 bind(L_decrement_retry); 474 } 475 } 476 477 #endif // INCLUDE_RTM_OPT 478 479 // fast_lock and fast_unlock used by C2 480 481 // Because the transitions from emitted code to the runtime 482 // monitorenter/exit helper stubs are so slow it's critical that 483 // we inline both the stack-locking fast path and the inflated fast path. 484 // 485 // See also: cmpFastLock and cmpFastUnlock. 486 // 487 // What follows is a specialized inline transliteration of the code 488 // in enter() and exit(). If we're concerned about I$ bloat another 489 // option would be to emit TrySlowEnter and TrySlowExit methods 490 // at startup-time. These methods would accept arguments as 491 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 492 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 493 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 494 // In practice, however, the # of lock sites is bounded and is usually small. 495 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 496 // if the processor uses simple bimodal branch predictors keyed by EIP 497 // Since the helper routines would be called from multiple synchronization 498 // sites. 499 // 500 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 501 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 502 // to those specialized methods. That'd give us a mostly platform-independent 503 // implementation that the JITs could optimize and inline at their pleasure. 504 // Done correctly, the only time we'd need to cross to native could would be 505 // to park() or unpark() threads. We'd also need a few more unsafe operators 506 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 507 // (b) explicit barriers or fence operations. 508 // 509 // TODO: 510 // 511 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 512 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 513 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 514 // the lock operators would typically be faster than reifying Self. 515 // 516 // * Ideally I'd define the primitives as: 517 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 518 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 519 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 520 // Instead, we're stuck with a rather awkward and brittle register assignments below. 521 // Furthermore the register assignments are overconstrained, possibly resulting in 522 // sub-optimal code near the synchronization site. 523 // 524 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 525 // Alternately, use a better sp-proximity test. 526 // 527 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 528 // Either one is sufficient to uniquely identify a thread. 529 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 530 // 531 // * Intrinsify notify() and notifyAll() for the common cases where the 532 // object is locked by the calling thread but the waitlist is empty. 533 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 534 // 535 // * use jccb and jmpb instead of jcc and jmp to improve code density. 536 // But beware of excessive branch density on AMD Opterons. 537 // 538 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 539 // or failure of the fast path. If the fast path fails then we pass 540 // control to the slow path, typically in C. In fast_lock and 541 // fast_unlock we often branch to DONE_LABEL, just to find that C2 542 // will emit a conditional branch immediately after the node. 543 // So we have branches to branches and lots of ICC.ZF games. 544 // Instead, it might be better to have C2 pass a "FailureLabel" 545 // into fast_lock and fast_unlock. In the case of success, control 546 // will drop through the node. ICC.ZF is undefined at exit. 547 // In the case of failure, the node will branch directly to the 548 // FailureLabel 549 550 551 // obj: object to lock 552 // box: on-stack box address (displaced header location) - KILLED 553 // rax,: tmp -- KILLED 554 // scr: tmp -- KILLED 555 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 556 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 557 RTMLockingCounters* rtm_counters, 558 RTMLockingCounters* stack_rtm_counters, 559 Metadata* method_data, 560 bool use_rtm, bool profile_rtm) { 561 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 562 // Ensure the register assignments are disjoint 563 assert(tmpReg == rax, ""); 564 565 if (use_rtm) { 566 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg); 567 } else { 568 assert(cx1Reg == noreg, ""); 569 assert(cx2Reg == noreg, ""); 570 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 571 } 572 573 // Possible cases that we'll encounter in fast_lock 574 // ------------------------------------------------ 575 // * Inflated 576 // -- unlocked 577 // -- Locked 578 // = by self 579 // = by other 580 // * neutral 581 // * stack-locked 582 // -- by self 583 // = sp-proximity test hits 584 // = sp-proximity test generates false-negative 585 // -- by other 586 // 587 588 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 589 590 if (DiagnoseSyncOnValueBasedClasses != 0) { 591 load_klass(tmpReg, objReg, scrReg); 592 movl(tmpReg, Address(tmpReg, Klass::access_flags_offset())); 593 testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS); 594 jcc(Assembler::notZero, DONE_LABEL); 595 } 596 597 #if INCLUDE_RTM_OPT 598 if (UseRTMForStackLocks && use_rtm) { 599 assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive"); 600 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg, 601 stack_rtm_counters, method_data, profile_rtm, 602 DONE_LABEL, IsInflated); 603 } 604 #endif // INCLUDE_RTM_OPT 605 606 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 607 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 608 jcc(Assembler::notZero, IsInflated); 609 610 if (LockingMode == LM_MONITOR) { 611 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 612 testptr(objReg, objReg); 613 } else { 614 assert(LockingMode == LM_LEGACY, "must be"); 615 // Attempt stack-locking ... 616 orptr (tmpReg, markWord::unlocked_value); 617 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 618 lock(); 619 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 620 jcc(Assembler::equal, COUNT); // Success 621 622 // Recursive locking. 623 // The object is stack-locked: markword contains stack pointer to BasicLock. 624 // Locked by current thread if difference with current SP is less than one page. 625 subptr(tmpReg, rsp); 626 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 627 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) ); 628 movptr(Address(boxReg, 0), tmpReg); 629 } 630 jmp(DONE_LABEL); 631 632 bind(IsInflated); 633 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 634 635 #if INCLUDE_RTM_OPT 636 // Use the same RTM locking code in 32- and 64-bit VM. 637 if (use_rtm) { 638 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg, 639 rtm_counters, method_data, profile_rtm, DONE_LABEL); 640 } else { 641 #endif // INCLUDE_RTM_OPT 642 643 #ifndef _LP64 644 // The object is inflated. 645 646 // boxReg refers to the on-stack BasicLock in the current frame. 647 // We'd like to write: 648 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 649 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 650 // additional latency as we have another ST in the store buffer that must drain. 651 652 // avoid ST-before-CAS 653 // register juggle because we need tmpReg for cmpxchgptr below 654 movptr(scrReg, boxReg); 655 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 656 657 // Optimistic form: consider XORL tmpReg,tmpReg 658 movptr(tmpReg, NULL_WORD); 659 660 // Appears unlocked - try to swing _owner from null to non-null. 661 // Ideally, I'd manifest "Self" with get_thread and then attempt 662 // to CAS the register containing Self into m->Owner. 663 // But we don't have enough registers, so instead we can either try to CAS 664 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 665 // we later store "Self" into m->Owner. Transiently storing a stack address 666 // (rsp or the address of the box) into m->owner is harmless. 667 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 668 lock(); 669 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 670 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 671 // If we weren't able to swing _owner from null to the BasicLock 672 // then take the slow path. 673 jccb (Assembler::notZero, NO_COUNT); 674 // update _owner from BasicLock to thread 675 get_thread (scrReg); // beware: clobbers ICCs 676 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 677 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 678 679 // If the CAS fails we can either retry or pass control to the slow path. 680 // We use the latter tactic. 681 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 682 // If the CAS was successful ... 683 // Self has acquired the lock 684 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 685 // Intentional fall-through into DONE_LABEL ... 686 #else // _LP64 687 // It's inflated and we use scrReg for ObjectMonitor* in this section. 688 movq(scrReg, tmpReg); 689 xorq(tmpReg, tmpReg); 690 lock(); 691 cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 692 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 693 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 694 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 695 // Propagate ICC.ZF from CAS above into DONE_LABEL. 696 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 697 698 cmpptr(thread, rax); // Check if we are already the owner (recursive lock) 699 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 700 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 701 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 702 #endif // _LP64 703 #if INCLUDE_RTM_OPT 704 } // use_rtm() 705 #endif 706 bind(DONE_LABEL); 707 708 // ZFlag == 1 count in fast path 709 // ZFlag == 0 count in slow path 710 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 711 712 bind(COUNT); 713 // Count monitors in fast path 714 increment(Address(thread, JavaThread::held_monitor_count_offset())); 715 716 xorl(tmpReg, tmpReg); // Set ZF == 1 717 718 bind(NO_COUNT); 719 720 // At NO_COUNT the icc ZFlag is set as follows ... 721 // fast_unlock uses the same protocol. 722 // ZFlag == 1 -> Success 723 // ZFlag == 0 -> Failure - force control through the slow path 724 } 725 726 // obj: object to unlock 727 // box: box address (displaced header location), killed. Must be EAX. 728 // tmp: killed, cannot be obj nor box. 729 // 730 // Some commentary on balanced locking: 731 // 732 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 733 // Methods that don't have provably balanced locking are forced to run in the 734 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 735 // The interpreter provides two properties: 736 // I1: At return-time the interpreter automatically and quietly unlocks any 737 // objects acquired the current activation (frame). Recall that the 738 // interpreter maintains an on-stack list of locks currently held by 739 // a frame. 740 // I2: If a method attempts to unlock an object that is not held by the 741 // the frame the interpreter throws IMSX. 742 // 743 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 744 // B() doesn't have provably balanced locking so it runs in the interpreter. 745 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 746 // is still locked by A(). 747 // 748 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 749 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 750 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 751 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 752 // Arguably given that the spec legislates the JNI case as undefined our implementation 753 // could reasonably *avoid* checking owner in fast_unlock(). 754 // In the interest of performance we elide m->Owner==Self check in unlock. 755 // A perfectly viable alternative is to elide the owner check except when 756 // Xcheck:jni is enabled. 757 758 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { 759 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 760 assert(boxReg == rax, ""); 761 assert_different_registers(objReg, boxReg, tmpReg); 762 763 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 764 765 #if INCLUDE_RTM_OPT 766 if (UseRTMForStackLocks && use_rtm) { 767 assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive"); 768 Label L_regular_unlock; 769 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 770 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 771 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 772 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock 773 xend(); // otherwise end... 774 jmp(DONE_LABEL); // ... and we're done 775 bind(L_regular_unlock); 776 } 777 #endif 778 779 if (LockingMode == LM_LEGACY) { 780 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 781 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 782 } 783 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 784 if (LockingMode != LM_MONITOR) { 785 testptr(tmpReg, markWord::monitor_value); // Inflated? 786 jcc(Assembler::zero, Stacked); 787 } 788 789 // It's inflated. 790 791 #if INCLUDE_RTM_OPT 792 if (use_rtm) { 793 Label L_regular_inflated_unlock; 794 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 795 movptr(boxReg, Address(tmpReg, owner_offset)); 796 testptr(boxReg, boxReg); 797 jccb(Assembler::notZero, L_regular_inflated_unlock); 798 xend(); 799 jmp(DONE_LABEL); 800 bind(L_regular_inflated_unlock); 801 } 802 #endif 803 804 // Despite our balanced locking property we still check that m->_owner == Self 805 // as java routines or native JNI code called by this thread might 806 // have released the lock. 807 // Refer to the comments in synchronizer.cpp for how we might encode extra 808 // state in _succ so we can avoid fetching EntryList|cxq. 809 // 810 // If there's no contention try a 1-0 exit. That is, exit without 811 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 812 // we detect and recover from the race that the 1-0 exit admits. 813 // 814 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 815 // before it STs null into _owner, releasing the lock. Updates 816 // to data protected by the critical section must be visible before 817 // we drop the lock (and thus before any other thread could acquire 818 // the lock and observe the fields protected by the lock). 819 // IA32's memory-model is SPO, so STs are ordered with respect to 820 // each other and there's no need for an explicit barrier (fence). 821 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 822 #ifndef _LP64 823 // Note that we could employ various encoding schemes to reduce 824 // the number of loads below (currently 4) to just 2 or 3. 825 // Refer to the comments in synchronizer.cpp. 826 // In practice the chain of fetches doesn't seem to impact performance, however. 827 xorptr(boxReg, boxReg); 828 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 829 jccb (Assembler::notZero, DONE_LABEL); 830 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 831 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 832 jccb (Assembler::notZero, DONE_LABEL); 833 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 834 jmpb (DONE_LABEL); 835 #else // _LP64 836 // It's inflated 837 Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath; 838 839 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 840 jccb(Assembler::equal, LNotRecursive); 841 842 // Recursive inflated unlock 843 decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 844 jmpb(LSuccess); 845 846 bind(LNotRecursive); 847 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 848 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 849 jccb (Assembler::notZero, CheckSucc); 850 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 851 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 852 jmpb (DONE_LABEL); 853 854 // Try to avoid passing control into the slow_path ... 855 bind (CheckSucc); 856 857 // The following optional optimization can be elided if necessary 858 // Effectively: if (succ == null) goto slow path 859 // The code reduces the window for a race, however, 860 // and thus benefits performance. 861 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 862 jccb (Assembler::zero, LGoSlowPath); 863 864 xorptr(boxReg, boxReg); 865 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 866 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 867 868 // Memory barrier/fence 869 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 870 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 871 // This is faster on Nehalem and AMD Shanghai/Barcelona. 872 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 873 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 874 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 875 lock(); addl(Address(rsp, 0), 0); 876 877 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 878 jccb (Assembler::notZero, LSuccess); 879 880 // Rare inopportune interleaving - race. 881 // The successor vanished in the small window above. 882 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 883 // We need to ensure progress and succession. 884 // Try to reacquire the lock. 885 // If that fails then the new owner is responsible for succession and this 886 // thread needs to take no further action and can exit via the fast path (success). 887 // If the re-acquire succeeds then pass control into the slow path. 888 // As implemented, this latter mode is horrible because we generated more 889 // coherence traffic on the lock *and* artificially extended the critical section 890 // length while by virtue of passing control into the slow path. 891 892 // box is really RAX -- the following CMPXCHG depends on that binding 893 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 894 lock(); 895 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 896 // There's no successor so we tried to regrab the lock. 897 // If that didn't work, then another thread grabbed the 898 // lock so we're done (and exit was a success). 899 jccb (Assembler::notEqual, LSuccess); 900 // Intentional fall-through into slow path 901 902 bind (LGoSlowPath); 903 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 904 jmpb (DONE_LABEL); 905 906 bind (LSuccess); 907 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 908 jmpb (DONE_LABEL); 909 910 #endif 911 if (LockingMode == LM_LEGACY) { 912 bind (Stacked); 913 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 914 lock(); 915 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 916 // Intentional fall-thru into DONE_LABEL 917 } 918 919 bind(DONE_LABEL); 920 921 // ZFlag == 1 count in fast path 922 // ZFlag == 0 count in slow path 923 jccb(Assembler::notZero, NO_COUNT); 924 925 bind(COUNT); 926 // Count monitors in fast path 927 #ifndef _LP64 928 get_thread(tmpReg); 929 decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); 930 #else // _LP64 931 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 932 #endif 933 934 xorl(tmpReg, tmpReg); // Set ZF == 1 935 936 bind(NO_COUNT); 937 } 938 939 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 940 Register t, Register thread) { 941 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 942 assert(rax_reg == rax, "Used for CAS"); 943 assert_different_registers(obj, box, rax_reg, t, thread); 944 945 // Handle inflated monitor. 946 Label inflated; 947 // Finish fast lock successfully. ZF value is irrelevant. 948 Label locked; 949 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 950 Label slow_path; 951 952 if (DiagnoseSyncOnValueBasedClasses != 0) { 953 load_klass(rax_reg, obj, t); 954 movl(rax_reg, Address(rax_reg, Klass::access_flags_offset())); 955 testl(rax_reg, JVM_ACC_IS_VALUE_BASED_CLASS); 956 jcc(Assembler::notZero, slow_path); 957 } 958 959 const Register mark = t; 960 961 { // Lightweight Lock 962 963 Label push; 964 965 const Register top = box; 966 967 // Load the mark. 968 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 969 970 // Prefetch top. 971 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 972 973 // Check for monitor (0b10). 974 testptr(mark, markWord::monitor_value); 975 jcc(Assembler::notZero, inflated); 976 977 // Check if lock-stack is full. 978 cmpl(top, LockStack::end_offset() - 1); 979 jcc(Assembler::greater, slow_path); 980 981 // Check if recursive. 982 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 983 jccb(Assembler::equal, push); 984 985 // Try to lock. Transition lock bits 0b01 => 0b00 986 movptr(rax_reg, mark); 987 orptr(rax_reg, markWord::unlocked_value); 988 andptr(mark, ~(int32_t)markWord::unlocked_value); 989 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 990 jcc(Assembler::notEqual, slow_path); 991 992 bind(push); 993 // After successful lock, push object on lock-stack. 994 movptr(Address(thread, top), obj); 995 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 996 jmpb(locked); 997 } 998 999 { // Handle inflated monitor. 1000 bind(inflated); 1001 1002 const Register tagged_monitor = mark; 1003 1004 // CAS owner (null => current thread). 1005 xorptr(rax_reg, rax_reg); 1006 lock(); cmpxchgptr(thread, Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 1007 jccb(Assembler::equal, locked); 1008 1009 // Check if recursive. 1010 cmpptr(thread, rax_reg); 1011 jccb(Assembler::notEqual, slow_path); 1012 1013 // Recursive. 1014 increment(Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 1015 } 1016 1017 bind(locked); 1018 increment(Address(thread, JavaThread::held_monitor_count_offset())); 1019 // Set ZF = 1 1020 xorl(rax_reg, rax_reg); 1021 1022 #ifdef ASSERT 1023 // Check that locked label is reached with ZF set. 1024 Label zf_correct; 1025 Label zf_bad_zero; 1026 jcc(Assembler::zero, zf_correct); 1027 jmp(zf_bad_zero); 1028 #endif 1029 1030 bind(slow_path); 1031 #ifdef ASSERT 1032 // Check that slow_path label is reached with ZF not set. 1033 jcc(Assembler::notZero, zf_correct); 1034 stop("Fast Lock ZF != 0"); 1035 bind(zf_bad_zero); 1036 stop("Fast Lock ZF != 1"); 1037 bind(zf_correct); 1038 #endif 1039 // C2 uses the value of ZF to determine the continuation. 1040 } 1041 1042 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 1043 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 1044 assert(reg_rax == rax, "Used for CAS"); 1045 assert_different_registers(obj, reg_rax, t); 1046 1047 // Handle inflated monitor. 1048 Label inflated, inflated_check_lock_stack; 1049 // Finish fast unlock successfully. MUST jump with ZF == 1 1050 Label unlocked; 1051 1052 // Assume success. 1053 decrement(Address(thread, JavaThread::held_monitor_count_offset())); 1054 1055 const Register mark = t; 1056 const Register top = reg_rax; 1057 1058 Label dummy; 1059 C2FastUnlockLightweightStub* stub = nullptr; 1060 1061 if (!Compile::current()->output()->in_scratch_emit_size()) { 1062 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 1063 Compile::current()->output()->add_stub(stub); 1064 } 1065 1066 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 1067 Label& check_successor = stub == nullptr ? dummy : stub->check_successor(); 1068 1069 { // Lightweight Unlock 1070 1071 // Load top. 1072 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 1073 1074 // Prefetch mark. 1075 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 1076 1077 // Check if obj is top of lock-stack. 1078 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 1079 // Top of lock stack was not obj. Must be monitor. 1080 jcc(Assembler::notEqual, inflated_check_lock_stack); 1081 1082 // Pop lock-stack. 1083 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 1084 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 1085 1086 // Check if recursive. 1087 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 1088 jcc(Assembler::equal, unlocked); 1089 1090 // We elide the monitor check, let the CAS fail instead. 1091 1092 // Try to unlock. Transition lock bits 0b00 => 0b01 1093 movptr(reg_rax, mark); 1094 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 1095 orptr(mark, markWord::unlocked_value); 1096 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 1097 jcc(Assembler::notEqual, push_and_slow_path); 1098 jmp(unlocked); 1099 } 1100 1101 1102 { // Handle inflated monitor. 1103 bind(inflated_check_lock_stack); 1104 #ifdef ASSERT 1105 Label check_done; 1106 subl(top, oopSize); 1107 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 1108 jcc(Assembler::below, check_done); 1109 cmpptr(obj, Address(thread, top)); 1110 jccb(Assembler::notEqual, inflated_check_lock_stack); 1111 stop("Fast Unlock lock on stack"); 1112 bind(check_done); 1113 testptr(mark, markWord::monitor_value); 1114 jccb(Assembler::notZero, inflated); 1115 stop("Fast Unlock not monitor"); 1116 #endif 1117 1118 bind(inflated); 1119 1120 // mark contains the tagged ObjectMonitor*. 1121 const Register monitor = mark; 1122 1123 #ifndef _LP64 1124 // Check if recursive. 1125 xorptr(reg_rax, reg_rax); 1126 orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 1127 jcc(Assembler::notZero, check_successor); 1128 1129 // Check if the entry lists are empty. 1130 movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 1131 orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 1132 jcc(Assembler::notZero, check_successor); 1133 1134 // Release lock. 1135 movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 1136 #else // _LP64 1137 Label recursive; 1138 1139 // Check if recursive. 1140 cmpptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 1141 jccb(Assembler::notEqual, recursive); 1142 1143 // Check if the entry lists are empty. 1144 movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 1145 orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 1146 jcc(Assembler::notZero, check_successor); 1147 1148 // Release lock. 1149 movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 1150 jmpb(unlocked); 1151 1152 // Recursive unlock. 1153 bind(recursive); 1154 decrement(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 1155 xorl(t, t); 1156 #endif 1157 } 1158 1159 bind(unlocked); 1160 if (stub != nullptr) { 1161 bind(stub->unlocked_continuation()); 1162 } 1163 1164 #ifdef ASSERT 1165 // Check that unlocked label is reached with ZF set. 1166 Label zf_correct; 1167 jcc(Assembler::zero, zf_correct); 1168 stop("Fast Unlock ZF != 1"); 1169 #endif 1170 1171 if (stub != nullptr) { 1172 bind(stub->slow_path_continuation()); 1173 } 1174 #ifdef ASSERT 1175 // Check that stub->continuation() label is reached with ZF not set. 1176 jccb(Assembler::notZero, zf_correct); 1177 stop("Fast Unlock ZF != 0"); 1178 bind(zf_correct); 1179 #endif 1180 // C2 uses the value of ZF to determine the continuation. 1181 } 1182 1183 //------------------------------------------------------------------------------------------- 1184 // Generic instructions support for use in .ad files C2 code generation 1185 1186 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 1187 if (dst != src) { 1188 movdqu(dst, src); 1189 } 1190 if (opcode == Op_AbsVD) { 1191 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 1192 } else { 1193 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 1194 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1195 } 1196 } 1197 1198 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 1199 if (opcode == Op_AbsVD) { 1200 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 1201 } else { 1202 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 1203 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 1204 } 1205 } 1206 1207 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 1208 if (dst != src) { 1209 movdqu(dst, src); 1210 } 1211 if (opcode == Op_AbsVF) { 1212 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 1213 } else { 1214 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 1215 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1216 } 1217 } 1218 1219 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 1220 if (opcode == Op_AbsVF) { 1221 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 1222 } else { 1223 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 1224 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 1225 } 1226 } 1227 1228 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 1229 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1230 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 1231 1232 if (opcode == Op_MinV) { 1233 if (elem_bt == T_BYTE) { 1234 pminsb(dst, src); 1235 } else if (elem_bt == T_SHORT) { 1236 pminsw(dst, src); 1237 } else if (elem_bt == T_INT) { 1238 pminsd(dst, src); 1239 } else { 1240 assert(elem_bt == T_LONG, "required"); 1241 assert(tmp == xmm0, "required"); 1242 assert_different_registers(dst, src, tmp); 1243 movdqu(xmm0, dst); 1244 pcmpgtq(xmm0, src); 1245 blendvpd(dst, src); // xmm0 as mask 1246 } 1247 } else { // opcode == Op_MaxV 1248 if (elem_bt == T_BYTE) { 1249 pmaxsb(dst, src); 1250 } else if (elem_bt == T_SHORT) { 1251 pmaxsw(dst, src); 1252 } else if (elem_bt == T_INT) { 1253 pmaxsd(dst, src); 1254 } else { 1255 assert(elem_bt == T_LONG, "required"); 1256 assert(tmp == xmm0, "required"); 1257 assert_different_registers(dst, src, tmp); 1258 movdqu(xmm0, src); 1259 pcmpgtq(xmm0, dst); 1260 blendvpd(dst, src); // xmm0 as mask 1261 } 1262 } 1263 } 1264 1265 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 1266 XMMRegister dst, XMMRegister src1, XMMRegister src2, 1267 int vlen_enc) { 1268 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1269 1270 if (opcode == Op_MinV) { 1271 if (elem_bt == T_BYTE) { 1272 vpminsb(dst, src1, src2, vlen_enc); 1273 } else if (elem_bt == T_SHORT) { 1274 vpminsw(dst, src1, src2, vlen_enc); 1275 } else if (elem_bt == T_INT) { 1276 vpminsd(dst, src1, src2, vlen_enc); 1277 } else { 1278 assert(elem_bt == T_LONG, "required"); 1279 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1280 vpminsq(dst, src1, src2, vlen_enc); 1281 } else { 1282 assert_different_registers(dst, src1, src2); 1283 vpcmpgtq(dst, src1, src2, vlen_enc); 1284 vblendvpd(dst, src1, src2, dst, vlen_enc); 1285 } 1286 } 1287 } else { // opcode == Op_MaxV 1288 if (elem_bt == T_BYTE) { 1289 vpmaxsb(dst, src1, src2, vlen_enc); 1290 } else if (elem_bt == T_SHORT) { 1291 vpmaxsw(dst, src1, src2, vlen_enc); 1292 } else if (elem_bt == T_INT) { 1293 vpmaxsd(dst, src1, src2, vlen_enc); 1294 } else { 1295 assert(elem_bt == T_LONG, "required"); 1296 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1297 vpmaxsq(dst, src1, src2, vlen_enc); 1298 } else { 1299 assert_different_registers(dst, src1, src2); 1300 vpcmpgtq(dst, src1, src2, vlen_enc); 1301 vblendvpd(dst, src2, src1, dst, vlen_enc); 1302 } 1303 } 1304 } 1305 } 1306 1307 // Float/Double min max 1308 1309 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1310 XMMRegister dst, XMMRegister a, XMMRegister b, 1311 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1312 int vlen_enc) { 1313 assert(UseAVX > 0, "required"); 1314 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1315 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1316 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1317 assert_different_registers(a, tmp, atmp, btmp); 1318 assert_different_registers(b, tmp, atmp, btmp); 1319 1320 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1321 bool is_double_word = is_double_word_type(elem_bt); 1322 1323 /* Note on 'non-obvious' assembly sequence: 1324 * 1325 * While there are vminps/vmaxps instructions, there are two important differences between hardware 1326 * and Java on how they handle floats: 1327 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 1328 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 1329 * 1330 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 1331 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 1332 * (only useful when signs differ, noop otherwise) 1333 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 1334 1335 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 1336 * btmp = (b < +0.0) ? a : b 1337 * atmp = (b < +0.0) ? b : a 1338 * Tmp = Max_Float(atmp , btmp) 1339 * Res = (atmp == NaN) ? atmp : Tmp 1340 */ 1341 1342 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 1343 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 1344 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 1345 XMMRegister mask; 1346 1347 if (!is_double_word && is_min) { 1348 mask = a; 1349 vblend = &MacroAssembler::vblendvps; 1350 vmaxmin = &MacroAssembler::vminps; 1351 vcmp = &MacroAssembler::vcmpps; 1352 } else if (!is_double_word && !is_min) { 1353 mask = b; 1354 vblend = &MacroAssembler::vblendvps; 1355 vmaxmin = &MacroAssembler::vmaxps; 1356 vcmp = &MacroAssembler::vcmpps; 1357 } else if (is_double_word && is_min) { 1358 mask = a; 1359 vblend = &MacroAssembler::vblendvpd; 1360 vmaxmin = &MacroAssembler::vminpd; 1361 vcmp = &MacroAssembler::vcmppd; 1362 } else { 1363 assert(is_double_word && !is_min, "sanity"); 1364 mask = b; 1365 vblend = &MacroAssembler::vblendvpd; 1366 vmaxmin = &MacroAssembler::vmaxpd; 1367 vcmp = &MacroAssembler::vcmppd; 1368 } 1369 1370 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 1371 XMMRegister maxmin, scratch; 1372 if (dst == btmp) { 1373 maxmin = btmp; 1374 scratch = tmp; 1375 } else { 1376 maxmin = tmp; 1377 scratch = btmp; 1378 } 1379 1380 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 1381 if (precompute_mask && !is_double_word) { 1382 vpsrad(tmp, mask, 32, vlen_enc); 1383 mask = tmp; 1384 } else if (precompute_mask && is_double_word) { 1385 vpxor(tmp, tmp, tmp, vlen_enc); 1386 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1387 mask = tmp; 1388 } 1389 1390 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1391 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1392 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1393 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1394 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1395 } 1396 1397 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1398 XMMRegister dst, XMMRegister a, XMMRegister b, 1399 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1400 int vlen_enc) { 1401 assert(UseAVX > 2, "required"); 1402 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1403 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1404 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1405 assert_different_registers(dst, a, atmp, btmp); 1406 assert_different_registers(dst, b, atmp, btmp); 1407 1408 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1409 bool is_double_word = is_double_word_type(elem_bt); 1410 bool merge = true; 1411 1412 if (!is_double_word && is_min) { 1413 evpmovd2m(ktmp, a, vlen_enc); 1414 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1415 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1416 vminps(dst, atmp, btmp, vlen_enc); 1417 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1418 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1419 } else if (!is_double_word && !is_min) { 1420 evpmovd2m(ktmp, b, vlen_enc); 1421 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1422 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1423 vmaxps(dst, atmp, btmp, vlen_enc); 1424 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1425 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1426 } else if (is_double_word && is_min) { 1427 evpmovq2m(ktmp, a, vlen_enc); 1428 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1429 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1430 vminpd(dst, atmp, btmp, vlen_enc); 1431 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1432 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1433 } else { 1434 assert(is_double_word && !is_min, "sanity"); 1435 evpmovq2m(ktmp, b, vlen_enc); 1436 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1437 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1438 vmaxpd(dst, atmp, btmp, vlen_enc); 1439 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1440 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1441 } 1442 } 1443 1444 // Float/Double signum 1445 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1446 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1447 1448 Label DONE_LABEL; 1449 1450 if (opcode == Op_SignumF) { 1451 assert(UseSSE > 0, "required"); 1452 ucomiss(dst, zero); 1453 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1454 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1455 movflt(dst, one); 1456 jcc(Assembler::above, DONE_LABEL); 1457 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1458 } else if (opcode == Op_SignumD) { 1459 assert(UseSSE > 1, "required"); 1460 ucomisd(dst, zero); 1461 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1462 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1463 movdbl(dst, one); 1464 jcc(Assembler::above, DONE_LABEL); 1465 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1466 } 1467 1468 bind(DONE_LABEL); 1469 } 1470 1471 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1472 if (sign) { 1473 pmovsxbw(dst, src); 1474 } else { 1475 pmovzxbw(dst, src); 1476 } 1477 } 1478 1479 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1480 if (sign) { 1481 vpmovsxbw(dst, src, vector_len); 1482 } else { 1483 vpmovzxbw(dst, src, vector_len); 1484 } 1485 } 1486 1487 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1488 if (sign) { 1489 vpmovsxbd(dst, src, vector_len); 1490 } else { 1491 vpmovzxbd(dst, src, vector_len); 1492 } 1493 } 1494 1495 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1496 if (sign) { 1497 vpmovsxwd(dst, src, vector_len); 1498 } else { 1499 vpmovzxwd(dst, src, vector_len); 1500 } 1501 } 1502 1503 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1504 int shift, int vector_len) { 1505 if (opcode == Op_RotateLeftV) { 1506 if (etype == T_INT) { 1507 evprold(dst, src, shift, vector_len); 1508 } else { 1509 assert(etype == T_LONG, "expected type T_LONG"); 1510 evprolq(dst, src, shift, vector_len); 1511 } 1512 } else { 1513 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1514 if (etype == T_INT) { 1515 evprord(dst, src, shift, vector_len); 1516 } else { 1517 assert(etype == T_LONG, "expected type T_LONG"); 1518 evprorq(dst, src, shift, vector_len); 1519 } 1520 } 1521 } 1522 1523 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1524 XMMRegister shift, int vector_len) { 1525 if (opcode == Op_RotateLeftV) { 1526 if (etype == T_INT) { 1527 evprolvd(dst, src, shift, vector_len); 1528 } else { 1529 assert(etype == T_LONG, "expected type T_LONG"); 1530 evprolvq(dst, src, shift, vector_len); 1531 } 1532 } else { 1533 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1534 if (etype == T_INT) { 1535 evprorvd(dst, src, shift, vector_len); 1536 } else { 1537 assert(etype == T_LONG, "expected type T_LONG"); 1538 evprorvq(dst, src, shift, vector_len); 1539 } 1540 } 1541 } 1542 1543 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1544 if (opcode == Op_RShiftVI) { 1545 psrad(dst, shift); 1546 } else if (opcode == Op_LShiftVI) { 1547 pslld(dst, shift); 1548 } else { 1549 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1550 psrld(dst, shift); 1551 } 1552 } 1553 1554 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1555 switch (opcode) { 1556 case Op_RShiftVI: psrad(dst, shift); break; 1557 case Op_LShiftVI: pslld(dst, shift); break; 1558 case Op_URShiftVI: psrld(dst, shift); break; 1559 1560 default: assert(false, "%s", NodeClassNames[opcode]); 1561 } 1562 } 1563 1564 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1565 if (opcode == Op_RShiftVI) { 1566 vpsrad(dst, nds, shift, vector_len); 1567 } else if (opcode == Op_LShiftVI) { 1568 vpslld(dst, nds, shift, vector_len); 1569 } else { 1570 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1571 vpsrld(dst, nds, shift, vector_len); 1572 } 1573 } 1574 1575 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1576 switch (opcode) { 1577 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1578 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1579 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1580 1581 default: assert(false, "%s", NodeClassNames[opcode]); 1582 } 1583 } 1584 1585 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1586 switch (opcode) { 1587 case Op_RShiftVB: // fall-through 1588 case Op_RShiftVS: psraw(dst, shift); break; 1589 1590 case Op_LShiftVB: // fall-through 1591 case Op_LShiftVS: psllw(dst, shift); break; 1592 1593 case Op_URShiftVS: // fall-through 1594 case Op_URShiftVB: psrlw(dst, shift); break; 1595 1596 default: assert(false, "%s", NodeClassNames[opcode]); 1597 } 1598 } 1599 1600 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1601 switch (opcode) { 1602 case Op_RShiftVB: // fall-through 1603 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1604 1605 case Op_LShiftVB: // fall-through 1606 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1607 1608 case Op_URShiftVS: // fall-through 1609 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1610 1611 default: assert(false, "%s", NodeClassNames[opcode]); 1612 } 1613 } 1614 1615 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1616 switch (opcode) { 1617 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1618 case Op_LShiftVL: psllq(dst, shift); break; 1619 case Op_URShiftVL: psrlq(dst, shift); break; 1620 1621 default: assert(false, "%s", NodeClassNames[opcode]); 1622 } 1623 } 1624 1625 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1626 if (opcode == Op_RShiftVL) { 1627 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1628 } else if (opcode == Op_LShiftVL) { 1629 psllq(dst, shift); 1630 } else { 1631 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1632 psrlq(dst, shift); 1633 } 1634 } 1635 1636 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1637 switch (opcode) { 1638 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1639 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1640 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1641 1642 default: assert(false, "%s", NodeClassNames[opcode]); 1643 } 1644 } 1645 1646 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1647 if (opcode == Op_RShiftVL) { 1648 evpsraq(dst, nds, shift, vector_len); 1649 } else if (opcode == Op_LShiftVL) { 1650 vpsllq(dst, nds, shift, vector_len); 1651 } else { 1652 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1653 vpsrlq(dst, nds, shift, vector_len); 1654 } 1655 } 1656 1657 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1658 switch (opcode) { 1659 case Op_RShiftVB: // fall-through 1660 case Op_RShiftVS: // fall-through 1661 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1662 1663 case Op_LShiftVB: // fall-through 1664 case Op_LShiftVS: // fall-through 1665 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1666 1667 case Op_URShiftVB: // fall-through 1668 case Op_URShiftVS: // fall-through 1669 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1670 1671 default: assert(false, "%s", NodeClassNames[opcode]); 1672 } 1673 } 1674 1675 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1676 switch (opcode) { 1677 case Op_RShiftVB: // fall-through 1678 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1679 1680 case Op_LShiftVB: // fall-through 1681 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1682 1683 case Op_URShiftVB: // fall-through 1684 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1685 1686 default: assert(false, "%s", NodeClassNames[opcode]); 1687 } 1688 } 1689 1690 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1691 assert(UseAVX >= 2, "required"); 1692 switch (opcode) { 1693 case Op_RShiftVL: { 1694 if (UseAVX > 2) { 1695 assert(tmp == xnoreg, "not used"); 1696 if (!VM_Version::supports_avx512vl()) { 1697 vlen_enc = Assembler::AVX_512bit; 1698 } 1699 evpsravq(dst, src, shift, vlen_enc); 1700 } else { 1701 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1702 vpsrlvq(dst, src, shift, vlen_enc); 1703 vpsrlvq(tmp, tmp, shift, vlen_enc); 1704 vpxor(dst, dst, tmp, vlen_enc); 1705 vpsubq(dst, dst, tmp, vlen_enc); 1706 } 1707 break; 1708 } 1709 case Op_LShiftVL: { 1710 assert(tmp == xnoreg, "not used"); 1711 vpsllvq(dst, src, shift, vlen_enc); 1712 break; 1713 } 1714 case Op_URShiftVL: { 1715 assert(tmp == xnoreg, "not used"); 1716 vpsrlvq(dst, src, shift, vlen_enc); 1717 break; 1718 } 1719 default: assert(false, "%s", NodeClassNames[opcode]); 1720 } 1721 } 1722 1723 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1724 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1725 assert(opcode == Op_LShiftVB || 1726 opcode == Op_RShiftVB || 1727 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1728 bool sign = (opcode != Op_URShiftVB); 1729 assert(vector_len == 0, "required"); 1730 vextendbd(sign, dst, src, 1); 1731 vpmovzxbd(vtmp, shift, 1); 1732 varshiftd(opcode, dst, dst, vtmp, 1); 1733 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1734 vextracti128_high(vtmp, dst); 1735 vpackusdw(dst, dst, vtmp, 0); 1736 } 1737 1738 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1739 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1740 assert(opcode == Op_LShiftVB || 1741 opcode == Op_RShiftVB || 1742 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1743 bool sign = (opcode != Op_URShiftVB); 1744 int ext_vector_len = vector_len + 1; 1745 vextendbw(sign, dst, src, ext_vector_len); 1746 vpmovzxbw(vtmp, shift, ext_vector_len); 1747 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1748 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1749 if (vector_len == 0) { 1750 vextracti128_high(vtmp, dst); 1751 vpackuswb(dst, dst, vtmp, vector_len); 1752 } else { 1753 vextracti64x4_high(vtmp, dst); 1754 vpackuswb(dst, dst, vtmp, vector_len); 1755 vpermq(dst, dst, 0xD8, vector_len); 1756 } 1757 } 1758 1759 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1760 switch(typ) { 1761 case T_BYTE: 1762 pinsrb(dst, val, idx); 1763 break; 1764 case T_SHORT: 1765 pinsrw(dst, val, idx); 1766 break; 1767 case T_INT: 1768 pinsrd(dst, val, idx); 1769 break; 1770 case T_LONG: 1771 pinsrq(dst, val, idx); 1772 break; 1773 default: 1774 assert(false,"Should not reach here."); 1775 break; 1776 } 1777 } 1778 1779 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1780 switch(typ) { 1781 case T_BYTE: 1782 vpinsrb(dst, src, val, idx); 1783 break; 1784 case T_SHORT: 1785 vpinsrw(dst, src, val, idx); 1786 break; 1787 case T_INT: 1788 vpinsrd(dst, src, val, idx); 1789 break; 1790 case T_LONG: 1791 vpinsrq(dst, src, val, idx); 1792 break; 1793 default: 1794 assert(false,"Should not reach here."); 1795 break; 1796 } 1797 } 1798 1799 #ifdef _LP64 1800 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt, 1801 XMMRegister dst, Register base, 1802 Register idx_base, 1803 Register offset, Register mask, 1804 Register mask_idx, Register rtmp, 1805 int vlen_enc) { 1806 vpxor(dst, dst, dst, vlen_enc); 1807 if (elem_bt == T_SHORT) { 1808 for (int i = 0; i < 4; i++) { 1809 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1810 Label skip_load; 1811 btq(mask, mask_idx); 1812 jccb(Assembler::carryClear, skip_load); 1813 movl(rtmp, Address(idx_base, i * 4)); 1814 if (offset != noreg) { 1815 addl(rtmp, offset); 1816 } 1817 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1818 bind(skip_load); 1819 incq(mask_idx); 1820 } 1821 } else { 1822 assert(elem_bt == T_BYTE, ""); 1823 for (int i = 0; i < 8; i++) { 1824 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1825 Label skip_load; 1826 btq(mask, mask_idx); 1827 jccb(Assembler::carryClear, skip_load); 1828 movl(rtmp, Address(idx_base, i * 4)); 1829 if (offset != noreg) { 1830 addl(rtmp, offset); 1831 } 1832 pinsrb(dst, Address(base, rtmp), i); 1833 bind(skip_load); 1834 incq(mask_idx); 1835 } 1836 } 1837 } 1838 #endif // _LP64 1839 1840 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst, 1841 Register base, Register idx_base, 1842 Register offset, Register rtmp, 1843 int vlen_enc) { 1844 vpxor(dst, dst, dst, vlen_enc); 1845 if (elem_bt == T_SHORT) { 1846 for (int i = 0; i < 4; i++) { 1847 // dst[i] = src[offset + idx_base[i]] 1848 movl(rtmp, Address(idx_base, i * 4)); 1849 if (offset != noreg) { 1850 addl(rtmp, offset); 1851 } 1852 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1853 } 1854 } else { 1855 assert(elem_bt == T_BYTE, ""); 1856 for (int i = 0; i < 8; i++) { 1857 // dst[i] = src[offset + idx_base[i]] 1858 movl(rtmp, Address(idx_base, i * 4)); 1859 if (offset != noreg) { 1860 addl(rtmp, offset); 1861 } 1862 pinsrb(dst, Address(base, rtmp), i); 1863 } 1864 } 1865 } 1866 1867 /* 1868 * Gather using hybrid algorithm, first partially unroll scalar loop 1869 * to accumulate values from gather indices into a quad-word(64bit) slice. 1870 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1871 * permutation to place the slice into appropriate vector lane 1872 * locations in destination vector. Following pseudo code describes the 1873 * algorithm in detail: 1874 * 1875 * DST_VEC = ZERO_VEC 1876 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1877 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1878 * FOREACH_ITER: 1879 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1880 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1881 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1882 * PERM_INDEX = PERM_INDEX - TWO_VEC 1883 * 1884 * With each iteration, doubleword permute indices (0,1) corresponding 1885 * to gathered quadword gets right shifted by two lane positions. 1886 * 1887 */ 1888 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1889 Register base, Register idx_base, 1890 Register offset, Register mask, 1891 XMMRegister xtmp1, XMMRegister xtmp2, 1892 XMMRegister temp_dst, Register rtmp, 1893 Register mask_idx, Register length, 1894 int vector_len, int vlen_enc) { 1895 Label GATHER8_LOOP; 1896 assert(is_subword_type(elem_ty), ""); 1897 movl(length, vector_len); 1898 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1899 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1900 vallones(xtmp2, vlen_enc); 1901 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1902 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1903 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1904 1905 bind(GATHER8_LOOP); 1906 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1907 if (mask == noreg) { 1908 vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc); 1909 } else { 1910 LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc)); 1911 } 1912 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1913 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1914 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1915 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1916 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1917 vpor(dst, dst, temp_dst, vlen_enc); 1918 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1919 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1920 jcc(Assembler::notEqual, GATHER8_LOOP); 1921 } 1922 1923 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1924 switch(typ) { 1925 case T_INT: 1926 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1927 break; 1928 case T_FLOAT: 1929 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1930 break; 1931 case T_LONG: 1932 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1933 break; 1934 case T_DOUBLE: 1935 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1936 break; 1937 default: 1938 assert(false,"Should not reach here."); 1939 break; 1940 } 1941 } 1942 1943 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1944 switch(typ) { 1945 case T_INT: 1946 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1947 break; 1948 case T_FLOAT: 1949 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1950 break; 1951 case T_LONG: 1952 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1953 break; 1954 case T_DOUBLE: 1955 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1956 break; 1957 default: 1958 assert(false,"Should not reach here."); 1959 break; 1960 } 1961 } 1962 1963 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1964 switch(typ) { 1965 case T_INT: 1966 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1967 break; 1968 case T_FLOAT: 1969 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1970 break; 1971 case T_LONG: 1972 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1973 break; 1974 case T_DOUBLE: 1975 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1976 break; 1977 default: 1978 assert(false,"Should not reach here."); 1979 break; 1980 } 1981 } 1982 1983 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1984 if (vlen_in_bytes <= 16) { 1985 pxor (dst, dst); 1986 psubb(dst, src); 1987 switch (elem_bt) { 1988 case T_BYTE: /* nothing to do */ break; 1989 case T_SHORT: pmovsxbw(dst, dst); break; 1990 case T_INT: pmovsxbd(dst, dst); break; 1991 case T_FLOAT: pmovsxbd(dst, dst); break; 1992 case T_LONG: pmovsxbq(dst, dst); break; 1993 case T_DOUBLE: pmovsxbq(dst, dst); break; 1994 1995 default: assert(false, "%s", type2name(elem_bt)); 1996 } 1997 } else { 1998 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1999 int vlen_enc = vector_length_encoding(vlen_in_bytes); 2000 2001 vpxor (dst, dst, dst, vlen_enc); 2002 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 2003 2004 switch (elem_bt) { 2005 case T_BYTE: /* nothing to do */ break; 2006 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 2007 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 2008 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 2009 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 2010 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 2011 2012 default: assert(false, "%s", type2name(elem_bt)); 2013 } 2014 } 2015 } 2016 2017 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 2018 if (novlbwdq) { 2019 vpmovsxbd(xtmp, src, vlen_enc); 2020 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 2021 Assembler::eq, true, vlen_enc, noreg); 2022 } else { 2023 vpxor(xtmp, xtmp, xtmp, vlen_enc); 2024 vpsubb(xtmp, xtmp, src, vlen_enc); 2025 evpmovb2m(dst, xtmp, vlen_enc); 2026 } 2027 } 2028 2029 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) { 2030 switch (vlen_in_bytes) { 2031 case 4: movdl(dst, src); break; 2032 case 8: movq(dst, src); break; 2033 case 16: movdqu(dst, src); break; 2034 case 32: vmovdqu(dst, src); break; 2035 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 2036 default: ShouldNotReachHere(); 2037 } 2038 } 2039 2040 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 2041 assert(rscratch != noreg || always_reachable(src), "missing"); 2042 2043 if (reachable(src)) { 2044 load_vector(dst, as_Address(src), vlen_in_bytes); 2045 } else { 2046 lea(rscratch, src); 2047 load_vector(dst, Address(rscratch, 0), vlen_in_bytes); 2048 } 2049 } 2050 2051 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 2052 int vlen_enc = vector_length_encoding(vlen); 2053 if (VM_Version::supports_avx()) { 2054 if (bt == T_LONG) { 2055 if (VM_Version::supports_avx2()) { 2056 vpbroadcastq(dst, src, vlen_enc); 2057 } else { 2058 vmovddup(dst, src, vlen_enc); 2059 } 2060 } else if (bt == T_DOUBLE) { 2061 if (vlen_enc != Assembler::AVX_128bit) { 2062 vbroadcastsd(dst, src, vlen_enc, noreg); 2063 } else { 2064 vmovddup(dst, src, vlen_enc); 2065 } 2066 } else { 2067 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 2068 vpbroadcastd(dst, src, vlen_enc); 2069 } else { 2070 vbroadcastss(dst, src, vlen_enc); 2071 } 2072 } 2073 } else if (VM_Version::supports_sse3()) { 2074 movddup(dst, src); 2075 } else { 2076 movq(dst, src); 2077 if (vlen == 16) { 2078 punpcklqdq(dst, dst); 2079 } 2080 } 2081 } 2082 2083 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 2084 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 2085 int offset = exact_log2(type2aelembytes(bt)) << 6; 2086 if (is_floating_point_type(bt)) { 2087 offset += 128; 2088 } 2089 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 2090 load_vector(dst, addr, vlen_in_bytes); 2091 } 2092 2093 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 2094 2095 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 2096 int vector_len = Assembler::AVX_128bit; 2097 2098 switch (opcode) { 2099 case Op_AndReductionV: pand(dst, src); break; 2100 case Op_OrReductionV: por (dst, src); break; 2101 case Op_XorReductionV: pxor(dst, src); break; 2102 case Op_MinReductionV: 2103 switch (typ) { 2104 case T_BYTE: pminsb(dst, src); break; 2105 case T_SHORT: pminsw(dst, src); break; 2106 case T_INT: pminsd(dst, src); break; 2107 case T_LONG: assert(UseAVX > 2, "required"); 2108 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 2109 default: assert(false, "wrong type"); 2110 } 2111 break; 2112 case Op_MaxReductionV: 2113 switch (typ) { 2114 case T_BYTE: pmaxsb(dst, src); break; 2115 case T_SHORT: pmaxsw(dst, src); break; 2116 case T_INT: pmaxsd(dst, src); break; 2117 case T_LONG: assert(UseAVX > 2, "required"); 2118 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 2119 default: assert(false, "wrong type"); 2120 } 2121 break; 2122 case Op_AddReductionVF: addss(dst, src); break; 2123 case Op_AddReductionVD: addsd(dst, src); break; 2124 case Op_AddReductionVI: 2125 switch (typ) { 2126 case T_BYTE: paddb(dst, src); break; 2127 case T_SHORT: paddw(dst, src); break; 2128 case T_INT: paddd(dst, src); break; 2129 default: assert(false, "wrong type"); 2130 } 2131 break; 2132 case Op_AddReductionVL: paddq(dst, src); break; 2133 case Op_MulReductionVF: mulss(dst, src); break; 2134 case Op_MulReductionVD: mulsd(dst, src); break; 2135 case Op_MulReductionVI: 2136 switch (typ) { 2137 case T_SHORT: pmullw(dst, src); break; 2138 case T_INT: pmulld(dst, src); break; 2139 default: assert(false, "wrong type"); 2140 } 2141 break; 2142 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 2143 evpmullq(dst, dst, src, vector_len); break; 2144 default: assert(false, "wrong opcode"); 2145 } 2146 } 2147 2148 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 2149 int vector_len = Assembler::AVX_256bit; 2150 2151 switch (opcode) { 2152 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 2153 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 2154 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 2155 case Op_MinReductionV: 2156 switch (typ) { 2157 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 2158 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 2159 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 2160 case T_LONG: assert(UseAVX > 2, "required"); 2161 vpminsq(dst, src1, src2, vector_len); break; 2162 default: assert(false, "wrong type"); 2163 } 2164 break; 2165 case Op_MaxReductionV: 2166 switch (typ) { 2167 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 2168 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 2169 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 2170 case T_LONG: assert(UseAVX > 2, "required"); 2171 vpmaxsq(dst, src1, src2, vector_len); break; 2172 default: assert(false, "wrong type"); 2173 } 2174 break; 2175 case Op_AddReductionVI: 2176 switch (typ) { 2177 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 2178 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 2179 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 2180 default: assert(false, "wrong type"); 2181 } 2182 break; 2183 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 2184 case Op_MulReductionVI: 2185 switch (typ) { 2186 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 2187 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 2188 default: assert(false, "wrong type"); 2189 } 2190 break; 2191 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 2192 default: assert(false, "wrong opcode"); 2193 } 2194 } 2195 2196 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 2197 XMMRegister dst, XMMRegister src, 2198 XMMRegister vtmp1, XMMRegister vtmp2) { 2199 switch (opcode) { 2200 case Op_AddReductionVF: 2201 case Op_MulReductionVF: 2202 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 2203 break; 2204 2205 case Op_AddReductionVD: 2206 case Op_MulReductionVD: 2207 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 2208 break; 2209 2210 default: assert(false, "wrong opcode"); 2211 } 2212 } 2213 2214 void C2_MacroAssembler::reduceB(int opcode, int vlen, 2215 Register dst, Register src1, XMMRegister src2, 2216 XMMRegister vtmp1, XMMRegister vtmp2) { 2217 switch (vlen) { 2218 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2219 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2220 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2221 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2222 2223 default: assert(false, "wrong vector length"); 2224 } 2225 } 2226 2227 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 2228 Register dst, Register src1, XMMRegister src2, 2229 XMMRegister vtmp1, XMMRegister vtmp2) { 2230 switch (vlen) { 2231 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2232 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2233 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2234 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2235 2236 default: assert(false, "wrong vector length"); 2237 } 2238 } 2239 2240 void C2_MacroAssembler::reduceS(int opcode, int vlen, 2241 Register dst, Register src1, XMMRegister src2, 2242 XMMRegister vtmp1, XMMRegister vtmp2) { 2243 switch (vlen) { 2244 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2245 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2246 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2247 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2248 2249 default: assert(false, "wrong vector length"); 2250 } 2251 } 2252 2253 void C2_MacroAssembler::reduceI(int opcode, int vlen, 2254 Register dst, Register src1, XMMRegister src2, 2255 XMMRegister vtmp1, XMMRegister vtmp2) { 2256 switch (vlen) { 2257 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2258 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2259 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2260 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2261 2262 default: assert(false, "wrong vector length"); 2263 } 2264 } 2265 2266 #ifdef _LP64 2267 void C2_MacroAssembler::reduceL(int opcode, int vlen, 2268 Register dst, Register src1, XMMRegister src2, 2269 XMMRegister vtmp1, XMMRegister vtmp2) { 2270 switch (vlen) { 2271 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2272 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2273 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2274 2275 default: assert(false, "wrong vector length"); 2276 } 2277 } 2278 #endif // _LP64 2279 2280 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2281 switch (vlen) { 2282 case 2: 2283 assert(vtmp2 == xnoreg, ""); 2284 reduce2F(opcode, dst, src, vtmp1); 2285 break; 2286 case 4: 2287 assert(vtmp2 == xnoreg, ""); 2288 reduce4F(opcode, dst, src, vtmp1); 2289 break; 2290 case 8: 2291 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2292 break; 2293 case 16: 2294 reduce16F(opcode, dst, src, vtmp1, vtmp2); 2295 break; 2296 default: assert(false, "wrong vector length"); 2297 } 2298 } 2299 2300 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2301 switch (vlen) { 2302 case 2: 2303 assert(vtmp2 == xnoreg, ""); 2304 reduce2D(opcode, dst, src, vtmp1); 2305 break; 2306 case 4: 2307 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2308 break; 2309 case 8: 2310 reduce8D(opcode, dst, src, vtmp1, vtmp2); 2311 break; 2312 default: assert(false, "wrong vector length"); 2313 } 2314 } 2315 2316 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2317 if (opcode == Op_AddReductionVI) { 2318 if (vtmp1 != src2) { 2319 movdqu(vtmp1, src2); 2320 } 2321 phaddd(vtmp1, vtmp1); 2322 } else { 2323 pshufd(vtmp1, src2, 0x1); 2324 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2325 } 2326 movdl(vtmp2, src1); 2327 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2328 movdl(dst, vtmp1); 2329 } 2330 2331 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2332 if (opcode == Op_AddReductionVI) { 2333 if (vtmp1 != src2) { 2334 movdqu(vtmp1, src2); 2335 } 2336 phaddd(vtmp1, src2); 2337 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2338 } else { 2339 pshufd(vtmp2, src2, 0xE); 2340 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2341 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2342 } 2343 } 2344 2345 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2346 if (opcode == Op_AddReductionVI) { 2347 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2348 vextracti128_high(vtmp2, vtmp1); 2349 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2350 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2351 } else { 2352 vextracti128_high(vtmp1, src2); 2353 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2354 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2355 } 2356 } 2357 2358 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2359 vextracti64x4_high(vtmp2, src2); 2360 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2361 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2362 } 2363 2364 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2365 pshufd(vtmp2, src2, 0x1); 2366 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2367 movdqu(vtmp1, vtmp2); 2368 psrldq(vtmp1, 2); 2369 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2370 movdqu(vtmp2, vtmp1); 2371 psrldq(vtmp2, 1); 2372 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2373 movdl(vtmp2, src1); 2374 pmovsxbd(vtmp1, vtmp1); 2375 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2376 pextrb(dst, vtmp1, 0x0); 2377 movsbl(dst, dst); 2378 } 2379 2380 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2381 pshufd(vtmp1, src2, 0xE); 2382 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2383 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2384 } 2385 2386 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2387 vextracti128_high(vtmp2, src2); 2388 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2389 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2390 } 2391 2392 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2393 vextracti64x4_high(vtmp1, src2); 2394 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2395 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2396 } 2397 2398 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2399 pmovsxbw(vtmp2, src2); 2400 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2401 } 2402 2403 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2404 if (UseAVX > 1) { 2405 int vector_len = Assembler::AVX_256bit; 2406 vpmovsxbw(vtmp1, src2, vector_len); 2407 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2408 } else { 2409 pmovsxbw(vtmp2, src2); 2410 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2411 pshufd(vtmp2, src2, 0x1); 2412 pmovsxbw(vtmp2, src2); 2413 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2414 } 2415 } 2416 2417 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2418 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2419 int vector_len = Assembler::AVX_512bit; 2420 vpmovsxbw(vtmp1, src2, vector_len); 2421 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2422 } else { 2423 assert(UseAVX >= 2,"Should not reach here."); 2424 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2425 vextracti128_high(vtmp2, src2); 2426 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2427 } 2428 } 2429 2430 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2431 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2432 vextracti64x4_high(vtmp2, src2); 2433 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2434 } 2435 2436 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2437 if (opcode == Op_AddReductionVI) { 2438 if (vtmp1 != src2) { 2439 movdqu(vtmp1, src2); 2440 } 2441 phaddw(vtmp1, vtmp1); 2442 phaddw(vtmp1, vtmp1); 2443 } else { 2444 pshufd(vtmp2, src2, 0x1); 2445 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2446 movdqu(vtmp1, vtmp2); 2447 psrldq(vtmp1, 2); 2448 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2449 } 2450 movdl(vtmp2, src1); 2451 pmovsxwd(vtmp1, vtmp1); 2452 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2453 pextrw(dst, vtmp1, 0x0); 2454 movswl(dst, dst); 2455 } 2456 2457 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2458 if (opcode == Op_AddReductionVI) { 2459 if (vtmp1 != src2) { 2460 movdqu(vtmp1, src2); 2461 } 2462 phaddw(vtmp1, src2); 2463 } else { 2464 pshufd(vtmp1, src2, 0xE); 2465 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2466 } 2467 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2468 } 2469 2470 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2471 if (opcode == Op_AddReductionVI) { 2472 int vector_len = Assembler::AVX_256bit; 2473 vphaddw(vtmp2, src2, src2, vector_len); 2474 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2475 } else { 2476 vextracti128_high(vtmp2, src2); 2477 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2478 } 2479 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2480 } 2481 2482 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2483 int vector_len = Assembler::AVX_256bit; 2484 vextracti64x4_high(vtmp1, src2); 2485 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2486 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2487 } 2488 2489 #ifdef _LP64 2490 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2491 pshufd(vtmp2, src2, 0xE); 2492 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2493 movdq(vtmp1, src1); 2494 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2495 movdq(dst, vtmp1); 2496 } 2497 2498 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2499 vextracti128_high(vtmp1, src2); 2500 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2501 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2502 } 2503 2504 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2505 vextracti64x4_high(vtmp2, src2); 2506 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2507 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2508 } 2509 2510 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2511 mov64(temp, -1L); 2512 bzhiq(temp, temp, len); 2513 kmovql(dst, temp); 2514 } 2515 #endif // _LP64 2516 2517 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2518 reduce_operation_128(T_FLOAT, opcode, dst, src); 2519 pshufd(vtmp, src, 0x1); 2520 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2521 } 2522 2523 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2524 reduce2F(opcode, dst, src, vtmp); 2525 pshufd(vtmp, src, 0x2); 2526 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2527 pshufd(vtmp, src, 0x3); 2528 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2529 } 2530 2531 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2532 reduce4F(opcode, dst, src, vtmp2); 2533 vextractf128_high(vtmp2, src); 2534 reduce4F(opcode, dst, vtmp2, vtmp1); 2535 } 2536 2537 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2538 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2539 vextracti64x4_high(vtmp1, src); 2540 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2541 } 2542 2543 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2544 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2545 pshufd(vtmp, src, 0xE); 2546 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2547 } 2548 2549 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2550 reduce2D(opcode, dst, src, vtmp2); 2551 vextractf128_high(vtmp2, src); 2552 reduce2D(opcode, dst, vtmp2, vtmp1); 2553 } 2554 2555 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2556 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2557 vextracti64x4_high(vtmp1, src); 2558 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2559 } 2560 2561 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2562 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2563 } 2564 2565 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2566 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2567 } 2568 2569 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2570 int vec_enc) { 2571 switch(elem_bt) { 2572 case T_INT: 2573 case T_FLOAT: 2574 vmaskmovps(dst, src, mask, vec_enc); 2575 break; 2576 case T_LONG: 2577 case T_DOUBLE: 2578 vmaskmovpd(dst, src, mask, vec_enc); 2579 break; 2580 default: 2581 fatal("Unsupported type %s", type2name(elem_bt)); 2582 break; 2583 } 2584 } 2585 2586 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2587 int vec_enc) { 2588 switch(elem_bt) { 2589 case T_INT: 2590 case T_FLOAT: 2591 vmaskmovps(dst, src, mask, vec_enc); 2592 break; 2593 case T_LONG: 2594 case T_DOUBLE: 2595 vmaskmovpd(dst, src, mask, vec_enc); 2596 break; 2597 default: 2598 fatal("Unsupported type %s", type2name(elem_bt)); 2599 break; 2600 } 2601 } 2602 2603 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2604 XMMRegister dst, XMMRegister src, 2605 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2606 XMMRegister xmm_0, XMMRegister xmm_1) { 2607 const int permconst[] = {1, 14}; 2608 XMMRegister wsrc = src; 2609 XMMRegister wdst = xmm_0; 2610 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2611 2612 int vlen_enc = Assembler::AVX_128bit; 2613 if (vlen == 16) { 2614 vlen_enc = Assembler::AVX_256bit; 2615 } 2616 2617 for (int i = log2(vlen) - 1; i >=0; i--) { 2618 if (i == 0 && !is_dst_valid) { 2619 wdst = dst; 2620 } 2621 if (i == 3) { 2622 vextracti64x4_high(wtmp, wsrc); 2623 } else if (i == 2) { 2624 vextracti128_high(wtmp, wsrc); 2625 } else { // i = [0,1] 2626 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2627 } 2628 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2629 wsrc = wdst; 2630 vlen_enc = Assembler::AVX_128bit; 2631 } 2632 if (is_dst_valid) { 2633 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2634 } 2635 } 2636 2637 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2638 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2639 XMMRegister xmm_0, XMMRegister xmm_1) { 2640 XMMRegister wsrc = src; 2641 XMMRegister wdst = xmm_0; 2642 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2643 int vlen_enc = Assembler::AVX_128bit; 2644 if (vlen == 8) { 2645 vlen_enc = Assembler::AVX_256bit; 2646 } 2647 for (int i = log2(vlen) - 1; i >=0; i--) { 2648 if (i == 0 && !is_dst_valid) { 2649 wdst = dst; 2650 } 2651 if (i == 1) { 2652 vextracti128_high(wtmp, wsrc); 2653 } else if (i == 2) { 2654 vextracti64x4_high(wtmp, wsrc); 2655 } else { 2656 assert(i == 0, "%d", i); 2657 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2658 } 2659 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2660 wsrc = wdst; 2661 vlen_enc = Assembler::AVX_128bit; 2662 } 2663 if (is_dst_valid) { 2664 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2665 } 2666 } 2667 2668 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2669 switch (bt) { 2670 case T_BYTE: pextrb(dst, src, idx); break; 2671 case T_SHORT: pextrw(dst, src, idx); break; 2672 case T_INT: pextrd(dst, src, idx); break; 2673 case T_LONG: pextrq(dst, src, idx); break; 2674 2675 default: 2676 assert(false,"Should not reach here."); 2677 break; 2678 } 2679 } 2680 2681 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2682 int esize = type2aelembytes(typ); 2683 int elem_per_lane = 16/esize; 2684 int lane = elemindex / elem_per_lane; 2685 int eindex = elemindex % elem_per_lane; 2686 2687 if (lane >= 2) { 2688 assert(UseAVX > 2, "required"); 2689 vextractf32x4(dst, src, lane & 3); 2690 return dst; 2691 } else if (lane > 0) { 2692 assert(UseAVX > 0, "required"); 2693 vextractf128(dst, src, lane); 2694 return dst; 2695 } else { 2696 return src; 2697 } 2698 } 2699 2700 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2701 if (typ == T_BYTE) { 2702 movsbl(dst, dst); 2703 } else if (typ == T_SHORT) { 2704 movswl(dst, dst); 2705 } 2706 } 2707 2708 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2709 int esize = type2aelembytes(typ); 2710 int elem_per_lane = 16/esize; 2711 int eindex = elemindex % elem_per_lane; 2712 assert(is_integral_type(typ),"required"); 2713 2714 if (eindex == 0) { 2715 if (typ == T_LONG) { 2716 movq(dst, src); 2717 } else { 2718 movdl(dst, src); 2719 movsxl(typ, dst); 2720 } 2721 } else { 2722 extract(typ, dst, src, eindex); 2723 movsxl(typ, dst); 2724 } 2725 } 2726 2727 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2728 int esize = type2aelembytes(typ); 2729 int elem_per_lane = 16/esize; 2730 int eindex = elemindex % elem_per_lane; 2731 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2732 2733 if (eindex == 0) { 2734 movq(dst, src); 2735 } else { 2736 if (typ == T_FLOAT) { 2737 if (UseAVX == 0) { 2738 movdqu(dst, src); 2739 shufps(dst, dst, eindex); 2740 } else { 2741 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2742 } 2743 } else { 2744 if (UseAVX == 0) { 2745 movdqu(dst, src); 2746 psrldq(dst, eindex*esize); 2747 } else { 2748 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2749 } 2750 movq(dst, dst); 2751 } 2752 } 2753 // Zero upper bits 2754 if (typ == T_FLOAT) { 2755 if (UseAVX == 0) { 2756 assert(vtmp != xnoreg, "required."); 2757 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2758 pand(dst, vtmp); 2759 } else { 2760 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2761 } 2762 } 2763 } 2764 2765 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2766 switch(typ) { 2767 case T_BYTE: 2768 case T_BOOLEAN: 2769 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2770 break; 2771 case T_SHORT: 2772 case T_CHAR: 2773 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2774 break; 2775 case T_INT: 2776 case T_FLOAT: 2777 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2778 break; 2779 case T_LONG: 2780 case T_DOUBLE: 2781 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2782 break; 2783 default: 2784 assert(false,"Should not reach here."); 2785 break; 2786 } 2787 } 2788 2789 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2790 assert(rscratch != noreg || always_reachable(src2), "missing"); 2791 2792 switch(typ) { 2793 case T_BOOLEAN: 2794 case T_BYTE: 2795 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2796 break; 2797 case T_CHAR: 2798 case T_SHORT: 2799 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2800 break; 2801 case T_INT: 2802 case T_FLOAT: 2803 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2804 break; 2805 case T_LONG: 2806 case T_DOUBLE: 2807 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2808 break; 2809 default: 2810 assert(false,"Should not reach here."); 2811 break; 2812 } 2813 } 2814 2815 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2816 switch(typ) { 2817 case T_BYTE: 2818 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2819 break; 2820 case T_SHORT: 2821 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2822 break; 2823 case T_INT: 2824 case T_FLOAT: 2825 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2826 break; 2827 case T_LONG: 2828 case T_DOUBLE: 2829 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2830 break; 2831 default: 2832 assert(false,"Should not reach here."); 2833 break; 2834 } 2835 } 2836 2837 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2838 assert(vlen_in_bytes <= 32, ""); 2839 int esize = type2aelembytes(bt); 2840 if (vlen_in_bytes == 32) { 2841 assert(vtmp == xnoreg, "required."); 2842 if (esize >= 4) { 2843 vtestps(src1, src2, AVX_256bit); 2844 } else { 2845 vptest(src1, src2, AVX_256bit); 2846 } 2847 return; 2848 } 2849 if (vlen_in_bytes < 16) { 2850 // Duplicate the lower part to fill the whole register, 2851 // Don't need to do so for src2 2852 assert(vtmp != xnoreg, "required"); 2853 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2854 pshufd(vtmp, src1, shuffle_imm); 2855 } else { 2856 assert(vtmp == xnoreg, "required"); 2857 vtmp = src1; 2858 } 2859 if (esize >= 4 && VM_Version::supports_avx()) { 2860 vtestps(vtmp, src2, AVX_128bit); 2861 } else { 2862 ptest(vtmp, src2); 2863 } 2864 } 2865 2866 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2867 assert(UseAVX >= 2, "required"); 2868 #ifdef ASSERT 2869 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2870 bool is_bw_supported = VM_Version::supports_avx512bw(); 2871 if (is_bw && !is_bw_supported) { 2872 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2873 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2874 "XMM register should be 0-15"); 2875 } 2876 #endif // ASSERT 2877 switch (elem_bt) { 2878 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2879 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2880 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2881 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2882 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2883 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2884 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2885 } 2886 } 2887 2888 #ifdef _LP64 2889 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2890 assert(UseAVX >= 2, "required"); 2891 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2892 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2893 if ((UseAVX > 2) && 2894 (!is_bw || VM_Version::supports_avx512bw()) && 2895 (!is_vl || VM_Version::supports_avx512vl())) { 2896 switch (elem_bt) { 2897 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2898 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2899 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2900 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2901 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2902 } 2903 } else { 2904 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2905 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2906 switch (elem_bt) { 2907 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2908 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2909 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2910 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2911 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2912 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2913 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2914 } 2915 } 2916 } 2917 #endif 2918 2919 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2920 switch (to_elem_bt) { 2921 case T_SHORT: 2922 vpmovsxbw(dst, src, vlen_enc); 2923 break; 2924 case T_INT: 2925 vpmovsxbd(dst, src, vlen_enc); 2926 break; 2927 case T_FLOAT: 2928 vpmovsxbd(dst, src, vlen_enc); 2929 vcvtdq2ps(dst, dst, vlen_enc); 2930 break; 2931 case T_LONG: 2932 vpmovsxbq(dst, src, vlen_enc); 2933 break; 2934 case T_DOUBLE: { 2935 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2936 vpmovsxbd(dst, src, mid_vlen_enc); 2937 vcvtdq2pd(dst, dst, vlen_enc); 2938 break; 2939 } 2940 default: 2941 fatal("Unsupported type %s", type2name(to_elem_bt)); 2942 break; 2943 } 2944 } 2945 2946 //------------------------------------------------------------------------------------------- 2947 2948 // IndexOf for constant substrings with size >= 8 chars 2949 // which don't need to be loaded through stack. 2950 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2951 Register cnt1, Register cnt2, 2952 int int_cnt2, Register result, 2953 XMMRegister vec, Register tmp, 2954 int ae) { 2955 ShortBranchVerifier sbv(this); 2956 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2957 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2958 2959 // This method uses the pcmpestri instruction with bound registers 2960 // inputs: 2961 // xmm - substring 2962 // rax - substring length (elements count) 2963 // mem - scanned string 2964 // rdx - string length (elements count) 2965 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2966 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2967 // outputs: 2968 // rcx - matched index in string 2969 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2970 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2971 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2972 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2973 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2974 2975 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2976 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2977 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2978 2979 // Note, inline_string_indexOf() generates checks: 2980 // if (substr.count > string.count) return -1; 2981 // if (substr.count == 0) return 0; 2982 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2983 2984 // Load substring. 2985 if (ae == StrIntrinsicNode::UL) { 2986 pmovzxbw(vec, Address(str2, 0)); 2987 } else { 2988 movdqu(vec, Address(str2, 0)); 2989 } 2990 movl(cnt2, int_cnt2); 2991 movptr(result, str1); // string addr 2992 2993 if (int_cnt2 > stride) { 2994 jmpb(SCAN_TO_SUBSTR); 2995 2996 // Reload substr for rescan, this code 2997 // is executed only for large substrings (> 8 chars) 2998 bind(RELOAD_SUBSTR); 2999 if (ae == StrIntrinsicNode::UL) { 3000 pmovzxbw(vec, Address(str2, 0)); 3001 } else { 3002 movdqu(vec, Address(str2, 0)); 3003 } 3004 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 3005 3006 bind(RELOAD_STR); 3007 // We came here after the beginning of the substring was 3008 // matched but the rest of it was not so we need to search 3009 // again. Start from the next element after the previous match. 3010 3011 // cnt2 is number of substring reminding elements and 3012 // cnt1 is number of string reminding elements when cmp failed. 3013 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 3014 subl(cnt1, cnt2); 3015 addl(cnt1, int_cnt2); 3016 movl(cnt2, int_cnt2); // Now restore cnt2 3017 3018 decrementl(cnt1); // Shift to next element 3019 cmpl(cnt1, cnt2); 3020 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3021 3022 addptr(result, (1<<scale1)); 3023 3024 } // (int_cnt2 > 8) 3025 3026 // Scan string for start of substr in 16-byte vectors 3027 bind(SCAN_TO_SUBSTR); 3028 pcmpestri(vec, Address(result, 0), mode); 3029 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3030 subl(cnt1, stride); 3031 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3032 cmpl(cnt1, cnt2); 3033 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3034 addptr(result, 16); 3035 jmpb(SCAN_TO_SUBSTR); 3036 3037 // Found a potential substr 3038 bind(FOUND_CANDIDATE); 3039 // Matched whole vector if first element matched (tmp(rcx) == 0). 3040 if (int_cnt2 == stride) { 3041 jccb(Assembler::overflow, RET_FOUND); // OF == 1 3042 } else { // int_cnt2 > 8 3043 jccb(Assembler::overflow, FOUND_SUBSTR); 3044 } 3045 // After pcmpestri tmp(rcx) contains matched element index 3046 // Compute start addr of substr 3047 lea(result, Address(result, tmp, scale1)); 3048 3049 // Make sure string is still long enough 3050 subl(cnt1, tmp); 3051 cmpl(cnt1, cnt2); 3052 if (int_cnt2 == stride) { 3053 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3054 } else { // int_cnt2 > 8 3055 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 3056 } 3057 // Left less then substring. 3058 3059 bind(RET_NOT_FOUND); 3060 movl(result, -1); 3061 jmp(EXIT); 3062 3063 if (int_cnt2 > stride) { 3064 // This code is optimized for the case when whole substring 3065 // is matched if its head is matched. 3066 bind(MATCH_SUBSTR_HEAD); 3067 pcmpestri(vec, Address(result, 0), mode); 3068 // Reload only string if does not match 3069 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 3070 3071 Label CONT_SCAN_SUBSTR; 3072 // Compare the rest of substring (> 8 chars). 3073 bind(FOUND_SUBSTR); 3074 // First 8 chars are already matched. 3075 negptr(cnt2); 3076 addptr(cnt2, stride); 3077 3078 bind(SCAN_SUBSTR); 3079 subl(cnt1, stride); 3080 cmpl(cnt2, -stride); // Do not read beyond substring 3081 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 3082 // Back-up strings to avoid reading beyond substring: 3083 // cnt1 = cnt1 - cnt2 + 8 3084 addl(cnt1, cnt2); // cnt2 is negative 3085 addl(cnt1, stride); 3086 movl(cnt2, stride); negptr(cnt2); 3087 bind(CONT_SCAN_SUBSTR); 3088 if (int_cnt2 < (int)G) { 3089 int tail_off1 = int_cnt2<<scale1; 3090 int tail_off2 = int_cnt2<<scale2; 3091 if (ae == StrIntrinsicNode::UL) { 3092 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 3093 } else { 3094 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 3095 } 3096 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 3097 } else { 3098 // calculate index in register to avoid integer overflow (int_cnt2*2) 3099 movl(tmp, int_cnt2); 3100 addptr(tmp, cnt2); 3101 if (ae == StrIntrinsicNode::UL) { 3102 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 3103 } else { 3104 movdqu(vec, Address(str2, tmp, scale2, 0)); 3105 } 3106 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 3107 } 3108 // Need to reload strings pointers if not matched whole vector 3109 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3110 addptr(cnt2, stride); 3111 jcc(Assembler::negative, SCAN_SUBSTR); 3112 // Fall through if found full substring 3113 3114 } // (int_cnt2 > 8) 3115 3116 bind(RET_FOUND); 3117 // Found result if we matched full small substring. 3118 // Compute substr offset 3119 subptr(result, str1); 3120 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3121 shrl(result, 1); // index 3122 } 3123 bind(EXIT); 3124 3125 } // string_indexofC8 3126 3127 // Small strings are loaded through stack if they cross page boundary. 3128 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 3129 Register cnt1, Register cnt2, 3130 int int_cnt2, Register result, 3131 XMMRegister vec, Register tmp, 3132 int ae) { 3133 ShortBranchVerifier sbv(this); 3134 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3135 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 3136 3137 // 3138 // int_cnt2 is length of small (< 8 chars) constant substring 3139 // or (-1) for non constant substring in which case its length 3140 // is in cnt2 register. 3141 // 3142 // Note, inline_string_indexOf() generates checks: 3143 // if (substr.count > string.count) return -1; 3144 // if (substr.count == 0) return 0; 3145 // 3146 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 3147 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 3148 // This method uses the pcmpestri instruction with bound registers 3149 // inputs: 3150 // xmm - substring 3151 // rax - substring length (elements count) 3152 // mem - scanned string 3153 // rdx - string length (elements count) 3154 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 3155 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 3156 // outputs: 3157 // rcx - matched index in string 3158 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3159 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 3160 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 3161 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 3162 3163 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 3164 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 3165 FOUND_CANDIDATE; 3166 3167 { //======================================================== 3168 // We don't know where these strings are located 3169 // and we can't read beyond them. Load them through stack. 3170 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 3171 3172 movptr(tmp, rsp); // save old SP 3173 3174 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 3175 if (int_cnt2 == (1>>scale2)) { // One byte 3176 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 3177 load_unsigned_byte(result, Address(str2, 0)); 3178 movdl(vec, result); // move 32 bits 3179 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 3180 // Not enough header space in 32-bit VM: 12+3 = 15. 3181 movl(result, Address(str2, -1)); 3182 shrl(result, 8); 3183 movdl(vec, result); // move 32 bits 3184 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 3185 load_unsigned_short(result, Address(str2, 0)); 3186 movdl(vec, result); // move 32 bits 3187 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 3188 movdl(vec, Address(str2, 0)); // move 32 bits 3189 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 3190 movq(vec, Address(str2, 0)); // move 64 bits 3191 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 3192 // Array header size is 12 bytes in 32-bit VM 3193 // + 6 bytes for 3 chars == 18 bytes, 3194 // enough space to load vec and shift. 3195 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 3196 if (ae == StrIntrinsicNode::UL) { 3197 int tail_off = int_cnt2-8; 3198 pmovzxbw(vec, Address(str2, tail_off)); 3199 psrldq(vec, -2*tail_off); 3200 } 3201 else { 3202 int tail_off = int_cnt2*(1<<scale2); 3203 movdqu(vec, Address(str2, tail_off-16)); 3204 psrldq(vec, 16-tail_off); 3205 } 3206 } 3207 } else { // not constant substring 3208 cmpl(cnt2, stride); 3209 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 3210 3211 // We can read beyond string if srt+16 does not cross page boundary 3212 // since heaps are aligned and mapped by pages. 3213 assert(os::vm_page_size() < (int)G, "default page should be small"); 3214 movl(result, str2); // We need only low 32 bits 3215 andl(result, ((int)os::vm_page_size()-1)); 3216 cmpl(result, ((int)os::vm_page_size()-16)); 3217 jccb(Assembler::belowEqual, CHECK_STR); 3218 3219 // Move small strings to stack to allow load 16 bytes into vec. 3220 subptr(rsp, 16); 3221 int stk_offset = wordSize-(1<<scale2); 3222 push(cnt2); 3223 3224 bind(COPY_SUBSTR); 3225 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 3226 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 3227 movb(Address(rsp, cnt2, scale2, stk_offset), result); 3228 } else if (ae == StrIntrinsicNode::UU) { 3229 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 3230 movw(Address(rsp, cnt2, scale2, stk_offset), result); 3231 } 3232 decrement(cnt2); 3233 jccb(Assembler::notZero, COPY_SUBSTR); 3234 3235 pop(cnt2); 3236 movptr(str2, rsp); // New substring address 3237 } // non constant 3238 3239 bind(CHECK_STR); 3240 cmpl(cnt1, stride); 3241 jccb(Assembler::aboveEqual, BIG_STRINGS); 3242 3243 // Check cross page boundary. 3244 movl(result, str1); // We need only low 32 bits 3245 andl(result, ((int)os::vm_page_size()-1)); 3246 cmpl(result, ((int)os::vm_page_size()-16)); 3247 jccb(Assembler::belowEqual, BIG_STRINGS); 3248 3249 subptr(rsp, 16); 3250 int stk_offset = -(1<<scale1); 3251 if (int_cnt2 < 0) { // not constant 3252 push(cnt2); 3253 stk_offset += wordSize; 3254 } 3255 movl(cnt2, cnt1); 3256 3257 bind(COPY_STR); 3258 if (ae == StrIntrinsicNode::LL) { 3259 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3260 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3261 } else { 3262 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3263 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3264 } 3265 decrement(cnt2); 3266 jccb(Assembler::notZero, COPY_STR); 3267 3268 if (int_cnt2 < 0) { // not constant 3269 pop(cnt2); 3270 } 3271 movptr(str1, rsp); // New string address 3272 3273 bind(BIG_STRINGS); 3274 // Load substring. 3275 if (int_cnt2 < 0) { // -1 3276 if (ae == StrIntrinsicNode::UL) { 3277 pmovzxbw(vec, Address(str2, 0)); 3278 } else { 3279 movdqu(vec, Address(str2, 0)); 3280 } 3281 push(cnt2); // substr count 3282 push(str2); // substr addr 3283 push(str1); // string addr 3284 } else { 3285 // Small (< 8 chars) constant substrings are loaded already. 3286 movl(cnt2, int_cnt2); 3287 } 3288 push(tmp); // original SP 3289 3290 } // Finished loading 3291 3292 //======================================================== 3293 // Start search 3294 // 3295 3296 movptr(result, str1); // string addr 3297 3298 if (int_cnt2 < 0) { // Only for non constant substring 3299 jmpb(SCAN_TO_SUBSTR); 3300 3301 // SP saved at sp+0 3302 // String saved at sp+1*wordSize 3303 // Substr saved at sp+2*wordSize 3304 // Substr count saved at sp+3*wordSize 3305 3306 // Reload substr for rescan, this code 3307 // is executed only for large substrings (> 8 chars) 3308 bind(RELOAD_SUBSTR); 3309 movptr(str2, Address(rsp, 2*wordSize)); 3310 movl(cnt2, Address(rsp, 3*wordSize)); 3311 if (ae == StrIntrinsicNode::UL) { 3312 pmovzxbw(vec, Address(str2, 0)); 3313 } else { 3314 movdqu(vec, Address(str2, 0)); 3315 } 3316 // We came here after the beginning of the substring was 3317 // matched but the rest of it was not so we need to search 3318 // again. Start from the next element after the previous match. 3319 subptr(str1, result); // Restore counter 3320 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3321 shrl(str1, 1); 3322 } 3323 addl(cnt1, str1); 3324 decrementl(cnt1); // Shift to next element 3325 cmpl(cnt1, cnt2); 3326 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3327 3328 addptr(result, (1<<scale1)); 3329 } // non constant 3330 3331 // Scan string for start of substr in 16-byte vectors 3332 bind(SCAN_TO_SUBSTR); 3333 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3334 pcmpestri(vec, Address(result, 0), mode); 3335 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3336 subl(cnt1, stride); 3337 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3338 cmpl(cnt1, cnt2); 3339 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3340 addptr(result, 16); 3341 3342 bind(ADJUST_STR); 3343 cmpl(cnt1, stride); // Do not read beyond string 3344 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3345 // Back-up string to avoid reading beyond string. 3346 lea(result, Address(result, cnt1, scale1, -16)); 3347 movl(cnt1, stride); 3348 jmpb(SCAN_TO_SUBSTR); 3349 3350 // Found a potential substr 3351 bind(FOUND_CANDIDATE); 3352 // After pcmpestri tmp(rcx) contains matched element index 3353 3354 // Make sure string is still long enough 3355 subl(cnt1, tmp); 3356 cmpl(cnt1, cnt2); 3357 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3358 // Left less then substring. 3359 3360 bind(RET_NOT_FOUND); 3361 movl(result, -1); 3362 jmp(CLEANUP); 3363 3364 bind(FOUND_SUBSTR); 3365 // Compute start addr of substr 3366 lea(result, Address(result, tmp, scale1)); 3367 if (int_cnt2 > 0) { // Constant substring 3368 // Repeat search for small substring (< 8 chars) 3369 // from new point without reloading substring. 3370 // Have to check that we don't read beyond string. 3371 cmpl(tmp, stride-int_cnt2); 3372 jccb(Assembler::greater, ADJUST_STR); 3373 // Fall through if matched whole substring. 3374 } else { // non constant 3375 assert(int_cnt2 == -1, "should be != 0"); 3376 3377 addl(tmp, cnt2); 3378 // Found result if we matched whole substring. 3379 cmpl(tmp, stride); 3380 jcc(Assembler::lessEqual, RET_FOUND); 3381 3382 // Repeat search for small substring (<= 8 chars) 3383 // from new point 'str1' without reloading substring. 3384 cmpl(cnt2, stride); 3385 // Have to check that we don't read beyond string. 3386 jccb(Assembler::lessEqual, ADJUST_STR); 3387 3388 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3389 // Compare the rest of substring (> 8 chars). 3390 movptr(str1, result); 3391 3392 cmpl(tmp, cnt2); 3393 // First 8 chars are already matched. 3394 jccb(Assembler::equal, CHECK_NEXT); 3395 3396 bind(SCAN_SUBSTR); 3397 pcmpestri(vec, Address(str1, 0), mode); 3398 // Need to reload strings pointers if not matched whole vector 3399 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3400 3401 bind(CHECK_NEXT); 3402 subl(cnt2, stride); 3403 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3404 addptr(str1, 16); 3405 if (ae == StrIntrinsicNode::UL) { 3406 addptr(str2, 8); 3407 } else { 3408 addptr(str2, 16); 3409 } 3410 subl(cnt1, stride); 3411 cmpl(cnt2, stride); // Do not read beyond substring 3412 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3413 // Back-up strings to avoid reading beyond substring. 3414 3415 if (ae == StrIntrinsicNode::UL) { 3416 lea(str2, Address(str2, cnt2, scale2, -8)); 3417 lea(str1, Address(str1, cnt2, scale1, -16)); 3418 } else { 3419 lea(str2, Address(str2, cnt2, scale2, -16)); 3420 lea(str1, Address(str1, cnt2, scale1, -16)); 3421 } 3422 subl(cnt1, cnt2); 3423 movl(cnt2, stride); 3424 addl(cnt1, stride); 3425 bind(CONT_SCAN_SUBSTR); 3426 if (ae == StrIntrinsicNode::UL) { 3427 pmovzxbw(vec, Address(str2, 0)); 3428 } else { 3429 movdqu(vec, Address(str2, 0)); 3430 } 3431 jmp(SCAN_SUBSTR); 3432 3433 bind(RET_FOUND_LONG); 3434 movptr(str1, Address(rsp, wordSize)); 3435 } // non constant 3436 3437 bind(RET_FOUND); 3438 // Compute substr offset 3439 subptr(result, str1); 3440 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3441 shrl(result, 1); // index 3442 } 3443 bind(CLEANUP); 3444 pop(rsp); // restore SP 3445 3446 } // string_indexof 3447 3448 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3449 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3450 ShortBranchVerifier sbv(this); 3451 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3452 3453 int stride = 8; 3454 3455 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3456 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3457 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3458 FOUND_SEQ_CHAR, DONE_LABEL; 3459 3460 movptr(result, str1); 3461 if (UseAVX >= 2) { 3462 cmpl(cnt1, stride); 3463 jcc(Assembler::less, SCAN_TO_CHAR); 3464 cmpl(cnt1, 2*stride); 3465 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3466 movdl(vec1, ch); 3467 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3468 vpxor(vec2, vec2); 3469 movl(tmp, cnt1); 3470 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3471 andl(cnt1,0x0000000F); //tail count (in chars) 3472 3473 bind(SCAN_TO_16_CHAR_LOOP); 3474 vmovdqu(vec3, Address(result, 0)); 3475 vpcmpeqw(vec3, vec3, vec1, 1); 3476 vptest(vec2, vec3); 3477 jcc(Assembler::carryClear, FOUND_CHAR); 3478 addptr(result, 32); 3479 subl(tmp, 2*stride); 3480 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3481 jmp(SCAN_TO_8_CHAR); 3482 bind(SCAN_TO_8_CHAR_INIT); 3483 movdl(vec1, ch); 3484 pshuflw(vec1, vec1, 0x00); 3485 pshufd(vec1, vec1, 0); 3486 pxor(vec2, vec2); 3487 } 3488 bind(SCAN_TO_8_CHAR); 3489 cmpl(cnt1, stride); 3490 jcc(Assembler::less, SCAN_TO_CHAR); 3491 if (UseAVX < 2) { 3492 movdl(vec1, ch); 3493 pshuflw(vec1, vec1, 0x00); 3494 pshufd(vec1, vec1, 0); 3495 pxor(vec2, vec2); 3496 } 3497 movl(tmp, cnt1); 3498 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3499 andl(cnt1,0x00000007); //tail count (in chars) 3500 3501 bind(SCAN_TO_8_CHAR_LOOP); 3502 movdqu(vec3, Address(result, 0)); 3503 pcmpeqw(vec3, vec1); 3504 ptest(vec2, vec3); 3505 jcc(Assembler::carryClear, FOUND_CHAR); 3506 addptr(result, 16); 3507 subl(tmp, stride); 3508 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3509 bind(SCAN_TO_CHAR); 3510 testl(cnt1, cnt1); 3511 jcc(Assembler::zero, RET_NOT_FOUND); 3512 bind(SCAN_TO_CHAR_LOOP); 3513 load_unsigned_short(tmp, Address(result, 0)); 3514 cmpl(ch, tmp); 3515 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3516 addptr(result, 2); 3517 subl(cnt1, 1); 3518 jccb(Assembler::zero, RET_NOT_FOUND); 3519 jmp(SCAN_TO_CHAR_LOOP); 3520 3521 bind(RET_NOT_FOUND); 3522 movl(result, -1); 3523 jmpb(DONE_LABEL); 3524 3525 bind(FOUND_CHAR); 3526 if (UseAVX >= 2) { 3527 vpmovmskb(tmp, vec3); 3528 } else { 3529 pmovmskb(tmp, vec3); 3530 } 3531 bsfl(ch, tmp); 3532 addptr(result, ch); 3533 3534 bind(FOUND_SEQ_CHAR); 3535 subptr(result, str1); 3536 shrl(result, 1); 3537 3538 bind(DONE_LABEL); 3539 } // string_indexof_char 3540 3541 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3542 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3543 ShortBranchVerifier sbv(this); 3544 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3545 3546 int stride = 16; 3547 3548 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3549 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3550 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3551 FOUND_SEQ_CHAR, DONE_LABEL; 3552 3553 movptr(result, str1); 3554 if (UseAVX >= 2) { 3555 cmpl(cnt1, stride); 3556 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3557 cmpl(cnt1, stride*2); 3558 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3559 movdl(vec1, ch); 3560 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3561 vpxor(vec2, vec2); 3562 movl(tmp, cnt1); 3563 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3564 andl(cnt1,0x0000001F); //tail count (in chars) 3565 3566 bind(SCAN_TO_32_CHAR_LOOP); 3567 vmovdqu(vec3, Address(result, 0)); 3568 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3569 vptest(vec2, vec3); 3570 jcc(Assembler::carryClear, FOUND_CHAR); 3571 addptr(result, 32); 3572 subl(tmp, stride*2); 3573 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3574 jmp(SCAN_TO_16_CHAR); 3575 3576 bind(SCAN_TO_16_CHAR_INIT); 3577 movdl(vec1, ch); 3578 pxor(vec2, vec2); 3579 pshufb(vec1, vec2); 3580 } 3581 3582 bind(SCAN_TO_16_CHAR); 3583 cmpl(cnt1, stride); 3584 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3585 if (UseAVX < 2) { 3586 movdl(vec1, ch); 3587 pxor(vec2, vec2); 3588 pshufb(vec1, vec2); 3589 } 3590 movl(tmp, cnt1); 3591 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3592 andl(cnt1,0x0000000F); //tail count (in bytes) 3593 3594 bind(SCAN_TO_16_CHAR_LOOP); 3595 movdqu(vec3, Address(result, 0)); 3596 pcmpeqb(vec3, vec1); 3597 ptest(vec2, vec3); 3598 jcc(Assembler::carryClear, FOUND_CHAR); 3599 addptr(result, 16); 3600 subl(tmp, stride); 3601 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3602 3603 bind(SCAN_TO_CHAR_INIT); 3604 testl(cnt1, cnt1); 3605 jcc(Assembler::zero, RET_NOT_FOUND); 3606 bind(SCAN_TO_CHAR_LOOP); 3607 load_unsigned_byte(tmp, Address(result, 0)); 3608 cmpl(ch, tmp); 3609 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3610 addptr(result, 1); 3611 subl(cnt1, 1); 3612 jccb(Assembler::zero, RET_NOT_FOUND); 3613 jmp(SCAN_TO_CHAR_LOOP); 3614 3615 bind(RET_NOT_FOUND); 3616 movl(result, -1); 3617 jmpb(DONE_LABEL); 3618 3619 bind(FOUND_CHAR); 3620 if (UseAVX >= 2) { 3621 vpmovmskb(tmp, vec3); 3622 } else { 3623 pmovmskb(tmp, vec3); 3624 } 3625 bsfl(ch, tmp); 3626 addptr(result, ch); 3627 3628 bind(FOUND_SEQ_CHAR); 3629 subptr(result, str1); 3630 3631 bind(DONE_LABEL); 3632 } // stringL_indexof_char 3633 3634 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3635 switch (eltype) { 3636 case T_BOOLEAN: return sizeof(jboolean); 3637 case T_BYTE: return sizeof(jbyte); 3638 case T_SHORT: return sizeof(jshort); 3639 case T_CHAR: return sizeof(jchar); 3640 case T_INT: return sizeof(jint); 3641 default: 3642 ShouldNotReachHere(); 3643 return -1; 3644 } 3645 } 3646 3647 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3648 switch (eltype) { 3649 // T_BOOLEAN used as surrogate for unsigned byte 3650 case T_BOOLEAN: movzbl(dst, src); break; 3651 case T_BYTE: movsbl(dst, src); break; 3652 case T_SHORT: movswl(dst, src); break; 3653 case T_CHAR: movzwl(dst, src); break; 3654 case T_INT: movl(dst, src); break; 3655 default: 3656 ShouldNotReachHere(); 3657 } 3658 } 3659 3660 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3661 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3662 } 3663 3664 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3665 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3666 } 3667 3668 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3669 const int vlen = Assembler::AVX_256bit; 3670 switch (eltype) { 3671 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3672 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3673 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3674 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3675 case T_INT: 3676 // do nothing 3677 break; 3678 default: 3679 ShouldNotReachHere(); 3680 } 3681 } 3682 3683 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3684 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3685 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3686 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3687 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3688 BasicType eltype) { 3689 ShortBranchVerifier sbv(this); 3690 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3691 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3692 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3693 3694 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3695 SHORT_UNROLLED_LOOP_EXIT, 3696 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3697 UNROLLED_VECTOR_LOOP_BEGIN, 3698 END; 3699 switch (eltype) { 3700 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3701 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3702 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3703 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3704 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3705 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3706 } 3707 3708 // For "renaming" for readibility of the code 3709 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3710 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3711 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3712 3713 const int elsize = arrays_hashcode_elsize(eltype); 3714 3715 /* 3716 if (cnt1 >= 2) { 3717 if (cnt1 >= 32) { 3718 UNROLLED VECTOR LOOP 3719 } 3720 UNROLLED SCALAR LOOP 3721 } 3722 SINGLE SCALAR 3723 */ 3724 3725 cmpl(cnt1, 32); 3726 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3727 3728 // cnt1 >= 32 && generate_vectorized_loop 3729 xorl(index, index); 3730 3731 // vresult = IntVector.zero(I256); 3732 for (int idx = 0; idx < 4; idx++) { 3733 vpxor(vresult[idx], vresult[idx]); 3734 } 3735 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3736 Register bound = tmp2; 3737 Register next = tmp3; 3738 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3739 movl(next, Address(tmp2, 0)); 3740 movdl(vnext, next); 3741 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3742 3743 // index = 0; 3744 // bound = cnt1 & ~(32 - 1); 3745 movl(bound, cnt1); 3746 andl(bound, ~(32 - 1)); 3747 // for (; index < bound; index += 32) { 3748 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3749 // result *= next; 3750 imull(result, next); 3751 // loop fission to upfront the cost of fetching from memory, OOO execution 3752 // can then hopefully do a better job of prefetching 3753 for (int idx = 0; idx < 4; idx++) { 3754 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3755 } 3756 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3757 for (int idx = 0; idx < 4; idx++) { 3758 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3759 arrays_hashcode_elvcast(vtmp[idx], eltype); 3760 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3761 } 3762 // index += 32; 3763 addl(index, 32); 3764 // index < bound; 3765 cmpl(index, bound); 3766 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3767 // } 3768 3769 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3770 subl(cnt1, bound); 3771 // release bound 3772 3773 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3774 for (int idx = 0; idx < 4; idx++) { 3775 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3776 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3777 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3778 } 3779 // result += vresult.reduceLanes(ADD); 3780 for (int idx = 0; idx < 4; idx++) { 3781 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3782 } 3783 3784 // } else if (cnt1 < 32) { 3785 3786 bind(SHORT_UNROLLED_BEGIN); 3787 // int i = 1; 3788 movl(index, 1); 3789 cmpl(index, cnt1); 3790 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3791 3792 // for (; i < cnt1 ; i += 2) { 3793 bind(SHORT_UNROLLED_LOOP_BEGIN); 3794 movl(tmp3, 961); 3795 imull(result, tmp3); 3796 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3797 movl(tmp3, tmp2); 3798 shll(tmp3, 5); 3799 subl(tmp3, tmp2); 3800 addl(result, tmp3); 3801 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3802 addl(result, tmp3); 3803 addl(index, 2); 3804 cmpl(index, cnt1); 3805 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3806 3807 // } 3808 // if (i >= cnt1) { 3809 bind(SHORT_UNROLLED_LOOP_EXIT); 3810 jccb(Assembler::greater, END); 3811 movl(tmp2, result); 3812 shll(result, 5); 3813 subl(result, tmp2); 3814 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3815 addl(result, tmp3); 3816 // } 3817 bind(END); 3818 3819 BLOCK_COMMENT("} // arrays_hashcode"); 3820 3821 } // arrays_hashcode 3822 3823 // helper function for string_compare 3824 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3825 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3826 Address::ScaleFactor scale2, Register index, int ae) { 3827 if (ae == StrIntrinsicNode::LL) { 3828 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3829 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3830 } else if (ae == StrIntrinsicNode::UU) { 3831 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3832 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3833 } else { 3834 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3835 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3836 } 3837 } 3838 3839 // Compare strings, used for char[] and byte[]. 3840 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3841 Register cnt1, Register cnt2, Register result, 3842 XMMRegister vec1, int ae, KRegister mask) { 3843 ShortBranchVerifier sbv(this); 3844 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3845 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3846 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3847 int stride2x2 = 0x40; 3848 Address::ScaleFactor scale = Address::no_scale; 3849 Address::ScaleFactor scale1 = Address::no_scale; 3850 Address::ScaleFactor scale2 = Address::no_scale; 3851 3852 if (ae != StrIntrinsicNode::LL) { 3853 stride2x2 = 0x20; 3854 } 3855 3856 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3857 shrl(cnt2, 1); 3858 } 3859 // Compute the minimum of the string lengths and the 3860 // difference of the string lengths (stack). 3861 // Do the conditional move stuff 3862 movl(result, cnt1); 3863 subl(cnt1, cnt2); 3864 push(cnt1); 3865 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3866 3867 // Is the minimum length zero? 3868 testl(cnt2, cnt2); 3869 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3870 if (ae == StrIntrinsicNode::LL) { 3871 // Load first bytes 3872 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3873 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3874 } else if (ae == StrIntrinsicNode::UU) { 3875 // Load first characters 3876 load_unsigned_short(result, Address(str1, 0)); 3877 load_unsigned_short(cnt1, Address(str2, 0)); 3878 } else { 3879 load_unsigned_byte(result, Address(str1, 0)); 3880 load_unsigned_short(cnt1, Address(str2, 0)); 3881 } 3882 subl(result, cnt1); 3883 jcc(Assembler::notZero, POP_LABEL); 3884 3885 if (ae == StrIntrinsicNode::UU) { 3886 // Divide length by 2 to get number of chars 3887 shrl(cnt2, 1); 3888 } 3889 cmpl(cnt2, 1); 3890 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3891 3892 // Check if the strings start at the same location and setup scale and stride 3893 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3894 cmpptr(str1, str2); 3895 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3896 if (ae == StrIntrinsicNode::LL) { 3897 scale = Address::times_1; 3898 stride = 16; 3899 } else { 3900 scale = Address::times_2; 3901 stride = 8; 3902 } 3903 } else { 3904 scale1 = Address::times_1; 3905 scale2 = Address::times_2; 3906 // scale not used 3907 stride = 8; 3908 } 3909 3910 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3911 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3912 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3913 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3914 Label COMPARE_TAIL_LONG; 3915 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3916 3917 int pcmpmask = 0x19; 3918 if (ae == StrIntrinsicNode::LL) { 3919 pcmpmask &= ~0x01; 3920 } 3921 3922 // Setup to compare 16-chars (32-bytes) vectors, 3923 // start from first character again because it has aligned address. 3924 if (ae == StrIntrinsicNode::LL) { 3925 stride2 = 32; 3926 } else { 3927 stride2 = 16; 3928 } 3929 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3930 adr_stride = stride << scale; 3931 } else { 3932 adr_stride1 = 8; //stride << scale1; 3933 adr_stride2 = 16; //stride << scale2; 3934 } 3935 3936 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3937 // rax and rdx are used by pcmpestri as elements counters 3938 movl(result, cnt2); 3939 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3940 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3941 3942 // fast path : compare first 2 8-char vectors. 3943 bind(COMPARE_16_CHARS); 3944 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3945 movdqu(vec1, Address(str1, 0)); 3946 } else { 3947 pmovzxbw(vec1, Address(str1, 0)); 3948 } 3949 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3950 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3951 3952 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3953 movdqu(vec1, Address(str1, adr_stride)); 3954 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3955 } else { 3956 pmovzxbw(vec1, Address(str1, adr_stride1)); 3957 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3958 } 3959 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3960 addl(cnt1, stride); 3961 3962 // Compare the characters at index in cnt1 3963 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3964 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3965 subl(result, cnt2); 3966 jmp(POP_LABEL); 3967 3968 // Setup the registers to start vector comparison loop 3969 bind(COMPARE_WIDE_VECTORS); 3970 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3971 lea(str1, Address(str1, result, scale)); 3972 lea(str2, Address(str2, result, scale)); 3973 } else { 3974 lea(str1, Address(str1, result, scale1)); 3975 lea(str2, Address(str2, result, scale2)); 3976 } 3977 subl(result, stride2); 3978 subl(cnt2, stride2); 3979 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3980 negptr(result); 3981 3982 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3983 bind(COMPARE_WIDE_VECTORS_LOOP); 3984 3985 #ifdef _LP64 3986 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3987 cmpl(cnt2, stride2x2); 3988 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3989 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3990 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3991 3992 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3993 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3994 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3995 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3996 } else { 3997 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3998 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3999 } 4000 kortestql(mask, mask); 4001 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 4002 addptr(result, stride2x2); // update since we already compared at this addr 4003 subl(cnt2, stride2x2); // and sub the size too 4004 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4005 4006 vpxor(vec1, vec1); 4007 jmpb(COMPARE_WIDE_TAIL); 4008 }//if (VM_Version::supports_avx512vlbw()) 4009 #endif // _LP64 4010 4011 4012 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4013 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4014 vmovdqu(vec1, Address(str1, result, scale)); 4015 vpxor(vec1, Address(str2, result, scale)); 4016 } else { 4017 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 4018 vpxor(vec1, Address(str2, result, scale2)); 4019 } 4020 vptest(vec1, vec1); 4021 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 4022 addptr(result, stride2); 4023 subl(cnt2, stride2); 4024 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 4025 // clean upper bits of YMM registers 4026 vpxor(vec1, vec1); 4027 4028 // compare wide vectors tail 4029 bind(COMPARE_WIDE_TAIL); 4030 testptr(result, result); 4031 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 4032 4033 movl(result, stride2); 4034 movl(cnt2, result); 4035 negptr(result); 4036 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4037 4038 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 4039 bind(VECTOR_NOT_EQUAL); 4040 // clean upper bits of YMM registers 4041 vpxor(vec1, vec1); 4042 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4043 lea(str1, Address(str1, result, scale)); 4044 lea(str2, Address(str2, result, scale)); 4045 } else { 4046 lea(str1, Address(str1, result, scale1)); 4047 lea(str2, Address(str2, result, scale2)); 4048 } 4049 jmp(COMPARE_16_CHARS); 4050 4051 // Compare tail chars, length between 1 to 15 chars 4052 bind(COMPARE_TAIL_LONG); 4053 movl(cnt2, result); 4054 cmpl(cnt2, stride); 4055 jcc(Assembler::less, COMPARE_SMALL_STR); 4056 4057 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4058 movdqu(vec1, Address(str1, 0)); 4059 } else { 4060 pmovzxbw(vec1, Address(str1, 0)); 4061 } 4062 pcmpestri(vec1, Address(str2, 0), pcmpmask); 4063 jcc(Assembler::below, COMPARE_INDEX_CHAR); 4064 subptr(cnt2, stride); 4065 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 4066 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4067 lea(str1, Address(str1, result, scale)); 4068 lea(str2, Address(str2, result, scale)); 4069 } else { 4070 lea(str1, Address(str1, result, scale1)); 4071 lea(str2, Address(str2, result, scale2)); 4072 } 4073 negptr(cnt2); 4074 jmpb(WHILE_HEAD_LABEL); 4075 4076 bind(COMPARE_SMALL_STR); 4077 } else if (UseSSE42Intrinsics) { 4078 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 4079 int pcmpmask = 0x19; 4080 // Setup to compare 8-char (16-byte) vectors, 4081 // start from first character again because it has aligned address. 4082 movl(result, cnt2); 4083 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 4084 if (ae == StrIntrinsicNode::LL) { 4085 pcmpmask &= ~0x01; 4086 } 4087 jcc(Assembler::zero, COMPARE_TAIL); 4088 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4089 lea(str1, Address(str1, result, scale)); 4090 lea(str2, Address(str2, result, scale)); 4091 } else { 4092 lea(str1, Address(str1, result, scale1)); 4093 lea(str2, Address(str2, result, scale2)); 4094 } 4095 negptr(result); 4096 4097 // pcmpestri 4098 // inputs: 4099 // vec1- substring 4100 // rax - negative string length (elements count) 4101 // mem - scanned string 4102 // rdx - string length (elements count) 4103 // pcmpmask - cmp mode: 11000 (string compare with negated result) 4104 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 4105 // outputs: 4106 // rcx - first mismatched element index 4107 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 4108 4109 bind(COMPARE_WIDE_VECTORS); 4110 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4111 movdqu(vec1, Address(str1, result, scale)); 4112 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 4113 } else { 4114 pmovzxbw(vec1, Address(str1, result, scale1)); 4115 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 4116 } 4117 // After pcmpestri cnt1(rcx) contains mismatched element index 4118 4119 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 4120 addptr(result, stride); 4121 subptr(cnt2, stride); 4122 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4123 4124 // compare wide vectors tail 4125 testptr(result, result); 4126 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 4127 4128 movl(cnt2, stride); 4129 movl(result, stride); 4130 negptr(result); 4131 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4132 movdqu(vec1, Address(str1, result, scale)); 4133 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 4134 } else { 4135 pmovzxbw(vec1, Address(str1, result, scale1)); 4136 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 4137 } 4138 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 4139 4140 // Mismatched characters in the vectors 4141 bind(VECTOR_NOT_EQUAL); 4142 addptr(cnt1, result); 4143 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 4144 subl(result, cnt2); 4145 jmpb(POP_LABEL); 4146 4147 bind(COMPARE_TAIL); // limit is zero 4148 movl(cnt2, result); 4149 // Fallthru to tail compare 4150 } 4151 // Shift str2 and str1 to the end of the arrays, negate min 4152 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4153 lea(str1, Address(str1, cnt2, scale)); 4154 lea(str2, Address(str2, cnt2, scale)); 4155 } else { 4156 lea(str1, Address(str1, cnt2, scale1)); 4157 lea(str2, Address(str2, cnt2, scale2)); 4158 } 4159 decrementl(cnt2); // first character was compared already 4160 negptr(cnt2); 4161 4162 // Compare the rest of the elements 4163 bind(WHILE_HEAD_LABEL); 4164 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 4165 subl(result, cnt1); 4166 jccb(Assembler::notZero, POP_LABEL); 4167 increment(cnt2); 4168 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 4169 4170 // Strings are equal up to min length. Return the length difference. 4171 bind(LENGTH_DIFF_LABEL); 4172 pop(result); 4173 if (ae == StrIntrinsicNode::UU) { 4174 // Divide diff by 2 to get number of chars 4175 sarl(result, 1); 4176 } 4177 jmpb(DONE_LABEL); 4178 4179 #ifdef _LP64 4180 if (VM_Version::supports_avx512vlbw()) { 4181 4182 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 4183 4184 kmovql(cnt1, mask); 4185 notq(cnt1); 4186 bsfq(cnt2, cnt1); 4187 if (ae != StrIntrinsicNode::LL) { 4188 // Divide diff by 2 to get number of chars 4189 sarl(cnt2, 1); 4190 } 4191 addq(result, cnt2); 4192 if (ae == StrIntrinsicNode::LL) { 4193 load_unsigned_byte(cnt1, Address(str2, result)); 4194 load_unsigned_byte(result, Address(str1, result)); 4195 } else if (ae == StrIntrinsicNode::UU) { 4196 load_unsigned_short(cnt1, Address(str2, result, scale)); 4197 load_unsigned_short(result, Address(str1, result, scale)); 4198 } else { 4199 load_unsigned_short(cnt1, Address(str2, result, scale2)); 4200 load_unsigned_byte(result, Address(str1, result, scale1)); 4201 } 4202 subl(result, cnt1); 4203 jmpb(POP_LABEL); 4204 }//if (VM_Version::supports_avx512vlbw()) 4205 #endif // _LP64 4206 4207 // Discard the stored length difference 4208 bind(POP_LABEL); 4209 pop(cnt1); 4210 4211 // That's it 4212 bind(DONE_LABEL); 4213 if(ae == StrIntrinsicNode::UL) { 4214 negl(result); 4215 } 4216 4217 } 4218 4219 // Search for Non-ASCII character (Negative byte value) in a byte array, 4220 // return the index of the first such character, otherwise the length 4221 // of the array segment searched. 4222 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 4223 // @IntrinsicCandidate 4224 // public static int countPositives(byte[] ba, int off, int len) { 4225 // for (int i = off; i < off + len; i++) { 4226 // if (ba[i] < 0) { 4227 // return i - off; 4228 // } 4229 // } 4230 // return len; 4231 // } 4232 void C2_MacroAssembler::count_positives(Register ary1, Register len, 4233 Register result, Register tmp1, 4234 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 4235 // rsi: byte array 4236 // rcx: len 4237 // rax: result 4238 ShortBranchVerifier sbv(this); 4239 assert_different_registers(ary1, len, result, tmp1); 4240 assert_different_registers(vec1, vec2); 4241 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 4242 4243 movl(result, len); // copy 4244 // len == 0 4245 testl(len, len); 4246 jcc(Assembler::zero, DONE); 4247 4248 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 4249 VM_Version::supports_avx512vlbw() && 4250 VM_Version::supports_bmi2()) { 4251 4252 Label test_64_loop, test_tail, BREAK_LOOP; 4253 movl(tmp1, len); 4254 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4255 4256 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 4257 andl(len, 0xffffffc0); // vector count (in chars) 4258 jccb(Assembler::zero, test_tail); 4259 4260 lea(ary1, Address(ary1, len, Address::times_1)); 4261 negptr(len); 4262 4263 bind(test_64_loop); 4264 // Check whether our 64 elements of size byte contain negatives 4265 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4266 kortestql(mask1, mask1); 4267 jcc(Assembler::notZero, BREAK_LOOP); 4268 4269 addptr(len, 64); 4270 jccb(Assembler::notZero, test_64_loop); 4271 4272 bind(test_tail); 4273 // bail out when there is nothing to be done 4274 testl(tmp1, -1); 4275 jcc(Assembler::zero, DONE); 4276 4277 4278 // check the tail for absense of negatives 4279 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4280 #ifdef _LP64 4281 { 4282 Register tmp3_aliased = len; 4283 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4284 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4285 notq(tmp3_aliased); 4286 kmovql(mask2, tmp3_aliased); 4287 } 4288 #else 4289 Label k_init; 4290 jmp(k_init); 4291 4292 // We could not read 64-bits from a general purpose register thus we move 4293 // data required to compose 64 1's to the instruction stream 4294 // We emit 64 byte wide series of elements from 0..63 which later on would 4295 // be used as a compare targets with tail count contained in tmp1 register. 4296 // Result would be a k register having tmp1 consecutive number or 1 4297 // counting from least significant bit. 4298 address tmp = pc(); 4299 emit_int64(0x0706050403020100); 4300 emit_int64(0x0F0E0D0C0B0A0908); 4301 emit_int64(0x1716151413121110); 4302 emit_int64(0x1F1E1D1C1B1A1918); 4303 emit_int64(0x2726252423222120); 4304 emit_int64(0x2F2E2D2C2B2A2928); 4305 emit_int64(0x3736353433323130); 4306 emit_int64(0x3F3E3D3C3B3A3938); 4307 4308 bind(k_init); 4309 lea(len, InternalAddress(tmp)); 4310 // create mask to test for negative byte inside a vector 4311 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 4312 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 4313 4314 #endif 4315 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4316 ktestq(mask1, mask2); 4317 jcc(Assembler::zero, DONE); 4318 4319 // do a full check for negative registers in the tail 4320 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4321 // ary1 already pointing to the right place 4322 jmpb(TAIL_START); 4323 4324 bind(BREAK_LOOP); 4325 // At least one byte in the last 64 byte block was negative. 4326 // Set up to look at the last 64 bytes as if they were a tail 4327 lea(ary1, Address(ary1, len, Address::times_1)); 4328 addptr(result, len); 4329 // Ignore the very last byte: if all others are positive, 4330 // it must be negative, so we can skip right to the 2+1 byte 4331 // end comparison at this point 4332 orl(result, 63); 4333 movl(len, 63); 4334 // Fallthru to tail compare 4335 } else { 4336 4337 if (UseAVX >= 2 && UseSSE >= 2) { 4338 // With AVX2, use 32-byte vector compare 4339 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4340 4341 // Compare 32-byte vectors 4342 testl(len, 0xffffffe0); // vector count (in bytes) 4343 jccb(Assembler::zero, TAIL_START); 4344 4345 andl(len, 0xffffffe0); 4346 lea(ary1, Address(ary1, len, Address::times_1)); 4347 negptr(len); 4348 4349 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4350 movdl(vec2, tmp1); 4351 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4352 4353 bind(COMPARE_WIDE_VECTORS); 4354 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4355 vptest(vec1, vec2); 4356 jccb(Assembler::notZero, BREAK_LOOP); 4357 addptr(len, 32); 4358 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4359 4360 testl(result, 0x0000001f); // any bytes remaining? 4361 jcc(Assembler::zero, DONE); 4362 4363 // Quick test using the already prepared vector mask 4364 movl(len, result); 4365 andl(len, 0x0000001f); 4366 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4367 vptest(vec1, vec2); 4368 jcc(Assembler::zero, DONE); 4369 // There are zeros, jump to the tail to determine exactly where 4370 jmpb(TAIL_START); 4371 4372 bind(BREAK_LOOP); 4373 // At least one byte in the last 32-byte vector is negative. 4374 // Set up to look at the last 32 bytes as if they were a tail 4375 lea(ary1, Address(ary1, len, Address::times_1)); 4376 addptr(result, len); 4377 // Ignore the very last byte: if all others are positive, 4378 // it must be negative, so we can skip right to the 2+1 byte 4379 // end comparison at this point 4380 orl(result, 31); 4381 movl(len, 31); 4382 // Fallthru to tail compare 4383 } else if (UseSSE42Intrinsics) { 4384 // With SSE4.2, use double quad vector compare 4385 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4386 4387 // Compare 16-byte vectors 4388 testl(len, 0xfffffff0); // vector count (in bytes) 4389 jcc(Assembler::zero, TAIL_START); 4390 4391 andl(len, 0xfffffff0); 4392 lea(ary1, Address(ary1, len, Address::times_1)); 4393 negptr(len); 4394 4395 movl(tmp1, 0x80808080); 4396 movdl(vec2, tmp1); 4397 pshufd(vec2, vec2, 0); 4398 4399 bind(COMPARE_WIDE_VECTORS); 4400 movdqu(vec1, Address(ary1, len, Address::times_1)); 4401 ptest(vec1, vec2); 4402 jccb(Assembler::notZero, BREAK_LOOP); 4403 addptr(len, 16); 4404 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4405 4406 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4407 jcc(Assembler::zero, DONE); 4408 4409 // Quick test using the already prepared vector mask 4410 movl(len, result); 4411 andl(len, 0x0000000f); // tail count (in bytes) 4412 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4413 ptest(vec1, vec2); 4414 jcc(Assembler::zero, DONE); 4415 jmpb(TAIL_START); 4416 4417 bind(BREAK_LOOP); 4418 // At least one byte in the last 16-byte vector is negative. 4419 // Set up and look at the last 16 bytes as if they were a tail 4420 lea(ary1, Address(ary1, len, Address::times_1)); 4421 addptr(result, len); 4422 // Ignore the very last byte: if all others are positive, 4423 // it must be negative, so we can skip right to the 2+1 byte 4424 // end comparison at this point 4425 orl(result, 15); 4426 movl(len, 15); 4427 // Fallthru to tail compare 4428 } 4429 } 4430 4431 bind(TAIL_START); 4432 // Compare 4-byte vectors 4433 andl(len, 0xfffffffc); // vector count (in bytes) 4434 jccb(Assembler::zero, COMPARE_CHAR); 4435 4436 lea(ary1, Address(ary1, len, Address::times_1)); 4437 negptr(len); 4438 4439 bind(COMPARE_VECTORS); 4440 movl(tmp1, Address(ary1, len, Address::times_1)); 4441 andl(tmp1, 0x80808080); 4442 jccb(Assembler::notZero, TAIL_ADJUST); 4443 addptr(len, 4); 4444 jccb(Assembler::notZero, COMPARE_VECTORS); 4445 4446 // Compare trailing char (final 2-3 bytes), if any 4447 bind(COMPARE_CHAR); 4448 4449 testl(result, 0x2); // tail char 4450 jccb(Assembler::zero, COMPARE_BYTE); 4451 load_unsigned_short(tmp1, Address(ary1, 0)); 4452 andl(tmp1, 0x00008080); 4453 jccb(Assembler::notZero, CHAR_ADJUST); 4454 lea(ary1, Address(ary1, 2)); 4455 4456 bind(COMPARE_BYTE); 4457 testl(result, 0x1); // tail byte 4458 jccb(Assembler::zero, DONE); 4459 load_unsigned_byte(tmp1, Address(ary1, 0)); 4460 testl(tmp1, 0x00000080); 4461 jccb(Assembler::zero, DONE); 4462 subptr(result, 1); 4463 jmpb(DONE); 4464 4465 bind(TAIL_ADJUST); 4466 // there are negative bits in the last 4 byte block. 4467 // Adjust result and check the next three bytes 4468 addptr(result, len); 4469 orl(result, 3); 4470 lea(ary1, Address(ary1, len, Address::times_1)); 4471 jmpb(COMPARE_CHAR); 4472 4473 bind(CHAR_ADJUST); 4474 // We are looking at a char + optional byte tail, and found that one 4475 // of the bytes in the char is negative. Adjust the result, check the 4476 // first byte and readjust if needed. 4477 andl(result, 0xfffffffc); 4478 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4479 jccb(Assembler::notZero, DONE); 4480 addptr(result, 1); 4481 4482 // That's it 4483 bind(DONE); 4484 if (UseAVX >= 2 && UseSSE >= 2) { 4485 // clean upper bits of YMM registers 4486 vpxor(vec1, vec1); 4487 vpxor(vec2, vec2); 4488 } 4489 } 4490 4491 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4492 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4493 Register limit, Register result, Register chr, 4494 XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) { 4495 ShortBranchVerifier sbv(this); 4496 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4497 4498 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4499 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4500 4501 if (is_array_equ) { 4502 // Check the input args 4503 cmpoop(ary1, ary2); 4504 jcc(Assembler::equal, TRUE_LABEL); 4505 4506 // Need additional checks for arrays_equals. 4507 testptr(ary1, ary1); 4508 jcc(Assembler::zero, FALSE_LABEL); 4509 testptr(ary2, ary2); 4510 jcc(Assembler::zero, FALSE_LABEL); 4511 4512 // Check the lengths 4513 movl(limit, Address(ary1, length_offset)); 4514 cmpl(limit, Address(ary2, length_offset)); 4515 jcc(Assembler::notEqual, FALSE_LABEL); 4516 } 4517 4518 // count == 0 4519 testl(limit, limit); 4520 jcc(Assembler::zero, TRUE_LABEL); 4521 4522 if (is_array_equ) { 4523 // Load array address 4524 lea(ary1, Address(ary1, base_offset)); 4525 lea(ary2, Address(ary2, base_offset)); 4526 } 4527 4528 if (is_array_equ && is_char) { 4529 // arrays_equals when used for char[]. 4530 shll(limit, 1); // byte count != 0 4531 } 4532 movl(result, limit); // copy 4533 4534 if (UseAVX >= 2) { 4535 // With AVX2, use 32-byte vector compare 4536 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4537 4538 // Compare 32-byte vectors 4539 andl(result, 0x0000001f); // tail count (in bytes) 4540 andl(limit, 0xffffffe0); // vector count (in bytes) 4541 jcc(Assembler::zero, COMPARE_TAIL); 4542 4543 lea(ary1, Address(ary1, limit, Address::times_1)); 4544 lea(ary2, Address(ary2, limit, Address::times_1)); 4545 negptr(limit); 4546 4547 #ifdef _LP64 4548 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4549 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4550 4551 cmpl(limit, -64); 4552 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4553 4554 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4555 4556 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4557 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4558 kortestql(mask, mask); 4559 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4560 addptr(limit, 64); // update since we already compared at this addr 4561 cmpl(limit, -64); 4562 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4563 4564 // At this point we may still need to compare -limit+result bytes. 4565 // We could execute the next two instruction and just continue via non-wide path: 4566 // cmpl(limit, 0); 4567 // jcc(Assembler::equal, COMPARE_TAIL); // true 4568 // But since we stopped at the points ary{1,2}+limit which are 4569 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4570 // (|limit| <= 32 and result < 32), 4571 // we may just compare the last 64 bytes. 4572 // 4573 addptr(result, -64); // it is safe, bc we just came from this area 4574 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4575 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4576 kortestql(mask, mask); 4577 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4578 4579 jmp(TRUE_LABEL); 4580 4581 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4582 4583 }//if (VM_Version::supports_avx512vlbw()) 4584 #endif //_LP64 4585 bind(COMPARE_WIDE_VECTORS); 4586 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); 4587 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4588 vpxor(vec1, vec2); 4589 4590 vptest(vec1, vec1); 4591 jcc(Assembler::notZero, FALSE_LABEL); 4592 addptr(limit, 32); 4593 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4594 4595 testl(result, result); 4596 jcc(Assembler::zero, TRUE_LABEL); 4597 4598 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 4599 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4600 vpxor(vec1, vec2); 4601 4602 vptest(vec1, vec1); 4603 jccb(Assembler::notZero, FALSE_LABEL); 4604 jmpb(TRUE_LABEL); 4605 4606 bind(COMPARE_TAIL); // limit is zero 4607 movl(limit, result); 4608 // Fallthru to tail compare 4609 } else if (UseSSE42Intrinsics) { 4610 // With SSE4.2, use double quad vector compare 4611 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4612 4613 // Compare 16-byte vectors 4614 andl(result, 0x0000000f); // tail count (in bytes) 4615 andl(limit, 0xfffffff0); // vector count (in bytes) 4616 jcc(Assembler::zero, COMPARE_TAIL); 4617 4618 lea(ary1, Address(ary1, limit, Address::times_1)); 4619 lea(ary2, Address(ary2, limit, Address::times_1)); 4620 negptr(limit); 4621 4622 bind(COMPARE_WIDE_VECTORS); 4623 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4624 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4625 pxor(vec1, vec2); 4626 4627 ptest(vec1, vec1); 4628 jcc(Assembler::notZero, FALSE_LABEL); 4629 addptr(limit, 16); 4630 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4631 4632 testl(result, result); 4633 jcc(Assembler::zero, TRUE_LABEL); 4634 4635 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4636 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4637 pxor(vec1, vec2); 4638 4639 ptest(vec1, vec1); 4640 jccb(Assembler::notZero, FALSE_LABEL); 4641 jmpb(TRUE_LABEL); 4642 4643 bind(COMPARE_TAIL); // limit is zero 4644 movl(limit, result); 4645 // Fallthru to tail compare 4646 } 4647 4648 // Compare 4-byte vectors 4649 andl(limit, 0xfffffffc); // vector count (in bytes) 4650 jccb(Assembler::zero, COMPARE_CHAR); 4651 4652 lea(ary1, Address(ary1, limit, Address::times_1)); 4653 lea(ary2, Address(ary2, limit, Address::times_1)); 4654 negptr(limit); 4655 4656 bind(COMPARE_VECTORS); 4657 movl(chr, Address(ary1, limit, Address::times_1)); 4658 cmpl(chr, Address(ary2, limit, Address::times_1)); 4659 jccb(Assembler::notEqual, FALSE_LABEL); 4660 addptr(limit, 4); 4661 jcc(Assembler::notZero, COMPARE_VECTORS); 4662 4663 // Compare trailing char (final 2 bytes), if any 4664 bind(COMPARE_CHAR); 4665 testl(result, 0x2); // tail char 4666 jccb(Assembler::zero, COMPARE_BYTE); 4667 load_unsigned_short(chr, Address(ary1, 0)); 4668 load_unsigned_short(limit, Address(ary2, 0)); 4669 cmpl(chr, limit); 4670 jccb(Assembler::notEqual, FALSE_LABEL); 4671 4672 if (is_array_equ && is_char) { 4673 bind(COMPARE_BYTE); 4674 } else { 4675 lea(ary1, Address(ary1, 2)); 4676 lea(ary2, Address(ary2, 2)); 4677 4678 bind(COMPARE_BYTE); 4679 testl(result, 0x1); // tail byte 4680 jccb(Assembler::zero, TRUE_LABEL); 4681 load_unsigned_byte(chr, Address(ary1, 0)); 4682 load_unsigned_byte(limit, Address(ary2, 0)); 4683 cmpl(chr, limit); 4684 jccb(Assembler::notEqual, FALSE_LABEL); 4685 } 4686 bind(TRUE_LABEL); 4687 movl(result, 1); // return true 4688 jmpb(DONE); 4689 4690 bind(FALSE_LABEL); 4691 xorl(result, result); // return false 4692 4693 // That's it 4694 bind(DONE); 4695 if (UseAVX >= 2) { 4696 // clean upper bits of YMM registers 4697 vpxor(vec1, vec1); 4698 vpxor(vec2, vec2); 4699 } 4700 } 4701 4702 #ifdef _LP64 4703 4704 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4705 #define __ masm. 4706 Register dst = stub.data<0>(); 4707 XMMRegister src = stub.data<1>(); 4708 address target = stub.data<2>(); 4709 __ bind(stub.entry()); 4710 __ subptr(rsp, 8); 4711 __ movdbl(Address(rsp), src); 4712 __ call(RuntimeAddress(target)); 4713 __ pop(dst); 4714 __ jmp(stub.continuation()); 4715 #undef __ 4716 } 4717 4718 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4719 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4720 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4721 4722 address slowpath_target; 4723 if (dst_bt == T_INT) { 4724 if (src_bt == T_FLOAT) { 4725 cvttss2sil(dst, src); 4726 cmpl(dst, 0x80000000); 4727 slowpath_target = StubRoutines::x86::f2i_fixup(); 4728 } else { 4729 cvttsd2sil(dst, src); 4730 cmpl(dst, 0x80000000); 4731 slowpath_target = StubRoutines::x86::d2i_fixup(); 4732 } 4733 } else { 4734 if (src_bt == T_FLOAT) { 4735 cvttss2siq(dst, src); 4736 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4737 slowpath_target = StubRoutines::x86::f2l_fixup(); 4738 } else { 4739 cvttsd2siq(dst, src); 4740 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4741 slowpath_target = StubRoutines::x86::d2l_fixup(); 4742 } 4743 } 4744 4745 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4746 jcc(Assembler::equal, stub->entry()); 4747 bind(stub->continuation()); 4748 } 4749 4750 #endif // _LP64 4751 4752 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4753 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4754 switch(ideal_opc) { 4755 case Op_LShiftVS: 4756 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4757 case Op_LShiftVI: 4758 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4759 case Op_LShiftVL: 4760 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4761 case Op_RShiftVS: 4762 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4763 case Op_RShiftVI: 4764 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4765 case Op_RShiftVL: 4766 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4767 case Op_URShiftVS: 4768 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4769 case Op_URShiftVI: 4770 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4771 case Op_URShiftVL: 4772 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4773 case Op_RotateRightV: 4774 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4775 case Op_RotateLeftV: 4776 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4777 default: 4778 fatal("Unsupported masked operation"); break; 4779 } 4780 } 4781 4782 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4783 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4784 bool is_varshift) { 4785 switch (ideal_opc) { 4786 case Op_AddVB: 4787 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4788 case Op_AddVS: 4789 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4790 case Op_AddVI: 4791 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4792 case Op_AddVL: 4793 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4794 case Op_AddVF: 4795 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4796 case Op_AddVD: 4797 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4798 case Op_SubVB: 4799 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4800 case Op_SubVS: 4801 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4802 case Op_SubVI: 4803 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4804 case Op_SubVL: 4805 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4806 case Op_SubVF: 4807 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4808 case Op_SubVD: 4809 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4810 case Op_MulVS: 4811 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4812 case Op_MulVI: 4813 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4814 case Op_MulVL: 4815 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4816 case Op_MulVF: 4817 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4818 case Op_MulVD: 4819 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4820 case Op_DivVF: 4821 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4822 case Op_DivVD: 4823 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4824 case Op_SqrtVF: 4825 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4826 case Op_SqrtVD: 4827 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4828 case Op_AbsVB: 4829 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4830 case Op_AbsVS: 4831 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4832 case Op_AbsVI: 4833 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4834 case Op_AbsVL: 4835 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4836 case Op_FmaVF: 4837 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4838 case Op_FmaVD: 4839 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4840 case Op_VectorRearrange: 4841 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4842 case Op_LShiftVS: 4843 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4844 case Op_LShiftVI: 4845 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4846 case Op_LShiftVL: 4847 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4848 case Op_RShiftVS: 4849 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4850 case Op_RShiftVI: 4851 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4852 case Op_RShiftVL: 4853 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4854 case Op_URShiftVS: 4855 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4856 case Op_URShiftVI: 4857 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4858 case Op_URShiftVL: 4859 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4860 case Op_RotateLeftV: 4861 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4862 case Op_RotateRightV: 4863 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4864 case Op_MaxV: 4865 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4866 case Op_MinV: 4867 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4868 case Op_XorV: 4869 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4870 case Op_OrV: 4871 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4872 case Op_AndV: 4873 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4874 default: 4875 fatal("Unsupported masked operation"); break; 4876 } 4877 } 4878 4879 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4880 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4881 switch (ideal_opc) { 4882 case Op_AddVB: 4883 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4884 case Op_AddVS: 4885 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4886 case Op_AddVI: 4887 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4888 case Op_AddVL: 4889 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4890 case Op_AddVF: 4891 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4892 case Op_AddVD: 4893 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4894 case Op_SubVB: 4895 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4896 case Op_SubVS: 4897 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4898 case Op_SubVI: 4899 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4900 case Op_SubVL: 4901 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4902 case Op_SubVF: 4903 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4904 case Op_SubVD: 4905 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4906 case Op_MulVS: 4907 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4908 case Op_MulVI: 4909 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4910 case Op_MulVL: 4911 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4912 case Op_MulVF: 4913 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4914 case Op_MulVD: 4915 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4916 case Op_DivVF: 4917 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4918 case Op_DivVD: 4919 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4920 case Op_FmaVF: 4921 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4922 case Op_FmaVD: 4923 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4924 case Op_MaxV: 4925 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4926 case Op_MinV: 4927 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4928 case Op_XorV: 4929 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4930 case Op_OrV: 4931 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4932 case Op_AndV: 4933 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4934 default: 4935 fatal("Unsupported masked operation"); break; 4936 } 4937 } 4938 4939 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4940 KRegister src1, KRegister src2) { 4941 BasicType etype = T_ILLEGAL; 4942 switch(mask_len) { 4943 case 2: 4944 case 4: 4945 case 8: etype = T_BYTE; break; 4946 case 16: etype = T_SHORT; break; 4947 case 32: etype = T_INT; break; 4948 case 64: etype = T_LONG; break; 4949 default: fatal("Unsupported type"); break; 4950 } 4951 assert(etype != T_ILLEGAL, ""); 4952 switch(ideal_opc) { 4953 case Op_AndVMask: 4954 kand(etype, dst, src1, src2); break; 4955 case Op_OrVMask: 4956 kor(etype, dst, src1, src2); break; 4957 case Op_XorVMask: 4958 kxor(etype, dst, src1, src2); break; 4959 default: 4960 fatal("Unsupported masked operation"); break; 4961 } 4962 } 4963 4964 /* 4965 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4966 * If src is NaN, the result is 0. 4967 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4968 * the result is equal to the value of Integer.MIN_VALUE. 4969 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4970 * the result is equal to the value of Integer.MAX_VALUE. 4971 */ 4972 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4973 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4974 Register rscratch, AddressLiteral float_sign_flip, 4975 int vec_enc) { 4976 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4977 Label done; 4978 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4979 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4980 vptest(xtmp2, xtmp2, vec_enc); 4981 jccb(Assembler::equal, done); 4982 4983 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4984 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4985 4986 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4987 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4988 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4989 4990 // Recompute the mask for remaining special value. 4991 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4992 // Extract SRC values corresponding to TRUE mask lanes. 4993 vpand(xtmp4, xtmp2, src, vec_enc); 4994 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4995 // values are set. 4996 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4997 4998 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4999 bind(done); 5000 } 5001 5002 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5003 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5004 Register rscratch, AddressLiteral float_sign_flip, 5005 int vec_enc) { 5006 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5007 Label done; 5008 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5009 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5010 kortestwl(ktmp1, ktmp1); 5011 jccb(Assembler::equal, done); 5012 5013 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5014 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5015 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5016 5017 kxorwl(ktmp1, ktmp1, ktmp2); 5018 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5019 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5020 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5021 bind(done); 5022 } 5023 5024 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5025 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5026 Register rscratch, AddressLiteral double_sign_flip, 5027 int vec_enc) { 5028 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5029 5030 Label done; 5031 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5032 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 5033 kortestwl(ktmp1, ktmp1); 5034 jccb(Assembler::equal, done); 5035 5036 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5037 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5038 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5039 5040 kxorwl(ktmp1, ktmp1, ktmp2); 5041 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5042 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5043 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5044 bind(done); 5045 } 5046 5047 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5048 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5049 Register rscratch, AddressLiteral float_sign_flip, 5050 int vec_enc) { 5051 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5052 Label done; 5053 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5054 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5055 kortestwl(ktmp1, ktmp1); 5056 jccb(Assembler::equal, done); 5057 5058 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5059 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5060 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5061 5062 kxorwl(ktmp1, ktmp1, ktmp2); 5063 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5064 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5065 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5066 bind(done); 5067 } 5068 5069 /* 5070 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5071 * If src is NaN, the result is 0. 5072 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 5073 * the result is equal to the value of Long.MIN_VALUE. 5074 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 5075 * the result is equal to the value of Long.MAX_VALUE. 5076 */ 5077 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5078 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5079 Register rscratch, AddressLiteral double_sign_flip, 5080 int vec_enc) { 5081 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5082 5083 Label done; 5084 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5085 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 5086 kortestwl(ktmp1, ktmp1); 5087 jccb(Assembler::equal, done); 5088 5089 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5090 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5091 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5092 5093 kxorwl(ktmp1, ktmp1, ktmp2); 5094 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5095 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5096 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5097 bind(done); 5098 } 5099 5100 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 5101 XMMRegister xtmp, int index, int vec_enc) { 5102 assert(vec_enc < Assembler::AVX_512bit, ""); 5103 if (vec_enc == Assembler::AVX_256bit) { 5104 vextractf128_high(xtmp, src); 5105 vshufps(dst, src, xtmp, index, vec_enc); 5106 } else { 5107 vshufps(dst, src, zero, index, vec_enc); 5108 } 5109 } 5110 5111 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5112 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 5113 AddressLiteral float_sign_flip, int src_vec_enc) { 5114 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5115 5116 Label done; 5117 // Compare the destination lanes with float_sign_flip 5118 // value to get mask for all special values. 5119 movdqu(xtmp1, float_sign_flip, rscratch); 5120 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 5121 ptest(xtmp2, xtmp2); 5122 jccb(Assembler::equal, done); 5123 5124 // Flip float_sign_flip to get max integer value. 5125 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5126 pxor(xtmp1, xtmp4); 5127 5128 // Set detination lanes corresponding to unordered source lanes as zero. 5129 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5130 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5131 5132 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5133 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5134 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5135 5136 // Recompute the mask for remaining special value. 5137 pxor(xtmp2, xtmp3); 5138 // Extract mask corresponding to non-negative source lanes. 5139 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5140 5141 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5142 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5143 pand(xtmp3, xtmp2); 5144 5145 // Replace destination lanes holding special value(0x80000000) with max int 5146 // if corresponding source lane holds a +ve value. 5147 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5148 bind(done); 5149 } 5150 5151 5152 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5153 XMMRegister xtmp, Register rscratch, int vec_enc) { 5154 switch(to_elem_bt) { 5155 case T_SHORT: 5156 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5157 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5158 vpackusdw(dst, dst, zero, vec_enc); 5159 if (vec_enc == Assembler::AVX_256bit) { 5160 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5161 } 5162 break; 5163 case T_BYTE: 5164 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5165 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5166 vpackusdw(dst, dst, zero, vec_enc); 5167 if (vec_enc == Assembler::AVX_256bit) { 5168 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5169 } 5170 vpackuswb(dst, dst, zero, vec_enc); 5171 break; 5172 default: assert(false, "%s", type2name(to_elem_bt)); 5173 } 5174 } 5175 5176 /* 5177 * Algorithm for vector D2L and F2I conversions:- 5178 * a) Perform vector D2L/F2I cast. 5179 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5180 * It signifies that source value could be any of the special floating point 5181 * values(NaN,-Inf,Inf,Max,-Min). 5182 * c) Set destination to zero if source is NaN value. 5183 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5184 */ 5185 5186 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5187 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5188 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5189 int to_elem_sz = type2aelembytes(to_elem_bt); 5190 assert(to_elem_sz <= 4, ""); 5191 vcvttps2dq(dst, src, vec_enc); 5192 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5193 if (to_elem_sz < 4) { 5194 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5195 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5196 } 5197 } 5198 5199 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5200 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5201 Register rscratch, int vec_enc) { 5202 int to_elem_sz = type2aelembytes(to_elem_bt); 5203 assert(to_elem_sz <= 4, ""); 5204 vcvttps2dq(dst, src, vec_enc); 5205 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5206 switch(to_elem_bt) { 5207 case T_INT: 5208 break; 5209 case T_SHORT: 5210 evpmovdw(dst, dst, vec_enc); 5211 break; 5212 case T_BYTE: 5213 evpmovdb(dst, dst, vec_enc); 5214 break; 5215 default: assert(false, "%s", type2name(to_elem_bt)); 5216 } 5217 } 5218 5219 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5220 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5221 Register rscratch, int vec_enc) { 5222 evcvttps2qq(dst, src, vec_enc); 5223 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5224 } 5225 5226 // Handling for downcasting from double to integer or sub-word types on AVX2. 5227 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5228 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5229 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5230 int to_elem_sz = type2aelembytes(to_elem_bt); 5231 assert(to_elem_sz < 8, ""); 5232 vcvttpd2dq(dst, src, vec_enc); 5233 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5234 float_sign_flip, vec_enc); 5235 if (to_elem_sz < 4) { 5236 // xtmp4 holds all zero lanes. 5237 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5238 } 5239 } 5240 5241 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5242 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5243 KRegister ktmp2, AddressLiteral sign_flip, 5244 Register rscratch, int vec_enc) { 5245 if (VM_Version::supports_avx512dq()) { 5246 evcvttpd2qq(dst, src, vec_enc); 5247 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5248 switch(to_elem_bt) { 5249 case T_LONG: 5250 break; 5251 case T_INT: 5252 evpmovsqd(dst, dst, vec_enc); 5253 break; 5254 case T_SHORT: 5255 evpmovsqd(dst, dst, vec_enc); 5256 evpmovdw(dst, dst, vec_enc); 5257 break; 5258 case T_BYTE: 5259 evpmovsqd(dst, dst, vec_enc); 5260 evpmovdb(dst, dst, vec_enc); 5261 break; 5262 default: assert(false, "%s", type2name(to_elem_bt)); 5263 } 5264 } else { 5265 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5266 vcvttpd2dq(dst, src, vec_enc); 5267 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5268 switch(to_elem_bt) { 5269 case T_INT: 5270 break; 5271 case T_SHORT: 5272 evpmovdw(dst, dst, vec_enc); 5273 break; 5274 case T_BYTE: 5275 evpmovdb(dst, dst, vec_enc); 5276 break; 5277 default: assert(false, "%s", type2name(to_elem_bt)); 5278 } 5279 } 5280 } 5281 5282 #ifdef _LP64 5283 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5284 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5285 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5286 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5287 // and re-instantiate original MXCSR.RC mode after that. 5288 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5289 5290 mov64(tmp, julong_cast(0.5L)); 5291 evpbroadcastq(xtmp1, tmp, vec_enc); 5292 vaddpd(xtmp1, src , xtmp1, vec_enc); 5293 evcvtpd2qq(dst, xtmp1, vec_enc); 5294 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5295 double_sign_flip, vec_enc);; 5296 5297 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5298 } 5299 5300 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5301 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5302 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5303 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5304 // and re-instantiate original MXCSR.RC mode after that. 5305 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5306 5307 movl(tmp, jint_cast(0.5)); 5308 movq(xtmp1, tmp); 5309 vbroadcastss(xtmp1, xtmp1, vec_enc); 5310 vaddps(xtmp1, src , xtmp1, vec_enc); 5311 vcvtps2dq(dst, xtmp1, vec_enc); 5312 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5313 float_sign_flip, vec_enc); 5314 5315 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5316 } 5317 5318 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5319 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5320 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5321 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5322 // and re-instantiate original MXCSR.RC mode after that. 5323 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5324 5325 movl(tmp, jint_cast(0.5)); 5326 movq(xtmp1, tmp); 5327 vbroadcastss(xtmp1, xtmp1, vec_enc); 5328 vaddps(xtmp1, src , xtmp1, vec_enc); 5329 vcvtps2dq(dst, xtmp1, vec_enc); 5330 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5331 5332 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5333 } 5334 #endif // _LP64 5335 5336 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5337 BasicType from_elem_bt, BasicType to_elem_bt) { 5338 switch (from_elem_bt) { 5339 case T_BYTE: 5340 switch (to_elem_bt) { 5341 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5342 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5343 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5344 default: ShouldNotReachHere(); 5345 } 5346 break; 5347 case T_SHORT: 5348 switch (to_elem_bt) { 5349 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5350 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5351 default: ShouldNotReachHere(); 5352 } 5353 break; 5354 case T_INT: 5355 assert(to_elem_bt == T_LONG, ""); 5356 vpmovzxdq(dst, src, vlen_enc); 5357 break; 5358 default: 5359 ShouldNotReachHere(); 5360 } 5361 } 5362 5363 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5364 BasicType from_elem_bt, BasicType to_elem_bt) { 5365 switch (from_elem_bt) { 5366 case T_BYTE: 5367 switch (to_elem_bt) { 5368 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5369 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5370 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5371 default: ShouldNotReachHere(); 5372 } 5373 break; 5374 case T_SHORT: 5375 switch (to_elem_bt) { 5376 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5377 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5378 default: ShouldNotReachHere(); 5379 } 5380 break; 5381 case T_INT: 5382 assert(to_elem_bt == T_LONG, ""); 5383 vpmovsxdq(dst, src, vlen_enc); 5384 break; 5385 default: 5386 ShouldNotReachHere(); 5387 } 5388 } 5389 5390 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5391 BasicType dst_bt, BasicType src_bt, int vlen) { 5392 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5393 assert(vlen_enc != AVX_512bit, ""); 5394 5395 int dst_bt_size = type2aelembytes(dst_bt); 5396 int src_bt_size = type2aelembytes(src_bt); 5397 if (dst_bt_size > src_bt_size) { 5398 switch (dst_bt_size / src_bt_size) { 5399 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5400 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5401 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5402 default: ShouldNotReachHere(); 5403 } 5404 } else { 5405 assert(dst_bt_size < src_bt_size, ""); 5406 switch (src_bt_size / dst_bt_size) { 5407 case 2: { 5408 if (vlen_enc == AVX_128bit) { 5409 vpacksswb(dst, src, src, vlen_enc); 5410 } else { 5411 vpacksswb(dst, src, src, vlen_enc); 5412 vpermq(dst, dst, 0x08, vlen_enc); 5413 } 5414 break; 5415 } 5416 case 4: { 5417 if (vlen_enc == AVX_128bit) { 5418 vpackssdw(dst, src, src, vlen_enc); 5419 vpacksswb(dst, dst, dst, vlen_enc); 5420 } else { 5421 vpackssdw(dst, src, src, vlen_enc); 5422 vpermq(dst, dst, 0x08, vlen_enc); 5423 vpacksswb(dst, dst, dst, AVX_128bit); 5424 } 5425 break; 5426 } 5427 case 8: { 5428 if (vlen_enc == AVX_128bit) { 5429 vpshufd(dst, src, 0x08, vlen_enc); 5430 vpackssdw(dst, dst, dst, vlen_enc); 5431 vpacksswb(dst, dst, dst, vlen_enc); 5432 } else { 5433 vpshufd(dst, src, 0x08, vlen_enc); 5434 vpermq(dst, dst, 0x08, vlen_enc); 5435 vpackssdw(dst, dst, dst, AVX_128bit); 5436 vpacksswb(dst, dst, dst, AVX_128bit); 5437 } 5438 break; 5439 } 5440 default: ShouldNotReachHere(); 5441 } 5442 } 5443 } 5444 5445 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5446 bool merge, BasicType bt, int vlen_enc) { 5447 if (bt == T_INT) { 5448 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5449 } else { 5450 assert(bt == T_LONG, ""); 5451 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5452 } 5453 } 5454 5455 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5456 bool merge, BasicType bt, int vlen_enc) { 5457 if (bt == T_INT) { 5458 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5459 } else { 5460 assert(bt == T_LONG, ""); 5461 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5462 } 5463 } 5464 5465 #ifdef _LP64 5466 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5467 Register rtmp2, XMMRegister xtmp, int mask_len, 5468 int vec_enc) { 5469 int index = 0; 5470 int vindex = 0; 5471 mov64(rtmp1, 0x0101010101010101L); 5472 pdepq(rtmp1, src, rtmp1); 5473 if (mask_len > 8) { 5474 movq(rtmp2, src); 5475 vpxor(xtmp, xtmp, xtmp, vec_enc); 5476 movq(xtmp, rtmp1); 5477 } 5478 movq(dst, rtmp1); 5479 5480 mask_len -= 8; 5481 while (mask_len > 0) { 5482 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5483 index++; 5484 if ((index % 2) == 0) { 5485 pxor(xtmp, xtmp); 5486 } 5487 mov64(rtmp1, 0x0101010101010101L); 5488 shrq(rtmp2, 8); 5489 pdepq(rtmp1, rtmp2, rtmp1); 5490 pinsrq(xtmp, rtmp1, index % 2); 5491 vindex = index / 2; 5492 if (vindex) { 5493 // Write entire 16 byte vector when both 64 bit 5494 // lanes are update to save redundant instructions. 5495 if (index % 2) { 5496 vinsertf128(dst, dst, xtmp, vindex); 5497 } 5498 } else { 5499 vmovdqu(dst, xtmp); 5500 } 5501 mask_len -= 8; 5502 } 5503 } 5504 5505 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5506 switch(opc) { 5507 case Op_VectorMaskTrueCount: 5508 popcntq(dst, tmp); 5509 break; 5510 case Op_VectorMaskLastTrue: 5511 if (VM_Version::supports_lzcnt()) { 5512 lzcntq(tmp, tmp); 5513 movl(dst, 63); 5514 subl(dst, tmp); 5515 } else { 5516 movl(dst, -1); 5517 bsrq(tmp, tmp); 5518 cmov32(Assembler::notZero, dst, tmp); 5519 } 5520 break; 5521 case Op_VectorMaskFirstTrue: 5522 if (VM_Version::supports_bmi1()) { 5523 if (masklen < 32) { 5524 orl(tmp, 1 << masklen); 5525 tzcntl(dst, tmp); 5526 } else if (masklen == 32) { 5527 tzcntl(dst, tmp); 5528 } else { 5529 assert(masklen == 64, ""); 5530 tzcntq(dst, tmp); 5531 } 5532 } else { 5533 if (masklen < 32) { 5534 orl(tmp, 1 << masklen); 5535 bsfl(dst, tmp); 5536 } else { 5537 assert(masklen == 32 || masklen == 64, ""); 5538 movl(dst, masklen); 5539 if (masklen == 32) { 5540 bsfl(tmp, tmp); 5541 } else { 5542 bsfq(tmp, tmp); 5543 } 5544 cmov32(Assembler::notZero, dst, tmp); 5545 } 5546 } 5547 break; 5548 case Op_VectorMaskToLong: 5549 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5550 break; 5551 default: assert(false, "Unhandled mask operation"); 5552 } 5553 } 5554 5555 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5556 int masklen, int masksize, int vec_enc) { 5557 assert(VM_Version::supports_popcnt(), ""); 5558 5559 if(VM_Version::supports_avx512bw()) { 5560 kmovql(tmp, mask); 5561 } else { 5562 assert(masklen <= 16, ""); 5563 kmovwl(tmp, mask); 5564 } 5565 5566 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5567 // operations needs to be clipped. 5568 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5569 andq(tmp, (1 << masklen) - 1); 5570 } 5571 5572 vector_mask_operation_helper(opc, dst, tmp, masklen); 5573 } 5574 5575 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5576 Register tmp, int masklen, BasicType bt, int vec_enc) { 5577 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5578 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5579 assert(VM_Version::supports_popcnt(), ""); 5580 5581 bool need_clip = false; 5582 switch(bt) { 5583 case T_BOOLEAN: 5584 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5585 vpxor(xtmp, xtmp, xtmp, vec_enc); 5586 vpsubb(xtmp, xtmp, mask, vec_enc); 5587 vpmovmskb(tmp, xtmp, vec_enc); 5588 need_clip = masklen < 16; 5589 break; 5590 case T_BYTE: 5591 vpmovmskb(tmp, mask, vec_enc); 5592 need_clip = masklen < 16; 5593 break; 5594 case T_SHORT: 5595 vpacksswb(xtmp, mask, mask, vec_enc); 5596 if (masklen >= 16) { 5597 vpermpd(xtmp, xtmp, 8, vec_enc); 5598 } 5599 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5600 need_clip = masklen < 16; 5601 break; 5602 case T_INT: 5603 case T_FLOAT: 5604 vmovmskps(tmp, mask, vec_enc); 5605 need_clip = masklen < 4; 5606 break; 5607 case T_LONG: 5608 case T_DOUBLE: 5609 vmovmskpd(tmp, mask, vec_enc); 5610 need_clip = masklen < 2; 5611 break; 5612 default: assert(false, "Unhandled type, %s", type2name(bt)); 5613 } 5614 5615 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5616 // operations needs to be clipped. 5617 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5618 // need_clip implies masklen < 32 5619 andq(tmp, (1 << masklen) - 1); 5620 } 5621 5622 vector_mask_operation_helper(opc, dst, tmp, masklen); 5623 } 5624 5625 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5626 Register rtmp2, int mask_len) { 5627 kmov(rtmp1, src); 5628 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5629 mov64(rtmp2, -1L); 5630 pextq(rtmp2, rtmp2, rtmp1); 5631 kmov(dst, rtmp2); 5632 } 5633 5634 #ifdef _LP64 5635 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5636 XMMRegister mask, Register rtmp, Register rscratch, 5637 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5638 int vec_enc) { 5639 assert(type2aelembytes(bt) >= 4, ""); 5640 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5641 address compress_perm_table = nullptr; 5642 address expand_perm_table = nullptr; 5643 if (type2aelembytes(bt) == 8) { 5644 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5645 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5646 vmovmskpd(rtmp, mask, vec_enc); 5647 } else { 5648 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5649 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5650 vmovmskps(rtmp, mask, vec_enc); 5651 } 5652 shlq(rtmp, 5); // for 32 byte permute row. 5653 if (opcode == Op_CompressV) { 5654 lea(rscratch, ExternalAddress(compress_perm_table)); 5655 } else { 5656 lea(rscratch, ExternalAddress(expand_perm_table)); 5657 } 5658 addptr(rtmp, rscratch); 5659 vmovdqu(permv, Address(rtmp)); 5660 vpermps(dst, permv, src, Assembler::AVX_256bit); 5661 vpxor(xtmp, xtmp, xtmp, vec_enc); 5662 // Blend the result with zero vector using permute mask, each column entry 5663 // in a permute table row contains either a valid permute index or a -1 (default) 5664 // value, this can potentially be used as a blending mask after 5665 // compressing/expanding the source vector lanes. 5666 vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv); 5667 } 5668 #endif 5669 5670 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5671 bool merge, BasicType bt, int vec_enc) { 5672 if (opcode == Op_CompressV) { 5673 switch(bt) { 5674 case T_BYTE: 5675 evpcompressb(dst, mask, src, merge, vec_enc); 5676 break; 5677 case T_CHAR: 5678 case T_SHORT: 5679 evpcompressw(dst, mask, src, merge, vec_enc); 5680 break; 5681 case T_INT: 5682 evpcompressd(dst, mask, src, merge, vec_enc); 5683 break; 5684 case T_FLOAT: 5685 evcompressps(dst, mask, src, merge, vec_enc); 5686 break; 5687 case T_LONG: 5688 evpcompressq(dst, mask, src, merge, vec_enc); 5689 break; 5690 case T_DOUBLE: 5691 evcompresspd(dst, mask, src, merge, vec_enc); 5692 break; 5693 default: 5694 fatal("Unsupported type %s", type2name(bt)); 5695 break; 5696 } 5697 } else { 5698 assert(opcode == Op_ExpandV, ""); 5699 switch(bt) { 5700 case T_BYTE: 5701 evpexpandb(dst, mask, src, merge, vec_enc); 5702 break; 5703 case T_CHAR: 5704 case T_SHORT: 5705 evpexpandw(dst, mask, src, merge, vec_enc); 5706 break; 5707 case T_INT: 5708 evpexpandd(dst, mask, src, merge, vec_enc); 5709 break; 5710 case T_FLOAT: 5711 evexpandps(dst, mask, src, merge, vec_enc); 5712 break; 5713 case T_LONG: 5714 evpexpandq(dst, mask, src, merge, vec_enc); 5715 break; 5716 case T_DOUBLE: 5717 evexpandpd(dst, mask, src, merge, vec_enc); 5718 break; 5719 default: 5720 fatal("Unsupported type %s", type2name(bt)); 5721 break; 5722 } 5723 } 5724 } 5725 #endif 5726 5727 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5728 KRegister ktmp1, int vec_enc) { 5729 if (opcode == Op_SignumVD) { 5730 vsubpd(dst, zero, one, vec_enc); 5731 // if src < 0 ? -1 : 1 5732 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5733 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5734 // if src == NaN, -0.0 or 0.0 return src. 5735 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5736 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5737 } else { 5738 assert(opcode == Op_SignumVF, ""); 5739 vsubps(dst, zero, one, vec_enc); 5740 // if src < 0 ? -1 : 1 5741 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5742 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5743 // if src == NaN, -0.0 or 0.0 return src. 5744 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5745 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5746 } 5747 } 5748 5749 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5750 XMMRegister xtmp1, int vec_enc) { 5751 if (opcode == Op_SignumVD) { 5752 vsubpd(dst, zero, one, vec_enc); 5753 // if src < 0 ? -1 : 1 5754 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5755 // if src == NaN, -0.0 or 0.0 return src. 5756 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5757 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5758 } else { 5759 assert(opcode == Op_SignumVF, ""); 5760 vsubps(dst, zero, one, vec_enc); 5761 // if src < 0 ? -1 : 1 5762 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5763 // if src == NaN, -0.0 or 0.0 return src. 5764 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5765 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5766 } 5767 } 5768 5769 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5770 if (VM_Version::supports_avx512bw()) { 5771 if (mask_len > 32) { 5772 kmovql(dst, src); 5773 } else { 5774 kmovdl(dst, src); 5775 if (mask_len != 32) { 5776 kshiftrdl(dst, dst, 32 - mask_len); 5777 } 5778 } 5779 } else { 5780 assert(mask_len <= 16, ""); 5781 kmovwl(dst, src); 5782 if (mask_len != 16) { 5783 kshiftrwl(dst, dst, 16 - mask_len); 5784 } 5785 } 5786 } 5787 5788 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5789 int lane_size = type2aelembytes(bt); 5790 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5791 if ((is_LP64 || lane_size < 8) && 5792 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5793 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5794 movptr(rtmp, imm32); 5795 switch(lane_size) { 5796 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5797 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5798 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5799 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5800 fatal("Unsupported lane size %d", lane_size); 5801 break; 5802 } 5803 } else { 5804 movptr(rtmp, imm32); 5805 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5806 switch(lane_size) { 5807 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5808 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5809 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5810 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5811 fatal("Unsupported lane size %d", lane_size); 5812 break; 5813 } 5814 } 5815 } 5816 5817 // 5818 // Following is lookup table based popcount computation algorithm:- 5819 // Index Bit set count 5820 // [ 0000 -> 0, 5821 // 0001 -> 1, 5822 // 0010 -> 1, 5823 // 0011 -> 2, 5824 // 0100 -> 1, 5825 // 0101 -> 2, 5826 // 0110 -> 2, 5827 // 0111 -> 3, 5828 // 1000 -> 1, 5829 // 1001 -> 2, 5830 // 1010 -> 3, 5831 // 1011 -> 3, 5832 // 1100 -> 2, 5833 // 1101 -> 3, 5834 // 1111 -> 4 ] 5835 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5836 // shuffle indices for lookup table access. 5837 // b. Right shift each byte of vector lane by 4 positions. 5838 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5839 // shuffle indices for lookup table access. 5840 // d. Add the bitset count of upper and lower 4 bits of each byte. 5841 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5842 // count of all the bytes of a quadword. 5843 // f. Perform step e. for upper 128bit vector lane. 5844 // g. Pack the bitset count of quadwords back to double word. 5845 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5846 5847 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5848 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5849 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5850 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5851 vpsrlw(dst, src, 4, vec_enc); 5852 vpand(dst, dst, xtmp1, vec_enc); 5853 vpand(xtmp1, src, xtmp1, vec_enc); 5854 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5855 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5856 vpshufb(dst, xtmp2, dst, vec_enc); 5857 vpaddb(dst, dst, xtmp1, vec_enc); 5858 } 5859 5860 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5861 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5862 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5863 // Following code is as per steps e,f,g and h of above algorithm. 5864 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5865 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5866 vpsadbw(dst, dst, xtmp2, vec_enc); 5867 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5868 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5869 vpackuswb(dst, xtmp1, dst, vec_enc); 5870 } 5871 5872 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5873 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5874 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5875 // Add the popcount of upper and lower bytes of word. 5876 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5877 vpsrlw(dst, xtmp1, 8, vec_enc); 5878 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5879 vpaddw(dst, dst, xtmp1, vec_enc); 5880 } 5881 5882 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5883 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5884 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5885 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5886 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5887 } 5888 5889 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5890 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5891 switch(bt) { 5892 case T_LONG: 5893 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5894 break; 5895 case T_INT: 5896 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5897 break; 5898 case T_CHAR: 5899 case T_SHORT: 5900 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5901 break; 5902 case T_BYTE: 5903 case T_BOOLEAN: 5904 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5905 break; 5906 default: 5907 fatal("Unsupported type %s", type2name(bt)); 5908 break; 5909 } 5910 } 5911 5912 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5913 KRegister mask, bool merge, int vec_enc) { 5914 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5915 switch(bt) { 5916 case T_LONG: 5917 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5918 evpopcntq(dst, mask, src, merge, vec_enc); 5919 break; 5920 case T_INT: 5921 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5922 evpopcntd(dst, mask, src, merge, vec_enc); 5923 break; 5924 case T_CHAR: 5925 case T_SHORT: 5926 assert(VM_Version::supports_avx512_bitalg(), ""); 5927 evpopcntw(dst, mask, src, merge, vec_enc); 5928 break; 5929 case T_BYTE: 5930 case T_BOOLEAN: 5931 assert(VM_Version::supports_avx512_bitalg(), ""); 5932 evpopcntb(dst, mask, src, merge, vec_enc); 5933 break; 5934 default: 5935 fatal("Unsupported type %s", type2name(bt)); 5936 break; 5937 } 5938 } 5939 5940 #ifndef _LP64 5941 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 5942 assert(VM_Version::supports_avx512bw(), ""); 5943 kmovdl(tmp, src); 5944 kunpckdql(dst, tmp, tmp); 5945 } 5946 #endif 5947 5948 // Bit reversal algorithm first reverses the bits of each byte followed by 5949 // a byte level reversal for multi-byte primitive types (short/int/long). 5950 // Algorithm performs a lookup table access to get reverse bit sequence 5951 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5952 // is obtained by swapping the reverse bit sequences of upper and lower 5953 // nibble of a byte. 5954 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5955 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5956 if (VM_Version::supports_avx512vlbw()) { 5957 5958 // Get the reverse bit sequence of lower nibble of each byte. 5959 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5960 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5961 evpandq(dst, xtmp2, src, vec_enc); 5962 vpshufb(dst, xtmp1, dst, vec_enc); 5963 vpsllq(dst, dst, 4, vec_enc); 5964 5965 // Get the reverse bit sequence of upper nibble of each byte. 5966 vpandn(xtmp2, xtmp2, src, vec_enc); 5967 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5968 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5969 5970 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5971 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5972 evporq(xtmp2, dst, xtmp2, vec_enc); 5973 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5974 5975 } else if(vec_enc == Assembler::AVX_512bit) { 5976 // Shift based bit reversal. 5977 assert(bt == T_LONG || bt == T_INT, ""); 5978 5979 // Swap lower and upper nibble of each byte. 5980 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5981 5982 // Swap two least and most significant bits of each nibble. 5983 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5984 5985 // Swap adjacent pair of bits. 5986 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5987 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5988 5989 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5990 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5991 } else { 5992 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5993 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5994 5995 // Get the reverse bit sequence of lower nibble of each byte. 5996 vpand(dst, xtmp2, src, vec_enc); 5997 vpshufb(dst, xtmp1, dst, vec_enc); 5998 vpsllq(dst, dst, 4, vec_enc); 5999 6000 // Get the reverse bit sequence of upper nibble of each byte. 6001 vpandn(xtmp2, xtmp2, src, vec_enc); 6002 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 6003 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6004 6005 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 6006 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 6007 vpor(xtmp2, dst, xtmp2, vec_enc); 6008 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 6009 } 6010 } 6011 6012 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 6013 XMMRegister xtmp, Register rscratch) { 6014 assert(VM_Version::supports_gfni(), ""); 6015 assert(rscratch != noreg || always_reachable(mask), "missing"); 6016 6017 // Galois field instruction based bit reversal based on following algorithm. 6018 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6019 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 6020 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 6021 vector_reverse_byte(bt, dst, xtmp, vec_enc); 6022 } 6023 6024 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 6025 XMMRegister xtmp1, Register rtmp, int vec_enc) { 6026 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 6027 evpandq(dst, xtmp1, src, vec_enc); 6028 vpsllq(dst, dst, nbits, vec_enc); 6029 vpandn(xtmp1, xtmp1, src, vec_enc); 6030 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 6031 evporq(dst, dst, xtmp1, vec_enc); 6032 } 6033 6034 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6035 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6036 // Shift based bit reversal. 6037 assert(VM_Version::supports_evex(), ""); 6038 switch(bt) { 6039 case T_LONG: 6040 // Swap upper and lower double word of each quad word. 6041 evprorq(xtmp1, k0, src, 32, true, vec_enc); 6042 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 6043 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6044 break; 6045 case T_INT: 6046 // Swap upper and lower word of each double word. 6047 evprord(xtmp1, k0, src, 16, true, vec_enc); 6048 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6049 break; 6050 case T_CHAR: 6051 case T_SHORT: 6052 // Swap upper and lower byte of each word. 6053 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 6054 break; 6055 case T_BYTE: 6056 evmovdquq(dst, k0, src, true, vec_enc); 6057 break; 6058 default: 6059 fatal("Unsupported type %s", type2name(bt)); 6060 break; 6061 } 6062 } 6063 6064 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 6065 if (bt == T_BYTE) { 6066 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 6067 evmovdquq(dst, k0, src, true, vec_enc); 6068 } else { 6069 vmovdqu(dst, src); 6070 } 6071 return; 6072 } 6073 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 6074 // pre-computed shuffle indices. 6075 switch(bt) { 6076 case T_LONG: 6077 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 6078 break; 6079 case T_INT: 6080 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 6081 break; 6082 case T_CHAR: 6083 case T_SHORT: 6084 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 6085 break; 6086 default: 6087 fatal("Unsupported type %s", type2name(bt)); 6088 break; 6089 } 6090 vpshufb(dst, src, dst, vec_enc); 6091 } 6092 6093 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6094 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6095 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 6096 assert(is_integral_type(bt), ""); 6097 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 6098 assert(VM_Version::supports_avx512cd(), ""); 6099 switch(bt) { 6100 case T_LONG: 6101 evplzcntq(dst, ktmp, src, merge, vec_enc); 6102 break; 6103 case T_INT: 6104 evplzcntd(dst, ktmp, src, merge, vec_enc); 6105 break; 6106 case T_SHORT: 6107 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 6108 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 6109 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 6110 vpunpckhwd(dst, xtmp1, src, vec_enc); 6111 evplzcntd(dst, ktmp, dst, merge, vec_enc); 6112 vpackusdw(dst, xtmp2, dst, vec_enc); 6113 break; 6114 case T_BYTE: 6115 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6116 // accessing the lookup table. 6117 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6118 // accessing the lookup table. 6119 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6120 assert(VM_Version::supports_avx512bw(), ""); 6121 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 6122 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 6123 vpand(xtmp2, dst, src, vec_enc); 6124 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6125 vpsrlw(xtmp3, src, 4, vec_enc); 6126 vpand(xtmp3, dst, xtmp3, vec_enc); 6127 vpshufb(dst, xtmp1, xtmp3, vec_enc); 6128 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6129 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 6130 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 6131 break; 6132 default: 6133 fatal("Unsupported type %s", type2name(bt)); 6134 break; 6135 } 6136 } 6137 6138 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6139 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6140 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6141 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6142 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6143 // accessing the lookup table. 6144 vpand(dst, xtmp2, src, vec_enc); 6145 vpshufb(dst, xtmp1, dst, vec_enc); 6146 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6147 // accessing the lookup table. 6148 vpsrlw(xtmp3, src, 4, vec_enc); 6149 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6150 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6151 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6152 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6153 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6154 vpaddb(dst, dst, xtmp2, vec_enc); 6155 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6156 } 6157 6158 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6159 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6160 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6161 // Add zero counts of lower byte and upper byte of a word if 6162 // upper byte holds a zero value. 6163 vpsrlw(xtmp3, src, 8, vec_enc); 6164 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6165 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6166 vpsllw(xtmp2, dst, 8, vec_enc); 6167 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6168 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6169 vpsrlw(dst, dst, 8, vec_enc); 6170 } 6171 6172 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6173 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6174 // Since IEEE 754 floating point format represents mantissa in 1.0 format 6175 // hence biased exponent can be used to compute leading zero count as per 6176 // following formula:- 6177 // LZCNT = 32 - (biased_exp - 127) 6178 // Special handling has been introduced for Zero, Max_Int and -ve source values. 6179 6180 // Broadcast 0xFF 6181 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6182 vpsrld(xtmp1, xtmp1, 24, vec_enc); 6183 6184 // Extract biased exponent. 6185 vcvtdq2ps(dst, src, vec_enc); 6186 vpsrld(dst, dst, 23, vec_enc); 6187 vpand(dst, dst, xtmp1, vec_enc); 6188 6189 // Broadcast 127. 6190 vpsrld(xtmp1, xtmp1, 1, vec_enc); 6191 // Exponent = biased_exp - 127 6192 vpsubd(dst, dst, xtmp1, vec_enc); 6193 6194 // Exponent = Exponent + 1 6195 vpsrld(xtmp3, xtmp1, 6, vec_enc); 6196 vpaddd(dst, dst, xtmp3, vec_enc); 6197 6198 // Replace -ve exponent with zero, exponent is -ve when src 6199 // lane contains a zero value. 6200 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6201 vblendvps(dst, dst, xtmp2, dst, vec_enc); 6202 6203 // Rematerialize broadcast 32. 6204 vpslld(xtmp1, xtmp3, 5, vec_enc); 6205 // Exponent is 32 if corresponding source lane contains max_int value. 6206 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 6207 // LZCNT = 32 - exponent 6208 vpsubd(dst, xtmp1, dst, vec_enc); 6209 6210 // Replace LZCNT with a value 1 if corresponding source lane 6211 // contains max_int value. 6212 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 6213 6214 // Replace biased_exp with 0 if source lane value is less than zero. 6215 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6216 vblendvps(dst, dst, xtmp2, src, vec_enc); 6217 } 6218 6219 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6220 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6221 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6222 // Add zero counts of lower word and upper word of a double word if 6223 // upper word holds a zero value. 6224 vpsrld(xtmp3, src, 16, vec_enc); 6225 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6226 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6227 vpslld(xtmp2, dst, 16, vec_enc); 6228 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6229 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6230 vpsrld(dst, dst, 16, vec_enc); 6231 // Add zero counts of lower doubleword and upper doubleword of a 6232 // quadword if upper doubleword holds a zero value. 6233 vpsrlq(xtmp3, src, 32, vec_enc); 6234 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6235 vpsllq(xtmp2, dst, 32, vec_enc); 6236 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6237 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6238 vpsrlq(dst, dst, 32, vec_enc); 6239 } 6240 6241 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6242 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6243 Register rtmp, int vec_enc) { 6244 assert(is_integral_type(bt), "unexpected type"); 6245 assert(vec_enc < Assembler::AVX_512bit, ""); 6246 switch(bt) { 6247 case T_LONG: 6248 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6249 break; 6250 case T_INT: 6251 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6252 break; 6253 case T_SHORT: 6254 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6255 break; 6256 case T_BYTE: 6257 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6258 break; 6259 default: 6260 fatal("Unsupported type %s", type2name(bt)); 6261 break; 6262 } 6263 } 6264 6265 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6266 switch(bt) { 6267 case T_BYTE: 6268 vpsubb(dst, src1, src2, vec_enc); 6269 break; 6270 case T_SHORT: 6271 vpsubw(dst, src1, src2, vec_enc); 6272 break; 6273 case T_INT: 6274 vpsubd(dst, src1, src2, vec_enc); 6275 break; 6276 case T_LONG: 6277 vpsubq(dst, src1, src2, vec_enc); 6278 break; 6279 default: 6280 fatal("Unsupported type %s", type2name(bt)); 6281 break; 6282 } 6283 } 6284 6285 // Trailing zero count computation is based on leading zero count operation as per 6286 // following equation. All AVX3 targets support AVX512CD feature which offers 6287 // direct vector instruction to compute leading zero count. 6288 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6289 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6290 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6291 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6292 assert(is_integral_type(bt), ""); 6293 // xtmp = -1 6294 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6295 // xtmp = xtmp + src 6296 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6297 // xtmp = xtmp & ~src 6298 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6299 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6300 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6301 vpsub(bt, dst, xtmp4, dst, vec_enc); 6302 } 6303 6304 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6305 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6306 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6307 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6308 assert(is_integral_type(bt), ""); 6309 // xtmp = 0 6310 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6311 // xtmp = 0 - src 6312 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6313 // xtmp = xtmp | src 6314 vpor(xtmp3, xtmp3, src, vec_enc); 6315 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6316 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6317 vpsub(bt, dst, xtmp1, dst, vec_enc); 6318 } 6319 6320 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6321 Label done; 6322 Label neg_divisor_fastpath; 6323 cmpl(divisor, 0); 6324 jccb(Assembler::less, neg_divisor_fastpath); 6325 xorl(rdx, rdx); 6326 divl(divisor); 6327 jmpb(done); 6328 bind(neg_divisor_fastpath); 6329 // Fastpath for divisor < 0: 6330 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6331 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6332 movl(rdx, rax); 6333 subl(rdx, divisor); 6334 if (VM_Version::supports_bmi1()) { 6335 andnl(rax, rdx, rax); 6336 } else { 6337 notl(rdx); 6338 andl(rax, rdx); 6339 } 6340 shrl(rax, 31); 6341 bind(done); 6342 } 6343 6344 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6345 Label done; 6346 Label neg_divisor_fastpath; 6347 cmpl(divisor, 0); 6348 jccb(Assembler::less, neg_divisor_fastpath); 6349 xorl(rdx, rdx); 6350 divl(divisor); 6351 jmpb(done); 6352 bind(neg_divisor_fastpath); 6353 // Fastpath when divisor < 0: 6354 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6355 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6356 movl(rdx, rax); 6357 subl(rax, divisor); 6358 if (VM_Version::supports_bmi1()) { 6359 andnl(rax, rax, rdx); 6360 } else { 6361 notl(rax); 6362 andl(rax, rdx); 6363 } 6364 sarl(rax, 31); 6365 andl(rax, divisor); 6366 subl(rdx, rax); 6367 bind(done); 6368 } 6369 6370 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6371 Label done; 6372 Label neg_divisor_fastpath; 6373 6374 cmpl(divisor, 0); 6375 jccb(Assembler::less, neg_divisor_fastpath); 6376 xorl(rdx, rdx); 6377 divl(divisor); 6378 jmpb(done); 6379 bind(neg_divisor_fastpath); 6380 // Fastpath for divisor < 0: 6381 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6382 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6383 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6384 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6385 movl(rdx, rax); 6386 subl(rax, divisor); 6387 if (VM_Version::supports_bmi1()) { 6388 andnl(rax, rax, rdx); 6389 } else { 6390 notl(rax); 6391 andl(rax, rdx); 6392 } 6393 movl(tmp, rax); 6394 shrl(rax, 31); // quotient 6395 sarl(tmp, 31); 6396 andl(tmp, divisor); 6397 subl(rdx, tmp); // remainder 6398 bind(done); 6399 } 6400 6401 #ifdef _LP64 6402 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6403 XMMRegister xtmp2, Register rtmp) { 6404 if(VM_Version::supports_gfni()) { 6405 // Galois field instruction based bit reversal based on following algorithm. 6406 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6407 mov64(rtmp, 0x8040201008040201L); 6408 movq(xtmp1, src); 6409 movq(xtmp2, rtmp); 6410 gf2p8affineqb(xtmp1, xtmp2, 0); 6411 movq(dst, xtmp1); 6412 } else { 6413 // Swap even and odd numbered bits. 6414 movl(rtmp, src); 6415 andl(rtmp, 0x55555555); 6416 shll(rtmp, 1); 6417 movl(dst, src); 6418 andl(dst, 0xAAAAAAAA); 6419 shrl(dst, 1); 6420 orl(dst, rtmp); 6421 6422 // Swap LSB and MSB 2 bits of each nibble. 6423 movl(rtmp, dst); 6424 andl(rtmp, 0x33333333); 6425 shll(rtmp, 2); 6426 andl(dst, 0xCCCCCCCC); 6427 shrl(dst, 2); 6428 orl(dst, rtmp); 6429 6430 // Swap LSB and MSB 4 bits of each byte. 6431 movl(rtmp, dst); 6432 andl(rtmp, 0x0F0F0F0F); 6433 shll(rtmp, 4); 6434 andl(dst, 0xF0F0F0F0); 6435 shrl(dst, 4); 6436 orl(dst, rtmp); 6437 } 6438 bswapl(dst); 6439 } 6440 6441 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6442 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6443 if(VM_Version::supports_gfni()) { 6444 // Galois field instruction based bit reversal based on following algorithm. 6445 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6446 mov64(rtmp1, 0x8040201008040201L); 6447 movq(xtmp1, src); 6448 movq(xtmp2, rtmp1); 6449 gf2p8affineqb(xtmp1, xtmp2, 0); 6450 movq(dst, xtmp1); 6451 } else { 6452 // Swap even and odd numbered bits. 6453 movq(rtmp1, src); 6454 mov64(rtmp2, 0x5555555555555555L); 6455 andq(rtmp1, rtmp2); 6456 shlq(rtmp1, 1); 6457 movq(dst, src); 6458 notq(rtmp2); 6459 andq(dst, rtmp2); 6460 shrq(dst, 1); 6461 orq(dst, rtmp1); 6462 6463 // Swap LSB and MSB 2 bits of each nibble. 6464 movq(rtmp1, dst); 6465 mov64(rtmp2, 0x3333333333333333L); 6466 andq(rtmp1, rtmp2); 6467 shlq(rtmp1, 2); 6468 notq(rtmp2); 6469 andq(dst, rtmp2); 6470 shrq(dst, 2); 6471 orq(dst, rtmp1); 6472 6473 // Swap LSB and MSB 4 bits of each byte. 6474 movq(rtmp1, dst); 6475 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6476 andq(rtmp1, rtmp2); 6477 shlq(rtmp1, 4); 6478 notq(rtmp2); 6479 andq(dst, rtmp2); 6480 shrq(dst, 4); 6481 orq(dst, rtmp1); 6482 } 6483 bswapq(dst); 6484 } 6485 6486 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6487 Label done; 6488 Label neg_divisor_fastpath; 6489 cmpq(divisor, 0); 6490 jccb(Assembler::less, neg_divisor_fastpath); 6491 xorl(rdx, rdx); 6492 divq(divisor); 6493 jmpb(done); 6494 bind(neg_divisor_fastpath); 6495 // Fastpath for divisor < 0: 6496 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6497 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6498 movq(rdx, rax); 6499 subq(rdx, divisor); 6500 if (VM_Version::supports_bmi1()) { 6501 andnq(rax, rdx, rax); 6502 } else { 6503 notq(rdx); 6504 andq(rax, rdx); 6505 } 6506 shrq(rax, 63); 6507 bind(done); 6508 } 6509 6510 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6511 Label done; 6512 Label neg_divisor_fastpath; 6513 cmpq(divisor, 0); 6514 jccb(Assembler::less, neg_divisor_fastpath); 6515 xorq(rdx, rdx); 6516 divq(divisor); 6517 jmp(done); 6518 bind(neg_divisor_fastpath); 6519 // Fastpath when divisor < 0: 6520 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6521 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6522 movq(rdx, rax); 6523 subq(rax, divisor); 6524 if (VM_Version::supports_bmi1()) { 6525 andnq(rax, rax, rdx); 6526 } else { 6527 notq(rax); 6528 andq(rax, rdx); 6529 } 6530 sarq(rax, 63); 6531 andq(rax, divisor); 6532 subq(rdx, rax); 6533 bind(done); 6534 } 6535 6536 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6537 Label done; 6538 Label neg_divisor_fastpath; 6539 cmpq(divisor, 0); 6540 jccb(Assembler::less, neg_divisor_fastpath); 6541 xorq(rdx, rdx); 6542 divq(divisor); 6543 jmp(done); 6544 bind(neg_divisor_fastpath); 6545 // Fastpath for divisor < 0: 6546 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6547 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6548 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6549 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6550 movq(rdx, rax); 6551 subq(rax, divisor); 6552 if (VM_Version::supports_bmi1()) { 6553 andnq(rax, rax, rdx); 6554 } else { 6555 notq(rax); 6556 andq(rax, rdx); 6557 } 6558 movq(tmp, rax); 6559 shrq(rax, 63); // quotient 6560 sarq(tmp, 63); 6561 andq(tmp, divisor); 6562 subq(rdx, tmp); // remainder 6563 bind(done); 6564 } 6565 #endif 6566 6567 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6568 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6569 int vlen_enc) { 6570 assert(VM_Version::supports_avx512bw(), ""); 6571 // Byte shuffles are inlane operations and indices are determined using 6572 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6573 // normalized to index range 0-15. This makes sure that all the multiples 6574 // of an index value are placed at same relative position in 128 bit 6575 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6576 // will be 16th element in their respective 128 bit lanes. 6577 movl(rtmp, 16); 6578 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6579 6580 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6581 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6582 // original shuffle indices and move the shuffled lanes corresponding to true 6583 // mask to destination vector. 6584 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6585 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6586 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6587 6588 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6589 // and broadcasting second 128 bit lane. 6590 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6591 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6592 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6593 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6594 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6595 6596 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6597 // and broadcasting third 128 bit lane. 6598 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6599 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6600 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6601 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6602 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6603 6604 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6605 // and broadcasting third 128 bit lane. 6606 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6607 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6608 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6609 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6610 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6611 } 6612 6613 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6614 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6615 if (vlen_enc == AVX_128bit) { 6616 vpermilps(dst, src, shuffle, vlen_enc); 6617 } else if (bt == T_INT) { 6618 vpermd(dst, shuffle, src, vlen_enc); 6619 } else { 6620 assert(bt == T_FLOAT, ""); 6621 vpermps(dst, shuffle, src, vlen_enc); 6622 } 6623 }