1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "oops/methodData.hpp" 31 #include "opto/c2_MacroAssembler.hpp" 32 #include "opto/intrinsicnode.hpp" 33 #include "opto/output.hpp" 34 #include "opto/opcodes.hpp" 35 #include "opto/subnode.hpp" 36 #include "runtime/globals.hpp" 37 #include "runtime/objectMonitor.hpp" 38 #include "runtime/stubRoutines.hpp" 39 #include "utilities/checkedCast.hpp" 40 #include "utilities/globalDefinitions.hpp" 41 #include "utilities/powerOfTwo.hpp" 42 #include "utilities/sizes.hpp" 43 44 #ifdef PRODUCT 45 #define BLOCK_COMMENT(str) /* nothing */ 46 #define STOP(error) stop(error) 47 #else 48 #define BLOCK_COMMENT(str) block_comment(str) 49 #define STOP(error) block_comment(error); stop(error) 50 #endif 51 52 // C2 compiled method's prolog code. 53 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { 54 55 // WARNING: Initial instruction MUST be 5 bytes or longer so that 56 // NativeJump::patch_verified_entry will be able to patch out the entry 57 // code safely. The push to verify stack depth is ok at 5 bytes, 58 // the frame allocation can be either 3 or 6 bytes. So if we don't do 59 // stack bang then we must use the 6 byte frame allocation even if 60 // we have no frame. :-( 61 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 62 63 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 64 // Remove word for return addr 65 framesize -= wordSize; 66 stack_bang_size -= wordSize; 67 68 // Calls to C2R adapters often do not accept exceptional returns. 69 // We require that their callers must bang for them. But be careful, because 70 // some VM calls (such as call site linkage) can use several kilobytes of 71 // stack. But the stack safety zone should account for that. 72 // See bugs 4446381, 4468289, 4497237. 73 if (stack_bang_size > 0) { 74 generate_stack_overflow_check(stack_bang_size); 75 76 // We always push rbp, so that on return to interpreter rbp, will be 77 // restored correctly and we can correct the stack. 78 push(rbp); 79 // Save caller's stack pointer into RBP if the frame pointer is preserved. 80 if (PreserveFramePointer) { 81 mov(rbp, rsp); 82 } 83 // Remove word for ebp 84 framesize -= wordSize; 85 86 // Create frame 87 if (framesize) { 88 subptr(rsp, framesize); 89 } 90 } else { 91 // Create frame (force generation of a 4 byte immediate value) 92 subptr_imm32(rsp, framesize); 93 94 // Save RBP register now. 95 framesize -= wordSize; 96 movptr(Address(rsp, framesize), rbp); 97 // Save caller's stack pointer into RBP if the frame pointer is preserved. 98 if (PreserveFramePointer) { 99 movptr(rbp, rsp); 100 if (framesize > 0) { 101 addptr(rbp, framesize); 102 } 103 } 104 } 105 106 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 107 framesize -= wordSize; 108 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 109 } 110 111 #ifndef _LP64 112 // If method sets FPU control word do it now 113 if (fp_mode_24b) { 114 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 115 } 116 if (UseSSE >= 2 && VerifyFPU) { 117 verify_FPU(0, "FPU stack must be clean on entry"); 118 } 119 #endif 120 121 #ifdef ASSERT 122 if (VerifyStackAtCalls) { 123 Label L; 124 push(rax); 125 mov(rax, rsp); 126 andptr(rax, StackAlignmentInBytes-1); 127 cmpptr(rax, StackAlignmentInBytes-wordSize); 128 pop(rax); 129 jcc(Assembler::equal, L); 130 STOP("Stack is not properly aligned!"); 131 bind(L); 132 } 133 #endif 134 135 if (!is_stub) { 136 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 137 #ifdef _LP64 138 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 139 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 140 Label dummy_slow_path; 141 Label dummy_continuation; 142 Label* slow_path = &dummy_slow_path; 143 Label* continuation = &dummy_continuation; 144 if (!Compile::current()->output()->in_scratch_emit_size()) { 145 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 146 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 147 Compile::current()->output()->add_stub(stub); 148 slow_path = &stub->entry(); 149 continuation = &stub->continuation(); 150 } 151 bs->nmethod_entry_barrier(this, slow_path, continuation); 152 } 153 #else 154 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 155 bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */); 156 #endif 157 } 158 } 159 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 161 switch (vlen_in_bytes) { 162 case 4: // fall-through 163 case 8: // fall-through 164 case 16: return Assembler::AVX_128bit; 165 case 32: return Assembler::AVX_256bit; 166 case 64: return Assembler::AVX_512bit; 167 168 default: { 169 ShouldNotReachHere(); 170 return Assembler::AVX_NoVec; 171 } 172 } 173 } 174 175 #if INCLUDE_RTM_OPT 176 177 // Update rtm_counters based on abort status 178 // input: abort_status 179 // rtm_counters (RTMLockingCounters*) 180 // flags are killed 181 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) { 182 183 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset())); 184 if (PrintPreciseRTMLockingStatistics) { 185 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 186 Label check_abort; 187 testl(abort_status, (1<<i)); 188 jccb(Assembler::equal, check_abort); 189 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx)))); 190 bind(check_abort); 191 } 192 } 193 } 194 195 // Branch if (random & (count-1) != 0), count is 2^n 196 // tmp, scr and flags are killed 197 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) { 198 assert(tmp == rax, ""); 199 assert(scr == rdx, ""); 200 rdtsc(); // modifies EDX:EAX 201 andptr(tmp, count-1); 202 jccb(Assembler::notZero, brLabel); 203 } 204 205 // Perform abort ratio calculation, set no_rtm bit if high ratio 206 // input: rtm_counters_Reg (RTMLockingCounters* address) 207 // tmpReg, rtm_counters_Reg and flags are killed 208 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg, 209 Register rtm_counters_Reg, 210 RTMLockingCounters* rtm_counters, 211 Metadata* method_data) { 212 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 213 214 if (RTMLockingCalculationDelay > 0) { 215 // Delay calculation 216 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr())); 217 testptr(tmpReg, tmpReg); 218 jccb(Assembler::equal, L_done); 219 } 220 // Abort ratio calculation only if abort_count > RTMAbortThreshold 221 // Aborted transactions = abort_count * 100 222 // All transactions = total_count * RTMTotalCountIncrRate 223 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 224 225 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset())); 226 cmpptr(tmpReg, RTMAbortThreshold); 227 jccb(Assembler::below, L_check_always_rtm2); 228 imulptr(tmpReg, tmpReg, 100); 229 230 Register scrReg = rtm_counters_Reg; 231 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 232 imulptr(scrReg, scrReg, RTMTotalCountIncrRate); 233 imulptr(scrReg, scrReg, RTMAbortRatio); 234 cmpptr(tmpReg, scrReg); 235 jccb(Assembler::below, L_check_always_rtm1); 236 if (method_data != nullptr) { 237 // set rtm_state to "no rtm" in MDO 238 mov_metadata(tmpReg, method_data); 239 lock(); 240 orl(Address(tmpReg, MethodData::rtm_state_offset()), NoRTM); 241 } 242 jmpb(L_done); 243 bind(L_check_always_rtm1); 244 // Reload RTMLockingCounters* address 245 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 246 bind(L_check_always_rtm2); 247 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 248 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 249 jccb(Assembler::below, L_done); 250 if (method_data != nullptr) { 251 // set rtm_state to "always rtm" in MDO 252 mov_metadata(tmpReg, method_data); 253 lock(); 254 orl(Address(tmpReg, MethodData::rtm_state_offset()), UseRTM); 255 } 256 bind(L_done); 257 } 258 259 // Update counters and perform abort ratio calculation 260 // input: abort_status_Reg 261 // rtm_counters_Reg, flags are killed 262 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg, 263 Register rtm_counters_Reg, 264 RTMLockingCounters* rtm_counters, 265 Metadata* method_data, 266 bool profile_rtm) { 267 268 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 269 // update rtm counters based on rax value at abort 270 // reads abort_status_Reg, updates flags 271 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 272 rtm_counters_update(abort_status_Reg, rtm_counters_Reg); 273 if (profile_rtm) { 274 // Save abort status because abort_status_Reg is used by following code. 275 if (RTMRetryCount > 0) { 276 push(abort_status_Reg); 277 } 278 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 279 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data); 280 // restore abort status 281 if (RTMRetryCount > 0) { 282 pop(abort_status_Reg); 283 } 284 } 285 } 286 287 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4) 288 // inputs: retry_count_Reg 289 // : abort_status_Reg 290 // output: retry_count_Reg decremented by 1 291 // flags are killed 292 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) { 293 Label doneRetry; 294 assert(abort_status_Reg == rax, ""); 295 // The abort reason bits are in eax (see all states in rtmLocking.hpp) 296 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) 297 // if reason is in 0x6 and retry count != 0 then retry 298 andptr(abort_status_Reg, 0x6); 299 jccb(Assembler::zero, doneRetry); 300 testl(retry_count_Reg, retry_count_Reg); 301 jccb(Assembler::zero, doneRetry); 302 pause(); 303 decrementl(retry_count_Reg); 304 jmp(retryLabel); 305 bind(doneRetry); 306 } 307 308 // Spin and retry if lock is busy, 309 // inputs: box_Reg (monitor address) 310 // : retry_count_Reg 311 // output: retry_count_Reg decremented by 1 312 // : clear z flag if retry count exceeded 313 // tmp_Reg, scr_Reg, flags are killed 314 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg, 315 Register tmp_Reg, Register scr_Reg, Label& retryLabel) { 316 Label SpinLoop, SpinExit, doneRetry; 317 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 318 319 testl(retry_count_Reg, retry_count_Reg); 320 jccb(Assembler::zero, doneRetry); 321 decrementl(retry_count_Reg); 322 movptr(scr_Reg, RTMSpinLoopCount); 323 324 bind(SpinLoop); 325 pause(); 326 decrementl(scr_Reg); 327 jccb(Assembler::lessEqual, SpinExit); 328 movptr(tmp_Reg, Address(box_Reg, owner_offset)); 329 testptr(tmp_Reg, tmp_Reg); 330 jccb(Assembler::notZero, SpinLoop); 331 332 bind(SpinExit); 333 jmp(retryLabel); 334 bind(doneRetry); 335 incrementl(retry_count_Reg); // clear z flag 336 } 337 338 // Use RTM for normal stack locks 339 // Input: objReg (object to lock) 340 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg, 341 Register retry_on_abort_count_Reg, 342 RTMLockingCounters* stack_rtm_counters, 343 Metadata* method_data, bool profile_rtm, 344 Label& DONE_LABEL, Label& IsInflated) { 345 assert(UseRTMForStackLocks, "why call this otherwise?"); 346 assert(tmpReg == rax, ""); 347 assert(scrReg == rdx, ""); 348 Label L_rtm_retry, L_decrement_retry, L_on_abort; 349 350 if (RTMRetryCount > 0) { 351 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 352 bind(L_rtm_retry); 353 } 354 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 355 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 356 jcc(Assembler::notZero, IsInflated); 357 358 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 359 Label L_noincrement; 360 if (RTMTotalCountIncrRate > 1) { 361 // tmpReg, scrReg and flags are killed 362 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 363 } 364 assert(stack_rtm_counters != nullptr, "should not be null when profiling RTM"); 365 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg); 366 bind(L_noincrement); 367 } 368 xbegin(L_on_abort); 369 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 370 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 371 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 372 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked 373 374 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 375 if (UseRTMXendForLockBusy) { 376 xend(); 377 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry) 378 jmp(L_decrement_retry); 379 } 380 else { 381 xabort(0); 382 } 383 bind(L_on_abort); 384 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 385 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm); 386 } 387 bind(L_decrement_retry); 388 if (RTMRetryCount > 0) { 389 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 390 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 391 } 392 } 393 394 // Use RTM for inflating locks 395 // inputs: objReg (object to lock) 396 // boxReg (on-stack box address (displaced header location) - KILLED) 397 // tmpReg (ObjectMonitor address + markWord::monitor_value) 398 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg, 399 Register scrReg, Register retry_on_busy_count_Reg, 400 Register retry_on_abort_count_Reg, 401 RTMLockingCounters* rtm_counters, 402 Metadata* method_data, bool profile_rtm, 403 Label& DONE_LABEL) { 404 assert(UseRTMLocking, "why call this otherwise?"); 405 assert(tmpReg == rax, ""); 406 assert(scrReg == rdx, ""); 407 Label L_rtm_retry, L_decrement_retry, L_on_abort; 408 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 409 410 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 411 movptr(boxReg, tmpReg); // Save ObjectMonitor address 412 413 if (RTMRetryCount > 0) { 414 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy 415 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 416 bind(L_rtm_retry); 417 } 418 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 419 Label L_noincrement; 420 if (RTMTotalCountIncrRate > 1) { 421 // tmpReg, scrReg and flags are killed 422 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 423 } 424 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 425 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg); 426 bind(L_noincrement); 427 } 428 xbegin(L_on_abort); 429 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 430 movptr(tmpReg, Address(tmpReg, owner_offset)); 431 testptr(tmpReg, tmpReg); 432 jcc(Assembler::zero, DONE_LABEL); 433 if (UseRTMXendForLockBusy) { 434 xend(); 435 jmp(L_decrement_retry); 436 } 437 else { 438 xabort(0); 439 } 440 bind(L_on_abort); 441 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 442 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 443 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm); 444 } 445 if (RTMRetryCount > 0) { 446 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 447 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 448 } 449 450 movptr(tmpReg, Address(boxReg, owner_offset)) ; 451 testptr(tmpReg, tmpReg) ; 452 jccb(Assembler::notZero, L_decrement_retry) ; 453 454 // Appears unlocked - try to swing _owner from null to non-null. 455 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 456 #ifdef _LP64 457 Register threadReg = r15_thread; 458 #else 459 get_thread(scrReg); 460 Register threadReg = scrReg; 461 #endif 462 movptr(scrReg, Address(threadReg, JavaThread::lock_id_offset())); 463 lock(); 464 cmpxchgptr(scrReg, Address(boxReg, owner_offset)); // Updates tmpReg 465 466 if (RTMRetryCount > 0) { 467 // success done else retry 468 jccb(Assembler::equal, DONE_LABEL) ; 469 bind(L_decrement_retry); 470 // Spin and retry if lock is busy. 471 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry); 472 jmp(DONE_LABEL); 473 } 474 else { 475 bind(L_decrement_retry); 476 jmp(DONE_LABEL); 477 } 478 } 479 480 #endif // INCLUDE_RTM_OPT 481 482 // fast_lock and fast_unlock used by C2 483 484 // Because the transitions from emitted code to the runtime 485 // monitorenter/exit helper stubs are so slow it's critical that 486 // we inline both the stack-locking fast path and the inflated fast path. 487 // 488 // See also: cmpFastLock and cmpFastUnlock. 489 // 490 // What follows is a specialized inline transliteration of the code 491 // in enter() and exit(). If we're concerned about I$ bloat another 492 // option would be to emit TrySlowEnter and TrySlowExit methods 493 // at startup-time. These methods would accept arguments as 494 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 495 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 496 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 497 // In practice, however, the # of lock sites is bounded and is usually small. 498 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 499 // if the processor uses simple bimodal branch predictors keyed by EIP 500 // Since the helper routines would be called from multiple synchronization 501 // sites. 502 // 503 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 504 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 505 // to those specialized methods. That'd give us a mostly platform-independent 506 // implementation that the JITs could optimize and inline at their pleasure. 507 // Done correctly, the only time we'd need to cross to native could would be 508 // to park() or unpark() threads. We'd also need a few more unsafe operators 509 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 510 // (b) explicit barriers or fence operations. 511 // 512 // TODO: 513 // 514 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 515 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 516 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 517 // the lock operators would typically be faster than reifying Self. 518 // 519 // * Ideally I'd define the primitives as: 520 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 521 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 522 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 523 // Instead, we're stuck with a rather awkward and brittle register assignments below. 524 // Furthermore the register assignments are overconstrained, possibly resulting in 525 // sub-optimal code near the synchronization site. 526 // 527 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 528 // Alternately, use a better sp-proximity test. 529 // 530 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 531 // Either one is sufficient to uniquely identify a thread. 532 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 533 // 534 // * Intrinsify notify() and notifyAll() for the common cases where the 535 // object is locked by the calling thread but the waitlist is empty. 536 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 537 // 538 // * use jccb and jmpb instead of jcc and jmp to improve code density. 539 // But beware of excessive branch density on AMD Opterons. 540 // 541 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 542 // or failure of the fast path. If the fast path fails then we pass 543 // control to the slow path, typically in C. In fast_lock and 544 // fast_unlock we often branch to DONE_LABEL, just to find that C2 545 // will emit a conditional branch immediately after the node. 546 // So we have branches to branches and lots of ICC.ZF games. 547 // Instead, it might be better to have C2 pass a "FailureLabel" 548 // into fast_lock and fast_unlock. In the case of success, control 549 // will drop through the node. ICC.ZF is undefined at exit. 550 // In the case of failure, the node will branch directly to the 551 // FailureLabel 552 553 554 // obj: object to lock 555 // box: on-stack box address (displaced header location) - KILLED 556 // rax,: tmp -- KILLED 557 // scr: tmp -- KILLED 558 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 559 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 560 RTMLockingCounters* rtm_counters, 561 RTMLockingCounters* stack_rtm_counters, 562 Metadata* method_data, 563 bool use_rtm, bool profile_rtm) { 564 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 565 // Ensure the register assignments are disjoint 566 assert(tmpReg == rax, ""); 567 568 if (use_rtm) { 569 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg); 570 } else { 571 assert(cx1Reg == noreg, ""); 572 assert(cx2Reg == noreg, ""); 573 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 574 } 575 576 // Possible cases that we'll encounter in fast_lock 577 // ------------------------------------------------ 578 // * Inflated 579 // -- unlocked 580 // -- Locked 581 // = by self 582 // = by other 583 // * neutral 584 // * stack-locked 585 // -- by self 586 // = sp-proximity test hits 587 // = sp-proximity test generates false-negative 588 // -- by other 589 // 590 591 Label IsInflated, DONE_LABEL, COUNT; 592 593 if (DiagnoseSyncOnValueBasedClasses != 0) { 594 load_klass(tmpReg, objReg, scrReg); 595 movl(tmpReg, Address(tmpReg, Klass::access_flags_offset())); 596 testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS); 597 jcc(Assembler::notZero, DONE_LABEL); 598 } 599 600 #if INCLUDE_RTM_OPT 601 if (UseRTMForStackLocks && use_rtm) { 602 assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive"); 603 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg, 604 stack_rtm_counters, method_data, profile_rtm, 605 DONE_LABEL, IsInflated); 606 } 607 #endif // INCLUDE_RTM_OPT 608 609 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 610 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 611 jcc(Assembler::notZero, IsInflated); 612 613 if (LockingMode == LM_MONITOR) { 614 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 615 testptr(objReg, objReg); 616 } else { 617 assert(LockingMode == LM_LEGACY, "must be"); 618 // Attempt stack-locking ... 619 orptr (tmpReg, markWord::unlocked_value); 620 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 621 lock(); 622 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 623 jcc(Assembler::equal, COUNT); // Success 624 625 // Recursive locking. 626 // The object is stack-locked: markword contains stack pointer to BasicLock. 627 // Locked by current thread if difference with current SP is less than one page. 628 subptr(tmpReg, rsp); 629 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 630 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) ); 631 movptr(Address(boxReg, 0), tmpReg); 632 } 633 // After recursive stack locking attempt case 634 jmp(DONE_LABEL); 635 636 bind(IsInflated); 637 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 638 639 #if INCLUDE_RTM_OPT 640 // Use the same RTM locking code in 32- and 64-bit VM. 641 if (use_rtm) { 642 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg, 643 rtm_counters, method_data, profile_rtm, DONE_LABEL); 644 } else { 645 #endif // INCLUDE_RTM_OPT 646 647 #ifndef _LP64 648 // The object is inflated. 649 650 // boxReg refers to the on-stack BasicLock in the current frame. 651 // We'd like to write: 652 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 653 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 654 // additional latency as we have another ST in the store buffer that must drain. 655 656 // avoid ST-before-CAS 657 // register juggle because we need tmpReg for cmpxchgptr below 658 movptr(scrReg, boxReg); 659 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 660 661 // Optimistic form: consider XORL tmpReg,tmpReg 662 movptr(tmpReg, NULL_WORD); 663 664 // Appears unlocked - try to swing _owner from null to non-null. 665 // Ideally, I'd manifest "Self" with get_thread and then attempt 666 // to CAS the register containing thread id into m->Owner. 667 // But we don't have enough registers, so instead we can either try to CAS 668 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 669 // we later store thread id into m->Owner. Transiently storing a stack address 670 // (rsp or the address of the box) into m->owner is harmless. 671 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 672 lock(); 673 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 674 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 675 // If we weren't able to swing _owner from null to the BasicLock 676 // then take the slow path. 677 jccb (Assembler::notZero, DONE_LABEL); 678 // update _owner from BasicLock to thread 679 get_thread (scrReg); // beware: clobbers ICCs 680 movptr(scrReg, Address(scrReg, JavaThread::lock_id_offset())); 681 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 682 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 683 jmp(DONE_LABEL); 684 685 // If the CAS fails we can either retry or pass control to the slow path. 686 // We use the latter tactic. 687 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 688 // If the CAS was successful ... 689 // Self has acquired the lock 690 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 691 // Intentional fall-through into DONE_LABEL ... 692 #else // _LP64 693 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 694 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 695 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 696 697 // It's inflated and we use scrReg for ObjectMonitor* in this section. 698 movq(scrReg, tmpReg); 699 xorq(tmpReg, tmpReg); 700 movptr(boxReg, Address(r15_thread, JavaThread::lock_id_offset())); 701 lock(); 702 cmpxchgptr(boxReg, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 703 704 // Propagate ICC.ZF from CAS above into DONE_LABEL. 705 jccb(Assembler::equal, DONE_LABEL); // CAS above succeeded; propagate ZF = 1 (success) 706 707 cmpptr(boxReg, rax); // Check if we are already the owner (recursive lock) 708 jccb(Assembler::notEqual, DONE_LABEL); // If not recursive, ZF = 0 at this point (fail) 709 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 710 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 711 jmp(DONE_LABEL); 712 #endif // _LP64 713 #if INCLUDE_RTM_OPT 714 } // use_rtm() 715 #endif 716 717 bind(COUNT); 718 // Count monitors in fast path 719 increment(Address(thread, JavaThread::held_monitor_count_offset())); 720 xorl(tmpReg, tmpReg); // Set ZF == 1 721 722 bind(DONE_LABEL); 723 724 // At DONE_LABEL the icc ZFlag is set as follows ... 725 // fast_unlock uses the same protocol. 726 // ZFlag == 1 -> Success 727 // ZFlag == 0 -> Failure - force control through the slow path 728 } 729 730 // obj: object to unlock 731 // box: box address (displaced header location), killed. Must be EAX. 732 // tmp: killed, cannot be obj nor box. 733 // 734 // Some commentary on balanced locking: 735 // 736 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 737 // Methods that don't have provably balanced locking are forced to run in the 738 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 739 // The interpreter provides two properties: 740 // I1: At return-time the interpreter automatically and quietly unlocks any 741 // objects acquired the current activation (frame). Recall that the 742 // interpreter maintains an on-stack list of locks currently held by 743 // a frame. 744 // I2: If a method attempts to unlock an object that is not held by the 745 // the frame the interpreter throws IMSX. 746 // 747 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 748 // B() doesn't have provably balanced locking so it runs in the interpreter. 749 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 750 // is still locked by A(). 751 // 752 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 753 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 754 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 755 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 756 // Arguably given that the spec legislates the JNI case as undefined our implementation 757 // could reasonably *avoid* checking owner in fast_unlock(). 758 // In the interest of performance we elide m->Owner==Self check in unlock. 759 // A perfectly viable alternative is to elide the owner check except when 760 // Xcheck:jni is enabled. 761 762 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, Register scrReg, bool use_rtm) { 763 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 764 assert(boxReg == rax, ""); 765 assert_different_registers(objReg, boxReg, tmpReg); 766 767 Label DONE_LABEL, Stacked, COUNT; 768 769 #if INCLUDE_RTM_OPT 770 if (UseRTMForStackLocks && use_rtm) { 771 assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive"); 772 Label L_regular_unlock; 773 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 774 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 775 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 776 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock 777 xend(); // otherwise end... 778 jmp(DONE_LABEL); // ... and we're done 779 bind(L_regular_unlock); 780 } 781 #endif 782 783 if (LockingMode == LM_LEGACY) { 784 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 785 jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock 786 } 787 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 788 if (LockingMode != LM_MONITOR) { 789 testptr(tmpReg, markWord::monitor_value); // Inflated? 790 jcc(Assembler::zero, Stacked); 791 } 792 793 // It's inflated. 794 // If the owner is ANONYMOUS, we need to fix it - in an outline stub. 795 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t) ObjectMonitor::ANONYMOUS_OWNER); 796 #ifdef _LP64 797 if (!Compile::current()->output()->in_scratch_emit_size()) { 798 C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmpReg, boxReg); 799 Compile::current()->output()->add_stub(stub); 800 jcc(Assembler::equal, stub->entry()); 801 bind(stub->continuation()); 802 } else 803 #endif 804 { 805 // We can't easily implement this optimization on 32 bit because we don't have a thread register. 806 // Call the slow-path instead. 807 jcc(Assembler::notEqual, DONE_LABEL); 808 } 809 810 #if INCLUDE_RTM_OPT 811 if (use_rtm) { 812 Label L_regular_inflated_unlock; 813 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 814 movptr(boxReg, Address(tmpReg, owner_offset)); 815 testptr(boxReg, boxReg); 816 jccb(Assembler::notZero, L_regular_inflated_unlock); 817 xend(); 818 jmp(DONE_LABEL); 819 bind(L_regular_inflated_unlock); 820 } 821 #endif 822 823 // Despite our balanced locking property we still check that m->_owner == Self 824 // as java routines or native JNI code called by this thread might 825 // have released the lock. 826 // Refer to the comments in synchronizer.cpp for how we might encode extra 827 // state in _succ so we can avoid fetching EntryList|cxq. 828 // 829 // If there's no contention try a 1-0 exit. That is, exit without 830 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 831 // we detect and recover from the race that the 1-0 exit admits. 832 // 833 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 834 // before it STs null into _owner, releasing the lock. Updates 835 // to data protected by the critical section must be visible before 836 // we drop the lock (and thus before any other thread could acquire 837 // the lock and observe the fields protected by the lock). 838 // IA32's memory-model is SPO, so STs are ordered with respect to 839 // each other and there's no need for an explicit barrier (fence). 840 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 841 #ifndef _LP64 842 // Note that we could employ various encoding schemes to reduce 843 // the number of loads below (currently 4) to just 2 or 3. 844 // Refer to the comments in synchronizer.cpp. 845 // In practice the chain of fetches doesn't seem to impact performance, however. 846 xorptr(boxReg, boxReg); 847 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 848 jccb (Assembler::notZero, DONE_LABEL); 849 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 850 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 851 jccb (Assembler::notZero, DONE_LABEL); 852 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 853 jmpb (DONE_LABEL); 854 #else // _LP64 855 // It's inflated 856 Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath; 857 858 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 859 jccb(Assembler::equal, LNotRecursive); 860 861 // Recursive inflated unlock 862 decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 863 xorl(tmpReg, tmpReg); // Set ZF == 1 864 jmp(DONE_LABEL); 865 866 bind(LNotRecursive); 867 868 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 869 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 870 jccb (Assembler::notZero, CheckSucc); 871 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 872 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 873 jmpb (DONE_LABEL); 874 875 // Try to avoid passing control into the slow_path ... 876 bind (CheckSucc); 877 878 // The following optional optimization can be elided if necessary 879 // Effectively: if (succ == null) goto slow path 880 // The code reduces the window for a race, however, 881 // and thus benefits performance. 882 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 883 jccb (Assembler::zero, LGoSlowPath); 884 885 xorptr(boxReg, boxReg); 886 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 887 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 888 889 // Memory barrier/fence 890 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 891 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 892 // This is faster on Nehalem and AMD Shanghai/Barcelona. 893 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 894 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 895 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 896 lock(); addl(Address(rsp, 0), 0); 897 898 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 899 jccb (Assembler::notZero, LSuccess); 900 901 // Rare inopportune interleaving - race. 902 // The successor vanished in the small window above. 903 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 904 // We need to ensure progress and succession. 905 // Try to reacquire the lock. 906 // If that fails then the new owner is responsible for succession and this 907 // thread needs to take no further action and can exit via the fast path (success). 908 // If the re-acquire succeeds then pass control into the slow path. 909 // As implemented, this latter mode is horrible because we generated more 910 // coherence traffic on the lock *and* artificially extended the critical section 911 // length while by virtue of passing control into the slow path. 912 913 // box is really RAX -- the following CMPXCHG depends on that binding 914 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 915 movptr(scrReg, Address(r15_thread, JavaThread::lock_id_offset())); 916 lock(); 917 cmpxchgptr(scrReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 918 // There's no successor so we tried to regrab the lock. 919 // If that didn't work, then another thread grabbed the 920 // lock so we're done (and exit was a success). 921 jccb (Assembler::notEqual, LSuccess); 922 // Intentional fall-through into slow path 923 924 bind (LGoSlowPath); 925 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 926 jmpb (DONE_LABEL); 927 928 bind (LSuccess); 929 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 930 jmpb (DONE_LABEL); 931 932 #endif 933 if (LockingMode == LM_LEGACY) { 934 bind (Stacked); 935 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 936 lock(); 937 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 938 jccb(Assembler::notZero, DONE_LABEL); 939 // Count monitors in fast path 940 #ifndef _LP64 941 get_thread(tmpReg); 942 decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); 943 #else // _LP64 944 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 945 #endif 946 xorl(tmpReg, tmpReg); // Set ZF == 1 947 } 948 949 // ZFlag == 1 -> Success 950 // ZFlag == 0 -> Failure - force control through the slow path 951 bind(DONE_LABEL); 952 } 953 954 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 955 Register t, Register thread) { 956 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 957 assert(rax_reg == rax, "Used for CAS"); 958 assert_different_registers(obj, box, rax_reg, t, thread); 959 960 // Handle inflated monitor. 961 Label inflated; 962 // Finish fast lock successfully. 963 Label locked; 964 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 965 Label slow_path; 966 967 if (DiagnoseSyncOnValueBasedClasses != 0) { 968 load_klass(rax_reg, obj, t); 969 movl(rax_reg, Address(rax_reg, Klass::access_flags_offset())); 970 testl(rax_reg, JVM_ACC_IS_VALUE_BASED_CLASS); 971 jcc(Assembler::notZero, slow_path); 972 } 973 974 const Register mark = t; 975 976 { // Lightweight Lock 977 978 Label push; 979 980 const Register top = box; 981 982 // Load the mark. 983 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 984 985 // Prefetch top. 986 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 987 988 // Check for monitor (0b10). 989 testptr(mark, markWord::monitor_value); 990 jcc(Assembler::notZero, inflated); 991 992 // Check if lock-stack is full. 993 cmpl(top, LockStack::end_offset() - 1); 994 jcc(Assembler::greater, slow_path); 995 996 // Check if recursive. 997 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 998 jccb(Assembler::equal, push); 999 1000 // Try to lock. Transition lock bits 0b01 => 0b00 1001 movptr(rax_reg, mark); 1002 orptr(rax_reg, markWord::unlocked_value); 1003 andptr(mark, ~(int32_t)markWord::unlocked_value); 1004 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 1005 jcc(Assembler::notEqual, slow_path); 1006 1007 bind(push); 1008 // After successful lock, push object on lock-stack. 1009 movptr(Address(thread, top), obj); 1010 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 1011 xorl(rax_reg, rax_reg); 1012 jmpb(locked); 1013 } 1014 1015 { // Handle inflated monitor. 1016 bind(inflated); 1017 1018 const Register tagged_monitor = mark; 1019 1020 // CAS owner (null => current thread). 1021 xorptr(rax_reg, rax_reg); 1022 movptr(box, Address(thread, JavaThread::lock_id_offset())); 1023 lock(); cmpxchgptr(box, Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 1024 jccb(Assembler::equal, locked); 1025 1026 // Check if recursive. 1027 cmpptr(box, rax_reg); 1028 jccb(Assembler::notEqual, slow_path); 1029 1030 // Recursive. 1031 increment(Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 1032 xorl(rax_reg, rax_reg); 1033 } 1034 1035 bind(locked); 1036 #ifdef ASSERT 1037 // Check that locked label is reached with ZF set. 1038 Label zf_correct; 1039 Label zf_bad_zero; 1040 jcc(Assembler::zero, zf_correct); 1041 jmp(zf_bad_zero); 1042 #endif 1043 1044 bind(slow_path); 1045 #ifdef ASSERT 1046 // Check that slow_path label is reached with ZF not set. 1047 jcc(Assembler::notZero, zf_correct); 1048 stop("Fast Lock ZF != 0"); 1049 bind(zf_bad_zero); 1050 stop("Fast Lock ZF != 1"); 1051 bind(zf_correct); 1052 #endif 1053 // C2 uses the value of ZF to determine the continuation. 1054 } 1055 1056 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t1, Register t2, Register thread) { 1057 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 1058 assert(reg_rax == rax, "Used for CAS"); 1059 assert_different_registers(obj, reg_rax, t1, t2); 1060 1061 // Handle inflated monitor. 1062 Label inflated, inflated_check_lock_stack; 1063 // Finish fast unlock successfully. MUST jump with ZF == 1 1064 Label unlocked; 1065 1066 const Register mark = t1; 1067 const Register top = reg_rax; 1068 1069 Label dummy; 1070 C2FastUnlockLightweightStub* stub = nullptr; 1071 1072 if (!Compile::current()->output()->in_scratch_emit_size()) { 1073 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, t2, thread); 1074 Compile::current()->output()->add_stub(stub); 1075 } 1076 1077 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 1078 Label& check_successor = stub == nullptr ? dummy : stub->check_successor(); 1079 1080 { // Lightweight Unlock 1081 1082 // Load top. 1083 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 1084 1085 // Prefetch mark. 1086 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 1087 1088 // Check if obj is top of lock-stack. 1089 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 1090 // Top of lock stack was not obj. Must be monitor. 1091 jcc(Assembler::notEqual, inflated_check_lock_stack); 1092 1093 // Pop lock-stack. 1094 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 1095 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 1096 1097 // Check if recursive. 1098 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 1099 jcc(Assembler::equal, unlocked); 1100 1101 // We elide the monitor check, let the CAS fail instead. 1102 1103 // Try to unlock. Transition lock bits 0b00 => 0b01 1104 movptr(reg_rax, mark); 1105 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 1106 orptr(mark, markWord::unlocked_value); 1107 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 1108 jcc(Assembler::notEqual, push_and_slow_path); 1109 jmp(unlocked); 1110 } 1111 1112 1113 { // Handle inflated monitor. 1114 bind(inflated_check_lock_stack); 1115 #ifdef ASSERT 1116 Label check_done; 1117 subl(top, oopSize); 1118 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 1119 jcc(Assembler::below, check_done); 1120 cmpptr(obj, Address(thread, top)); 1121 jccb(Assembler::notEqual, inflated_check_lock_stack); 1122 stop("Fast Unlock lock on stack"); 1123 bind(check_done); 1124 testptr(mark, markWord::monitor_value); 1125 jccb(Assembler::notZero, inflated); 1126 stop("Fast Unlock not monitor"); 1127 #endif 1128 1129 bind(inflated); 1130 1131 // mark contains the tagged ObjectMonitor*. 1132 const Register monitor = mark; 1133 1134 #ifndef _LP64 1135 // Check if recursive. 1136 xorptr(reg_rax, reg_rax); 1137 orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 1138 jcc(Assembler::notZero, check_successor); 1139 1140 // Check if the entry lists are empty. 1141 movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 1142 orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 1143 jcc(Assembler::notZero, check_successor); 1144 1145 // Release lock. 1146 movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 1147 #else // _LP64 1148 Label recursive; 1149 1150 // Check if recursive. 1151 cmpptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 1152 jccb(Assembler::notEqual, recursive); 1153 1154 // Check if the entry lists are empty. 1155 movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 1156 orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 1157 jcc(Assembler::notZero, check_successor); 1158 1159 // Release lock. 1160 movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 1161 jmpb(unlocked); 1162 1163 // Recursive unlock. 1164 bind(recursive); 1165 decrement(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 1166 xorl(t1, t1); 1167 #endif 1168 } 1169 1170 bind(unlocked); 1171 if (stub != nullptr) { 1172 bind(stub->unlocked_continuation()); 1173 } 1174 1175 #ifdef ASSERT 1176 // Check that unlocked label is reached with ZF set. 1177 Label zf_correct; 1178 jcc(Assembler::zero, zf_correct); 1179 stop("Fast Unlock ZF != 1"); 1180 #endif 1181 1182 if (stub != nullptr) { 1183 bind(stub->slow_path_continuation()); 1184 } 1185 #ifdef ASSERT 1186 // Check that stub->continuation() label is reached with ZF not set. 1187 jccb(Assembler::notZero, zf_correct); 1188 stop("Fast Unlock ZF != 0"); 1189 bind(zf_correct); 1190 #endif 1191 // C2 uses the value of ZF to determine the continuation. 1192 } 1193 1194 //------------------------------------------------------------------------------------------- 1195 // Generic instructions support for use in .ad files C2 code generation 1196 1197 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 1198 if (dst != src) { 1199 movdqu(dst, src); 1200 } 1201 if (opcode == Op_AbsVD) { 1202 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 1203 } else { 1204 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 1205 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1206 } 1207 } 1208 1209 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 1210 if (opcode == Op_AbsVD) { 1211 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 1212 } else { 1213 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 1214 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 1215 } 1216 } 1217 1218 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 1219 if (dst != src) { 1220 movdqu(dst, src); 1221 } 1222 if (opcode == Op_AbsVF) { 1223 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 1224 } else { 1225 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 1226 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1227 } 1228 } 1229 1230 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 1231 if (opcode == Op_AbsVF) { 1232 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 1233 } else { 1234 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 1235 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 1236 } 1237 } 1238 1239 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 1240 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1241 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 1242 1243 if (opcode == Op_MinV) { 1244 if (elem_bt == T_BYTE) { 1245 pminsb(dst, src); 1246 } else if (elem_bt == T_SHORT) { 1247 pminsw(dst, src); 1248 } else if (elem_bt == T_INT) { 1249 pminsd(dst, src); 1250 } else { 1251 assert(elem_bt == T_LONG, "required"); 1252 assert(tmp == xmm0, "required"); 1253 assert_different_registers(dst, src, tmp); 1254 movdqu(xmm0, dst); 1255 pcmpgtq(xmm0, src); 1256 blendvpd(dst, src); // xmm0 as mask 1257 } 1258 } else { // opcode == Op_MaxV 1259 if (elem_bt == T_BYTE) { 1260 pmaxsb(dst, src); 1261 } else if (elem_bt == T_SHORT) { 1262 pmaxsw(dst, src); 1263 } else if (elem_bt == T_INT) { 1264 pmaxsd(dst, src); 1265 } else { 1266 assert(elem_bt == T_LONG, "required"); 1267 assert(tmp == xmm0, "required"); 1268 assert_different_registers(dst, src, tmp); 1269 movdqu(xmm0, src); 1270 pcmpgtq(xmm0, dst); 1271 blendvpd(dst, src); // xmm0 as mask 1272 } 1273 } 1274 } 1275 1276 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 1277 XMMRegister dst, XMMRegister src1, XMMRegister src2, 1278 int vlen_enc) { 1279 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1280 1281 if (opcode == Op_MinV) { 1282 if (elem_bt == T_BYTE) { 1283 vpminsb(dst, src1, src2, vlen_enc); 1284 } else if (elem_bt == T_SHORT) { 1285 vpminsw(dst, src1, src2, vlen_enc); 1286 } else if (elem_bt == T_INT) { 1287 vpminsd(dst, src1, src2, vlen_enc); 1288 } else { 1289 assert(elem_bt == T_LONG, "required"); 1290 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1291 vpminsq(dst, src1, src2, vlen_enc); 1292 } else { 1293 assert_different_registers(dst, src1, src2); 1294 vpcmpgtq(dst, src1, src2, vlen_enc); 1295 vblendvpd(dst, src1, src2, dst, vlen_enc); 1296 } 1297 } 1298 } else { // opcode == Op_MaxV 1299 if (elem_bt == T_BYTE) { 1300 vpmaxsb(dst, src1, src2, vlen_enc); 1301 } else if (elem_bt == T_SHORT) { 1302 vpmaxsw(dst, src1, src2, vlen_enc); 1303 } else if (elem_bt == T_INT) { 1304 vpmaxsd(dst, src1, src2, vlen_enc); 1305 } else { 1306 assert(elem_bt == T_LONG, "required"); 1307 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1308 vpmaxsq(dst, src1, src2, vlen_enc); 1309 } else { 1310 assert_different_registers(dst, src1, src2); 1311 vpcmpgtq(dst, src1, src2, vlen_enc); 1312 vblendvpd(dst, src2, src1, dst, vlen_enc); 1313 } 1314 } 1315 } 1316 } 1317 1318 // Float/Double min max 1319 1320 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1321 XMMRegister dst, XMMRegister a, XMMRegister b, 1322 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1323 int vlen_enc) { 1324 assert(UseAVX > 0, "required"); 1325 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1326 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1327 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1328 assert_different_registers(a, tmp, atmp, btmp); 1329 assert_different_registers(b, tmp, atmp, btmp); 1330 1331 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1332 bool is_double_word = is_double_word_type(elem_bt); 1333 1334 /* Note on 'non-obvious' assembly sequence: 1335 * 1336 * While there are vminps/vmaxps instructions, there are two important differences between hardware 1337 * and Java on how they handle floats: 1338 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 1339 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 1340 * 1341 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 1342 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 1343 * (only useful when signs differ, noop otherwise) 1344 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 1345 1346 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 1347 * btmp = (b < +0.0) ? a : b 1348 * atmp = (b < +0.0) ? b : a 1349 * Tmp = Max_Float(atmp , btmp) 1350 * Res = (atmp == NaN) ? atmp : Tmp 1351 */ 1352 1353 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 1354 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 1355 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 1356 XMMRegister mask; 1357 1358 if (!is_double_word && is_min) { 1359 mask = a; 1360 vblend = &MacroAssembler::vblendvps; 1361 vmaxmin = &MacroAssembler::vminps; 1362 vcmp = &MacroAssembler::vcmpps; 1363 } else if (!is_double_word && !is_min) { 1364 mask = b; 1365 vblend = &MacroAssembler::vblendvps; 1366 vmaxmin = &MacroAssembler::vmaxps; 1367 vcmp = &MacroAssembler::vcmpps; 1368 } else if (is_double_word && is_min) { 1369 mask = a; 1370 vblend = &MacroAssembler::vblendvpd; 1371 vmaxmin = &MacroAssembler::vminpd; 1372 vcmp = &MacroAssembler::vcmppd; 1373 } else { 1374 assert(is_double_word && !is_min, "sanity"); 1375 mask = b; 1376 vblend = &MacroAssembler::vblendvpd; 1377 vmaxmin = &MacroAssembler::vmaxpd; 1378 vcmp = &MacroAssembler::vcmppd; 1379 } 1380 1381 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 1382 XMMRegister maxmin, scratch; 1383 if (dst == btmp) { 1384 maxmin = btmp; 1385 scratch = tmp; 1386 } else { 1387 maxmin = tmp; 1388 scratch = btmp; 1389 } 1390 1391 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 1392 if (precompute_mask && !is_double_word) { 1393 vpsrad(tmp, mask, 32, vlen_enc); 1394 mask = tmp; 1395 } else if (precompute_mask && is_double_word) { 1396 vpxor(tmp, tmp, tmp, vlen_enc); 1397 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1398 mask = tmp; 1399 } 1400 1401 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1402 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1403 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1404 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1405 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1406 } 1407 1408 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1409 XMMRegister dst, XMMRegister a, XMMRegister b, 1410 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1411 int vlen_enc) { 1412 assert(UseAVX > 2, "required"); 1413 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1414 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1415 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1416 assert_different_registers(dst, a, atmp, btmp); 1417 assert_different_registers(dst, b, atmp, btmp); 1418 1419 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1420 bool is_double_word = is_double_word_type(elem_bt); 1421 bool merge = true; 1422 1423 if (!is_double_word && is_min) { 1424 evpmovd2m(ktmp, a, vlen_enc); 1425 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1426 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1427 vminps(dst, atmp, btmp, vlen_enc); 1428 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1429 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1430 } else if (!is_double_word && !is_min) { 1431 evpmovd2m(ktmp, b, vlen_enc); 1432 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1433 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1434 vmaxps(dst, atmp, btmp, vlen_enc); 1435 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1436 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1437 } else if (is_double_word && is_min) { 1438 evpmovq2m(ktmp, a, vlen_enc); 1439 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1440 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1441 vminpd(dst, atmp, btmp, vlen_enc); 1442 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1443 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1444 } else { 1445 assert(is_double_word && !is_min, "sanity"); 1446 evpmovq2m(ktmp, b, vlen_enc); 1447 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1448 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1449 vmaxpd(dst, atmp, btmp, vlen_enc); 1450 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1451 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1452 } 1453 } 1454 1455 // Float/Double signum 1456 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1457 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1458 1459 Label DONE_LABEL; 1460 1461 if (opcode == Op_SignumF) { 1462 assert(UseSSE > 0, "required"); 1463 ucomiss(dst, zero); 1464 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1465 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1466 movflt(dst, one); 1467 jcc(Assembler::above, DONE_LABEL); 1468 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1469 } else if (opcode == Op_SignumD) { 1470 assert(UseSSE > 1, "required"); 1471 ucomisd(dst, zero); 1472 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1473 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1474 movdbl(dst, one); 1475 jcc(Assembler::above, DONE_LABEL); 1476 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1477 } 1478 1479 bind(DONE_LABEL); 1480 } 1481 1482 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1483 if (sign) { 1484 pmovsxbw(dst, src); 1485 } else { 1486 pmovzxbw(dst, src); 1487 } 1488 } 1489 1490 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1491 if (sign) { 1492 vpmovsxbw(dst, src, vector_len); 1493 } else { 1494 vpmovzxbw(dst, src, vector_len); 1495 } 1496 } 1497 1498 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1499 if (sign) { 1500 vpmovsxbd(dst, src, vector_len); 1501 } else { 1502 vpmovzxbd(dst, src, vector_len); 1503 } 1504 } 1505 1506 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1507 if (sign) { 1508 vpmovsxwd(dst, src, vector_len); 1509 } else { 1510 vpmovzxwd(dst, src, vector_len); 1511 } 1512 } 1513 1514 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1515 int shift, int vector_len) { 1516 if (opcode == Op_RotateLeftV) { 1517 if (etype == T_INT) { 1518 evprold(dst, src, shift, vector_len); 1519 } else { 1520 assert(etype == T_LONG, "expected type T_LONG"); 1521 evprolq(dst, src, shift, vector_len); 1522 } 1523 } else { 1524 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1525 if (etype == T_INT) { 1526 evprord(dst, src, shift, vector_len); 1527 } else { 1528 assert(etype == T_LONG, "expected type T_LONG"); 1529 evprorq(dst, src, shift, vector_len); 1530 } 1531 } 1532 } 1533 1534 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1535 XMMRegister shift, int vector_len) { 1536 if (opcode == Op_RotateLeftV) { 1537 if (etype == T_INT) { 1538 evprolvd(dst, src, shift, vector_len); 1539 } else { 1540 assert(etype == T_LONG, "expected type T_LONG"); 1541 evprolvq(dst, src, shift, vector_len); 1542 } 1543 } else { 1544 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1545 if (etype == T_INT) { 1546 evprorvd(dst, src, shift, vector_len); 1547 } else { 1548 assert(etype == T_LONG, "expected type T_LONG"); 1549 evprorvq(dst, src, shift, vector_len); 1550 } 1551 } 1552 } 1553 1554 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1555 if (opcode == Op_RShiftVI) { 1556 psrad(dst, shift); 1557 } else if (opcode == Op_LShiftVI) { 1558 pslld(dst, shift); 1559 } else { 1560 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1561 psrld(dst, shift); 1562 } 1563 } 1564 1565 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1566 switch (opcode) { 1567 case Op_RShiftVI: psrad(dst, shift); break; 1568 case Op_LShiftVI: pslld(dst, shift); break; 1569 case Op_URShiftVI: psrld(dst, shift); break; 1570 1571 default: assert(false, "%s", NodeClassNames[opcode]); 1572 } 1573 } 1574 1575 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1576 if (opcode == Op_RShiftVI) { 1577 vpsrad(dst, nds, shift, vector_len); 1578 } else if (opcode == Op_LShiftVI) { 1579 vpslld(dst, nds, shift, vector_len); 1580 } else { 1581 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1582 vpsrld(dst, nds, shift, vector_len); 1583 } 1584 } 1585 1586 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1587 switch (opcode) { 1588 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1589 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1590 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1591 1592 default: assert(false, "%s", NodeClassNames[opcode]); 1593 } 1594 } 1595 1596 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1597 switch (opcode) { 1598 case Op_RShiftVB: // fall-through 1599 case Op_RShiftVS: psraw(dst, shift); break; 1600 1601 case Op_LShiftVB: // fall-through 1602 case Op_LShiftVS: psllw(dst, shift); break; 1603 1604 case Op_URShiftVS: // fall-through 1605 case Op_URShiftVB: psrlw(dst, shift); break; 1606 1607 default: assert(false, "%s", NodeClassNames[opcode]); 1608 } 1609 } 1610 1611 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1612 switch (opcode) { 1613 case Op_RShiftVB: // fall-through 1614 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1615 1616 case Op_LShiftVB: // fall-through 1617 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1618 1619 case Op_URShiftVS: // fall-through 1620 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1621 1622 default: assert(false, "%s", NodeClassNames[opcode]); 1623 } 1624 } 1625 1626 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1627 switch (opcode) { 1628 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1629 case Op_LShiftVL: psllq(dst, shift); break; 1630 case Op_URShiftVL: psrlq(dst, shift); break; 1631 1632 default: assert(false, "%s", NodeClassNames[opcode]); 1633 } 1634 } 1635 1636 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1637 if (opcode == Op_RShiftVL) { 1638 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1639 } else if (opcode == Op_LShiftVL) { 1640 psllq(dst, shift); 1641 } else { 1642 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1643 psrlq(dst, shift); 1644 } 1645 } 1646 1647 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1648 switch (opcode) { 1649 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1650 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1651 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1652 1653 default: assert(false, "%s", NodeClassNames[opcode]); 1654 } 1655 } 1656 1657 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1658 if (opcode == Op_RShiftVL) { 1659 evpsraq(dst, nds, shift, vector_len); 1660 } else if (opcode == Op_LShiftVL) { 1661 vpsllq(dst, nds, shift, vector_len); 1662 } else { 1663 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1664 vpsrlq(dst, nds, shift, vector_len); 1665 } 1666 } 1667 1668 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1669 switch (opcode) { 1670 case Op_RShiftVB: // fall-through 1671 case Op_RShiftVS: // fall-through 1672 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1673 1674 case Op_LShiftVB: // fall-through 1675 case Op_LShiftVS: // fall-through 1676 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1677 1678 case Op_URShiftVB: // fall-through 1679 case Op_URShiftVS: // fall-through 1680 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1681 1682 default: assert(false, "%s", NodeClassNames[opcode]); 1683 } 1684 } 1685 1686 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1687 switch (opcode) { 1688 case Op_RShiftVB: // fall-through 1689 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1690 1691 case Op_LShiftVB: // fall-through 1692 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1693 1694 case Op_URShiftVB: // fall-through 1695 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1696 1697 default: assert(false, "%s", NodeClassNames[opcode]); 1698 } 1699 } 1700 1701 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1702 assert(UseAVX >= 2, "required"); 1703 switch (opcode) { 1704 case Op_RShiftVL: { 1705 if (UseAVX > 2) { 1706 assert(tmp == xnoreg, "not used"); 1707 if (!VM_Version::supports_avx512vl()) { 1708 vlen_enc = Assembler::AVX_512bit; 1709 } 1710 evpsravq(dst, src, shift, vlen_enc); 1711 } else { 1712 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1713 vpsrlvq(dst, src, shift, vlen_enc); 1714 vpsrlvq(tmp, tmp, shift, vlen_enc); 1715 vpxor(dst, dst, tmp, vlen_enc); 1716 vpsubq(dst, dst, tmp, vlen_enc); 1717 } 1718 break; 1719 } 1720 case Op_LShiftVL: { 1721 assert(tmp == xnoreg, "not used"); 1722 vpsllvq(dst, src, shift, vlen_enc); 1723 break; 1724 } 1725 case Op_URShiftVL: { 1726 assert(tmp == xnoreg, "not used"); 1727 vpsrlvq(dst, src, shift, vlen_enc); 1728 break; 1729 } 1730 default: assert(false, "%s", NodeClassNames[opcode]); 1731 } 1732 } 1733 1734 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1735 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1736 assert(opcode == Op_LShiftVB || 1737 opcode == Op_RShiftVB || 1738 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1739 bool sign = (opcode != Op_URShiftVB); 1740 assert(vector_len == 0, "required"); 1741 vextendbd(sign, dst, src, 1); 1742 vpmovzxbd(vtmp, shift, 1); 1743 varshiftd(opcode, dst, dst, vtmp, 1); 1744 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1745 vextracti128_high(vtmp, dst); 1746 vpackusdw(dst, dst, vtmp, 0); 1747 } 1748 1749 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1750 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1751 assert(opcode == Op_LShiftVB || 1752 opcode == Op_RShiftVB || 1753 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1754 bool sign = (opcode != Op_URShiftVB); 1755 int ext_vector_len = vector_len + 1; 1756 vextendbw(sign, dst, src, ext_vector_len); 1757 vpmovzxbw(vtmp, shift, ext_vector_len); 1758 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1759 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1760 if (vector_len == 0) { 1761 vextracti128_high(vtmp, dst); 1762 vpackuswb(dst, dst, vtmp, vector_len); 1763 } else { 1764 vextracti64x4_high(vtmp, dst); 1765 vpackuswb(dst, dst, vtmp, vector_len); 1766 vpermq(dst, dst, 0xD8, vector_len); 1767 } 1768 } 1769 1770 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1771 switch(typ) { 1772 case T_BYTE: 1773 pinsrb(dst, val, idx); 1774 break; 1775 case T_SHORT: 1776 pinsrw(dst, val, idx); 1777 break; 1778 case T_INT: 1779 pinsrd(dst, val, idx); 1780 break; 1781 case T_LONG: 1782 pinsrq(dst, val, idx); 1783 break; 1784 default: 1785 assert(false,"Should not reach here."); 1786 break; 1787 } 1788 } 1789 1790 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1791 switch(typ) { 1792 case T_BYTE: 1793 vpinsrb(dst, src, val, idx); 1794 break; 1795 case T_SHORT: 1796 vpinsrw(dst, src, val, idx); 1797 break; 1798 case T_INT: 1799 vpinsrd(dst, src, val, idx); 1800 break; 1801 case T_LONG: 1802 vpinsrq(dst, src, val, idx); 1803 break; 1804 default: 1805 assert(false,"Should not reach here."); 1806 break; 1807 } 1808 } 1809 1810 #ifdef _LP64 1811 void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt, 1812 XMMRegister dst, Register base, 1813 Register idx_base, 1814 Register offset, Register mask, 1815 Register mask_idx, Register rtmp, 1816 int vlen_enc) { 1817 vpxor(dst, dst, dst, vlen_enc); 1818 if (elem_bt == T_SHORT) { 1819 for (int i = 0; i < 4; i++) { 1820 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1821 Label skip_load; 1822 btq(mask, mask_idx); 1823 jccb(Assembler::carryClear, skip_load); 1824 movl(rtmp, Address(idx_base, i * 4)); 1825 if (offset != noreg) { 1826 addl(rtmp, offset); 1827 } 1828 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1829 bind(skip_load); 1830 incq(mask_idx); 1831 } 1832 } else { 1833 assert(elem_bt == T_BYTE, ""); 1834 for (int i = 0; i < 8; i++) { 1835 // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 1836 Label skip_load; 1837 btq(mask, mask_idx); 1838 jccb(Assembler::carryClear, skip_load); 1839 movl(rtmp, Address(idx_base, i * 4)); 1840 if (offset != noreg) { 1841 addl(rtmp, offset); 1842 } 1843 pinsrb(dst, Address(base, rtmp), i); 1844 bind(skip_load); 1845 incq(mask_idx); 1846 } 1847 } 1848 } 1849 #endif // _LP64 1850 1851 void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst, 1852 Register base, Register idx_base, 1853 Register offset, Register rtmp, 1854 int vlen_enc) { 1855 vpxor(dst, dst, dst, vlen_enc); 1856 if (elem_bt == T_SHORT) { 1857 for (int i = 0; i < 4; i++) { 1858 // dst[i] = src[offset + idx_base[i]] 1859 movl(rtmp, Address(idx_base, i * 4)); 1860 if (offset != noreg) { 1861 addl(rtmp, offset); 1862 } 1863 pinsrw(dst, Address(base, rtmp, Address::times_2), i); 1864 } 1865 } else { 1866 assert(elem_bt == T_BYTE, ""); 1867 for (int i = 0; i < 8; i++) { 1868 // dst[i] = src[offset + idx_base[i]] 1869 movl(rtmp, Address(idx_base, i * 4)); 1870 if (offset != noreg) { 1871 addl(rtmp, offset); 1872 } 1873 pinsrb(dst, Address(base, rtmp), i); 1874 } 1875 } 1876 } 1877 1878 /* 1879 * Gather using hybrid algorithm, first partially unroll scalar loop 1880 * to accumulate values from gather indices into a quad-word(64bit) slice. 1881 * A slice may hold 8 bytes or 4 short values. This is followed by a vector 1882 * permutation to place the slice into appropriate vector lane 1883 * locations in destination vector. Following pseudo code describes the 1884 * algorithm in detail: 1885 * 1886 * DST_VEC = ZERO_VEC 1887 * PERM_INDEX = {0, 1, 2, 3, 4, 5, 6, 7, 8..} 1888 * TWO_VEC = {2, 2, 2, 2, 2, 2, 2, 2, 2..} 1889 * FOREACH_ITER: 1890 * TMP_VEC_64 = PICK_SUB_WORDS_FROM_GATHER_INDICES 1891 * TEMP_PERM_VEC = PERMUTE TMP_VEC_64 PERM_INDEX 1892 * DST_VEC = DST_VEC OR TEMP_PERM_VEC 1893 * PERM_INDEX = PERM_INDEX - TWO_VEC 1894 * 1895 * With each iteration, doubleword permute indices (0,1) corresponding 1896 * to gathered quadword gets right shifted by two lane positions. 1897 * 1898 */ 1899 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, 1900 Register base, Register idx_base, 1901 Register offset, Register mask, 1902 XMMRegister xtmp1, XMMRegister xtmp2, 1903 XMMRegister temp_dst, Register rtmp, 1904 Register mask_idx, Register length, 1905 int vector_len, int vlen_enc) { 1906 Label GATHER8_LOOP; 1907 assert(is_subword_type(elem_ty), ""); 1908 movl(length, vector_len); 1909 vpxor(xtmp1, xtmp1, xtmp1, vlen_enc); // xtmp1 = {0, ...} 1910 vpxor(dst, dst, dst, vlen_enc); // dst = {0, ...} 1911 vallones(xtmp2, vlen_enc); 1912 vpsubd(xtmp2, xtmp1, xtmp2, vlen_enc); 1913 vpslld(xtmp2, xtmp2, 1, vlen_enc); // xtmp2 = {2, 2, ...} 1914 load_iota_indices(xtmp1, vector_len * type2aelembytes(elem_ty), T_INT); // xtmp1 = {0, 1, 2, ...} 1915 1916 bind(GATHER8_LOOP); 1917 // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES 1918 if (mask == noreg) { 1919 vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc); 1920 } else { 1921 LP64_ONLY(vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc)); 1922 } 1923 // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) 1924 vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); 1925 // PERM_INDEX(xtmp1) = PERM_INDEX(xtmp1) - TWO_VEC(xtmp2) 1926 vpsubd(xtmp1, xtmp1, xtmp2, vlen_enc); 1927 // DST_VEC = DST_VEC OR TEMP_PERM_VEC 1928 vpor(dst, dst, temp_dst, vlen_enc); 1929 addptr(idx_base, 32 >> (type2aelembytes(elem_ty) - 1)); 1930 subl(length, 8 >> (type2aelembytes(elem_ty) - 1)); 1931 jcc(Assembler::notEqual, GATHER8_LOOP); 1932 } 1933 1934 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1935 switch(typ) { 1936 case T_INT: 1937 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1938 break; 1939 case T_FLOAT: 1940 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1941 break; 1942 case T_LONG: 1943 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1944 break; 1945 case T_DOUBLE: 1946 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1947 break; 1948 default: 1949 assert(false,"Should not reach here."); 1950 break; 1951 } 1952 } 1953 1954 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1955 switch(typ) { 1956 case T_INT: 1957 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1958 break; 1959 case T_FLOAT: 1960 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1961 break; 1962 case T_LONG: 1963 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1964 break; 1965 case T_DOUBLE: 1966 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1967 break; 1968 default: 1969 assert(false,"Should not reach here."); 1970 break; 1971 } 1972 } 1973 1974 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1975 switch(typ) { 1976 case T_INT: 1977 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1978 break; 1979 case T_FLOAT: 1980 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1981 break; 1982 case T_LONG: 1983 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1984 break; 1985 case T_DOUBLE: 1986 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1987 break; 1988 default: 1989 assert(false,"Should not reach here."); 1990 break; 1991 } 1992 } 1993 1994 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1995 if (vlen_in_bytes <= 16) { 1996 pxor (dst, dst); 1997 psubb(dst, src); 1998 switch (elem_bt) { 1999 case T_BYTE: /* nothing to do */ break; 2000 case T_SHORT: pmovsxbw(dst, dst); break; 2001 case T_INT: pmovsxbd(dst, dst); break; 2002 case T_FLOAT: pmovsxbd(dst, dst); break; 2003 case T_LONG: pmovsxbq(dst, dst); break; 2004 case T_DOUBLE: pmovsxbq(dst, dst); break; 2005 2006 default: assert(false, "%s", type2name(elem_bt)); 2007 } 2008 } else { 2009 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 2010 int vlen_enc = vector_length_encoding(vlen_in_bytes); 2011 2012 vpxor (dst, dst, dst, vlen_enc); 2013 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 2014 2015 switch (elem_bt) { 2016 case T_BYTE: /* nothing to do */ break; 2017 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 2018 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 2019 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 2020 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 2021 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 2022 2023 default: assert(false, "%s", type2name(elem_bt)); 2024 } 2025 } 2026 } 2027 2028 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 2029 if (novlbwdq) { 2030 vpmovsxbd(xtmp, src, vlen_enc); 2031 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 2032 Assembler::eq, true, vlen_enc, noreg); 2033 } else { 2034 vpxor(xtmp, xtmp, xtmp, vlen_enc); 2035 vpsubb(xtmp, xtmp, src, vlen_enc); 2036 evpmovb2m(dst, xtmp, vlen_enc); 2037 } 2038 } 2039 2040 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) { 2041 switch (vlen_in_bytes) { 2042 case 4: movdl(dst, src); break; 2043 case 8: movq(dst, src); break; 2044 case 16: movdqu(dst, src); break; 2045 case 32: vmovdqu(dst, src); break; 2046 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 2047 default: ShouldNotReachHere(); 2048 } 2049 } 2050 2051 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 2052 assert(rscratch != noreg || always_reachable(src), "missing"); 2053 2054 if (reachable(src)) { 2055 load_vector(dst, as_Address(src), vlen_in_bytes); 2056 } else { 2057 lea(rscratch, src); 2058 load_vector(dst, Address(rscratch, 0), vlen_in_bytes); 2059 } 2060 } 2061 2062 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 2063 int vlen_enc = vector_length_encoding(vlen); 2064 if (VM_Version::supports_avx()) { 2065 if (bt == T_LONG) { 2066 if (VM_Version::supports_avx2()) { 2067 vpbroadcastq(dst, src, vlen_enc); 2068 } else { 2069 vmovddup(dst, src, vlen_enc); 2070 } 2071 } else if (bt == T_DOUBLE) { 2072 if (vlen_enc != Assembler::AVX_128bit) { 2073 vbroadcastsd(dst, src, vlen_enc, noreg); 2074 } else { 2075 vmovddup(dst, src, vlen_enc); 2076 } 2077 } else { 2078 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 2079 vpbroadcastd(dst, src, vlen_enc); 2080 } else { 2081 vbroadcastss(dst, src, vlen_enc); 2082 } 2083 } 2084 } else if (VM_Version::supports_sse3()) { 2085 movddup(dst, src); 2086 } else { 2087 movq(dst, src); 2088 if (vlen == 16) { 2089 punpcklqdq(dst, dst); 2090 } 2091 } 2092 } 2093 2094 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 2095 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 2096 int offset = exact_log2(type2aelembytes(bt)) << 6; 2097 if (is_floating_point_type(bt)) { 2098 offset += 128; 2099 } 2100 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 2101 load_vector(dst, addr, vlen_in_bytes); 2102 } 2103 2104 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 2105 2106 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 2107 int vector_len = Assembler::AVX_128bit; 2108 2109 switch (opcode) { 2110 case Op_AndReductionV: pand(dst, src); break; 2111 case Op_OrReductionV: por (dst, src); break; 2112 case Op_XorReductionV: pxor(dst, src); break; 2113 case Op_MinReductionV: 2114 switch (typ) { 2115 case T_BYTE: pminsb(dst, src); break; 2116 case T_SHORT: pminsw(dst, src); break; 2117 case T_INT: pminsd(dst, src); break; 2118 case T_LONG: assert(UseAVX > 2, "required"); 2119 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 2120 default: assert(false, "wrong type"); 2121 } 2122 break; 2123 case Op_MaxReductionV: 2124 switch (typ) { 2125 case T_BYTE: pmaxsb(dst, src); break; 2126 case T_SHORT: pmaxsw(dst, src); break; 2127 case T_INT: pmaxsd(dst, src); break; 2128 case T_LONG: assert(UseAVX > 2, "required"); 2129 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 2130 default: assert(false, "wrong type"); 2131 } 2132 break; 2133 case Op_AddReductionVF: addss(dst, src); break; 2134 case Op_AddReductionVD: addsd(dst, src); break; 2135 case Op_AddReductionVI: 2136 switch (typ) { 2137 case T_BYTE: paddb(dst, src); break; 2138 case T_SHORT: paddw(dst, src); break; 2139 case T_INT: paddd(dst, src); break; 2140 default: assert(false, "wrong type"); 2141 } 2142 break; 2143 case Op_AddReductionVL: paddq(dst, src); break; 2144 case Op_MulReductionVF: mulss(dst, src); break; 2145 case Op_MulReductionVD: mulsd(dst, src); break; 2146 case Op_MulReductionVI: 2147 switch (typ) { 2148 case T_SHORT: pmullw(dst, src); break; 2149 case T_INT: pmulld(dst, src); break; 2150 default: assert(false, "wrong type"); 2151 } 2152 break; 2153 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 2154 evpmullq(dst, dst, src, vector_len); break; 2155 default: assert(false, "wrong opcode"); 2156 } 2157 } 2158 2159 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 2160 int vector_len = Assembler::AVX_256bit; 2161 2162 switch (opcode) { 2163 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 2164 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 2165 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 2166 case Op_MinReductionV: 2167 switch (typ) { 2168 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 2169 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 2170 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 2171 case T_LONG: assert(UseAVX > 2, "required"); 2172 vpminsq(dst, src1, src2, vector_len); break; 2173 default: assert(false, "wrong type"); 2174 } 2175 break; 2176 case Op_MaxReductionV: 2177 switch (typ) { 2178 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 2179 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 2180 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 2181 case T_LONG: assert(UseAVX > 2, "required"); 2182 vpmaxsq(dst, src1, src2, vector_len); break; 2183 default: assert(false, "wrong type"); 2184 } 2185 break; 2186 case Op_AddReductionVI: 2187 switch (typ) { 2188 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 2189 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 2190 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 2191 default: assert(false, "wrong type"); 2192 } 2193 break; 2194 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 2195 case Op_MulReductionVI: 2196 switch (typ) { 2197 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 2198 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 2199 default: assert(false, "wrong type"); 2200 } 2201 break; 2202 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 2203 default: assert(false, "wrong opcode"); 2204 } 2205 } 2206 2207 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 2208 XMMRegister dst, XMMRegister src, 2209 XMMRegister vtmp1, XMMRegister vtmp2) { 2210 switch (opcode) { 2211 case Op_AddReductionVF: 2212 case Op_MulReductionVF: 2213 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 2214 break; 2215 2216 case Op_AddReductionVD: 2217 case Op_MulReductionVD: 2218 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 2219 break; 2220 2221 default: assert(false, "wrong opcode"); 2222 } 2223 } 2224 2225 void C2_MacroAssembler::reduceB(int opcode, int vlen, 2226 Register dst, Register src1, XMMRegister src2, 2227 XMMRegister vtmp1, XMMRegister vtmp2) { 2228 switch (vlen) { 2229 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2230 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2231 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2232 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2233 2234 default: assert(false, "wrong vector length"); 2235 } 2236 } 2237 2238 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 2239 Register dst, Register src1, XMMRegister src2, 2240 XMMRegister vtmp1, XMMRegister vtmp2) { 2241 switch (vlen) { 2242 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2243 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2244 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2245 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2246 2247 default: assert(false, "wrong vector length"); 2248 } 2249 } 2250 2251 void C2_MacroAssembler::reduceS(int opcode, int vlen, 2252 Register dst, Register src1, XMMRegister src2, 2253 XMMRegister vtmp1, XMMRegister vtmp2) { 2254 switch (vlen) { 2255 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2256 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2257 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2258 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2259 2260 default: assert(false, "wrong vector length"); 2261 } 2262 } 2263 2264 void C2_MacroAssembler::reduceI(int opcode, int vlen, 2265 Register dst, Register src1, XMMRegister src2, 2266 XMMRegister vtmp1, XMMRegister vtmp2) { 2267 switch (vlen) { 2268 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2269 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2270 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2271 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2272 2273 default: assert(false, "wrong vector length"); 2274 } 2275 } 2276 2277 #ifdef _LP64 2278 void C2_MacroAssembler::reduceL(int opcode, int vlen, 2279 Register dst, Register src1, XMMRegister src2, 2280 XMMRegister vtmp1, XMMRegister vtmp2) { 2281 switch (vlen) { 2282 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2283 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2284 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2285 2286 default: assert(false, "wrong vector length"); 2287 } 2288 } 2289 #endif // _LP64 2290 2291 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2292 switch (vlen) { 2293 case 2: 2294 assert(vtmp2 == xnoreg, ""); 2295 reduce2F(opcode, dst, src, vtmp1); 2296 break; 2297 case 4: 2298 assert(vtmp2 == xnoreg, ""); 2299 reduce4F(opcode, dst, src, vtmp1); 2300 break; 2301 case 8: 2302 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2303 break; 2304 case 16: 2305 reduce16F(opcode, dst, src, vtmp1, vtmp2); 2306 break; 2307 default: assert(false, "wrong vector length"); 2308 } 2309 } 2310 2311 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2312 switch (vlen) { 2313 case 2: 2314 assert(vtmp2 == xnoreg, ""); 2315 reduce2D(opcode, dst, src, vtmp1); 2316 break; 2317 case 4: 2318 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2319 break; 2320 case 8: 2321 reduce8D(opcode, dst, src, vtmp1, vtmp2); 2322 break; 2323 default: assert(false, "wrong vector length"); 2324 } 2325 } 2326 2327 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2328 if (opcode == Op_AddReductionVI) { 2329 if (vtmp1 != src2) { 2330 movdqu(vtmp1, src2); 2331 } 2332 phaddd(vtmp1, vtmp1); 2333 } else { 2334 pshufd(vtmp1, src2, 0x1); 2335 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2336 } 2337 movdl(vtmp2, src1); 2338 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2339 movdl(dst, vtmp1); 2340 } 2341 2342 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2343 if (opcode == Op_AddReductionVI) { 2344 if (vtmp1 != src2) { 2345 movdqu(vtmp1, src2); 2346 } 2347 phaddd(vtmp1, src2); 2348 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2349 } else { 2350 pshufd(vtmp2, src2, 0xE); 2351 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2352 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2353 } 2354 } 2355 2356 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2357 if (opcode == Op_AddReductionVI) { 2358 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2359 vextracti128_high(vtmp2, vtmp1); 2360 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2361 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2362 } else { 2363 vextracti128_high(vtmp1, src2); 2364 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2365 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2366 } 2367 } 2368 2369 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2370 vextracti64x4_high(vtmp2, src2); 2371 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2372 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2373 } 2374 2375 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2376 pshufd(vtmp2, src2, 0x1); 2377 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2378 movdqu(vtmp1, vtmp2); 2379 psrldq(vtmp1, 2); 2380 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2381 movdqu(vtmp2, vtmp1); 2382 psrldq(vtmp2, 1); 2383 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2384 movdl(vtmp2, src1); 2385 pmovsxbd(vtmp1, vtmp1); 2386 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2387 pextrb(dst, vtmp1, 0x0); 2388 movsbl(dst, dst); 2389 } 2390 2391 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2392 pshufd(vtmp1, src2, 0xE); 2393 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2394 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2395 } 2396 2397 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2398 vextracti128_high(vtmp2, src2); 2399 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2400 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2401 } 2402 2403 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2404 vextracti64x4_high(vtmp1, src2); 2405 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2406 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2407 } 2408 2409 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2410 pmovsxbw(vtmp2, src2); 2411 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2412 } 2413 2414 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2415 if (UseAVX > 1) { 2416 int vector_len = Assembler::AVX_256bit; 2417 vpmovsxbw(vtmp1, src2, vector_len); 2418 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2419 } else { 2420 pmovsxbw(vtmp2, src2); 2421 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2422 pshufd(vtmp2, src2, 0x1); 2423 pmovsxbw(vtmp2, src2); 2424 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2425 } 2426 } 2427 2428 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2429 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2430 int vector_len = Assembler::AVX_512bit; 2431 vpmovsxbw(vtmp1, src2, vector_len); 2432 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2433 } else { 2434 assert(UseAVX >= 2,"Should not reach here."); 2435 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2436 vextracti128_high(vtmp2, src2); 2437 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2438 } 2439 } 2440 2441 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2442 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2443 vextracti64x4_high(vtmp2, src2); 2444 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2445 } 2446 2447 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2448 if (opcode == Op_AddReductionVI) { 2449 if (vtmp1 != src2) { 2450 movdqu(vtmp1, src2); 2451 } 2452 phaddw(vtmp1, vtmp1); 2453 phaddw(vtmp1, vtmp1); 2454 } else { 2455 pshufd(vtmp2, src2, 0x1); 2456 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2457 movdqu(vtmp1, vtmp2); 2458 psrldq(vtmp1, 2); 2459 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2460 } 2461 movdl(vtmp2, src1); 2462 pmovsxwd(vtmp1, vtmp1); 2463 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2464 pextrw(dst, vtmp1, 0x0); 2465 movswl(dst, dst); 2466 } 2467 2468 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2469 if (opcode == Op_AddReductionVI) { 2470 if (vtmp1 != src2) { 2471 movdqu(vtmp1, src2); 2472 } 2473 phaddw(vtmp1, src2); 2474 } else { 2475 pshufd(vtmp1, src2, 0xE); 2476 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2477 } 2478 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2479 } 2480 2481 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2482 if (opcode == Op_AddReductionVI) { 2483 int vector_len = Assembler::AVX_256bit; 2484 vphaddw(vtmp2, src2, src2, vector_len); 2485 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2486 } else { 2487 vextracti128_high(vtmp2, src2); 2488 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2489 } 2490 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2491 } 2492 2493 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2494 int vector_len = Assembler::AVX_256bit; 2495 vextracti64x4_high(vtmp1, src2); 2496 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2497 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2498 } 2499 2500 #ifdef _LP64 2501 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2502 pshufd(vtmp2, src2, 0xE); 2503 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2504 movdq(vtmp1, src1); 2505 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2506 movdq(dst, vtmp1); 2507 } 2508 2509 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2510 vextracti128_high(vtmp1, src2); 2511 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2512 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2513 } 2514 2515 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2516 vextracti64x4_high(vtmp2, src2); 2517 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2518 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2519 } 2520 2521 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2522 mov64(temp, -1L); 2523 bzhiq(temp, temp, len); 2524 kmovql(dst, temp); 2525 } 2526 #endif // _LP64 2527 2528 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2529 reduce_operation_128(T_FLOAT, opcode, dst, src); 2530 pshufd(vtmp, src, 0x1); 2531 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2532 } 2533 2534 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2535 reduce2F(opcode, dst, src, vtmp); 2536 pshufd(vtmp, src, 0x2); 2537 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2538 pshufd(vtmp, src, 0x3); 2539 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2540 } 2541 2542 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2543 reduce4F(opcode, dst, src, vtmp2); 2544 vextractf128_high(vtmp2, src); 2545 reduce4F(opcode, dst, vtmp2, vtmp1); 2546 } 2547 2548 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2549 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2550 vextracti64x4_high(vtmp1, src); 2551 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2552 } 2553 2554 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2555 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2556 pshufd(vtmp, src, 0xE); 2557 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2558 } 2559 2560 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2561 reduce2D(opcode, dst, src, vtmp2); 2562 vextractf128_high(vtmp2, src); 2563 reduce2D(opcode, dst, vtmp2, vtmp1); 2564 } 2565 2566 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2567 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2568 vextracti64x4_high(vtmp1, src); 2569 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2570 } 2571 2572 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2573 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2574 } 2575 2576 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2577 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2578 } 2579 2580 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2581 int vec_enc) { 2582 switch(elem_bt) { 2583 case T_INT: 2584 case T_FLOAT: 2585 vmaskmovps(dst, src, mask, vec_enc); 2586 break; 2587 case T_LONG: 2588 case T_DOUBLE: 2589 vmaskmovpd(dst, src, mask, vec_enc); 2590 break; 2591 default: 2592 fatal("Unsupported type %s", type2name(elem_bt)); 2593 break; 2594 } 2595 } 2596 2597 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2598 int vec_enc) { 2599 switch(elem_bt) { 2600 case T_INT: 2601 case T_FLOAT: 2602 vmaskmovps(dst, src, mask, vec_enc); 2603 break; 2604 case T_LONG: 2605 case T_DOUBLE: 2606 vmaskmovpd(dst, src, mask, vec_enc); 2607 break; 2608 default: 2609 fatal("Unsupported type %s", type2name(elem_bt)); 2610 break; 2611 } 2612 } 2613 2614 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2615 XMMRegister dst, XMMRegister src, 2616 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2617 XMMRegister xmm_0, XMMRegister xmm_1) { 2618 const int permconst[] = {1, 14}; 2619 XMMRegister wsrc = src; 2620 XMMRegister wdst = xmm_0; 2621 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2622 2623 int vlen_enc = Assembler::AVX_128bit; 2624 if (vlen == 16) { 2625 vlen_enc = Assembler::AVX_256bit; 2626 } 2627 2628 for (int i = log2(vlen) - 1; i >=0; i--) { 2629 if (i == 0 && !is_dst_valid) { 2630 wdst = dst; 2631 } 2632 if (i == 3) { 2633 vextracti64x4_high(wtmp, wsrc); 2634 } else if (i == 2) { 2635 vextracti128_high(wtmp, wsrc); 2636 } else { // i = [0,1] 2637 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2638 } 2639 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2640 wsrc = wdst; 2641 vlen_enc = Assembler::AVX_128bit; 2642 } 2643 if (is_dst_valid) { 2644 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2645 } 2646 } 2647 2648 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2649 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2650 XMMRegister xmm_0, XMMRegister xmm_1) { 2651 XMMRegister wsrc = src; 2652 XMMRegister wdst = xmm_0; 2653 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2654 int vlen_enc = Assembler::AVX_128bit; 2655 if (vlen == 8) { 2656 vlen_enc = Assembler::AVX_256bit; 2657 } 2658 for (int i = log2(vlen) - 1; i >=0; i--) { 2659 if (i == 0 && !is_dst_valid) { 2660 wdst = dst; 2661 } 2662 if (i == 1) { 2663 vextracti128_high(wtmp, wsrc); 2664 } else if (i == 2) { 2665 vextracti64x4_high(wtmp, wsrc); 2666 } else { 2667 assert(i == 0, "%d", i); 2668 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2669 } 2670 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2671 wsrc = wdst; 2672 vlen_enc = Assembler::AVX_128bit; 2673 } 2674 if (is_dst_valid) { 2675 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2676 } 2677 } 2678 2679 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2680 switch (bt) { 2681 case T_BYTE: pextrb(dst, src, idx); break; 2682 case T_SHORT: pextrw(dst, src, idx); break; 2683 case T_INT: pextrd(dst, src, idx); break; 2684 case T_LONG: pextrq(dst, src, idx); break; 2685 2686 default: 2687 assert(false,"Should not reach here."); 2688 break; 2689 } 2690 } 2691 2692 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2693 int esize = type2aelembytes(typ); 2694 int elem_per_lane = 16/esize; 2695 int lane = elemindex / elem_per_lane; 2696 int eindex = elemindex % elem_per_lane; 2697 2698 if (lane >= 2) { 2699 assert(UseAVX > 2, "required"); 2700 vextractf32x4(dst, src, lane & 3); 2701 return dst; 2702 } else if (lane > 0) { 2703 assert(UseAVX > 0, "required"); 2704 vextractf128(dst, src, lane); 2705 return dst; 2706 } else { 2707 return src; 2708 } 2709 } 2710 2711 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2712 if (typ == T_BYTE) { 2713 movsbl(dst, dst); 2714 } else if (typ == T_SHORT) { 2715 movswl(dst, dst); 2716 } 2717 } 2718 2719 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2720 int esize = type2aelembytes(typ); 2721 int elem_per_lane = 16/esize; 2722 int eindex = elemindex % elem_per_lane; 2723 assert(is_integral_type(typ),"required"); 2724 2725 if (eindex == 0) { 2726 if (typ == T_LONG) { 2727 movq(dst, src); 2728 } else { 2729 movdl(dst, src); 2730 movsxl(typ, dst); 2731 } 2732 } else { 2733 extract(typ, dst, src, eindex); 2734 movsxl(typ, dst); 2735 } 2736 } 2737 2738 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2739 int esize = type2aelembytes(typ); 2740 int elem_per_lane = 16/esize; 2741 int eindex = elemindex % elem_per_lane; 2742 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2743 2744 if (eindex == 0) { 2745 movq(dst, src); 2746 } else { 2747 if (typ == T_FLOAT) { 2748 if (UseAVX == 0) { 2749 movdqu(dst, src); 2750 shufps(dst, dst, eindex); 2751 } else { 2752 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2753 } 2754 } else { 2755 if (UseAVX == 0) { 2756 movdqu(dst, src); 2757 psrldq(dst, eindex*esize); 2758 } else { 2759 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2760 } 2761 movq(dst, dst); 2762 } 2763 } 2764 // Zero upper bits 2765 if (typ == T_FLOAT) { 2766 if (UseAVX == 0) { 2767 assert(vtmp != xnoreg, "required."); 2768 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2769 pand(dst, vtmp); 2770 } else { 2771 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2772 } 2773 } 2774 } 2775 2776 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2777 switch(typ) { 2778 case T_BYTE: 2779 case T_BOOLEAN: 2780 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2781 break; 2782 case T_SHORT: 2783 case T_CHAR: 2784 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2785 break; 2786 case T_INT: 2787 case T_FLOAT: 2788 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2789 break; 2790 case T_LONG: 2791 case T_DOUBLE: 2792 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2793 break; 2794 default: 2795 assert(false,"Should not reach here."); 2796 break; 2797 } 2798 } 2799 2800 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2801 assert(rscratch != noreg || always_reachable(src2), "missing"); 2802 2803 switch(typ) { 2804 case T_BOOLEAN: 2805 case T_BYTE: 2806 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2807 break; 2808 case T_CHAR: 2809 case T_SHORT: 2810 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2811 break; 2812 case T_INT: 2813 case T_FLOAT: 2814 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2815 break; 2816 case T_LONG: 2817 case T_DOUBLE: 2818 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2819 break; 2820 default: 2821 assert(false,"Should not reach here."); 2822 break; 2823 } 2824 } 2825 2826 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2827 switch(typ) { 2828 case T_BYTE: 2829 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2830 break; 2831 case T_SHORT: 2832 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2833 break; 2834 case T_INT: 2835 case T_FLOAT: 2836 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2837 break; 2838 case T_LONG: 2839 case T_DOUBLE: 2840 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2841 break; 2842 default: 2843 assert(false,"Should not reach here."); 2844 break; 2845 } 2846 } 2847 2848 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2849 assert(vlen_in_bytes <= 32, ""); 2850 int esize = type2aelembytes(bt); 2851 if (vlen_in_bytes == 32) { 2852 assert(vtmp == xnoreg, "required."); 2853 if (esize >= 4) { 2854 vtestps(src1, src2, AVX_256bit); 2855 } else { 2856 vptest(src1, src2, AVX_256bit); 2857 } 2858 return; 2859 } 2860 if (vlen_in_bytes < 16) { 2861 // Duplicate the lower part to fill the whole register, 2862 // Don't need to do so for src2 2863 assert(vtmp != xnoreg, "required"); 2864 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2865 pshufd(vtmp, src1, shuffle_imm); 2866 } else { 2867 assert(vtmp == xnoreg, "required"); 2868 vtmp = src1; 2869 } 2870 if (esize >= 4 && VM_Version::supports_avx()) { 2871 vtestps(vtmp, src2, AVX_128bit); 2872 } else { 2873 ptest(vtmp, src2); 2874 } 2875 } 2876 2877 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2878 assert(UseAVX >= 2, "required"); 2879 #ifdef ASSERT 2880 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2881 bool is_bw_supported = VM_Version::supports_avx512bw(); 2882 if (is_bw && !is_bw_supported) { 2883 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2884 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2885 "XMM register should be 0-15"); 2886 } 2887 #endif // ASSERT 2888 switch (elem_bt) { 2889 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2890 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2891 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2892 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2893 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2894 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2895 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2896 } 2897 } 2898 2899 #ifdef _LP64 2900 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2901 assert(UseAVX >= 2, "required"); 2902 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2903 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2904 if ((UseAVX > 2) && 2905 (!is_bw || VM_Version::supports_avx512bw()) && 2906 (!is_vl || VM_Version::supports_avx512vl())) { 2907 switch (elem_bt) { 2908 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2909 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2910 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2911 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2912 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2913 } 2914 } else { 2915 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2916 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2917 switch (elem_bt) { 2918 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2919 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2920 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2921 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2922 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2923 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2924 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2925 } 2926 } 2927 } 2928 #endif 2929 2930 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2931 switch (to_elem_bt) { 2932 case T_SHORT: 2933 vpmovsxbw(dst, src, vlen_enc); 2934 break; 2935 case T_INT: 2936 vpmovsxbd(dst, src, vlen_enc); 2937 break; 2938 case T_FLOAT: 2939 vpmovsxbd(dst, src, vlen_enc); 2940 vcvtdq2ps(dst, dst, vlen_enc); 2941 break; 2942 case T_LONG: 2943 vpmovsxbq(dst, src, vlen_enc); 2944 break; 2945 case T_DOUBLE: { 2946 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2947 vpmovsxbd(dst, src, mid_vlen_enc); 2948 vcvtdq2pd(dst, dst, vlen_enc); 2949 break; 2950 } 2951 default: 2952 fatal("Unsupported type %s", type2name(to_elem_bt)); 2953 break; 2954 } 2955 } 2956 2957 //------------------------------------------------------------------------------------------- 2958 2959 // IndexOf for constant substrings with size >= 8 chars 2960 // which don't need to be loaded through stack. 2961 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2962 Register cnt1, Register cnt2, 2963 int int_cnt2, Register result, 2964 XMMRegister vec, Register tmp, 2965 int ae) { 2966 ShortBranchVerifier sbv(this); 2967 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2968 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2969 2970 // This method uses the pcmpestri instruction with bound registers 2971 // inputs: 2972 // xmm - substring 2973 // rax - substring length (elements count) 2974 // mem - scanned string 2975 // rdx - string length (elements count) 2976 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2977 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2978 // outputs: 2979 // rcx - matched index in string 2980 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2981 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2982 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2983 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2984 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2985 2986 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2987 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2988 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2989 2990 // Note, inline_string_indexOf() generates checks: 2991 // if (substr.count > string.count) return -1; 2992 // if (substr.count == 0) return 0; 2993 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2994 2995 // Load substring. 2996 if (ae == StrIntrinsicNode::UL) { 2997 pmovzxbw(vec, Address(str2, 0)); 2998 } else { 2999 movdqu(vec, Address(str2, 0)); 3000 } 3001 movl(cnt2, int_cnt2); 3002 movptr(result, str1); // string addr 3003 3004 if (int_cnt2 > stride) { 3005 jmpb(SCAN_TO_SUBSTR); 3006 3007 // Reload substr for rescan, this code 3008 // is executed only for large substrings (> 8 chars) 3009 bind(RELOAD_SUBSTR); 3010 if (ae == StrIntrinsicNode::UL) { 3011 pmovzxbw(vec, Address(str2, 0)); 3012 } else { 3013 movdqu(vec, Address(str2, 0)); 3014 } 3015 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 3016 3017 bind(RELOAD_STR); 3018 // We came here after the beginning of the substring was 3019 // matched but the rest of it was not so we need to search 3020 // again. Start from the next element after the previous match. 3021 3022 // cnt2 is number of substring reminding elements and 3023 // cnt1 is number of string reminding elements when cmp failed. 3024 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 3025 subl(cnt1, cnt2); 3026 addl(cnt1, int_cnt2); 3027 movl(cnt2, int_cnt2); // Now restore cnt2 3028 3029 decrementl(cnt1); // Shift to next element 3030 cmpl(cnt1, cnt2); 3031 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3032 3033 addptr(result, (1<<scale1)); 3034 3035 } // (int_cnt2 > 8) 3036 3037 // Scan string for start of substr in 16-byte vectors 3038 bind(SCAN_TO_SUBSTR); 3039 pcmpestri(vec, Address(result, 0), mode); 3040 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3041 subl(cnt1, stride); 3042 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3043 cmpl(cnt1, cnt2); 3044 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3045 addptr(result, 16); 3046 jmpb(SCAN_TO_SUBSTR); 3047 3048 // Found a potential substr 3049 bind(FOUND_CANDIDATE); 3050 // Matched whole vector if first element matched (tmp(rcx) == 0). 3051 if (int_cnt2 == stride) { 3052 jccb(Assembler::overflow, RET_FOUND); // OF == 1 3053 } else { // int_cnt2 > 8 3054 jccb(Assembler::overflow, FOUND_SUBSTR); 3055 } 3056 // After pcmpestri tmp(rcx) contains matched element index 3057 // Compute start addr of substr 3058 lea(result, Address(result, tmp, scale1)); 3059 3060 // Make sure string is still long enough 3061 subl(cnt1, tmp); 3062 cmpl(cnt1, cnt2); 3063 if (int_cnt2 == stride) { 3064 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3065 } else { // int_cnt2 > 8 3066 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 3067 } 3068 // Left less then substring. 3069 3070 bind(RET_NOT_FOUND); 3071 movl(result, -1); 3072 jmp(EXIT); 3073 3074 if (int_cnt2 > stride) { 3075 // This code is optimized for the case when whole substring 3076 // is matched if its head is matched. 3077 bind(MATCH_SUBSTR_HEAD); 3078 pcmpestri(vec, Address(result, 0), mode); 3079 // Reload only string if does not match 3080 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 3081 3082 Label CONT_SCAN_SUBSTR; 3083 // Compare the rest of substring (> 8 chars). 3084 bind(FOUND_SUBSTR); 3085 // First 8 chars are already matched. 3086 negptr(cnt2); 3087 addptr(cnt2, stride); 3088 3089 bind(SCAN_SUBSTR); 3090 subl(cnt1, stride); 3091 cmpl(cnt2, -stride); // Do not read beyond substring 3092 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 3093 // Back-up strings to avoid reading beyond substring: 3094 // cnt1 = cnt1 - cnt2 + 8 3095 addl(cnt1, cnt2); // cnt2 is negative 3096 addl(cnt1, stride); 3097 movl(cnt2, stride); negptr(cnt2); 3098 bind(CONT_SCAN_SUBSTR); 3099 if (int_cnt2 < (int)G) { 3100 int tail_off1 = int_cnt2<<scale1; 3101 int tail_off2 = int_cnt2<<scale2; 3102 if (ae == StrIntrinsicNode::UL) { 3103 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 3104 } else { 3105 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 3106 } 3107 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 3108 } else { 3109 // calculate index in register to avoid integer overflow (int_cnt2*2) 3110 movl(tmp, int_cnt2); 3111 addptr(tmp, cnt2); 3112 if (ae == StrIntrinsicNode::UL) { 3113 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 3114 } else { 3115 movdqu(vec, Address(str2, tmp, scale2, 0)); 3116 } 3117 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 3118 } 3119 // Need to reload strings pointers if not matched whole vector 3120 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3121 addptr(cnt2, stride); 3122 jcc(Assembler::negative, SCAN_SUBSTR); 3123 // Fall through if found full substring 3124 3125 } // (int_cnt2 > 8) 3126 3127 bind(RET_FOUND); 3128 // Found result if we matched full small substring. 3129 // Compute substr offset 3130 subptr(result, str1); 3131 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3132 shrl(result, 1); // index 3133 } 3134 bind(EXIT); 3135 3136 } // string_indexofC8 3137 3138 // Small strings are loaded through stack if they cross page boundary. 3139 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 3140 Register cnt1, Register cnt2, 3141 int int_cnt2, Register result, 3142 XMMRegister vec, Register tmp, 3143 int ae) { 3144 ShortBranchVerifier sbv(this); 3145 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3146 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 3147 3148 // 3149 // int_cnt2 is length of small (< 8 chars) constant substring 3150 // or (-1) for non constant substring in which case its length 3151 // is in cnt2 register. 3152 // 3153 // Note, inline_string_indexOf() generates checks: 3154 // if (substr.count > string.count) return -1; 3155 // if (substr.count == 0) return 0; 3156 // 3157 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 3158 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 3159 // This method uses the pcmpestri instruction with bound registers 3160 // inputs: 3161 // xmm - substring 3162 // rax - substring length (elements count) 3163 // mem - scanned string 3164 // rdx - string length (elements count) 3165 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 3166 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 3167 // outputs: 3168 // rcx - matched index in string 3169 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3170 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 3171 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 3172 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 3173 3174 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 3175 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 3176 FOUND_CANDIDATE; 3177 3178 { //======================================================== 3179 // We don't know where these strings are located 3180 // and we can't read beyond them. Load them through stack. 3181 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 3182 3183 movptr(tmp, rsp); // save old SP 3184 3185 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 3186 if (int_cnt2 == (1>>scale2)) { // One byte 3187 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 3188 load_unsigned_byte(result, Address(str2, 0)); 3189 movdl(vec, result); // move 32 bits 3190 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 3191 // Not enough header space in 32-bit VM: 12+3 = 15. 3192 movl(result, Address(str2, -1)); 3193 shrl(result, 8); 3194 movdl(vec, result); // move 32 bits 3195 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 3196 load_unsigned_short(result, Address(str2, 0)); 3197 movdl(vec, result); // move 32 bits 3198 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 3199 movdl(vec, Address(str2, 0)); // move 32 bits 3200 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 3201 movq(vec, Address(str2, 0)); // move 64 bits 3202 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 3203 // Array header size is 12 bytes in 32-bit VM 3204 // + 6 bytes for 3 chars == 18 bytes, 3205 // enough space to load vec and shift. 3206 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 3207 if (ae == StrIntrinsicNode::UL) { 3208 int tail_off = int_cnt2-8; 3209 pmovzxbw(vec, Address(str2, tail_off)); 3210 psrldq(vec, -2*tail_off); 3211 } 3212 else { 3213 int tail_off = int_cnt2*(1<<scale2); 3214 movdqu(vec, Address(str2, tail_off-16)); 3215 psrldq(vec, 16-tail_off); 3216 } 3217 } 3218 } else { // not constant substring 3219 cmpl(cnt2, stride); 3220 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 3221 3222 // We can read beyond string if srt+16 does not cross page boundary 3223 // since heaps are aligned and mapped by pages. 3224 assert(os::vm_page_size() < (int)G, "default page should be small"); 3225 movl(result, str2); // We need only low 32 bits 3226 andl(result, ((int)os::vm_page_size()-1)); 3227 cmpl(result, ((int)os::vm_page_size()-16)); 3228 jccb(Assembler::belowEqual, CHECK_STR); 3229 3230 // Move small strings to stack to allow load 16 bytes into vec. 3231 subptr(rsp, 16); 3232 int stk_offset = wordSize-(1<<scale2); 3233 push(cnt2); 3234 3235 bind(COPY_SUBSTR); 3236 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 3237 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 3238 movb(Address(rsp, cnt2, scale2, stk_offset), result); 3239 } else if (ae == StrIntrinsicNode::UU) { 3240 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 3241 movw(Address(rsp, cnt2, scale2, stk_offset), result); 3242 } 3243 decrement(cnt2); 3244 jccb(Assembler::notZero, COPY_SUBSTR); 3245 3246 pop(cnt2); 3247 movptr(str2, rsp); // New substring address 3248 } // non constant 3249 3250 bind(CHECK_STR); 3251 cmpl(cnt1, stride); 3252 jccb(Assembler::aboveEqual, BIG_STRINGS); 3253 3254 // Check cross page boundary. 3255 movl(result, str1); // We need only low 32 bits 3256 andl(result, ((int)os::vm_page_size()-1)); 3257 cmpl(result, ((int)os::vm_page_size()-16)); 3258 jccb(Assembler::belowEqual, BIG_STRINGS); 3259 3260 subptr(rsp, 16); 3261 int stk_offset = -(1<<scale1); 3262 if (int_cnt2 < 0) { // not constant 3263 push(cnt2); 3264 stk_offset += wordSize; 3265 } 3266 movl(cnt2, cnt1); 3267 3268 bind(COPY_STR); 3269 if (ae == StrIntrinsicNode::LL) { 3270 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3271 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3272 } else { 3273 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3274 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3275 } 3276 decrement(cnt2); 3277 jccb(Assembler::notZero, COPY_STR); 3278 3279 if (int_cnt2 < 0) { // not constant 3280 pop(cnt2); 3281 } 3282 movptr(str1, rsp); // New string address 3283 3284 bind(BIG_STRINGS); 3285 // Load substring. 3286 if (int_cnt2 < 0) { // -1 3287 if (ae == StrIntrinsicNode::UL) { 3288 pmovzxbw(vec, Address(str2, 0)); 3289 } else { 3290 movdqu(vec, Address(str2, 0)); 3291 } 3292 push(cnt2); // substr count 3293 push(str2); // substr addr 3294 push(str1); // string addr 3295 } else { 3296 // Small (< 8 chars) constant substrings are loaded already. 3297 movl(cnt2, int_cnt2); 3298 } 3299 push(tmp); // original SP 3300 3301 } // Finished loading 3302 3303 //======================================================== 3304 // Start search 3305 // 3306 3307 movptr(result, str1); // string addr 3308 3309 if (int_cnt2 < 0) { // Only for non constant substring 3310 jmpb(SCAN_TO_SUBSTR); 3311 3312 // SP saved at sp+0 3313 // String saved at sp+1*wordSize 3314 // Substr saved at sp+2*wordSize 3315 // Substr count saved at sp+3*wordSize 3316 3317 // Reload substr for rescan, this code 3318 // is executed only for large substrings (> 8 chars) 3319 bind(RELOAD_SUBSTR); 3320 movptr(str2, Address(rsp, 2*wordSize)); 3321 movl(cnt2, Address(rsp, 3*wordSize)); 3322 if (ae == StrIntrinsicNode::UL) { 3323 pmovzxbw(vec, Address(str2, 0)); 3324 } else { 3325 movdqu(vec, Address(str2, 0)); 3326 } 3327 // We came here after the beginning of the substring was 3328 // matched but the rest of it was not so we need to search 3329 // again. Start from the next element after the previous match. 3330 subptr(str1, result); // Restore counter 3331 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3332 shrl(str1, 1); 3333 } 3334 addl(cnt1, str1); 3335 decrementl(cnt1); // Shift to next element 3336 cmpl(cnt1, cnt2); 3337 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3338 3339 addptr(result, (1<<scale1)); 3340 } // non constant 3341 3342 // Scan string for start of substr in 16-byte vectors 3343 bind(SCAN_TO_SUBSTR); 3344 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3345 pcmpestri(vec, Address(result, 0), mode); 3346 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3347 subl(cnt1, stride); 3348 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3349 cmpl(cnt1, cnt2); 3350 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3351 addptr(result, 16); 3352 3353 bind(ADJUST_STR); 3354 cmpl(cnt1, stride); // Do not read beyond string 3355 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3356 // Back-up string to avoid reading beyond string. 3357 lea(result, Address(result, cnt1, scale1, -16)); 3358 movl(cnt1, stride); 3359 jmpb(SCAN_TO_SUBSTR); 3360 3361 // Found a potential substr 3362 bind(FOUND_CANDIDATE); 3363 // After pcmpestri tmp(rcx) contains matched element index 3364 3365 // Make sure string is still long enough 3366 subl(cnt1, tmp); 3367 cmpl(cnt1, cnt2); 3368 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3369 // Left less then substring. 3370 3371 bind(RET_NOT_FOUND); 3372 movl(result, -1); 3373 jmp(CLEANUP); 3374 3375 bind(FOUND_SUBSTR); 3376 // Compute start addr of substr 3377 lea(result, Address(result, tmp, scale1)); 3378 if (int_cnt2 > 0) { // Constant substring 3379 // Repeat search for small substring (< 8 chars) 3380 // from new point without reloading substring. 3381 // Have to check that we don't read beyond string. 3382 cmpl(tmp, stride-int_cnt2); 3383 jccb(Assembler::greater, ADJUST_STR); 3384 // Fall through if matched whole substring. 3385 } else { // non constant 3386 assert(int_cnt2 == -1, "should be != 0"); 3387 3388 addl(tmp, cnt2); 3389 // Found result if we matched whole substring. 3390 cmpl(tmp, stride); 3391 jcc(Assembler::lessEqual, RET_FOUND); 3392 3393 // Repeat search for small substring (<= 8 chars) 3394 // from new point 'str1' without reloading substring. 3395 cmpl(cnt2, stride); 3396 // Have to check that we don't read beyond string. 3397 jccb(Assembler::lessEqual, ADJUST_STR); 3398 3399 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3400 // Compare the rest of substring (> 8 chars). 3401 movptr(str1, result); 3402 3403 cmpl(tmp, cnt2); 3404 // First 8 chars are already matched. 3405 jccb(Assembler::equal, CHECK_NEXT); 3406 3407 bind(SCAN_SUBSTR); 3408 pcmpestri(vec, Address(str1, 0), mode); 3409 // Need to reload strings pointers if not matched whole vector 3410 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3411 3412 bind(CHECK_NEXT); 3413 subl(cnt2, stride); 3414 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3415 addptr(str1, 16); 3416 if (ae == StrIntrinsicNode::UL) { 3417 addptr(str2, 8); 3418 } else { 3419 addptr(str2, 16); 3420 } 3421 subl(cnt1, stride); 3422 cmpl(cnt2, stride); // Do not read beyond substring 3423 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3424 // Back-up strings to avoid reading beyond substring. 3425 3426 if (ae == StrIntrinsicNode::UL) { 3427 lea(str2, Address(str2, cnt2, scale2, -8)); 3428 lea(str1, Address(str1, cnt2, scale1, -16)); 3429 } else { 3430 lea(str2, Address(str2, cnt2, scale2, -16)); 3431 lea(str1, Address(str1, cnt2, scale1, -16)); 3432 } 3433 subl(cnt1, cnt2); 3434 movl(cnt2, stride); 3435 addl(cnt1, stride); 3436 bind(CONT_SCAN_SUBSTR); 3437 if (ae == StrIntrinsicNode::UL) { 3438 pmovzxbw(vec, Address(str2, 0)); 3439 } else { 3440 movdqu(vec, Address(str2, 0)); 3441 } 3442 jmp(SCAN_SUBSTR); 3443 3444 bind(RET_FOUND_LONG); 3445 movptr(str1, Address(rsp, wordSize)); 3446 } // non constant 3447 3448 bind(RET_FOUND); 3449 // Compute substr offset 3450 subptr(result, str1); 3451 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3452 shrl(result, 1); // index 3453 } 3454 bind(CLEANUP); 3455 pop(rsp); // restore SP 3456 3457 } // string_indexof 3458 3459 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3460 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3461 ShortBranchVerifier sbv(this); 3462 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3463 3464 int stride = 8; 3465 3466 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3467 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3468 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3469 FOUND_SEQ_CHAR, DONE_LABEL; 3470 3471 movptr(result, str1); 3472 if (UseAVX >= 2) { 3473 cmpl(cnt1, stride); 3474 jcc(Assembler::less, SCAN_TO_CHAR); 3475 cmpl(cnt1, 2*stride); 3476 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3477 movdl(vec1, ch); 3478 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3479 vpxor(vec2, vec2); 3480 movl(tmp, cnt1); 3481 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3482 andl(cnt1,0x0000000F); //tail count (in chars) 3483 3484 bind(SCAN_TO_16_CHAR_LOOP); 3485 vmovdqu(vec3, Address(result, 0)); 3486 vpcmpeqw(vec3, vec3, vec1, 1); 3487 vptest(vec2, vec3); 3488 jcc(Assembler::carryClear, FOUND_CHAR); 3489 addptr(result, 32); 3490 subl(tmp, 2*stride); 3491 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3492 jmp(SCAN_TO_8_CHAR); 3493 bind(SCAN_TO_8_CHAR_INIT); 3494 movdl(vec1, ch); 3495 pshuflw(vec1, vec1, 0x00); 3496 pshufd(vec1, vec1, 0); 3497 pxor(vec2, vec2); 3498 } 3499 bind(SCAN_TO_8_CHAR); 3500 cmpl(cnt1, stride); 3501 jcc(Assembler::less, SCAN_TO_CHAR); 3502 if (UseAVX < 2) { 3503 movdl(vec1, ch); 3504 pshuflw(vec1, vec1, 0x00); 3505 pshufd(vec1, vec1, 0); 3506 pxor(vec2, vec2); 3507 } 3508 movl(tmp, cnt1); 3509 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3510 andl(cnt1,0x00000007); //tail count (in chars) 3511 3512 bind(SCAN_TO_8_CHAR_LOOP); 3513 movdqu(vec3, Address(result, 0)); 3514 pcmpeqw(vec3, vec1); 3515 ptest(vec2, vec3); 3516 jcc(Assembler::carryClear, FOUND_CHAR); 3517 addptr(result, 16); 3518 subl(tmp, stride); 3519 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3520 bind(SCAN_TO_CHAR); 3521 testl(cnt1, cnt1); 3522 jcc(Assembler::zero, RET_NOT_FOUND); 3523 bind(SCAN_TO_CHAR_LOOP); 3524 load_unsigned_short(tmp, Address(result, 0)); 3525 cmpl(ch, tmp); 3526 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3527 addptr(result, 2); 3528 subl(cnt1, 1); 3529 jccb(Assembler::zero, RET_NOT_FOUND); 3530 jmp(SCAN_TO_CHAR_LOOP); 3531 3532 bind(RET_NOT_FOUND); 3533 movl(result, -1); 3534 jmpb(DONE_LABEL); 3535 3536 bind(FOUND_CHAR); 3537 if (UseAVX >= 2) { 3538 vpmovmskb(tmp, vec3); 3539 } else { 3540 pmovmskb(tmp, vec3); 3541 } 3542 bsfl(ch, tmp); 3543 addptr(result, ch); 3544 3545 bind(FOUND_SEQ_CHAR); 3546 subptr(result, str1); 3547 shrl(result, 1); 3548 3549 bind(DONE_LABEL); 3550 } // string_indexof_char 3551 3552 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3553 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3554 ShortBranchVerifier sbv(this); 3555 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3556 3557 int stride = 16; 3558 3559 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3560 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3561 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3562 FOUND_SEQ_CHAR, DONE_LABEL; 3563 3564 movptr(result, str1); 3565 if (UseAVX >= 2) { 3566 cmpl(cnt1, stride); 3567 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3568 cmpl(cnt1, stride*2); 3569 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3570 movdl(vec1, ch); 3571 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3572 vpxor(vec2, vec2); 3573 movl(tmp, cnt1); 3574 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3575 andl(cnt1,0x0000001F); //tail count (in chars) 3576 3577 bind(SCAN_TO_32_CHAR_LOOP); 3578 vmovdqu(vec3, Address(result, 0)); 3579 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3580 vptest(vec2, vec3); 3581 jcc(Assembler::carryClear, FOUND_CHAR); 3582 addptr(result, 32); 3583 subl(tmp, stride*2); 3584 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3585 jmp(SCAN_TO_16_CHAR); 3586 3587 bind(SCAN_TO_16_CHAR_INIT); 3588 movdl(vec1, ch); 3589 pxor(vec2, vec2); 3590 pshufb(vec1, vec2); 3591 } 3592 3593 bind(SCAN_TO_16_CHAR); 3594 cmpl(cnt1, stride); 3595 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3596 if (UseAVX < 2) { 3597 movdl(vec1, ch); 3598 pxor(vec2, vec2); 3599 pshufb(vec1, vec2); 3600 } 3601 movl(tmp, cnt1); 3602 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3603 andl(cnt1,0x0000000F); //tail count (in bytes) 3604 3605 bind(SCAN_TO_16_CHAR_LOOP); 3606 movdqu(vec3, Address(result, 0)); 3607 pcmpeqb(vec3, vec1); 3608 ptest(vec2, vec3); 3609 jcc(Assembler::carryClear, FOUND_CHAR); 3610 addptr(result, 16); 3611 subl(tmp, stride); 3612 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3613 3614 bind(SCAN_TO_CHAR_INIT); 3615 testl(cnt1, cnt1); 3616 jcc(Assembler::zero, RET_NOT_FOUND); 3617 bind(SCAN_TO_CHAR_LOOP); 3618 load_unsigned_byte(tmp, Address(result, 0)); 3619 cmpl(ch, tmp); 3620 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3621 addptr(result, 1); 3622 subl(cnt1, 1); 3623 jccb(Assembler::zero, RET_NOT_FOUND); 3624 jmp(SCAN_TO_CHAR_LOOP); 3625 3626 bind(RET_NOT_FOUND); 3627 movl(result, -1); 3628 jmpb(DONE_LABEL); 3629 3630 bind(FOUND_CHAR); 3631 if (UseAVX >= 2) { 3632 vpmovmskb(tmp, vec3); 3633 } else { 3634 pmovmskb(tmp, vec3); 3635 } 3636 bsfl(ch, tmp); 3637 addptr(result, ch); 3638 3639 bind(FOUND_SEQ_CHAR); 3640 subptr(result, str1); 3641 3642 bind(DONE_LABEL); 3643 } // stringL_indexof_char 3644 3645 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3646 switch (eltype) { 3647 case T_BOOLEAN: return sizeof(jboolean); 3648 case T_BYTE: return sizeof(jbyte); 3649 case T_SHORT: return sizeof(jshort); 3650 case T_CHAR: return sizeof(jchar); 3651 case T_INT: return sizeof(jint); 3652 default: 3653 ShouldNotReachHere(); 3654 return -1; 3655 } 3656 } 3657 3658 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3659 switch (eltype) { 3660 // T_BOOLEAN used as surrogate for unsigned byte 3661 case T_BOOLEAN: movzbl(dst, src); break; 3662 case T_BYTE: movsbl(dst, src); break; 3663 case T_SHORT: movswl(dst, src); break; 3664 case T_CHAR: movzwl(dst, src); break; 3665 case T_INT: movl(dst, src); break; 3666 default: 3667 ShouldNotReachHere(); 3668 } 3669 } 3670 3671 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3672 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3673 } 3674 3675 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3676 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3677 } 3678 3679 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3680 const int vlen = Assembler::AVX_256bit; 3681 switch (eltype) { 3682 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3683 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3684 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3685 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3686 case T_INT: 3687 // do nothing 3688 break; 3689 default: 3690 ShouldNotReachHere(); 3691 } 3692 } 3693 3694 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3695 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3696 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3697 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3698 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3699 BasicType eltype) { 3700 ShortBranchVerifier sbv(this); 3701 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3702 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3703 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3704 3705 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3706 SHORT_UNROLLED_LOOP_EXIT, 3707 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3708 UNROLLED_VECTOR_LOOP_BEGIN, 3709 END; 3710 switch (eltype) { 3711 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3712 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3713 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3714 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3715 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3716 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3717 } 3718 3719 // For "renaming" for readibility of the code 3720 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3721 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3722 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3723 3724 const int elsize = arrays_hashcode_elsize(eltype); 3725 3726 /* 3727 if (cnt1 >= 2) { 3728 if (cnt1 >= 32) { 3729 UNROLLED VECTOR LOOP 3730 } 3731 UNROLLED SCALAR LOOP 3732 } 3733 SINGLE SCALAR 3734 */ 3735 3736 cmpl(cnt1, 32); 3737 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3738 3739 // cnt1 >= 32 && generate_vectorized_loop 3740 xorl(index, index); 3741 3742 // vresult = IntVector.zero(I256); 3743 for (int idx = 0; idx < 4; idx++) { 3744 vpxor(vresult[idx], vresult[idx]); 3745 } 3746 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3747 Register bound = tmp2; 3748 Register next = tmp3; 3749 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3750 movl(next, Address(tmp2, 0)); 3751 movdl(vnext, next); 3752 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3753 3754 // index = 0; 3755 // bound = cnt1 & ~(32 - 1); 3756 movl(bound, cnt1); 3757 andl(bound, ~(32 - 1)); 3758 // for (; index < bound; index += 32) { 3759 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3760 // result *= next; 3761 imull(result, next); 3762 // loop fission to upfront the cost of fetching from memory, OOO execution 3763 // can then hopefully do a better job of prefetching 3764 for (int idx = 0; idx < 4; idx++) { 3765 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3766 } 3767 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3768 for (int idx = 0; idx < 4; idx++) { 3769 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3770 arrays_hashcode_elvcast(vtmp[idx], eltype); 3771 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3772 } 3773 // index += 32; 3774 addl(index, 32); 3775 // index < bound; 3776 cmpl(index, bound); 3777 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3778 // } 3779 3780 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3781 subl(cnt1, bound); 3782 // release bound 3783 3784 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3785 for (int idx = 0; idx < 4; idx++) { 3786 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3787 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3788 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3789 } 3790 // result += vresult.reduceLanes(ADD); 3791 for (int idx = 0; idx < 4; idx++) { 3792 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3793 } 3794 3795 // } else if (cnt1 < 32) { 3796 3797 bind(SHORT_UNROLLED_BEGIN); 3798 // int i = 1; 3799 movl(index, 1); 3800 cmpl(index, cnt1); 3801 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3802 3803 // for (; i < cnt1 ; i += 2) { 3804 bind(SHORT_UNROLLED_LOOP_BEGIN); 3805 movl(tmp3, 961); 3806 imull(result, tmp3); 3807 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3808 movl(tmp3, tmp2); 3809 shll(tmp3, 5); 3810 subl(tmp3, tmp2); 3811 addl(result, tmp3); 3812 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3813 addl(result, tmp3); 3814 addl(index, 2); 3815 cmpl(index, cnt1); 3816 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3817 3818 // } 3819 // if (i >= cnt1) { 3820 bind(SHORT_UNROLLED_LOOP_EXIT); 3821 jccb(Assembler::greater, END); 3822 movl(tmp2, result); 3823 shll(result, 5); 3824 subl(result, tmp2); 3825 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3826 addl(result, tmp3); 3827 // } 3828 bind(END); 3829 3830 BLOCK_COMMENT("} // arrays_hashcode"); 3831 3832 } // arrays_hashcode 3833 3834 // helper function for string_compare 3835 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3836 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3837 Address::ScaleFactor scale2, Register index, int ae) { 3838 if (ae == StrIntrinsicNode::LL) { 3839 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3840 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3841 } else if (ae == StrIntrinsicNode::UU) { 3842 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3843 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3844 } else { 3845 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3846 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3847 } 3848 } 3849 3850 // Compare strings, used for char[] and byte[]. 3851 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3852 Register cnt1, Register cnt2, Register result, 3853 XMMRegister vec1, int ae, KRegister mask) { 3854 ShortBranchVerifier sbv(this); 3855 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3856 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3857 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3858 int stride2x2 = 0x40; 3859 Address::ScaleFactor scale = Address::no_scale; 3860 Address::ScaleFactor scale1 = Address::no_scale; 3861 Address::ScaleFactor scale2 = Address::no_scale; 3862 3863 if (ae != StrIntrinsicNode::LL) { 3864 stride2x2 = 0x20; 3865 } 3866 3867 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3868 shrl(cnt2, 1); 3869 } 3870 // Compute the minimum of the string lengths and the 3871 // difference of the string lengths (stack). 3872 // Do the conditional move stuff 3873 movl(result, cnt1); 3874 subl(cnt1, cnt2); 3875 push(cnt1); 3876 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3877 3878 // Is the minimum length zero? 3879 testl(cnt2, cnt2); 3880 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3881 if (ae == StrIntrinsicNode::LL) { 3882 // Load first bytes 3883 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3884 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3885 } else if (ae == StrIntrinsicNode::UU) { 3886 // Load first characters 3887 load_unsigned_short(result, Address(str1, 0)); 3888 load_unsigned_short(cnt1, Address(str2, 0)); 3889 } else { 3890 load_unsigned_byte(result, Address(str1, 0)); 3891 load_unsigned_short(cnt1, Address(str2, 0)); 3892 } 3893 subl(result, cnt1); 3894 jcc(Assembler::notZero, POP_LABEL); 3895 3896 if (ae == StrIntrinsicNode::UU) { 3897 // Divide length by 2 to get number of chars 3898 shrl(cnt2, 1); 3899 } 3900 cmpl(cnt2, 1); 3901 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3902 3903 // Check if the strings start at the same location and setup scale and stride 3904 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3905 cmpptr(str1, str2); 3906 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3907 if (ae == StrIntrinsicNode::LL) { 3908 scale = Address::times_1; 3909 stride = 16; 3910 } else { 3911 scale = Address::times_2; 3912 stride = 8; 3913 } 3914 } else { 3915 scale1 = Address::times_1; 3916 scale2 = Address::times_2; 3917 // scale not used 3918 stride = 8; 3919 } 3920 3921 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3922 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3923 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3924 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3925 Label COMPARE_TAIL_LONG; 3926 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3927 3928 int pcmpmask = 0x19; 3929 if (ae == StrIntrinsicNode::LL) { 3930 pcmpmask &= ~0x01; 3931 } 3932 3933 // Setup to compare 16-chars (32-bytes) vectors, 3934 // start from first character again because it has aligned address. 3935 if (ae == StrIntrinsicNode::LL) { 3936 stride2 = 32; 3937 } else { 3938 stride2 = 16; 3939 } 3940 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3941 adr_stride = stride << scale; 3942 } else { 3943 adr_stride1 = 8; //stride << scale1; 3944 adr_stride2 = 16; //stride << scale2; 3945 } 3946 3947 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3948 // rax and rdx are used by pcmpestri as elements counters 3949 movl(result, cnt2); 3950 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3951 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3952 3953 // fast path : compare first 2 8-char vectors. 3954 bind(COMPARE_16_CHARS); 3955 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3956 movdqu(vec1, Address(str1, 0)); 3957 } else { 3958 pmovzxbw(vec1, Address(str1, 0)); 3959 } 3960 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3961 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3962 3963 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3964 movdqu(vec1, Address(str1, adr_stride)); 3965 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3966 } else { 3967 pmovzxbw(vec1, Address(str1, adr_stride1)); 3968 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3969 } 3970 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3971 addl(cnt1, stride); 3972 3973 // Compare the characters at index in cnt1 3974 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3975 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3976 subl(result, cnt2); 3977 jmp(POP_LABEL); 3978 3979 // Setup the registers to start vector comparison loop 3980 bind(COMPARE_WIDE_VECTORS); 3981 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3982 lea(str1, Address(str1, result, scale)); 3983 lea(str2, Address(str2, result, scale)); 3984 } else { 3985 lea(str1, Address(str1, result, scale1)); 3986 lea(str2, Address(str2, result, scale2)); 3987 } 3988 subl(result, stride2); 3989 subl(cnt2, stride2); 3990 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3991 negptr(result); 3992 3993 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3994 bind(COMPARE_WIDE_VECTORS_LOOP); 3995 3996 #ifdef _LP64 3997 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3998 cmpl(cnt2, stride2x2); 3999 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4000 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 4001 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 4002 4003 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4004 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4005 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 4006 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 4007 } else { 4008 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 4009 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 4010 } 4011 kortestql(mask, mask); 4012 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 4013 addptr(result, stride2x2); // update since we already compared at this addr 4014 subl(cnt2, stride2x2); // and sub the size too 4015 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4016 4017 vpxor(vec1, vec1); 4018 jmpb(COMPARE_WIDE_TAIL); 4019 }//if (VM_Version::supports_avx512vlbw()) 4020 #endif // _LP64 4021 4022 4023 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4024 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4025 vmovdqu(vec1, Address(str1, result, scale)); 4026 vpxor(vec1, Address(str2, result, scale)); 4027 } else { 4028 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 4029 vpxor(vec1, Address(str2, result, scale2)); 4030 } 4031 vptest(vec1, vec1); 4032 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 4033 addptr(result, stride2); 4034 subl(cnt2, stride2); 4035 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 4036 // clean upper bits of YMM registers 4037 vpxor(vec1, vec1); 4038 4039 // compare wide vectors tail 4040 bind(COMPARE_WIDE_TAIL); 4041 testptr(result, result); 4042 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 4043 4044 movl(result, stride2); 4045 movl(cnt2, result); 4046 negptr(result); 4047 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4048 4049 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 4050 bind(VECTOR_NOT_EQUAL); 4051 // clean upper bits of YMM registers 4052 vpxor(vec1, vec1); 4053 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4054 lea(str1, Address(str1, result, scale)); 4055 lea(str2, Address(str2, result, scale)); 4056 } else { 4057 lea(str1, Address(str1, result, scale1)); 4058 lea(str2, Address(str2, result, scale2)); 4059 } 4060 jmp(COMPARE_16_CHARS); 4061 4062 // Compare tail chars, length between 1 to 15 chars 4063 bind(COMPARE_TAIL_LONG); 4064 movl(cnt2, result); 4065 cmpl(cnt2, stride); 4066 jcc(Assembler::less, COMPARE_SMALL_STR); 4067 4068 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4069 movdqu(vec1, Address(str1, 0)); 4070 } else { 4071 pmovzxbw(vec1, Address(str1, 0)); 4072 } 4073 pcmpestri(vec1, Address(str2, 0), pcmpmask); 4074 jcc(Assembler::below, COMPARE_INDEX_CHAR); 4075 subptr(cnt2, stride); 4076 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 4077 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4078 lea(str1, Address(str1, result, scale)); 4079 lea(str2, Address(str2, result, scale)); 4080 } else { 4081 lea(str1, Address(str1, result, scale1)); 4082 lea(str2, Address(str2, result, scale2)); 4083 } 4084 negptr(cnt2); 4085 jmpb(WHILE_HEAD_LABEL); 4086 4087 bind(COMPARE_SMALL_STR); 4088 } else if (UseSSE42Intrinsics) { 4089 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 4090 int pcmpmask = 0x19; 4091 // Setup to compare 8-char (16-byte) vectors, 4092 // start from first character again because it has aligned address. 4093 movl(result, cnt2); 4094 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 4095 if (ae == StrIntrinsicNode::LL) { 4096 pcmpmask &= ~0x01; 4097 } 4098 jcc(Assembler::zero, COMPARE_TAIL); 4099 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4100 lea(str1, Address(str1, result, scale)); 4101 lea(str2, Address(str2, result, scale)); 4102 } else { 4103 lea(str1, Address(str1, result, scale1)); 4104 lea(str2, Address(str2, result, scale2)); 4105 } 4106 negptr(result); 4107 4108 // pcmpestri 4109 // inputs: 4110 // vec1- substring 4111 // rax - negative string length (elements count) 4112 // mem - scanned string 4113 // rdx - string length (elements count) 4114 // pcmpmask - cmp mode: 11000 (string compare with negated result) 4115 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 4116 // outputs: 4117 // rcx - first mismatched element index 4118 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 4119 4120 bind(COMPARE_WIDE_VECTORS); 4121 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4122 movdqu(vec1, Address(str1, result, scale)); 4123 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 4124 } else { 4125 pmovzxbw(vec1, Address(str1, result, scale1)); 4126 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 4127 } 4128 // After pcmpestri cnt1(rcx) contains mismatched element index 4129 4130 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 4131 addptr(result, stride); 4132 subptr(cnt2, stride); 4133 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4134 4135 // compare wide vectors tail 4136 testptr(result, result); 4137 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 4138 4139 movl(cnt2, stride); 4140 movl(result, stride); 4141 negptr(result); 4142 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4143 movdqu(vec1, Address(str1, result, scale)); 4144 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 4145 } else { 4146 pmovzxbw(vec1, Address(str1, result, scale1)); 4147 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 4148 } 4149 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 4150 4151 // Mismatched characters in the vectors 4152 bind(VECTOR_NOT_EQUAL); 4153 addptr(cnt1, result); 4154 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 4155 subl(result, cnt2); 4156 jmpb(POP_LABEL); 4157 4158 bind(COMPARE_TAIL); // limit is zero 4159 movl(cnt2, result); 4160 // Fallthru to tail compare 4161 } 4162 // Shift str2 and str1 to the end of the arrays, negate min 4163 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4164 lea(str1, Address(str1, cnt2, scale)); 4165 lea(str2, Address(str2, cnt2, scale)); 4166 } else { 4167 lea(str1, Address(str1, cnt2, scale1)); 4168 lea(str2, Address(str2, cnt2, scale2)); 4169 } 4170 decrementl(cnt2); // first character was compared already 4171 negptr(cnt2); 4172 4173 // Compare the rest of the elements 4174 bind(WHILE_HEAD_LABEL); 4175 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 4176 subl(result, cnt1); 4177 jccb(Assembler::notZero, POP_LABEL); 4178 increment(cnt2); 4179 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 4180 4181 // Strings are equal up to min length. Return the length difference. 4182 bind(LENGTH_DIFF_LABEL); 4183 pop(result); 4184 if (ae == StrIntrinsicNode::UU) { 4185 // Divide diff by 2 to get number of chars 4186 sarl(result, 1); 4187 } 4188 jmpb(DONE_LABEL); 4189 4190 #ifdef _LP64 4191 if (VM_Version::supports_avx512vlbw()) { 4192 4193 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 4194 4195 kmovql(cnt1, mask); 4196 notq(cnt1); 4197 bsfq(cnt2, cnt1); 4198 if (ae != StrIntrinsicNode::LL) { 4199 // Divide diff by 2 to get number of chars 4200 sarl(cnt2, 1); 4201 } 4202 addq(result, cnt2); 4203 if (ae == StrIntrinsicNode::LL) { 4204 load_unsigned_byte(cnt1, Address(str2, result)); 4205 load_unsigned_byte(result, Address(str1, result)); 4206 } else if (ae == StrIntrinsicNode::UU) { 4207 load_unsigned_short(cnt1, Address(str2, result, scale)); 4208 load_unsigned_short(result, Address(str1, result, scale)); 4209 } else { 4210 load_unsigned_short(cnt1, Address(str2, result, scale2)); 4211 load_unsigned_byte(result, Address(str1, result, scale1)); 4212 } 4213 subl(result, cnt1); 4214 jmpb(POP_LABEL); 4215 }//if (VM_Version::supports_avx512vlbw()) 4216 #endif // _LP64 4217 4218 // Discard the stored length difference 4219 bind(POP_LABEL); 4220 pop(cnt1); 4221 4222 // That's it 4223 bind(DONE_LABEL); 4224 if(ae == StrIntrinsicNode::UL) { 4225 negl(result); 4226 } 4227 4228 } 4229 4230 // Search for Non-ASCII character (Negative byte value) in a byte array, 4231 // return the index of the first such character, otherwise the length 4232 // of the array segment searched. 4233 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 4234 // @IntrinsicCandidate 4235 // public static int countPositives(byte[] ba, int off, int len) { 4236 // for (int i = off; i < off + len; i++) { 4237 // if (ba[i] < 0) { 4238 // return i - off; 4239 // } 4240 // } 4241 // return len; 4242 // } 4243 void C2_MacroAssembler::count_positives(Register ary1, Register len, 4244 Register result, Register tmp1, 4245 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 4246 // rsi: byte array 4247 // rcx: len 4248 // rax: result 4249 ShortBranchVerifier sbv(this); 4250 assert_different_registers(ary1, len, result, tmp1); 4251 assert_different_registers(vec1, vec2); 4252 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 4253 4254 movl(result, len); // copy 4255 // len == 0 4256 testl(len, len); 4257 jcc(Assembler::zero, DONE); 4258 4259 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 4260 VM_Version::supports_avx512vlbw() && 4261 VM_Version::supports_bmi2()) { 4262 4263 Label test_64_loop, test_tail, BREAK_LOOP; 4264 movl(tmp1, len); 4265 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4266 4267 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 4268 andl(len, 0xffffffc0); // vector count (in chars) 4269 jccb(Assembler::zero, test_tail); 4270 4271 lea(ary1, Address(ary1, len, Address::times_1)); 4272 negptr(len); 4273 4274 bind(test_64_loop); 4275 // Check whether our 64 elements of size byte contain negatives 4276 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4277 kortestql(mask1, mask1); 4278 jcc(Assembler::notZero, BREAK_LOOP); 4279 4280 addptr(len, 64); 4281 jccb(Assembler::notZero, test_64_loop); 4282 4283 bind(test_tail); 4284 // bail out when there is nothing to be done 4285 testl(tmp1, -1); 4286 jcc(Assembler::zero, DONE); 4287 4288 4289 // check the tail for absense of negatives 4290 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4291 #ifdef _LP64 4292 { 4293 Register tmp3_aliased = len; 4294 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4295 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4296 notq(tmp3_aliased); 4297 kmovql(mask2, tmp3_aliased); 4298 } 4299 #else 4300 Label k_init; 4301 jmp(k_init); 4302 4303 // We could not read 64-bits from a general purpose register thus we move 4304 // data required to compose 64 1's to the instruction stream 4305 // We emit 64 byte wide series of elements from 0..63 which later on would 4306 // be used as a compare targets with tail count contained in tmp1 register. 4307 // Result would be a k register having tmp1 consecutive number or 1 4308 // counting from least significant bit. 4309 address tmp = pc(); 4310 emit_int64(0x0706050403020100); 4311 emit_int64(0x0F0E0D0C0B0A0908); 4312 emit_int64(0x1716151413121110); 4313 emit_int64(0x1F1E1D1C1B1A1918); 4314 emit_int64(0x2726252423222120); 4315 emit_int64(0x2F2E2D2C2B2A2928); 4316 emit_int64(0x3736353433323130); 4317 emit_int64(0x3F3E3D3C3B3A3938); 4318 4319 bind(k_init); 4320 lea(len, InternalAddress(tmp)); 4321 // create mask to test for negative byte inside a vector 4322 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 4323 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 4324 4325 #endif 4326 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4327 ktestq(mask1, mask2); 4328 jcc(Assembler::zero, DONE); 4329 4330 // do a full check for negative registers in the tail 4331 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4332 // ary1 already pointing to the right place 4333 jmpb(TAIL_START); 4334 4335 bind(BREAK_LOOP); 4336 // At least one byte in the last 64 byte block was negative. 4337 // Set up to look at the last 64 bytes as if they were a tail 4338 lea(ary1, Address(ary1, len, Address::times_1)); 4339 addptr(result, len); 4340 // Ignore the very last byte: if all others are positive, 4341 // it must be negative, so we can skip right to the 2+1 byte 4342 // end comparison at this point 4343 orl(result, 63); 4344 movl(len, 63); 4345 // Fallthru to tail compare 4346 } else { 4347 4348 if (UseAVX >= 2 && UseSSE >= 2) { 4349 // With AVX2, use 32-byte vector compare 4350 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4351 4352 // Compare 32-byte vectors 4353 testl(len, 0xffffffe0); // vector count (in bytes) 4354 jccb(Assembler::zero, TAIL_START); 4355 4356 andl(len, 0xffffffe0); 4357 lea(ary1, Address(ary1, len, Address::times_1)); 4358 negptr(len); 4359 4360 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4361 movdl(vec2, tmp1); 4362 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4363 4364 bind(COMPARE_WIDE_VECTORS); 4365 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4366 vptest(vec1, vec2); 4367 jccb(Assembler::notZero, BREAK_LOOP); 4368 addptr(len, 32); 4369 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4370 4371 testl(result, 0x0000001f); // any bytes remaining? 4372 jcc(Assembler::zero, DONE); 4373 4374 // Quick test using the already prepared vector mask 4375 movl(len, result); 4376 andl(len, 0x0000001f); 4377 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4378 vptest(vec1, vec2); 4379 jcc(Assembler::zero, DONE); 4380 // There are zeros, jump to the tail to determine exactly where 4381 jmpb(TAIL_START); 4382 4383 bind(BREAK_LOOP); 4384 // At least one byte in the last 32-byte vector is negative. 4385 // Set up to look at the last 32 bytes as if they were a tail 4386 lea(ary1, Address(ary1, len, Address::times_1)); 4387 addptr(result, len); 4388 // Ignore the very last byte: if all others are positive, 4389 // it must be negative, so we can skip right to the 2+1 byte 4390 // end comparison at this point 4391 orl(result, 31); 4392 movl(len, 31); 4393 // Fallthru to tail compare 4394 } else if (UseSSE42Intrinsics) { 4395 // With SSE4.2, use double quad vector compare 4396 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4397 4398 // Compare 16-byte vectors 4399 testl(len, 0xfffffff0); // vector count (in bytes) 4400 jcc(Assembler::zero, TAIL_START); 4401 4402 andl(len, 0xfffffff0); 4403 lea(ary1, Address(ary1, len, Address::times_1)); 4404 negptr(len); 4405 4406 movl(tmp1, 0x80808080); 4407 movdl(vec2, tmp1); 4408 pshufd(vec2, vec2, 0); 4409 4410 bind(COMPARE_WIDE_VECTORS); 4411 movdqu(vec1, Address(ary1, len, Address::times_1)); 4412 ptest(vec1, vec2); 4413 jccb(Assembler::notZero, BREAK_LOOP); 4414 addptr(len, 16); 4415 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4416 4417 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4418 jcc(Assembler::zero, DONE); 4419 4420 // Quick test using the already prepared vector mask 4421 movl(len, result); 4422 andl(len, 0x0000000f); // tail count (in bytes) 4423 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4424 ptest(vec1, vec2); 4425 jcc(Assembler::zero, DONE); 4426 jmpb(TAIL_START); 4427 4428 bind(BREAK_LOOP); 4429 // At least one byte in the last 16-byte vector is negative. 4430 // Set up and look at the last 16 bytes as if they were a tail 4431 lea(ary1, Address(ary1, len, Address::times_1)); 4432 addptr(result, len); 4433 // Ignore the very last byte: if all others are positive, 4434 // it must be negative, so we can skip right to the 2+1 byte 4435 // end comparison at this point 4436 orl(result, 15); 4437 movl(len, 15); 4438 // Fallthru to tail compare 4439 } 4440 } 4441 4442 bind(TAIL_START); 4443 // Compare 4-byte vectors 4444 andl(len, 0xfffffffc); // vector count (in bytes) 4445 jccb(Assembler::zero, COMPARE_CHAR); 4446 4447 lea(ary1, Address(ary1, len, Address::times_1)); 4448 negptr(len); 4449 4450 bind(COMPARE_VECTORS); 4451 movl(tmp1, Address(ary1, len, Address::times_1)); 4452 andl(tmp1, 0x80808080); 4453 jccb(Assembler::notZero, TAIL_ADJUST); 4454 addptr(len, 4); 4455 jccb(Assembler::notZero, COMPARE_VECTORS); 4456 4457 // Compare trailing char (final 2-3 bytes), if any 4458 bind(COMPARE_CHAR); 4459 4460 testl(result, 0x2); // tail char 4461 jccb(Assembler::zero, COMPARE_BYTE); 4462 load_unsigned_short(tmp1, Address(ary1, 0)); 4463 andl(tmp1, 0x00008080); 4464 jccb(Assembler::notZero, CHAR_ADJUST); 4465 lea(ary1, Address(ary1, 2)); 4466 4467 bind(COMPARE_BYTE); 4468 testl(result, 0x1); // tail byte 4469 jccb(Assembler::zero, DONE); 4470 load_unsigned_byte(tmp1, Address(ary1, 0)); 4471 testl(tmp1, 0x00000080); 4472 jccb(Assembler::zero, DONE); 4473 subptr(result, 1); 4474 jmpb(DONE); 4475 4476 bind(TAIL_ADJUST); 4477 // there are negative bits in the last 4 byte block. 4478 // Adjust result and check the next three bytes 4479 addptr(result, len); 4480 orl(result, 3); 4481 lea(ary1, Address(ary1, len, Address::times_1)); 4482 jmpb(COMPARE_CHAR); 4483 4484 bind(CHAR_ADJUST); 4485 // We are looking at a char + optional byte tail, and found that one 4486 // of the bytes in the char is negative. Adjust the result, check the 4487 // first byte and readjust if needed. 4488 andl(result, 0xfffffffc); 4489 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4490 jccb(Assembler::notZero, DONE); 4491 addptr(result, 1); 4492 4493 // That's it 4494 bind(DONE); 4495 if (UseAVX >= 2 && UseSSE >= 2) { 4496 // clean upper bits of YMM registers 4497 vpxor(vec1, vec1); 4498 vpxor(vec2, vec2); 4499 } 4500 } 4501 4502 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4503 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4504 Register limit, Register result, Register chr, 4505 XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) { 4506 ShortBranchVerifier sbv(this); 4507 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4508 4509 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4510 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4511 4512 if (is_array_equ) { 4513 // Check the input args 4514 cmpoop(ary1, ary2); 4515 jcc(Assembler::equal, TRUE_LABEL); 4516 4517 // Need additional checks for arrays_equals. 4518 testptr(ary1, ary1); 4519 jcc(Assembler::zero, FALSE_LABEL); 4520 testptr(ary2, ary2); 4521 jcc(Assembler::zero, FALSE_LABEL); 4522 4523 // Check the lengths 4524 movl(limit, Address(ary1, length_offset)); 4525 cmpl(limit, Address(ary2, length_offset)); 4526 jcc(Assembler::notEqual, FALSE_LABEL); 4527 } 4528 4529 // count == 0 4530 testl(limit, limit); 4531 jcc(Assembler::zero, TRUE_LABEL); 4532 4533 if (is_array_equ) { 4534 // Load array address 4535 lea(ary1, Address(ary1, base_offset)); 4536 lea(ary2, Address(ary2, base_offset)); 4537 } 4538 4539 if (is_array_equ && is_char) { 4540 // arrays_equals when used for char[]. 4541 shll(limit, 1); // byte count != 0 4542 } 4543 movl(result, limit); // copy 4544 4545 if (UseAVX >= 2) { 4546 // With AVX2, use 32-byte vector compare 4547 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4548 4549 // Compare 32-byte vectors 4550 andl(result, 0x0000001f); // tail count (in bytes) 4551 andl(limit, 0xffffffe0); // vector count (in bytes) 4552 jcc(Assembler::zero, COMPARE_TAIL); 4553 4554 lea(ary1, Address(ary1, limit, Address::times_1)); 4555 lea(ary2, Address(ary2, limit, Address::times_1)); 4556 negptr(limit); 4557 4558 #ifdef _LP64 4559 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4560 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4561 4562 cmpl(limit, -64); 4563 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4564 4565 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4566 4567 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4568 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4569 kortestql(mask, mask); 4570 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4571 addptr(limit, 64); // update since we already compared at this addr 4572 cmpl(limit, -64); 4573 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4574 4575 // At this point we may still need to compare -limit+result bytes. 4576 // We could execute the next two instruction and just continue via non-wide path: 4577 // cmpl(limit, 0); 4578 // jcc(Assembler::equal, COMPARE_TAIL); // true 4579 // But since we stopped at the points ary{1,2}+limit which are 4580 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4581 // (|limit| <= 32 and result < 32), 4582 // we may just compare the last 64 bytes. 4583 // 4584 addptr(result, -64); // it is safe, bc we just came from this area 4585 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4586 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4587 kortestql(mask, mask); 4588 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4589 4590 jmp(TRUE_LABEL); 4591 4592 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4593 4594 }//if (VM_Version::supports_avx512vlbw()) 4595 #endif //_LP64 4596 bind(COMPARE_WIDE_VECTORS); 4597 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); 4598 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4599 vpxor(vec1, vec2); 4600 4601 vptest(vec1, vec1); 4602 jcc(Assembler::notZero, FALSE_LABEL); 4603 addptr(limit, 32); 4604 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4605 4606 testl(result, result); 4607 jcc(Assembler::zero, TRUE_LABEL); 4608 4609 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 4610 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4611 vpxor(vec1, vec2); 4612 4613 vptest(vec1, vec1); 4614 jccb(Assembler::notZero, FALSE_LABEL); 4615 jmpb(TRUE_LABEL); 4616 4617 bind(COMPARE_TAIL); // limit is zero 4618 movl(limit, result); 4619 // Fallthru to tail compare 4620 } else if (UseSSE42Intrinsics) { 4621 // With SSE4.2, use double quad vector compare 4622 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4623 4624 // Compare 16-byte vectors 4625 andl(result, 0x0000000f); // tail count (in bytes) 4626 andl(limit, 0xfffffff0); // vector count (in bytes) 4627 jcc(Assembler::zero, COMPARE_TAIL); 4628 4629 lea(ary1, Address(ary1, limit, Address::times_1)); 4630 lea(ary2, Address(ary2, limit, Address::times_1)); 4631 negptr(limit); 4632 4633 bind(COMPARE_WIDE_VECTORS); 4634 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4635 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4636 pxor(vec1, vec2); 4637 4638 ptest(vec1, vec1); 4639 jcc(Assembler::notZero, FALSE_LABEL); 4640 addptr(limit, 16); 4641 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4642 4643 testl(result, result); 4644 jcc(Assembler::zero, TRUE_LABEL); 4645 4646 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4647 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4648 pxor(vec1, vec2); 4649 4650 ptest(vec1, vec1); 4651 jccb(Assembler::notZero, FALSE_LABEL); 4652 jmpb(TRUE_LABEL); 4653 4654 bind(COMPARE_TAIL); // limit is zero 4655 movl(limit, result); 4656 // Fallthru to tail compare 4657 } 4658 4659 // Compare 4-byte vectors 4660 andl(limit, 0xfffffffc); // vector count (in bytes) 4661 jccb(Assembler::zero, COMPARE_CHAR); 4662 4663 lea(ary1, Address(ary1, limit, Address::times_1)); 4664 lea(ary2, Address(ary2, limit, Address::times_1)); 4665 negptr(limit); 4666 4667 bind(COMPARE_VECTORS); 4668 movl(chr, Address(ary1, limit, Address::times_1)); 4669 cmpl(chr, Address(ary2, limit, Address::times_1)); 4670 jccb(Assembler::notEqual, FALSE_LABEL); 4671 addptr(limit, 4); 4672 jcc(Assembler::notZero, COMPARE_VECTORS); 4673 4674 // Compare trailing char (final 2 bytes), if any 4675 bind(COMPARE_CHAR); 4676 testl(result, 0x2); // tail char 4677 jccb(Assembler::zero, COMPARE_BYTE); 4678 load_unsigned_short(chr, Address(ary1, 0)); 4679 load_unsigned_short(limit, Address(ary2, 0)); 4680 cmpl(chr, limit); 4681 jccb(Assembler::notEqual, FALSE_LABEL); 4682 4683 if (is_array_equ && is_char) { 4684 bind(COMPARE_BYTE); 4685 } else { 4686 lea(ary1, Address(ary1, 2)); 4687 lea(ary2, Address(ary2, 2)); 4688 4689 bind(COMPARE_BYTE); 4690 testl(result, 0x1); // tail byte 4691 jccb(Assembler::zero, TRUE_LABEL); 4692 load_unsigned_byte(chr, Address(ary1, 0)); 4693 load_unsigned_byte(limit, Address(ary2, 0)); 4694 cmpl(chr, limit); 4695 jccb(Assembler::notEqual, FALSE_LABEL); 4696 } 4697 bind(TRUE_LABEL); 4698 movl(result, 1); // return true 4699 jmpb(DONE); 4700 4701 bind(FALSE_LABEL); 4702 xorl(result, result); // return false 4703 4704 // That's it 4705 bind(DONE); 4706 if (UseAVX >= 2) { 4707 // clean upper bits of YMM registers 4708 vpxor(vec1, vec1); 4709 vpxor(vec2, vec2); 4710 } 4711 } 4712 4713 #ifdef _LP64 4714 4715 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4716 #define __ masm. 4717 Register dst = stub.data<0>(); 4718 XMMRegister src = stub.data<1>(); 4719 address target = stub.data<2>(); 4720 __ bind(stub.entry()); 4721 __ subptr(rsp, 8); 4722 __ movdbl(Address(rsp), src); 4723 __ call(RuntimeAddress(target)); 4724 __ pop(dst); 4725 __ jmp(stub.continuation()); 4726 #undef __ 4727 } 4728 4729 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4730 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4731 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4732 4733 address slowpath_target; 4734 if (dst_bt == T_INT) { 4735 if (src_bt == T_FLOAT) { 4736 cvttss2sil(dst, src); 4737 cmpl(dst, 0x80000000); 4738 slowpath_target = StubRoutines::x86::f2i_fixup(); 4739 } else { 4740 cvttsd2sil(dst, src); 4741 cmpl(dst, 0x80000000); 4742 slowpath_target = StubRoutines::x86::d2i_fixup(); 4743 } 4744 } else { 4745 if (src_bt == T_FLOAT) { 4746 cvttss2siq(dst, src); 4747 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4748 slowpath_target = StubRoutines::x86::f2l_fixup(); 4749 } else { 4750 cvttsd2siq(dst, src); 4751 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4752 slowpath_target = StubRoutines::x86::d2l_fixup(); 4753 } 4754 } 4755 4756 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4757 jcc(Assembler::equal, stub->entry()); 4758 bind(stub->continuation()); 4759 } 4760 4761 #endif // _LP64 4762 4763 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4764 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4765 switch(ideal_opc) { 4766 case Op_LShiftVS: 4767 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4768 case Op_LShiftVI: 4769 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4770 case Op_LShiftVL: 4771 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4772 case Op_RShiftVS: 4773 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4774 case Op_RShiftVI: 4775 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4776 case Op_RShiftVL: 4777 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4778 case Op_URShiftVS: 4779 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4780 case Op_URShiftVI: 4781 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4782 case Op_URShiftVL: 4783 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4784 case Op_RotateRightV: 4785 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4786 case Op_RotateLeftV: 4787 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4788 default: 4789 fatal("Unsupported masked operation"); break; 4790 } 4791 } 4792 4793 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4794 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4795 bool is_varshift) { 4796 switch (ideal_opc) { 4797 case Op_AddVB: 4798 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4799 case Op_AddVS: 4800 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4801 case Op_AddVI: 4802 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4803 case Op_AddVL: 4804 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4805 case Op_AddVF: 4806 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4807 case Op_AddVD: 4808 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4809 case Op_SubVB: 4810 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4811 case Op_SubVS: 4812 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4813 case Op_SubVI: 4814 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4815 case Op_SubVL: 4816 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4817 case Op_SubVF: 4818 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4819 case Op_SubVD: 4820 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4821 case Op_MulVS: 4822 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4823 case Op_MulVI: 4824 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4825 case Op_MulVL: 4826 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4827 case Op_MulVF: 4828 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4829 case Op_MulVD: 4830 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4831 case Op_DivVF: 4832 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4833 case Op_DivVD: 4834 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4835 case Op_SqrtVF: 4836 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4837 case Op_SqrtVD: 4838 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4839 case Op_AbsVB: 4840 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4841 case Op_AbsVS: 4842 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4843 case Op_AbsVI: 4844 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4845 case Op_AbsVL: 4846 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4847 case Op_FmaVF: 4848 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4849 case Op_FmaVD: 4850 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4851 case Op_VectorRearrange: 4852 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4853 case Op_LShiftVS: 4854 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4855 case Op_LShiftVI: 4856 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4857 case Op_LShiftVL: 4858 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4859 case Op_RShiftVS: 4860 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4861 case Op_RShiftVI: 4862 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4863 case Op_RShiftVL: 4864 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4865 case Op_URShiftVS: 4866 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4867 case Op_URShiftVI: 4868 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4869 case Op_URShiftVL: 4870 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4871 case Op_RotateLeftV: 4872 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4873 case Op_RotateRightV: 4874 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4875 case Op_MaxV: 4876 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4877 case Op_MinV: 4878 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4879 case Op_XorV: 4880 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4881 case Op_OrV: 4882 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4883 case Op_AndV: 4884 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4885 default: 4886 fatal("Unsupported masked operation"); break; 4887 } 4888 } 4889 4890 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4891 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4892 switch (ideal_opc) { 4893 case Op_AddVB: 4894 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4895 case Op_AddVS: 4896 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4897 case Op_AddVI: 4898 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4899 case Op_AddVL: 4900 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4901 case Op_AddVF: 4902 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4903 case Op_AddVD: 4904 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4905 case Op_SubVB: 4906 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4907 case Op_SubVS: 4908 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4909 case Op_SubVI: 4910 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4911 case Op_SubVL: 4912 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4913 case Op_SubVF: 4914 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4915 case Op_SubVD: 4916 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4917 case Op_MulVS: 4918 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4919 case Op_MulVI: 4920 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4921 case Op_MulVL: 4922 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4923 case Op_MulVF: 4924 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4925 case Op_MulVD: 4926 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4927 case Op_DivVF: 4928 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4929 case Op_DivVD: 4930 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4931 case Op_FmaVF: 4932 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4933 case Op_FmaVD: 4934 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4935 case Op_MaxV: 4936 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4937 case Op_MinV: 4938 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4939 case Op_XorV: 4940 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4941 case Op_OrV: 4942 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4943 case Op_AndV: 4944 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4945 default: 4946 fatal("Unsupported masked operation"); break; 4947 } 4948 } 4949 4950 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4951 KRegister src1, KRegister src2) { 4952 BasicType etype = T_ILLEGAL; 4953 switch(mask_len) { 4954 case 2: 4955 case 4: 4956 case 8: etype = T_BYTE; break; 4957 case 16: etype = T_SHORT; break; 4958 case 32: etype = T_INT; break; 4959 case 64: etype = T_LONG; break; 4960 default: fatal("Unsupported type"); break; 4961 } 4962 assert(etype != T_ILLEGAL, ""); 4963 switch(ideal_opc) { 4964 case Op_AndVMask: 4965 kand(etype, dst, src1, src2); break; 4966 case Op_OrVMask: 4967 kor(etype, dst, src1, src2); break; 4968 case Op_XorVMask: 4969 kxor(etype, dst, src1, src2); break; 4970 default: 4971 fatal("Unsupported masked operation"); break; 4972 } 4973 } 4974 4975 /* 4976 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4977 * If src is NaN, the result is 0. 4978 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4979 * the result is equal to the value of Integer.MIN_VALUE. 4980 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4981 * the result is equal to the value of Integer.MAX_VALUE. 4982 */ 4983 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4984 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4985 Register rscratch, AddressLiteral float_sign_flip, 4986 int vec_enc) { 4987 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4988 Label done; 4989 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4990 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4991 vptest(xtmp2, xtmp2, vec_enc); 4992 jccb(Assembler::equal, done); 4993 4994 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4995 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4996 4997 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4998 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4999 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 5000 5001 // Recompute the mask for remaining special value. 5002 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 5003 // Extract SRC values corresponding to TRUE mask lanes. 5004 vpand(xtmp4, xtmp2, src, vec_enc); 5005 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 5006 // values are set. 5007 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 5008 5009 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 5010 bind(done); 5011 } 5012 5013 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5014 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5015 Register rscratch, AddressLiteral float_sign_flip, 5016 int vec_enc) { 5017 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5018 Label done; 5019 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5020 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5021 kortestwl(ktmp1, ktmp1); 5022 jccb(Assembler::equal, done); 5023 5024 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5025 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5026 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5027 5028 kxorwl(ktmp1, ktmp1, ktmp2); 5029 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5030 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5031 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5032 bind(done); 5033 } 5034 5035 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5036 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5037 Register rscratch, AddressLiteral double_sign_flip, 5038 int vec_enc) { 5039 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5040 5041 Label done; 5042 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5043 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 5044 kortestwl(ktmp1, ktmp1); 5045 jccb(Assembler::equal, done); 5046 5047 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5048 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5049 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5050 5051 kxorwl(ktmp1, ktmp1, ktmp2); 5052 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5053 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5054 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5055 bind(done); 5056 } 5057 5058 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5059 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5060 Register rscratch, AddressLiteral float_sign_flip, 5061 int vec_enc) { 5062 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5063 Label done; 5064 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5065 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5066 kortestwl(ktmp1, ktmp1); 5067 jccb(Assembler::equal, done); 5068 5069 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5070 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5071 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5072 5073 kxorwl(ktmp1, ktmp1, ktmp2); 5074 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5075 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5076 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5077 bind(done); 5078 } 5079 5080 /* 5081 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5082 * If src is NaN, the result is 0. 5083 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 5084 * the result is equal to the value of Long.MIN_VALUE. 5085 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 5086 * the result is equal to the value of Long.MAX_VALUE. 5087 */ 5088 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5089 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5090 Register rscratch, AddressLiteral double_sign_flip, 5091 int vec_enc) { 5092 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5093 5094 Label done; 5095 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5096 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 5097 kortestwl(ktmp1, ktmp1); 5098 jccb(Assembler::equal, done); 5099 5100 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5101 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5102 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5103 5104 kxorwl(ktmp1, ktmp1, ktmp2); 5105 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5106 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5107 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5108 bind(done); 5109 } 5110 5111 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 5112 XMMRegister xtmp, int index, int vec_enc) { 5113 assert(vec_enc < Assembler::AVX_512bit, ""); 5114 if (vec_enc == Assembler::AVX_256bit) { 5115 vextractf128_high(xtmp, src); 5116 vshufps(dst, src, xtmp, index, vec_enc); 5117 } else { 5118 vshufps(dst, src, zero, index, vec_enc); 5119 } 5120 } 5121 5122 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5123 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 5124 AddressLiteral float_sign_flip, int src_vec_enc) { 5125 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5126 5127 Label done; 5128 // Compare the destination lanes with float_sign_flip 5129 // value to get mask for all special values. 5130 movdqu(xtmp1, float_sign_flip, rscratch); 5131 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 5132 ptest(xtmp2, xtmp2); 5133 jccb(Assembler::equal, done); 5134 5135 // Flip float_sign_flip to get max integer value. 5136 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5137 pxor(xtmp1, xtmp4); 5138 5139 // Set detination lanes corresponding to unordered source lanes as zero. 5140 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5141 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5142 5143 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5144 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5145 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5146 5147 // Recompute the mask for remaining special value. 5148 pxor(xtmp2, xtmp3); 5149 // Extract mask corresponding to non-negative source lanes. 5150 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5151 5152 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5153 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5154 pand(xtmp3, xtmp2); 5155 5156 // Replace destination lanes holding special value(0x80000000) with max int 5157 // if corresponding source lane holds a +ve value. 5158 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5159 bind(done); 5160 } 5161 5162 5163 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5164 XMMRegister xtmp, Register rscratch, int vec_enc) { 5165 switch(to_elem_bt) { 5166 case T_SHORT: 5167 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5168 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5169 vpackusdw(dst, dst, zero, vec_enc); 5170 if (vec_enc == Assembler::AVX_256bit) { 5171 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5172 } 5173 break; 5174 case T_BYTE: 5175 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5176 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5177 vpackusdw(dst, dst, zero, vec_enc); 5178 if (vec_enc == Assembler::AVX_256bit) { 5179 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5180 } 5181 vpackuswb(dst, dst, zero, vec_enc); 5182 break; 5183 default: assert(false, "%s", type2name(to_elem_bt)); 5184 } 5185 } 5186 5187 /* 5188 * Algorithm for vector D2L and F2I conversions:- 5189 * a) Perform vector D2L/F2I cast. 5190 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5191 * It signifies that source value could be any of the special floating point 5192 * values(NaN,-Inf,Inf,Max,-Min). 5193 * c) Set destination to zero if source is NaN value. 5194 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5195 */ 5196 5197 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5198 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5199 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5200 int to_elem_sz = type2aelembytes(to_elem_bt); 5201 assert(to_elem_sz <= 4, ""); 5202 vcvttps2dq(dst, src, vec_enc); 5203 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5204 if (to_elem_sz < 4) { 5205 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5206 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5207 } 5208 } 5209 5210 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5211 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5212 Register rscratch, int vec_enc) { 5213 int to_elem_sz = type2aelembytes(to_elem_bt); 5214 assert(to_elem_sz <= 4, ""); 5215 vcvttps2dq(dst, src, vec_enc); 5216 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5217 switch(to_elem_bt) { 5218 case T_INT: 5219 break; 5220 case T_SHORT: 5221 evpmovdw(dst, dst, vec_enc); 5222 break; 5223 case T_BYTE: 5224 evpmovdb(dst, dst, vec_enc); 5225 break; 5226 default: assert(false, "%s", type2name(to_elem_bt)); 5227 } 5228 } 5229 5230 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5231 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5232 Register rscratch, int vec_enc) { 5233 evcvttps2qq(dst, src, vec_enc); 5234 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5235 } 5236 5237 // Handling for downcasting from double to integer or sub-word types on AVX2. 5238 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5239 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5240 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5241 int to_elem_sz = type2aelembytes(to_elem_bt); 5242 assert(to_elem_sz < 8, ""); 5243 vcvttpd2dq(dst, src, vec_enc); 5244 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5245 float_sign_flip, vec_enc); 5246 if (to_elem_sz < 4) { 5247 // xtmp4 holds all zero lanes. 5248 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5249 } 5250 } 5251 5252 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5253 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5254 KRegister ktmp2, AddressLiteral sign_flip, 5255 Register rscratch, int vec_enc) { 5256 if (VM_Version::supports_avx512dq()) { 5257 evcvttpd2qq(dst, src, vec_enc); 5258 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5259 switch(to_elem_bt) { 5260 case T_LONG: 5261 break; 5262 case T_INT: 5263 evpmovsqd(dst, dst, vec_enc); 5264 break; 5265 case T_SHORT: 5266 evpmovsqd(dst, dst, vec_enc); 5267 evpmovdw(dst, dst, vec_enc); 5268 break; 5269 case T_BYTE: 5270 evpmovsqd(dst, dst, vec_enc); 5271 evpmovdb(dst, dst, vec_enc); 5272 break; 5273 default: assert(false, "%s", type2name(to_elem_bt)); 5274 } 5275 } else { 5276 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5277 vcvttpd2dq(dst, src, vec_enc); 5278 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5279 switch(to_elem_bt) { 5280 case T_INT: 5281 break; 5282 case T_SHORT: 5283 evpmovdw(dst, dst, vec_enc); 5284 break; 5285 case T_BYTE: 5286 evpmovdb(dst, dst, vec_enc); 5287 break; 5288 default: assert(false, "%s", type2name(to_elem_bt)); 5289 } 5290 } 5291 } 5292 5293 #ifdef _LP64 5294 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5295 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5296 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5297 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5298 // and re-instantiate original MXCSR.RC mode after that. 5299 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5300 5301 mov64(tmp, julong_cast(0.5L)); 5302 evpbroadcastq(xtmp1, tmp, vec_enc); 5303 vaddpd(xtmp1, src , xtmp1, vec_enc); 5304 evcvtpd2qq(dst, xtmp1, vec_enc); 5305 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5306 double_sign_flip, vec_enc);; 5307 5308 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5309 } 5310 5311 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5312 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5313 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5314 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5315 // and re-instantiate original MXCSR.RC mode after that. 5316 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5317 5318 movl(tmp, jint_cast(0.5)); 5319 movq(xtmp1, tmp); 5320 vbroadcastss(xtmp1, xtmp1, vec_enc); 5321 vaddps(xtmp1, src , xtmp1, vec_enc); 5322 vcvtps2dq(dst, xtmp1, vec_enc); 5323 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5324 float_sign_flip, vec_enc); 5325 5326 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5327 } 5328 5329 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5330 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5331 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5332 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5333 // and re-instantiate original MXCSR.RC mode after that. 5334 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5335 5336 movl(tmp, jint_cast(0.5)); 5337 movq(xtmp1, tmp); 5338 vbroadcastss(xtmp1, xtmp1, vec_enc); 5339 vaddps(xtmp1, src , xtmp1, vec_enc); 5340 vcvtps2dq(dst, xtmp1, vec_enc); 5341 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5342 5343 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5344 } 5345 #endif // _LP64 5346 5347 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5348 BasicType from_elem_bt, BasicType to_elem_bt) { 5349 switch (from_elem_bt) { 5350 case T_BYTE: 5351 switch (to_elem_bt) { 5352 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5353 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5354 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5355 default: ShouldNotReachHere(); 5356 } 5357 break; 5358 case T_SHORT: 5359 switch (to_elem_bt) { 5360 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5361 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5362 default: ShouldNotReachHere(); 5363 } 5364 break; 5365 case T_INT: 5366 assert(to_elem_bt == T_LONG, ""); 5367 vpmovzxdq(dst, src, vlen_enc); 5368 break; 5369 default: 5370 ShouldNotReachHere(); 5371 } 5372 } 5373 5374 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5375 BasicType from_elem_bt, BasicType to_elem_bt) { 5376 switch (from_elem_bt) { 5377 case T_BYTE: 5378 switch (to_elem_bt) { 5379 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5380 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5381 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5382 default: ShouldNotReachHere(); 5383 } 5384 break; 5385 case T_SHORT: 5386 switch (to_elem_bt) { 5387 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5388 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5389 default: ShouldNotReachHere(); 5390 } 5391 break; 5392 case T_INT: 5393 assert(to_elem_bt == T_LONG, ""); 5394 vpmovsxdq(dst, src, vlen_enc); 5395 break; 5396 default: 5397 ShouldNotReachHere(); 5398 } 5399 } 5400 5401 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5402 BasicType dst_bt, BasicType src_bt, int vlen) { 5403 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5404 assert(vlen_enc != AVX_512bit, ""); 5405 5406 int dst_bt_size = type2aelembytes(dst_bt); 5407 int src_bt_size = type2aelembytes(src_bt); 5408 if (dst_bt_size > src_bt_size) { 5409 switch (dst_bt_size / src_bt_size) { 5410 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5411 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5412 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5413 default: ShouldNotReachHere(); 5414 } 5415 } else { 5416 assert(dst_bt_size < src_bt_size, ""); 5417 switch (src_bt_size / dst_bt_size) { 5418 case 2: { 5419 if (vlen_enc == AVX_128bit) { 5420 vpacksswb(dst, src, src, vlen_enc); 5421 } else { 5422 vpacksswb(dst, src, src, vlen_enc); 5423 vpermq(dst, dst, 0x08, vlen_enc); 5424 } 5425 break; 5426 } 5427 case 4: { 5428 if (vlen_enc == AVX_128bit) { 5429 vpackssdw(dst, src, src, vlen_enc); 5430 vpacksswb(dst, dst, dst, vlen_enc); 5431 } else { 5432 vpackssdw(dst, src, src, vlen_enc); 5433 vpermq(dst, dst, 0x08, vlen_enc); 5434 vpacksswb(dst, dst, dst, AVX_128bit); 5435 } 5436 break; 5437 } 5438 case 8: { 5439 if (vlen_enc == AVX_128bit) { 5440 vpshufd(dst, src, 0x08, vlen_enc); 5441 vpackssdw(dst, dst, dst, vlen_enc); 5442 vpacksswb(dst, dst, dst, vlen_enc); 5443 } else { 5444 vpshufd(dst, src, 0x08, vlen_enc); 5445 vpermq(dst, dst, 0x08, vlen_enc); 5446 vpackssdw(dst, dst, dst, AVX_128bit); 5447 vpacksswb(dst, dst, dst, AVX_128bit); 5448 } 5449 break; 5450 } 5451 default: ShouldNotReachHere(); 5452 } 5453 } 5454 } 5455 5456 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5457 bool merge, BasicType bt, int vlen_enc) { 5458 if (bt == T_INT) { 5459 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5460 } else { 5461 assert(bt == T_LONG, ""); 5462 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5463 } 5464 } 5465 5466 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5467 bool merge, BasicType bt, int vlen_enc) { 5468 if (bt == T_INT) { 5469 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5470 } else { 5471 assert(bt == T_LONG, ""); 5472 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5473 } 5474 } 5475 5476 #ifdef _LP64 5477 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5478 Register rtmp2, XMMRegister xtmp, int mask_len, 5479 int vec_enc) { 5480 int index = 0; 5481 int vindex = 0; 5482 mov64(rtmp1, 0x0101010101010101L); 5483 pdepq(rtmp1, src, rtmp1); 5484 if (mask_len > 8) { 5485 movq(rtmp2, src); 5486 vpxor(xtmp, xtmp, xtmp, vec_enc); 5487 movq(xtmp, rtmp1); 5488 } 5489 movq(dst, rtmp1); 5490 5491 mask_len -= 8; 5492 while (mask_len > 0) { 5493 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5494 index++; 5495 if ((index % 2) == 0) { 5496 pxor(xtmp, xtmp); 5497 } 5498 mov64(rtmp1, 0x0101010101010101L); 5499 shrq(rtmp2, 8); 5500 pdepq(rtmp1, rtmp2, rtmp1); 5501 pinsrq(xtmp, rtmp1, index % 2); 5502 vindex = index / 2; 5503 if (vindex) { 5504 // Write entire 16 byte vector when both 64 bit 5505 // lanes are update to save redundant instructions. 5506 if (index % 2) { 5507 vinsertf128(dst, dst, xtmp, vindex); 5508 } 5509 } else { 5510 vmovdqu(dst, xtmp); 5511 } 5512 mask_len -= 8; 5513 } 5514 } 5515 5516 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5517 switch(opc) { 5518 case Op_VectorMaskTrueCount: 5519 popcntq(dst, tmp); 5520 break; 5521 case Op_VectorMaskLastTrue: 5522 if (VM_Version::supports_lzcnt()) { 5523 lzcntq(tmp, tmp); 5524 movl(dst, 63); 5525 subl(dst, tmp); 5526 } else { 5527 movl(dst, -1); 5528 bsrq(tmp, tmp); 5529 cmov32(Assembler::notZero, dst, tmp); 5530 } 5531 break; 5532 case Op_VectorMaskFirstTrue: 5533 if (VM_Version::supports_bmi1()) { 5534 if (masklen < 32) { 5535 orl(tmp, 1 << masklen); 5536 tzcntl(dst, tmp); 5537 } else if (masklen == 32) { 5538 tzcntl(dst, tmp); 5539 } else { 5540 assert(masklen == 64, ""); 5541 tzcntq(dst, tmp); 5542 } 5543 } else { 5544 if (masklen < 32) { 5545 orl(tmp, 1 << masklen); 5546 bsfl(dst, tmp); 5547 } else { 5548 assert(masklen == 32 || masklen == 64, ""); 5549 movl(dst, masklen); 5550 if (masklen == 32) { 5551 bsfl(tmp, tmp); 5552 } else { 5553 bsfq(tmp, tmp); 5554 } 5555 cmov32(Assembler::notZero, dst, tmp); 5556 } 5557 } 5558 break; 5559 case Op_VectorMaskToLong: 5560 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5561 break; 5562 default: assert(false, "Unhandled mask operation"); 5563 } 5564 } 5565 5566 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5567 int masklen, int masksize, int vec_enc) { 5568 assert(VM_Version::supports_popcnt(), ""); 5569 5570 if(VM_Version::supports_avx512bw()) { 5571 kmovql(tmp, mask); 5572 } else { 5573 assert(masklen <= 16, ""); 5574 kmovwl(tmp, mask); 5575 } 5576 5577 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5578 // operations needs to be clipped. 5579 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5580 andq(tmp, (1 << masklen) - 1); 5581 } 5582 5583 vector_mask_operation_helper(opc, dst, tmp, masklen); 5584 } 5585 5586 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5587 Register tmp, int masklen, BasicType bt, int vec_enc) { 5588 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5589 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5590 assert(VM_Version::supports_popcnt(), ""); 5591 5592 bool need_clip = false; 5593 switch(bt) { 5594 case T_BOOLEAN: 5595 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5596 vpxor(xtmp, xtmp, xtmp, vec_enc); 5597 vpsubb(xtmp, xtmp, mask, vec_enc); 5598 vpmovmskb(tmp, xtmp, vec_enc); 5599 need_clip = masklen < 16; 5600 break; 5601 case T_BYTE: 5602 vpmovmskb(tmp, mask, vec_enc); 5603 need_clip = masklen < 16; 5604 break; 5605 case T_SHORT: 5606 vpacksswb(xtmp, mask, mask, vec_enc); 5607 if (masklen >= 16) { 5608 vpermpd(xtmp, xtmp, 8, vec_enc); 5609 } 5610 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5611 need_clip = masklen < 16; 5612 break; 5613 case T_INT: 5614 case T_FLOAT: 5615 vmovmskps(tmp, mask, vec_enc); 5616 need_clip = masklen < 4; 5617 break; 5618 case T_LONG: 5619 case T_DOUBLE: 5620 vmovmskpd(tmp, mask, vec_enc); 5621 need_clip = masklen < 2; 5622 break; 5623 default: assert(false, "Unhandled type, %s", type2name(bt)); 5624 } 5625 5626 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5627 // operations needs to be clipped. 5628 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5629 // need_clip implies masklen < 32 5630 andq(tmp, (1 << masklen) - 1); 5631 } 5632 5633 vector_mask_operation_helper(opc, dst, tmp, masklen); 5634 } 5635 5636 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5637 Register rtmp2, int mask_len) { 5638 kmov(rtmp1, src); 5639 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5640 mov64(rtmp2, -1L); 5641 pextq(rtmp2, rtmp2, rtmp1); 5642 kmov(dst, rtmp2); 5643 } 5644 5645 #ifdef _LP64 5646 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5647 XMMRegister mask, Register rtmp, Register rscratch, 5648 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5649 int vec_enc) { 5650 assert(type2aelembytes(bt) >= 4, ""); 5651 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5652 address compress_perm_table = nullptr; 5653 address expand_perm_table = nullptr; 5654 if (type2aelembytes(bt) == 8) { 5655 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5656 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5657 vmovmskpd(rtmp, mask, vec_enc); 5658 } else { 5659 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5660 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5661 vmovmskps(rtmp, mask, vec_enc); 5662 } 5663 shlq(rtmp, 5); // for 32 byte permute row. 5664 if (opcode == Op_CompressV) { 5665 lea(rscratch, ExternalAddress(compress_perm_table)); 5666 } else { 5667 lea(rscratch, ExternalAddress(expand_perm_table)); 5668 } 5669 addptr(rtmp, rscratch); 5670 vmovdqu(permv, Address(rtmp)); 5671 vpermps(dst, permv, src, Assembler::AVX_256bit); 5672 vpxor(xtmp, xtmp, xtmp, vec_enc); 5673 // Blend the result with zero vector using permute mask, each column entry 5674 // in a permute table row contains either a valid permute index or a -1 (default) 5675 // value, this can potentially be used as a blending mask after 5676 // compressing/expanding the source vector lanes. 5677 vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv); 5678 } 5679 #endif 5680 5681 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5682 bool merge, BasicType bt, int vec_enc) { 5683 if (opcode == Op_CompressV) { 5684 switch(bt) { 5685 case T_BYTE: 5686 evpcompressb(dst, mask, src, merge, vec_enc); 5687 break; 5688 case T_CHAR: 5689 case T_SHORT: 5690 evpcompressw(dst, mask, src, merge, vec_enc); 5691 break; 5692 case T_INT: 5693 evpcompressd(dst, mask, src, merge, vec_enc); 5694 break; 5695 case T_FLOAT: 5696 evcompressps(dst, mask, src, merge, vec_enc); 5697 break; 5698 case T_LONG: 5699 evpcompressq(dst, mask, src, merge, vec_enc); 5700 break; 5701 case T_DOUBLE: 5702 evcompresspd(dst, mask, src, merge, vec_enc); 5703 break; 5704 default: 5705 fatal("Unsupported type %s", type2name(bt)); 5706 break; 5707 } 5708 } else { 5709 assert(opcode == Op_ExpandV, ""); 5710 switch(bt) { 5711 case T_BYTE: 5712 evpexpandb(dst, mask, src, merge, vec_enc); 5713 break; 5714 case T_CHAR: 5715 case T_SHORT: 5716 evpexpandw(dst, mask, src, merge, vec_enc); 5717 break; 5718 case T_INT: 5719 evpexpandd(dst, mask, src, merge, vec_enc); 5720 break; 5721 case T_FLOAT: 5722 evexpandps(dst, mask, src, merge, vec_enc); 5723 break; 5724 case T_LONG: 5725 evpexpandq(dst, mask, src, merge, vec_enc); 5726 break; 5727 case T_DOUBLE: 5728 evexpandpd(dst, mask, src, merge, vec_enc); 5729 break; 5730 default: 5731 fatal("Unsupported type %s", type2name(bt)); 5732 break; 5733 } 5734 } 5735 } 5736 #endif 5737 5738 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5739 KRegister ktmp1, int vec_enc) { 5740 if (opcode == Op_SignumVD) { 5741 vsubpd(dst, zero, one, vec_enc); 5742 // if src < 0 ? -1 : 1 5743 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5744 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5745 // if src == NaN, -0.0 or 0.0 return src. 5746 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5747 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5748 } else { 5749 assert(opcode == Op_SignumVF, ""); 5750 vsubps(dst, zero, one, vec_enc); 5751 // if src < 0 ? -1 : 1 5752 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5753 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5754 // if src == NaN, -0.0 or 0.0 return src. 5755 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5756 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5757 } 5758 } 5759 5760 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5761 XMMRegister xtmp1, int vec_enc) { 5762 if (opcode == Op_SignumVD) { 5763 vsubpd(dst, zero, one, vec_enc); 5764 // if src < 0 ? -1 : 1 5765 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5766 // if src == NaN, -0.0 or 0.0 return src. 5767 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5768 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5769 } else { 5770 assert(opcode == Op_SignumVF, ""); 5771 vsubps(dst, zero, one, vec_enc); 5772 // if src < 0 ? -1 : 1 5773 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5774 // if src == NaN, -0.0 or 0.0 return src. 5775 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5776 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5777 } 5778 } 5779 5780 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5781 if (VM_Version::supports_avx512bw()) { 5782 if (mask_len > 32) { 5783 kmovql(dst, src); 5784 } else { 5785 kmovdl(dst, src); 5786 if (mask_len != 32) { 5787 kshiftrdl(dst, dst, 32 - mask_len); 5788 } 5789 } 5790 } else { 5791 assert(mask_len <= 16, ""); 5792 kmovwl(dst, src); 5793 if (mask_len != 16) { 5794 kshiftrwl(dst, dst, 16 - mask_len); 5795 } 5796 } 5797 } 5798 5799 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5800 int lane_size = type2aelembytes(bt); 5801 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5802 if ((is_LP64 || lane_size < 8) && 5803 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5804 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5805 movptr(rtmp, imm32); 5806 switch(lane_size) { 5807 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5808 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5809 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5810 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5811 fatal("Unsupported lane size %d", lane_size); 5812 break; 5813 } 5814 } else { 5815 movptr(rtmp, imm32); 5816 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5817 switch(lane_size) { 5818 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5819 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5820 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5821 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5822 fatal("Unsupported lane size %d", lane_size); 5823 break; 5824 } 5825 } 5826 } 5827 5828 // 5829 // Following is lookup table based popcount computation algorithm:- 5830 // Index Bit set count 5831 // [ 0000 -> 0, 5832 // 0001 -> 1, 5833 // 0010 -> 1, 5834 // 0011 -> 2, 5835 // 0100 -> 1, 5836 // 0101 -> 2, 5837 // 0110 -> 2, 5838 // 0111 -> 3, 5839 // 1000 -> 1, 5840 // 1001 -> 2, 5841 // 1010 -> 3, 5842 // 1011 -> 3, 5843 // 1100 -> 2, 5844 // 1101 -> 3, 5845 // 1111 -> 4 ] 5846 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5847 // shuffle indices for lookup table access. 5848 // b. Right shift each byte of vector lane by 4 positions. 5849 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5850 // shuffle indices for lookup table access. 5851 // d. Add the bitset count of upper and lower 4 bits of each byte. 5852 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5853 // count of all the bytes of a quadword. 5854 // f. Perform step e. for upper 128bit vector lane. 5855 // g. Pack the bitset count of quadwords back to double word. 5856 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5857 5858 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5859 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5860 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5861 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5862 vpsrlw(dst, src, 4, vec_enc); 5863 vpand(dst, dst, xtmp1, vec_enc); 5864 vpand(xtmp1, src, xtmp1, vec_enc); 5865 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5866 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5867 vpshufb(dst, xtmp2, dst, vec_enc); 5868 vpaddb(dst, dst, xtmp1, vec_enc); 5869 } 5870 5871 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5872 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5873 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5874 // Following code is as per steps e,f,g and h of above algorithm. 5875 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5876 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5877 vpsadbw(dst, dst, xtmp2, vec_enc); 5878 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5879 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5880 vpackuswb(dst, xtmp1, dst, vec_enc); 5881 } 5882 5883 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5884 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5885 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5886 // Add the popcount of upper and lower bytes of word. 5887 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5888 vpsrlw(dst, xtmp1, 8, vec_enc); 5889 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5890 vpaddw(dst, dst, xtmp1, vec_enc); 5891 } 5892 5893 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5894 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5895 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5896 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5897 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5898 } 5899 5900 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5901 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5902 switch(bt) { 5903 case T_LONG: 5904 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5905 break; 5906 case T_INT: 5907 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5908 break; 5909 case T_CHAR: 5910 case T_SHORT: 5911 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5912 break; 5913 case T_BYTE: 5914 case T_BOOLEAN: 5915 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5916 break; 5917 default: 5918 fatal("Unsupported type %s", type2name(bt)); 5919 break; 5920 } 5921 } 5922 5923 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5924 KRegister mask, bool merge, int vec_enc) { 5925 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5926 switch(bt) { 5927 case T_LONG: 5928 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5929 evpopcntq(dst, mask, src, merge, vec_enc); 5930 break; 5931 case T_INT: 5932 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5933 evpopcntd(dst, mask, src, merge, vec_enc); 5934 break; 5935 case T_CHAR: 5936 case T_SHORT: 5937 assert(VM_Version::supports_avx512_bitalg(), ""); 5938 evpopcntw(dst, mask, src, merge, vec_enc); 5939 break; 5940 case T_BYTE: 5941 case T_BOOLEAN: 5942 assert(VM_Version::supports_avx512_bitalg(), ""); 5943 evpopcntb(dst, mask, src, merge, vec_enc); 5944 break; 5945 default: 5946 fatal("Unsupported type %s", type2name(bt)); 5947 break; 5948 } 5949 } 5950 5951 #ifndef _LP64 5952 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 5953 assert(VM_Version::supports_avx512bw(), ""); 5954 kmovdl(tmp, src); 5955 kunpckdql(dst, tmp, tmp); 5956 } 5957 #endif 5958 5959 // Bit reversal algorithm first reverses the bits of each byte followed by 5960 // a byte level reversal for multi-byte primitive types (short/int/long). 5961 // Algorithm performs a lookup table access to get reverse bit sequence 5962 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5963 // is obtained by swapping the reverse bit sequences of upper and lower 5964 // nibble of a byte. 5965 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5966 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5967 if (VM_Version::supports_avx512vlbw()) { 5968 5969 // Get the reverse bit sequence of lower nibble of each byte. 5970 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5971 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5972 evpandq(dst, xtmp2, src, vec_enc); 5973 vpshufb(dst, xtmp1, dst, vec_enc); 5974 vpsllq(dst, dst, 4, vec_enc); 5975 5976 // Get the reverse bit sequence of upper nibble of each byte. 5977 vpandn(xtmp2, xtmp2, src, vec_enc); 5978 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5979 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5980 5981 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5982 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5983 evporq(xtmp2, dst, xtmp2, vec_enc); 5984 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5985 5986 } else if(vec_enc == Assembler::AVX_512bit) { 5987 // Shift based bit reversal. 5988 assert(bt == T_LONG || bt == T_INT, ""); 5989 5990 // Swap lower and upper nibble of each byte. 5991 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5992 5993 // Swap two least and most significant bits of each nibble. 5994 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5995 5996 // Swap adjacent pair of bits. 5997 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5998 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5999 6000 evmovdqul(xtmp1, k0, dst, true, vec_enc); 6001 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 6002 } else { 6003 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 6004 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6005 6006 // Get the reverse bit sequence of lower nibble of each byte. 6007 vpand(dst, xtmp2, src, vec_enc); 6008 vpshufb(dst, xtmp1, dst, vec_enc); 6009 vpsllq(dst, dst, 4, vec_enc); 6010 6011 // Get the reverse bit sequence of upper nibble of each byte. 6012 vpandn(xtmp2, xtmp2, src, vec_enc); 6013 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 6014 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6015 6016 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 6017 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 6018 vpor(xtmp2, dst, xtmp2, vec_enc); 6019 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 6020 } 6021 } 6022 6023 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 6024 XMMRegister xtmp, Register rscratch) { 6025 assert(VM_Version::supports_gfni(), ""); 6026 assert(rscratch != noreg || always_reachable(mask), "missing"); 6027 6028 // Galois field instruction based bit reversal based on following algorithm. 6029 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6030 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 6031 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 6032 vector_reverse_byte(bt, dst, xtmp, vec_enc); 6033 } 6034 6035 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 6036 XMMRegister xtmp1, Register rtmp, int vec_enc) { 6037 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 6038 evpandq(dst, xtmp1, src, vec_enc); 6039 vpsllq(dst, dst, nbits, vec_enc); 6040 vpandn(xtmp1, xtmp1, src, vec_enc); 6041 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 6042 evporq(dst, dst, xtmp1, vec_enc); 6043 } 6044 6045 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6046 XMMRegister xtmp2, Register rtmp, int vec_enc) { 6047 // Shift based bit reversal. 6048 assert(VM_Version::supports_evex(), ""); 6049 switch(bt) { 6050 case T_LONG: 6051 // Swap upper and lower double word of each quad word. 6052 evprorq(xtmp1, k0, src, 32, true, vec_enc); 6053 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 6054 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6055 break; 6056 case T_INT: 6057 // Swap upper and lower word of each double word. 6058 evprord(xtmp1, k0, src, 16, true, vec_enc); 6059 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6060 break; 6061 case T_CHAR: 6062 case T_SHORT: 6063 // Swap upper and lower byte of each word. 6064 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 6065 break; 6066 case T_BYTE: 6067 evmovdquq(dst, k0, src, true, vec_enc); 6068 break; 6069 default: 6070 fatal("Unsupported type %s", type2name(bt)); 6071 break; 6072 } 6073 } 6074 6075 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 6076 if (bt == T_BYTE) { 6077 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 6078 evmovdquq(dst, k0, src, true, vec_enc); 6079 } else { 6080 vmovdqu(dst, src); 6081 } 6082 return; 6083 } 6084 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 6085 // pre-computed shuffle indices. 6086 switch(bt) { 6087 case T_LONG: 6088 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 6089 break; 6090 case T_INT: 6091 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 6092 break; 6093 case T_CHAR: 6094 case T_SHORT: 6095 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 6096 break; 6097 default: 6098 fatal("Unsupported type %s", type2name(bt)); 6099 break; 6100 } 6101 vpshufb(dst, src, dst, vec_enc); 6102 } 6103 6104 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6105 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6106 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 6107 assert(is_integral_type(bt), ""); 6108 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 6109 assert(VM_Version::supports_avx512cd(), ""); 6110 switch(bt) { 6111 case T_LONG: 6112 evplzcntq(dst, ktmp, src, merge, vec_enc); 6113 break; 6114 case T_INT: 6115 evplzcntd(dst, ktmp, src, merge, vec_enc); 6116 break; 6117 case T_SHORT: 6118 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 6119 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 6120 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 6121 vpunpckhwd(dst, xtmp1, src, vec_enc); 6122 evplzcntd(dst, ktmp, dst, merge, vec_enc); 6123 vpackusdw(dst, xtmp2, dst, vec_enc); 6124 break; 6125 case T_BYTE: 6126 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6127 // accessing the lookup table. 6128 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6129 // accessing the lookup table. 6130 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6131 assert(VM_Version::supports_avx512bw(), ""); 6132 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 6133 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 6134 vpand(xtmp2, dst, src, vec_enc); 6135 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6136 vpsrlw(xtmp3, src, 4, vec_enc); 6137 vpand(xtmp3, dst, xtmp3, vec_enc); 6138 vpshufb(dst, xtmp1, xtmp3, vec_enc); 6139 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6140 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 6141 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 6142 break; 6143 default: 6144 fatal("Unsupported type %s", type2name(bt)); 6145 break; 6146 } 6147 } 6148 6149 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6150 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6151 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6152 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6153 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6154 // accessing the lookup table. 6155 vpand(dst, xtmp2, src, vec_enc); 6156 vpshufb(dst, xtmp1, dst, vec_enc); 6157 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6158 // accessing the lookup table. 6159 vpsrlw(xtmp3, src, 4, vec_enc); 6160 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6161 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6162 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6163 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6164 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6165 vpaddb(dst, dst, xtmp2, vec_enc); 6166 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6167 } 6168 6169 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6170 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6171 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6172 // Add zero counts of lower byte and upper byte of a word if 6173 // upper byte holds a zero value. 6174 vpsrlw(xtmp3, src, 8, vec_enc); 6175 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6176 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6177 vpsllw(xtmp2, dst, 8, vec_enc); 6178 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6179 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6180 vpsrlw(dst, dst, 8, vec_enc); 6181 } 6182 6183 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6184 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6185 // Since IEEE 754 floating point format represents mantissa in 1.0 format 6186 // hence biased exponent can be used to compute leading zero count as per 6187 // following formula:- 6188 // LZCNT = 32 - (biased_exp - 127) 6189 // Special handling has been introduced for Zero, Max_Int and -ve source values. 6190 6191 // Broadcast 0xFF 6192 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6193 vpsrld(xtmp1, xtmp1, 24, vec_enc); 6194 6195 // Extract biased exponent. 6196 vcvtdq2ps(dst, src, vec_enc); 6197 vpsrld(dst, dst, 23, vec_enc); 6198 vpand(dst, dst, xtmp1, vec_enc); 6199 6200 // Broadcast 127. 6201 vpsrld(xtmp1, xtmp1, 1, vec_enc); 6202 // Exponent = biased_exp - 127 6203 vpsubd(dst, dst, xtmp1, vec_enc); 6204 6205 // Exponent = Exponent + 1 6206 vpsrld(xtmp3, xtmp1, 6, vec_enc); 6207 vpaddd(dst, dst, xtmp3, vec_enc); 6208 6209 // Replace -ve exponent with zero, exponent is -ve when src 6210 // lane contains a zero value. 6211 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6212 vblendvps(dst, dst, xtmp2, dst, vec_enc); 6213 6214 // Rematerialize broadcast 32. 6215 vpslld(xtmp1, xtmp3, 5, vec_enc); 6216 // Exponent is 32 if corresponding source lane contains max_int value. 6217 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 6218 // LZCNT = 32 - exponent 6219 vpsubd(dst, xtmp1, dst, vec_enc); 6220 6221 // Replace LZCNT with a value 1 if corresponding source lane 6222 // contains max_int value. 6223 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 6224 6225 // Replace biased_exp with 0 if source lane value is less than zero. 6226 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6227 vblendvps(dst, dst, xtmp2, src, vec_enc); 6228 } 6229 6230 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6231 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6232 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6233 // Add zero counts of lower word and upper word of a double word if 6234 // upper word holds a zero value. 6235 vpsrld(xtmp3, src, 16, vec_enc); 6236 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6237 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6238 vpslld(xtmp2, dst, 16, vec_enc); 6239 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6240 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6241 vpsrld(dst, dst, 16, vec_enc); 6242 // Add zero counts of lower doubleword and upper doubleword of a 6243 // quadword if upper doubleword holds a zero value. 6244 vpsrlq(xtmp3, src, 32, vec_enc); 6245 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6246 vpsllq(xtmp2, dst, 32, vec_enc); 6247 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6248 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6249 vpsrlq(dst, dst, 32, vec_enc); 6250 } 6251 6252 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6253 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6254 Register rtmp, int vec_enc) { 6255 assert(is_integral_type(bt), "unexpected type"); 6256 assert(vec_enc < Assembler::AVX_512bit, ""); 6257 switch(bt) { 6258 case T_LONG: 6259 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6260 break; 6261 case T_INT: 6262 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6263 break; 6264 case T_SHORT: 6265 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6266 break; 6267 case T_BYTE: 6268 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6269 break; 6270 default: 6271 fatal("Unsupported type %s", type2name(bt)); 6272 break; 6273 } 6274 } 6275 6276 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6277 switch(bt) { 6278 case T_BYTE: 6279 vpsubb(dst, src1, src2, vec_enc); 6280 break; 6281 case T_SHORT: 6282 vpsubw(dst, src1, src2, vec_enc); 6283 break; 6284 case T_INT: 6285 vpsubd(dst, src1, src2, vec_enc); 6286 break; 6287 case T_LONG: 6288 vpsubq(dst, src1, src2, vec_enc); 6289 break; 6290 default: 6291 fatal("Unsupported type %s", type2name(bt)); 6292 break; 6293 } 6294 } 6295 6296 // Trailing zero count computation is based on leading zero count operation as per 6297 // following equation. All AVX3 targets support AVX512CD feature which offers 6298 // direct vector instruction to compute leading zero count. 6299 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6300 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6301 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6302 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6303 assert(is_integral_type(bt), ""); 6304 // xtmp = -1 6305 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6306 // xtmp = xtmp + src 6307 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6308 // xtmp = xtmp & ~src 6309 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6310 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6311 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6312 vpsub(bt, dst, xtmp4, dst, vec_enc); 6313 } 6314 6315 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6316 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6317 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6318 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6319 assert(is_integral_type(bt), ""); 6320 // xtmp = 0 6321 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6322 // xtmp = 0 - src 6323 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6324 // xtmp = xtmp | src 6325 vpor(xtmp3, xtmp3, src, vec_enc); 6326 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6327 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6328 vpsub(bt, dst, xtmp1, dst, vec_enc); 6329 } 6330 6331 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6332 Label done; 6333 Label neg_divisor_fastpath; 6334 cmpl(divisor, 0); 6335 jccb(Assembler::less, neg_divisor_fastpath); 6336 xorl(rdx, rdx); 6337 divl(divisor); 6338 jmpb(done); 6339 bind(neg_divisor_fastpath); 6340 // Fastpath for divisor < 0: 6341 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6342 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6343 movl(rdx, rax); 6344 subl(rdx, divisor); 6345 if (VM_Version::supports_bmi1()) { 6346 andnl(rax, rdx, rax); 6347 } else { 6348 notl(rdx); 6349 andl(rax, rdx); 6350 } 6351 shrl(rax, 31); 6352 bind(done); 6353 } 6354 6355 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6356 Label done; 6357 Label neg_divisor_fastpath; 6358 cmpl(divisor, 0); 6359 jccb(Assembler::less, neg_divisor_fastpath); 6360 xorl(rdx, rdx); 6361 divl(divisor); 6362 jmpb(done); 6363 bind(neg_divisor_fastpath); 6364 // Fastpath when divisor < 0: 6365 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6366 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6367 movl(rdx, rax); 6368 subl(rax, divisor); 6369 if (VM_Version::supports_bmi1()) { 6370 andnl(rax, rax, rdx); 6371 } else { 6372 notl(rax); 6373 andl(rax, rdx); 6374 } 6375 sarl(rax, 31); 6376 andl(rax, divisor); 6377 subl(rdx, rax); 6378 bind(done); 6379 } 6380 6381 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6382 Label done; 6383 Label neg_divisor_fastpath; 6384 6385 cmpl(divisor, 0); 6386 jccb(Assembler::less, neg_divisor_fastpath); 6387 xorl(rdx, rdx); 6388 divl(divisor); 6389 jmpb(done); 6390 bind(neg_divisor_fastpath); 6391 // Fastpath for divisor < 0: 6392 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6393 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6394 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6395 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6396 movl(rdx, rax); 6397 subl(rax, divisor); 6398 if (VM_Version::supports_bmi1()) { 6399 andnl(rax, rax, rdx); 6400 } else { 6401 notl(rax); 6402 andl(rax, rdx); 6403 } 6404 movl(tmp, rax); 6405 shrl(rax, 31); // quotient 6406 sarl(tmp, 31); 6407 andl(tmp, divisor); 6408 subl(rdx, tmp); // remainder 6409 bind(done); 6410 } 6411 6412 #ifdef _LP64 6413 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6414 XMMRegister xtmp2, Register rtmp) { 6415 if(VM_Version::supports_gfni()) { 6416 // Galois field instruction based bit reversal based on following algorithm. 6417 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6418 mov64(rtmp, 0x8040201008040201L); 6419 movq(xtmp1, src); 6420 movq(xtmp2, rtmp); 6421 gf2p8affineqb(xtmp1, xtmp2, 0); 6422 movq(dst, xtmp1); 6423 } else { 6424 // Swap even and odd numbered bits. 6425 movl(rtmp, src); 6426 andl(rtmp, 0x55555555); 6427 shll(rtmp, 1); 6428 movl(dst, src); 6429 andl(dst, 0xAAAAAAAA); 6430 shrl(dst, 1); 6431 orl(dst, rtmp); 6432 6433 // Swap LSB and MSB 2 bits of each nibble. 6434 movl(rtmp, dst); 6435 andl(rtmp, 0x33333333); 6436 shll(rtmp, 2); 6437 andl(dst, 0xCCCCCCCC); 6438 shrl(dst, 2); 6439 orl(dst, rtmp); 6440 6441 // Swap LSB and MSB 4 bits of each byte. 6442 movl(rtmp, dst); 6443 andl(rtmp, 0x0F0F0F0F); 6444 shll(rtmp, 4); 6445 andl(dst, 0xF0F0F0F0); 6446 shrl(dst, 4); 6447 orl(dst, rtmp); 6448 } 6449 bswapl(dst); 6450 } 6451 6452 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6453 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6454 if(VM_Version::supports_gfni()) { 6455 // Galois field instruction based bit reversal based on following algorithm. 6456 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6457 mov64(rtmp1, 0x8040201008040201L); 6458 movq(xtmp1, src); 6459 movq(xtmp2, rtmp1); 6460 gf2p8affineqb(xtmp1, xtmp2, 0); 6461 movq(dst, xtmp1); 6462 } else { 6463 // Swap even and odd numbered bits. 6464 movq(rtmp1, src); 6465 mov64(rtmp2, 0x5555555555555555L); 6466 andq(rtmp1, rtmp2); 6467 shlq(rtmp1, 1); 6468 movq(dst, src); 6469 notq(rtmp2); 6470 andq(dst, rtmp2); 6471 shrq(dst, 1); 6472 orq(dst, rtmp1); 6473 6474 // Swap LSB and MSB 2 bits of each nibble. 6475 movq(rtmp1, dst); 6476 mov64(rtmp2, 0x3333333333333333L); 6477 andq(rtmp1, rtmp2); 6478 shlq(rtmp1, 2); 6479 notq(rtmp2); 6480 andq(dst, rtmp2); 6481 shrq(dst, 2); 6482 orq(dst, rtmp1); 6483 6484 // Swap LSB and MSB 4 bits of each byte. 6485 movq(rtmp1, dst); 6486 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6487 andq(rtmp1, rtmp2); 6488 shlq(rtmp1, 4); 6489 notq(rtmp2); 6490 andq(dst, rtmp2); 6491 shrq(dst, 4); 6492 orq(dst, rtmp1); 6493 } 6494 bswapq(dst); 6495 } 6496 6497 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6498 Label done; 6499 Label neg_divisor_fastpath; 6500 cmpq(divisor, 0); 6501 jccb(Assembler::less, neg_divisor_fastpath); 6502 xorl(rdx, rdx); 6503 divq(divisor); 6504 jmpb(done); 6505 bind(neg_divisor_fastpath); 6506 // Fastpath for divisor < 0: 6507 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6508 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6509 movq(rdx, rax); 6510 subq(rdx, divisor); 6511 if (VM_Version::supports_bmi1()) { 6512 andnq(rax, rdx, rax); 6513 } else { 6514 notq(rdx); 6515 andq(rax, rdx); 6516 } 6517 shrq(rax, 63); 6518 bind(done); 6519 } 6520 6521 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6522 Label done; 6523 Label neg_divisor_fastpath; 6524 cmpq(divisor, 0); 6525 jccb(Assembler::less, neg_divisor_fastpath); 6526 xorq(rdx, rdx); 6527 divq(divisor); 6528 jmp(done); 6529 bind(neg_divisor_fastpath); 6530 // Fastpath when divisor < 0: 6531 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6532 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6533 movq(rdx, rax); 6534 subq(rax, divisor); 6535 if (VM_Version::supports_bmi1()) { 6536 andnq(rax, rax, rdx); 6537 } else { 6538 notq(rax); 6539 andq(rax, rdx); 6540 } 6541 sarq(rax, 63); 6542 andq(rax, divisor); 6543 subq(rdx, rax); 6544 bind(done); 6545 } 6546 6547 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6548 Label done; 6549 Label neg_divisor_fastpath; 6550 cmpq(divisor, 0); 6551 jccb(Assembler::less, neg_divisor_fastpath); 6552 xorq(rdx, rdx); 6553 divq(divisor); 6554 jmp(done); 6555 bind(neg_divisor_fastpath); 6556 // Fastpath for divisor < 0: 6557 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6558 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6559 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6560 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6561 movq(rdx, rax); 6562 subq(rax, divisor); 6563 if (VM_Version::supports_bmi1()) { 6564 andnq(rax, rax, rdx); 6565 } else { 6566 notq(rax); 6567 andq(rax, rdx); 6568 } 6569 movq(tmp, rax); 6570 shrq(rax, 63); // quotient 6571 sarq(tmp, 63); 6572 andq(tmp, divisor); 6573 subq(rdx, tmp); // remainder 6574 bind(done); 6575 } 6576 #endif 6577 6578 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6579 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6580 int vlen_enc) { 6581 assert(VM_Version::supports_avx512bw(), ""); 6582 // Byte shuffles are inlane operations and indices are determined using 6583 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6584 // normalized to index range 0-15. This makes sure that all the multiples 6585 // of an index value are placed at same relative position in 128 bit 6586 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6587 // will be 16th element in their respective 128 bit lanes. 6588 movl(rtmp, 16); 6589 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6590 6591 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6592 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6593 // original shuffle indices and move the shuffled lanes corresponding to true 6594 // mask to destination vector. 6595 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6596 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6597 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6598 6599 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6600 // and broadcasting second 128 bit lane. 6601 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6602 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6603 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6604 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6605 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6606 6607 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6608 // and broadcasting third 128 bit lane. 6609 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6610 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6611 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6612 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6613 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6614 6615 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6616 // and broadcasting third 128 bit lane. 6617 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6618 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6619 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6620 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6621 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6622 } 6623 6624 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6625 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6626 if (vlen_enc == AVX_128bit) { 6627 vpermilps(dst, src, shuffle, vlen_enc); 6628 } else if (bt == T_INT) { 6629 vpermd(dst, shuffle, src, vlen_enc); 6630 } else { 6631 assert(bt == T_FLOAT, ""); 6632 vpermps(dst, shuffle, src, vlen_enc); 6633 } 6634 }