1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "oops/methodData.hpp" 31 #include "opto/c2_MacroAssembler.hpp" 32 #include "opto/intrinsicnode.hpp" 33 #include "opto/output.hpp" 34 #include "opto/opcodes.hpp" 35 #include "opto/subnode.hpp" 36 #include "runtime/globals.hpp" 37 #include "runtime/objectMonitor.hpp" 38 #include "runtime/stubRoutines.hpp" 39 #include "utilities/checkedCast.hpp" 40 #include "utilities/globalDefinitions.hpp" 41 #include "utilities/powerOfTwo.hpp" 42 #include "utilities/sizes.hpp" 43 44 #ifdef PRODUCT 45 #define BLOCK_COMMENT(str) /* nothing */ 46 #define STOP(error) stop(error) 47 #else 48 #define BLOCK_COMMENT(str) block_comment(str) 49 #define STOP(error) block_comment(error); stop(error) 50 #endif 51 52 // C2 compiled method's prolog code. 53 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { 54 55 // WARNING: Initial instruction MUST be 5 bytes or longer so that 56 // NativeJump::patch_verified_entry will be able to patch out the entry 57 // code safely. The push to verify stack depth is ok at 5 bytes, 58 // the frame allocation can be either 3 or 6 bytes. So if we don't do 59 // stack bang then we must use the 6 byte frame allocation even if 60 // we have no frame. :-( 61 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 62 63 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 64 // Remove word for return addr 65 framesize -= wordSize; 66 stack_bang_size -= wordSize; 67 68 // Calls to C2R adapters often do not accept exceptional returns. 69 // We require that their callers must bang for them. But be careful, because 70 // some VM calls (such as call site linkage) can use several kilobytes of 71 // stack. But the stack safety zone should account for that. 72 // See bugs 4446381, 4468289, 4497237. 73 if (stack_bang_size > 0) { 74 generate_stack_overflow_check(stack_bang_size); 75 76 // We always push rbp, so that on return to interpreter rbp, will be 77 // restored correctly and we can correct the stack. 78 push(rbp); 79 // Save caller's stack pointer into RBP if the frame pointer is preserved. 80 if (PreserveFramePointer) { 81 mov(rbp, rsp); 82 } 83 // Remove word for ebp 84 framesize -= wordSize; 85 86 // Create frame 87 if (framesize) { 88 subptr(rsp, framesize); 89 } 90 } else { 91 // Create frame (force generation of a 4 byte immediate value) 92 subptr_imm32(rsp, framesize); 93 94 // Save RBP register now. 95 framesize -= wordSize; 96 movptr(Address(rsp, framesize), rbp); 97 // Save caller's stack pointer into RBP if the frame pointer is preserved. 98 if (PreserveFramePointer) { 99 movptr(rbp, rsp); 100 if (framesize > 0) { 101 addptr(rbp, framesize); 102 } 103 } 104 } 105 106 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 107 framesize -= wordSize; 108 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 109 } 110 111 #ifndef _LP64 112 // If method sets FPU control word do it now 113 if (fp_mode_24b) { 114 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 115 } 116 if (UseSSE >= 2 && VerifyFPU) { 117 verify_FPU(0, "FPU stack must be clean on entry"); 118 } 119 #endif 120 121 #ifdef ASSERT 122 if (VerifyStackAtCalls) { 123 Label L; 124 push(rax); 125 mov(rax, rsp); 126 andptr(rax, StackAlignmentInBytes-1); 127 cmpptr(rax, StackAlignmentInBytes-wordSize); 128 pop(rax); 129 jcc(Assembler::equal, L); 130 STOP("Stack is not properly aligned!"); 131 bind(L); 132 } 133 #endif 134 135 if (!is_stub) { 136 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 137 #ifdef _LP64 138 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 139 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 140 Label dummy_slow_path; 141 Label dummy_continuation; 142 Label* slow_path = &dummy_slow_path; 143 Label* continuation = &dummy_continuation; 144 if (!Compile::current()->output()->in_scratch_emit_size()) { 145 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 146 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 147 Compile::current()->output()->add_stub(stub); 148 slow_path = &stub->entry(); 149 continuation = &stub->continuation(); 150 } 151 bs->nmethod_entry_barrier(this, slow_path, continuation); 152 } 153 #else 154 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 155 bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */); 156 #endif 157 } 158 } 159 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 161 switch (vlen_in_bytes) { 162 case 4: // fall-through 163 case 8: // fall-through 164 case 16: return Assembler::AVX_128bit; 165 case 32: return Assembler::AVX_256bit; 166 case 64: return Assembler::AVX_512bit; 167 168 default: { 169 ShouldNotReachHere(); 170 return Assembler::AVX_NoVec; 171 } 172 } 173 } 174 175 #if INCLUDE_RTM_OPT 176 177 // Update rtm_counters based on abort status 178 // input: abort_status 179 // rtm_counters (RTMLockingCounters*) 180 // flags are killed 181 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) { 182 183 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset())); 184 if (PrintPreciseRTMLockingStatistics) { 185 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 186 Label check_abort; 187 testl(abort_status, (1<<i)); 188 jccb(Assembler::equal, check_abort); 189 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx)))); 190 bind(check_abort); 191 } 192 } 193 } 194 195 // Branch if (random & (count-1) != 0), count is 2^n 196 // tmp, scr and flags are killed 197 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) { 198 assert(tmp == rax, ""); 199 assert(scr == rdx, ""); 200 rdtsc(); // modifies EDX:EAX 201 andptr(tmp, count-1); 202 jccb(Assembler::notZero, brLabel); 203 } 204 205 // Perform abort ratio calculation, set no_rtm bit if high ratio 206 // input: rtm_counters_Reg (RTMLockingCounters* address) 207 // tmpReg, rtm_counters_Reg and flags are killed 208 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg, 209 Register rtm_counters_Reg, 210 RTMLockingCounters* rtm_counters, 211 Metadata* method_data) { 212 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 213 214 if (RTMLockingCalculationDelay > 0) { 215 // Delay calculation 216 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr())); 217 testptr(tmpReg, tmpReg); 218 jccb(Assembler::equal, L_done); 219 } 220 // Abort ratio calculation only if abort_count > RTMAbortThreshold 221 // Aborted transactions = abort_count * 100 222 // All transactions = total_count * RTMTotalCountIncrRate 223 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 224 225 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset())); 226 cmpptr(tmpReg, RTMAbortThreshold); 227 jccb(Assembler::below, L_check_always_rtm2); 228 imulptr(tmpReg, tmpReg, 100); 229 230 Register scrReg = rtm_counters_Reg; 231 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 232 imulptr(scrReg, scrReg, RTMTotalCountIncrRate); 233 imulptr(scrReg, scrReg, RTMAbortRatio); 234 cmpptr(tmpReg, scrReg); 235 jccb(Assembler::below, L_check_always_rtm1); 236 if (method_data != nullptr) { 237 // set rtm_state to "no rtm" in MDO 238 mov_metadata(tmpReg, method_data); 239 lock(); 240 orl(Address(tmpReg, MethodData::rtm_state_offset()), NoRTM); 241 } 242 jmpb(L_done); 243 bind(L_check_always_rtm1); 244 // Reload RTMLockingCounters* address 245 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 246 bind(L_check_always_rtm2); 247 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 248 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 249 jccb(Assembler::below, L_done); 250 if (method_data != nullptr) { 251 // set rtm_state to "always rtm" in MDO 252 mov_metadata(tmpReg, method_data); 253 lock(); 254 orl(Address(tmpReg, MethodData::rtm_state_offset()), UseRTM); 255 } 256 bind(L_done); 257 } 258 259 // Update counters and perform abort ratio calculation 260 // input: abort_status_Reg 261 // rtm_counters_Reg, flags are killed 262 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg, 263 Register rtm_counters_Reg, 264 RTMLockingCounters* rtm_counters, 265 Metadata* method_data, 266 bool profile_rtm) { 267 268 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 269 // update rtm counters based on rax value at abort 270 // reads abort_status_Reg, updates flags 271 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 272 rtm_counters_update(abort_status_Reg, rtm_counters_Reg); 273 if (profile_rtm) { 274 // Save abort status because abort_status_Reg is used by following code. 275 if (RTMRetryCount > 0) { 276 push(abort_status_Reg); 277 } 278 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 279 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data); 280 // restore abort status 281 if (RTMRetryCount > 0) { 282 pop(abort_status_Reg); 283 } 284 } 285 } 286 287 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4) 288 // inputs: retry_count_Reg 289 // : abort_status_Reg 290 // output: retry_count_Reg decremented by 1 291 // flags are killed 292 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) { 293 Label doneRetry; 294 assert(abort_status_Reg == rax, ""); 295 // The abort reason bits are in eax (see all states in rtmLocking.hpp) 296 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) 297 // if reason is in 0x6 and retry count != 0 then retry 298 andptr(abort_status_Reg, 0x6); 299 jccb(Assembler::zero, doneRetry); 300 testl(retry_count_Reg, retry_count_Reg); 301 jccb(Assembler::zero, doneRetry); 302 pause(); 303 decrementl(retry_count_Reg); 304 jmp(retryLabel); 305 bind(doneRetry); 306 } 307 308 // Spin and retry if lock is busy, 309 // inputs: box_Reg (monitor address) 310 // : retry_count_Reg 311 // output: retry_count_Reg decremented by 1 312 // : clear z flag if retry count exceeded 313 // tmp_Reg, scr_Reg, flags are killed 314 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg, 315 Register tmp_Reg, Register scr_Reg, Label& retryLabel) { 316 Label SpinLoop, SpinExit, doneRetry; 317 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 318 319 testl(retry_count_Reg, retry_count_Reg); 320 jccb(Assembler::zero, doneRetry); 321 decrementl(retry_count_Reg); 322 movptr(scr_Reg, RTMSpinLoopCount); 323 324 bind(SpinLoop); 325 pause(); 326 decrementl(scr_Reg); 327 jccb(Assembler::lessEqual, SpinExit); 328 movptr(tmp_Reg, Address(box_Reg, owner_offset)); 329 testptr(tmp_Reg, tmp_Reg); 330 jccb(Assembler::notZero, SpinLoop); 331 332 bind(SpinExit); 333 jmp(retryLabel); 334 bind(doneRetry); 335 incrementl(retry_count_Reg); // clear z flag 336 } 337 338 // Use RTM for normal stack locks 339 // Input: objReg (object to lock) 340 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg, 341 Register retry_on_abort_count_Reg, 342 RTMLockingCounters* stack_rtm_counters, 343 Metadata* method_data, bool profile_rtm, 344 Label& DONE_LABEL, Label& IsInflated) { 345 assert(UseRTMForStackLocks, "why call this otherwise?"); 346 assert(tmpReg == rax, ""); 347 assert(scrReg == rdx, ""); 348 Label L_rtm_retry, L_decrement_retry, L_on_abort; 349 350 if (RTMRetryCount > 0) { 351 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 352 bind(L_rtm_retry); 353 } 354 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 355 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 356 jcc(Assembler::notZero, IsInflated); 357 358 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 359 Label L_noincrement; 360 if (RTMTotalCountIncrRate > 1) { 361 // tmpReg, scrReg and flags are killed 362 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 363 } 364 assert(stack_rtm_counters != nullptr, "should not be null when profiling RTM"); 365 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg); 366 bind(L_noincrement); 367 } 368 xbegin(L_on_abort); 369 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 370 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 371 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 372 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked 373 374 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 375 if (UseRTMXendForLockBusy) { 376 xend(); 377 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry) 378 jmp(L_decrement_retry); 379 } 380 else { 381 xabort(0); 382 } 383 bind(L_on_abort); 384 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 385 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm); 386 } 387 bind(L_decrement_retry); 388 if (RTMRetryCount > 0) { 389 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 390 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 391 } 392 } 393 394 // Use RTM for inflating locks 395 // inputs: objReg (object to lock) 396 // boxReg (on-stack box address (displaced header location) - KILLED) 397 // tmpReg (ObjectMonitor address + markWord::monitor_value) 398 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg, 399 Register scrReg, Register retry_on_busy_count_Reg, 400 Register retry_on_abort_count_Reg, 401 RTMLockingCounters* rtm_counters, 402 Metadata* method_data, bool profile_rtm, 403 Label& DONE_LABEL) { 404 assert(UseRTMLocking, "why call this otherwise?"); 405 assert(tmpReg == rax, ""); 406 assert(scrReg == rdx, ""); 407 Label L_rtm_retry, L_decrement_retry, L_on_abort; 408 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 409 410 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 411 movptr(boxReg, tmpReg); // Save ObjectMonitor address 412 413 if (RTMRetryCount > 0) { 414 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy 415 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 416 bind(L_rtm_retry); 417 } 418 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 419 Label L_noincrement; 420 if (RTMTotalCountIncrRate > 1) { 421 // tmpReg, scrReg and flags are killed 422 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 423 } 424 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 425 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg); 426 bind(L_noincrement); 427 } 428 xbegin(L_on_abort); 429 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 430 movptr(tmpReg, Address(tmpReg, owner_offset)); 431 testptr(tmpReg, tmpReg); 432 jcc(Assembler::zero, DONE_LABEL); 433 if (UseRTMXendForLockBusy) { 434 xend(); 435 jmp(L_decrement_retry); 436 } 437 else { 438 xabort(0); 439 } 440 bind(L_on_abort); 441 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 442 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 443 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm); 444 } 445 if (RTMRetryCount > 0) { 446 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 447 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 448 } 449 450 movptr(tmpReg, Address(boxReg, owner_offset)) ; 451 testptr(tmpReg, tmpReg) ; 452 jccb(Assembler::notZero, L_decrement_retry) ; 453 454 // Appears unlocked - try to swing _owner from null to non-null. 455 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 456 #ifdef _LP64 457 Register threadReg = r15_thread; 458 #else 459 get_thread(scrReg); 460 Register threadReg = scrReg; 461 #endif 462 lock(); 463 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg 464 465 if (RTMRetryCount > 0) { 466 // success done else retry 467 jccb(Assembler::equal, DONE_LABEL) ; 468 bind(L_decrement_retry); 469 // Spin and retry if lock is busy. 470 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry); 471 } 472 else { 473 bind(L_decrement_retry); 474 } 475 } 476 477 #endif // INCLUDE_RTM_OPT 478 479 // fast_lock and fast_unlock used by C2 480 481 // Because the transitions from emitted code to the runtime 482 // monitorenter/exit helper stubs are so slow it's critical that 483 // we inline both the stack-locking fast path and the inflated fast path. 484 // 485 // See also: cmpFastLock and cmpFastUnlock. 486 // 487 // What follows is a specialized inline transliteration of the code 488 // in enter() and exit(). If we're concerned about I$ bloat another 489 // option would be to emit TrySlowEnter and TrySlowExit methods 490 // at startup-time. These methods would accept arguments as 491 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 492 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 493 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 494 // In practice, however, the # of lock sites is bounded and is usually small. 495 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 496 // if the processor uses simple bimodal branch predictors keyed by EIP 497 // Since the helper routines would be called from multiple synchronization 498 // sites. 499 // 500 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 501 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 502 // to those specialized methods. That'd give us a mostly platform-independent 503 // implementation that the JITs could optimize and inline at their pleasure. 504 // Done correctly, the only time we'd need to cross to native could would be 505 // to park() or unpark() threads. We'd also need a few more unsafe operators 506 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 507 // (b) explicit barriers or fence operations. 508 // 509 // TODO: 510 // 511 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 512 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 513 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 514 // the lock operators would typically be faster than reifying Self. 515 // 516 // * Ideally I'd define the primitives as: 517 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 518 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 519 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 520 // Instead, we're stuck with a rather awkward and brittle register assignments below. 521 // Furthermore the register assignments are overconstrained, possibly resulting in 522 // sub-optimal code near the synchronization site. 523 // 524 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 525 // Alternately, use a better sp-proximity test. 526 // 527 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 528 // Either one is sufficient to uniquely identify a thread. 529 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 530 // 531 // * Intrinsify notify() and notifyAll() for the common cases where the 532 // object is locked by the calling thread but the waitlist is empty. 533 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 534 // 535 // * use jccb and jmpb instead of jcc and jmp to improve code density. 536 // But beware of excessive branch density on AMD Opterons. 537 // 538 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 539 // or failure of the fast path. If the fast path fails then we pass 540 // control to the slow path, typically in C. In fast_lock and 541 // fast_unlock we often branch to DONE_LABEL, just to find that C2 542 // will emit a conditional branch immediately after the node. 543 // So we have branches to branches and lots of ICC.ZF games. 544 // Instead, it might be better to have C2 pass a "FailureLabel" 545 // into fast_lock and fast_unlock. In the case of success, control 546 // will drop through the node. ICC.ZF is undefined at exit. 547 // In the case of failure, the node will branch directly to the 548 // FailureLabel 549 550 551 // obj: object to lock 552 // box: on-stack box address (displaced header location) - KILLED 553 // rax,: tmp -- KILLED 554 // scr: tmp -- KILLED 555 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 556 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 557 RTMLockingCounters* rtm_counters, 558 RTMLockingCounters* stack_rtm_counters, 559 Metadata* method_data, 560 bool use_rtm, bool profile_rtm) { 561 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 562 // Ensure the register assignments are disjoint 563 assert(tmpReg == rax, ""); 564 565 if (use_rtm) { 566 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg); 567 } else { 568 assert(cx1Reg == noreg, ""); 569 assert(cx2Reg == noreg, ""); 570 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 571 } 572 573 // Possible cases that we'll encounter in fast_lock 574 // ------------------------------------------------ 575 // * Inflated 576 // -- unlocked 577 // -- Locked 578 // = by self 579 // = by other 580 // * neutral 581 // * stack-locked 582 // -- by self 583 // = sp-proximity test hits 584 // = sp-proximity test generates false-negative 585 // -- by other 586 // 587 588 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 589 590 if (DiagnoseSyncOnValueBasedClasses != 0) { 591 load_klass(tmpReg, objReg, scrReg); 592 movl(tmpReg, Address(tmpReg, Klass::access_flags_offset())); 593 testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS); 594 jcc(Assembler::notZero, DONE_LABEL); 595 } 596 597 #if INCLUDE_RTM_OPT 598 if (UseRTMForStackLocks && use_rtm) { 599 assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive"); 600 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg, 601 stack_rtm_counters, method_data, profile_rtm, 602 DONE_LABEL, IsInflated); 603 } 604 #endif // INCLUDE_RTM_OPT 605 606 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 607 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 608 jcc(Assembler::notZero, IsInflated); 609 610 if (LockingMode == LM_MONITOR) { 611 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 612 testptr(objReg, objReg); 613 } else { 614 assert(LockingMode == LM_LEGACY, "must be"); 615 // Attempt stack-locking ... 616 orptr (tmpReg, markWord::unlocked_value); 617 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 618 lock(); 619 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 620 jcc(Assembler::equal, COUNT); // Success 621 622 // Recursive locking. 623 // The object is stack-locked: markword contains stack pointer to BasicLock. 624 // Locked by current thread if difference with current SP is less than one page. 625 subptr(tmpReg, rsp); 626 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 627 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) ); 628 movptr(Address(boxReg, 0), tmpReg); 629 } 630 jmp(DONE_LABEL); 631 632 bind(IsInflated); 633 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 634 635 #if INCLUDE_RTM_OPT 636 // Use the same RTM locking code in 32- and 64-bit VM. 637 if (use_rtm) { 638 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg, 639 rtm_counters, method_data, profile_rtm, DONE_LABEL); 640 } else { 641 #endif // INCLUDE_RTM_OPT 642 643 #ifndef _LP64 644 // The object is inflated. 645 646 // boxReg refers to the on-stack BasicLock in the current frame. 647 // We'd like to write: 648 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 649 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 650 // additional latency as we have another ST in the store buffer that must drain. 651 652 // avoid ST-before-CAS 653 // register juggle because we need tmpReg for cmpxchgptr below 654 movptr(scrReg, boxReg); 655 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 656 657 // Optimistic form: consider XORL tmpReg,tmpReg 658 movptr(tmpReg, NULL_WORD); 659 660 // Appears unlocked - try to swing _owner from null to non-null. 661 // Ideally, I'd manifest "Self" with get_thread and then attempt 662 // to CAS the register containing Self into m->Owner. 663 // But we don't have enough registers, so instead we can either try to CAS 664 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 665 // we later store "Self" into m->Owner. Transiently storing a stack address 666 // (rsp or the address of the box) into m->owner is harmless. 667 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 668 lock(); 669 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 670 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 671 // If we weren't able to swing _owner from null to the BasicLock 672 // then take the slow path. 673 jccb (Assembler::notZero, NO_COUNT); 674 // update _owner from BasicLock to thread 675 get_thread (scrReg); // beware: clobbers ICCs 676 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 677 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 678 679 // If the CAS fails we can either retry or pass control to the slow path. 680 // We use the latter tactic. 681 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 682 // If the CAS was successful ... 683 // Self has acquired the lock 684 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 685 // Intentional fall-through into DONE_LABEL ... 686 #else // _LP64 687 // It's inflated and we use scrReg for ObjectMonitor* in this section. 688 movq(scrReg, tmpReg); 689 xorq(tmpReg, tmpReg); 690 lock(); 691 cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 692 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 693 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 694 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 695 // Propagate ICC.ZF from CAS above into DONE_LABEL. 696 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 697 698 cmpptr(thread, rax); // Check if we are already the owner (recursive lock) 699 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 700 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 701 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 702 #endif // _LP64 703 #if INCLUDE_RTM_OPT 704 } // use_rtm() 705 #endif 706 bind(DONE_LABEL); 707 708 // ZFlag == 1 count in fast path 709 // ZFlag == 0 count in slow path 710 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 711 712 bind(COUNT); 713 // Count monitors in fast path 714 increment(Address(thread, JavaThread::held_monitor_count_offset())); 715 716 xorl(tmpReg, tmpReg); // Set ZF == 1 717 718 bind(NO_COUNT); 719 720 // At NO_COUNT the icc ZFlag is set as follows ... 721 // fast_unlock uses the same protocol. 722 // ZFlag == 1 -> Success 723 // ZFlag == 0 -> Failure - force control through the slow path 724 } 725 726 // obj: object to unlock 727 // box: box address (displaced header location), killed. Must be EAX. 728 // tmp: killed, cannot be obj nor box. 729 // 730 // Some commentary on balanced locking: 731 // 732 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 733 // Methods that don't have provably balanced locking are forced to run in the 734 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 735 // The interpreter provides two properties: 736 // I1: At return-time the interpreter automatically and quietly unlocks any 737 // objects acquired the current activation (frame). Recall that the 738 // interpreter maintains an on-stack list of locks currently held by 739 // a frame. 740 // I2: If a method attempts to unlock an object that is not held by the 741 // the frame the interpreter throws IMSX. 742 // 743 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 744 // B() doesn't have provably balanced locking so it runs in the interpreter. 745 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 746 // is still locked by A(). 747 // 748 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 749 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 750 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 751 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 752 // Arguably given that the spec legislates the JNI case as undefined our implementation 753 // could reasonably *avoid* checking owner in fast_unlock(). 754 // In the interest of performance we elide m->Owner==Self check in unlock. 755 // A perfectly viable alternative is to elide the owner check except when 756 // Xcheck:jni is enabled. 757 758 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { 759 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 760 assert(boxReg == rax, ""); 761 assert_different_registers(objReg, boxReg, tmpReg); 762 763 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 764 765 #if INCLUDE_RTM_OPT 766 if (UseRTMForStackLocks && use_rtm) { 767 assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive"); 768 Label L_regular_unlock; 769 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 770 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 771 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 772 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock 773 xend(); // otherwise end... 774 jmp(DONE_LABEL); // ... and we're done 775 bind(L_regular_unlock); 776 } 777 #endif 778 779 if (LockingMode == LM_LEGACY) { 780 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 781 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 782 } 783 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 784 if (LockingMode != LM_MONITOR) { 785 testptr(tmpReg, markWord::monitor_value); // Inflated? 786 jcc(Assembler::zero, Stacked); 787 } 788 789 // It's inflated. 790 791 #if INCLUDE_RTM_OPT 792 if (use_rtm) { 793 Label L_regular_inflated_unlock; 794 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 795 movptr(boxReg, Address(tmpReg, owner_offset)); 796 testptr(boxReg, boxReg); 797 jccb(Assembler::notZero, L_regular_inflated_unlock); 798 xend(); 799 jmp(DONE_LABEL); 800 bind(L_regular_inflated_unlock); 801 } 802 #endif 803 804 // Despite our balanced locking property we still check that m->_owner == Self 805 // as java routines or native JNI code called by this thread might 806 // have released the lock. 807 // Refer to the comments in synchronizer.cpp for how we might encode extra 808 // state in _succ so we can avoid fetching EntryList|cxq. 809 // 810 // If there's no contention try a 1-0 exit. That is, exit without 811 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 812 // we detect and recover from the race that the 1-0 exit admits. 813 // 814 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 815 // before it STs null into _owner, releasing the lock. Updates 816 // to data protected by the critical section must be visible before 817 // we drop the lock (and thus before any other thread could acquire 818 // the lock and observe the fields protected by the lock). 819 // IA32's memory-model is SPO, so STs are ordered with respect to 820 // each other and there's no need for an explicit barrier (fence). 821 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 822 #ifndef _LP64 823 // Note that we could employ various encoding schemes to reduce 824 // the number of loads below (currently 4) to just 2 or 3. 825 // Refer to the comments in synchronizer.cpp. 826 // In practice the chain of fetches doesn't seem to impact performance, however. 827 xorptr(boxReg, boxReg); 828 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 829 jccb (Assembler::notZero, DONE_LABEL); 830 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 831 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 832 jccb (Assembler::notZero, DONE_LABEL); 833 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 834 jmpb (DONE_LABEL); 835 #else // _LP64 836 // It's inflated 837 Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath; 838 839 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 840 jccb(Assembler::equal, LNotRecursive); 841 842 // Recursive inflated unlock 843 decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 844 jmpb(LSuccess); 845 846 bind(LNotRecursive); 847 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 848 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 849 jccb (Assembler::notZero, CheckSucc); 850 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 851 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 852 jmpb (DONE_LABEL); 853 854 // Try to avoid passing control into the slow_path ... 855 bind (CheckSucc); 856 857 // The following optional optimization can be elided if necessary 858 // Effectively: if (succ == null) goto slow path 859 // The code reduces the window for a race, however, 860 // and thus benefits performance. 861 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 862 jccb (Assembler::zero, LGoSlowPath); 863 864 xorptr(boxReg, boxReg); 865 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 866 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 867 868 // Memory barrier/fence 869 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 870 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 871 // This is faster on Nehalem and AMD Shanghai/Barcelona. 872 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 873 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 874 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 875 lock(); addl(Address(rsp, 0), 0); 876 877 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 878 jccb (Assembler::notZero, LSuccess); 879 880 // Rare inopportune interleaving - race. 881 // The successor vanished in the small window above. 882 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 883 // We need to ensure progress and succession. 884 // Try to reacquire the lock. 885 // If that fails then the new owner is responsible for succession and this 886 // thread needs to take no further action and can exit via the fast path (success). 887 // If the re-acquire succeeds then pass control into the slow path. 888 // As implemented, this latter mode is horrible because we generated more 889 // coherence traffic on the lock *and* artificially extended the critical section 890 // length while by virtue of passing control into the slow path. 891 892 // box is really RAX -- the following CMPXCHG depends on that binding 893 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 894 lock(); 895 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 896 // There's no successor so we tried to regrab the lock. 897 // If that didn't work, then another thread grabbed the 898 // lock so we're done (and exit was a success). 899 jccb (Assembler::notEqual, LSuccess); 900 // Intentional fall-through into slow path 901 902 bind (LGoSlowPath); 903 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 904 jmpb (DONE_LABEL); 905 906 bind (LSuccess); 907 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 908 jmpb (DONE_LABEL); 909 910 #endif 911 if (LockingMode == LM_LEGACY) { 912 bind (Stacked); 913 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 914 lock(); 915 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 916 // Intentional fall-thru into DONE_LABEL 917 } 918 919 bind(DONE_LABEL); 920 921 // ZFlag == 1 count in fast path 922 // ZFlag == 0 count in slow path 923 jccb(Assembler::notZero, NO_COUNT); 924 925 bind(COUNT); 926 // Count monitors in fast path 927 #ifndef _LP64 928 get_thread(tmpReg); 929 decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); 930 #else // _LP64 931 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 932 #endif 933 934 xorl(tmpReg, tmpReg); // Set ZF == 1 935 936 bind(NO_COUNT); 937 } 938 939 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 940 Register t, Register thread) { 941 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 942 assert(rax_reg == rax, "Used for CAS"); 943 assert_different_registers(obj, box, rax_reg, t, thread); 944 945 // Handle inflated monitor. 946 Label inflated; 947 // Finish fast lock successfully. ZF value is irrelevant. 948 Label locked; 949 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 950 Label slow_path; 951 952 // Clear box. TODO[OMWorld]: Is this necessary? May also defer this to not write twice. 953 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), 0); 954 955 if (DiagnoseSyncOnValueBasedClasses != 0) { 956 load_klass(rax_reg, obj, t); 957 movl(rax_reg, Address(rax_reg, Klass::access_flags_offset())); 958 testl(rax_reg, JVM_ACC_IS_VALUE_BASED_CLASS); 959 jcc(Assembler::notZero, slow_path); 960 } 961 962 const Register mark = t; 963 964 { // Lightweight Lock 965 966 Label push; 967 968 const Register top = rax_reg; 969 970 // Load the mark. 971 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 972 973 // Prefetch top. 974 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 975 976 // Check for monitor (0b10). 977 testptr(mark, markWord::monitor_value); 978 jcc(Assembler::notZero, inflated); 979 980 // Check if lock-stack is full. 981 cmpl(top, LockStack::end_offset() - 1); 982 jcc(Assembler::greater, slow_path); 983 984 // Check if recursive. 985 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 986 jccb(Assembler::equal, push); 987 988 // Try to lock. Transition lock bits 0b01 => 0b00 989 movptr(rax_reg, mark); 990 orptr(rax_reg, markWord::unlocked_value); 991 andptr(mark, ~(int32_t)markWord::unlocked_value); 992 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 993 jcc(Assembler::notEqual, slow_path); 994 995 bind(push); 996 // After successful lock, push object on lock-stack. 997 // TODO[OMWorld]: Was prepush better? 998 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 999 movptr(Address(thread, top), obj); 1000 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 1001 jmpb(locked); 1002 } 1003 1004 { // Handle inflated monitor. 1005 bind(inflated); 1006 1007 const Register monitor = t; 1008 1009 if (!OMUseC2Cache) { 1010 jmp(slow_path); 1011 } else { 1012 if (OMCacheHitRate) increment(Address(thread, JavaThread::lock_lookup_offset())); 1013 1014 // Fetch ObjectMonitor* from the cache or take the slow-path. 1015 Label monitor_found; 1016 1017 // Load cache address 1018 lea(t, Address(thread, JavaThread::om_cache_oops_offset())); 1019 1020 const int num_unrolled = MIN2(OMC2UnrollCacheEntries, OMCacheSize); 1021 for (int i = 0; i < num_unrolled; i++) { 1022 cmpptr(obj, Address(t)); 1023 jccb(Assembler::equal, monitor_found); 1024 if (i + 1 != num_unrolled) { 1025 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 1026 } 1027 } 1028 1029 if (num_unrolled == 0 || (OMC2UnrollCacheLookupLoopTail && num_unrolled != OMCacheSize)) { 1030 if (num_unrolled != 0) { 1031 // Loop after unrolling, advance iterator. 1032 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 1033 } 1034 1035 Label loop; 1036 1037 // Search for obj in cache. 1038 bind(loop); 1039 1040 // Check for match. 1041 cmpptr(obj, Address(t)); 1042 jccb(Assembler::equal, monitor_found); 1043 1044 // Search until null encountered, guaranteed _null_sentinel at end. 1045 cmpptr(Address(t), 1); 1046 jcc(Assembler::below, slow_path); // 0 check, but with ZF=0 when *t == 0 1047 increment(t, in_bytes(OMCache::oop_to_oop_difference())); 1048 jmpb(loop); 1049 } else { 1050 jmp(slow_path); 1051 } 1052 1053 // Cache hit. 1054 bind(monitor_found); 1055 movptr(monitor, Address(t, OMCache::oop_to_monitor_difference())); 1056 if (OMCacheHitRate) increment(Address(thread, JavaThread::lock_hit_offset())); 1057 1058 Label monitor_locked; 1059 // Lock the monitor. 1060 Label recursion; 1061 if (OMRecursiveFastPath) { 1062 // Check owner for recursion first. 1063 cmpptr(thread, Address(monitor, ObjectMonitor::owner_offset())); 1064 jccb(Assembler::equal, recursion); 1065 } 1066 1067 // CAS owner (null => current thread). 1068 xorptr(rax, rax); 1069 lock(); cmpxchgptr(thread, Address(monitor, ObjectMonitor::owner_offset())); 1070 jccb(Assembler::equal, monitor_locked); 1071 1072 if (OMRecursiveFastPath) { 1073 // Recursion already checked. 1074 jmpb(slow_path); 1075 } else { 1076 // Check if recursive. 1077 cmpptr(thread, rax); 1078 jccb(Assembler::notEqual, slow_path); 1079 } 1080 1081 // Recursive. 1082 bind(recursion); 1083 increment(Address(monitor, ObjectMonitor::recursions_offset())); 1084 1085 bind(monitor_locked); 1086 // Cache the monitor for unlock 1087 movptr(Address(box, BasicLock::object_monitor_cache_offset_in_bytes()), monitor); 1088 } 1089 } 1090 1091 bind(locked); 1092 increment(Address(thread, JavaThread::held_monitor_count_offset())); 1093 // Set ZF = 1 1094 xorl(rax_reg, rax_reg); 1095 1096 #ifdef ASSERT 1097 // Check that locked label is reached with ZF set. 1098 Label zf_correct; 1099 Label zf_bad_zero; 1100 jcc(Assembler::zero, zf_correct); 1101 jmp(zf_bad_zero); 1102 #endif 1103 1104 bind(slow_path); 1105 #ifdef ASSERT 1106 // Check that slow_path label is reached with ZF not set. 1107 jcc(Assembler::notZero, zf_correct); 1108 stop("Fast Lock ZF != 0"); 1109 bind(zf_bad_zero); 1110 stop("Fast Lock ZF != 1"); 1111 bind(zf_correct); 1112 #endif 1113 // C2 uses the value of ZF to determine the continuation. 1114 } 1115 1116 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 1117 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 1118 assert(reg_rax == rax, "Used for CAS"); 1119 assert_different_registers(obj, reg_rax, t); 1120 1121 // Handle inflated monitor. 1122 Label inflated, inflated_check_lock_stack; 1123 // Finish fast unlock successfully. MUST jump with ZF == 1 1124 Label unlocked; 1125 1126 // Assume success. 1127 decrement(Address(thread, JavaThread::held_monitor_count_offset())); 1128 1129 const Register mark = t; 1130 const Register monitor = t; 1131 const Register top = t; 1132 const Register box = reg_rax; 1133 1134 Label dummy; 1135 C2FastUnlockLightweightStub* stub = nullptr; 1136 1137 if (!Compile::current()->output()->in_scratch_emit_size()) { 1138 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 1139 Compile::current()->output()->add_stub(stub); 1140 } 1141 1142 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 1143 Label& check_successor = stub == nullptr ? dummy : stub->check_successor(); 1144 Label& slow_path = stub == nullptr ? dummy : stub->slow_path(); 1145 1146 { // Lightweight Unlock 1147 1148 // Load top. 1149 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 1150 1151 // Check if obj is top of lock-stack. 1152 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 1153 // Top of lock stack was not obj. Must be monitor. 1154 jcc(Assembler::notEqual, inflated_check_lock_stack); 1155 1156 // Pop lock-stack. 1157 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 1158 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 1159 1160 // Check if recursive. 1161 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 1162 jcc(Assembler::equal, unlocked); 1163 1164 // We elide the monitor check, let the CAS fail instead. 1165 1166 // Load mark. 1167 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 1168 1169 // Try to unlock. Transition lock bits 0b00 => 0b01 1170 movptr(reg_rax, mark); 1171 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 1172 orptr(mark, markWord::unlocked_value); 1173 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 1174 jcc(Assembler::notEqual, push_and_slow_path); 1175 jmp(unlocked); 1176 } 1177 1178 1179 { // Handle inflated monitor. 1180 bind(inflated_check_lock_stack); 1181 #ifdef ASSERT 1182 Label check_done; 1183 subl(top, oopSize); 1184 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 1185 jcc(Assembler::below, check_done); 1186 cmpptr(obj, Address(thread, top)); 1187 jccb(Assembler::notEqual, inflated_check_lock_stack); 1188 stop("Fast Unlock lock on stack"); 1189 bind(check_done); 1190 const Register mark = t; 1191 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 1192 testptr(mark, markWord::monitor_value); 1193 jccb(Assembler::notZero, inflated); 1194 stop("Fast Unlock not monitor"); 1195 #endif 1196 1197 bind(inflated); 1198 1199 if (!OMUseC2Cache) { 1200 jmp(slow_path); 1201 } else { 1202 if (OMCacheHitRate) increment(Address(thread, JavaThread::unlock_lookup_offset())); 1203 movptr(monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 1204 // TODO[OMWorld]: Figure out the correctness surrounding the owner field here. Obj is not on the lock stack 1205 // but this means this thread must have locked on the inflated monitor at some point. So it 1206 // should not be anonymous. 1207 cmpptr(monitor, 2); 1208 jcc(Assembler::below, slow_path); 1209 1210 if (OMCacheHitRate) increment(Address(thread, JavaThread::unlock_hit_offset())); 1211 1212 Label recursive; 1213 1214 // Check if recursive. 1215 cmpptr(Address(monitor,ObjectMonitor::recursions_offset()), 0); 1216 jccb(Assembler::notEqual, recursive); 1217 1218 // Check if the entry lists are empty. 1219 movptr(reg_rax, Address(monitor, ObjectMonitor::cxq_offset())); 1220 orptr(reg_rax, Address(monitor, ObjectMonitor::EntryList_offset())); 1221 jcc(Assembler::notZero, check_successor); 1222 1223 // Release lock. 1224 movptr(Address(monitor, ObjectMonitor::owner_offset()), NULL_WORD); 1225 jmpb(unlocked); 1226 1227 // Recursive unlock. 1228 bind(recursive); 1229 decrement(Address(monitor, ObjectMonitor::recursions_offset())); 1230 xorl(t, t); 1231 } 1232 } 1233 1234 bind(unlocked); 1235 if (stub != nullptr) { 1236 bind(stub->unlocked_continuation()); 1237 } 1238 1239 #ifdef ASSERT 1240 // Check that unlocked label is reached with ZF set. 1241 Label zf_correct; 1242 jcc(Assembler::zero, zf_correct); 1243 stop("Fast Unlock ZF != 1"); 1244 #endif 1245 1246 if (stub != nullptr) { 1247 bind(stub->slow_path_continuation()); 1248 } 1249 #ifdef ASSERT 1250 // Check that stub->continuation() label is reached with ZF not set. 1251 jccb(Assembler::notZero, zf_correct); 1252 stop("Fast Unlock ZF != 0"); 1253 bind(zf_correct); 1254 #endif 1255 // C2 uses the value of ZF to determine the continuation. 1256 } 1257 1258 //------------------------------------------------------------------------------------------- 1259 // Generic instructions support for use in .ad files C2 code generation 1260 1261 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 1262 if (dst != src) { 1263 movdqu(dst, src); 1264 } 1265 if (opcode == Op_AbsVD) { 1266 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 1267 } else { 1268 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 1269 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1270 } 1271 } 1272 1273 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 1274 if (opcode == Op_AbsVD) { 1275 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 1276 } else { 1277 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 1278 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 1279 } 1280 } 1281 1282 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 1283 if (dst != src) { 1284 movdqu(dst, src); 1285 } 1286 if (opcode == Op_AbsVF) { 1287 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 1288 } else { 1289 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 1290 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1291 } 1292 } 1293 1294 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 1295 if (opcode == Op_AbsVF) { 1296 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 1297 } else { 1298 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 1299 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 1300 } 1301 } 1302 1303 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 1304 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1305 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 1306 1307 if (opcode == Op_MinV) { 1308 if (elem_bt == T_BYTE) { 1309 pminsb(dst, src); 1310 } else if (elem_bt == T_SHORT) { 1311 pminsw(dst, src); 1312 } else if (elem_bt == T_INT) { 1313 pminsd(dst, src); 1314 } else { 1315 assert(elem_bt == T_LONG, "required"); 1316 assert(tmp == xmm0, "required"); 1317 assert_different_registers(dst, src, tmp); 1318 movdqu(xmm0, dst); 1319 pcmpgtq(xmm0, src); 1320 blendvpd(dst, src); // xmm0 as mask 1321 } 1322 } else { // opcode == Op_MaxV 1323 if (elem_bt == T_BYTE) { 1324 pmaxsb(dst, src); 1325 } else if (elem_bt == T_SHORT) { 1326 pmaxsw(dst, src); 1327 } else if (elem_bt == T_INT) { 1328 pmaxsd(dst, src); 1329 } else { 1330 assert(elem_bt == T_LONG, "required"); 1331 assert(tmp == xmm0, "required"); 1332 assert_different_registers(dst, src, tmp); 1333 movdqu(xmm0, src); 1334 pcmpgtq(xmm0, dst); 1335 blendvpd(dst, src); // xmm0 as mask 1336 } 1337 } 1338 } 1339 1340 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 1341 XMMRegister dst, XMMRegister src1, XMMRegister src2, 1342 int vlen_enc) { 1343 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1344 1345 if (opcode == Op_MinV) { 1346 if (elem_bt == T_BYTE) { 1347 vpminsb(dst, src1, src2, vlen_enc); 1348 } else if (elem_bt == T_SHORT) { 1349 vpminsw(dst, src1, src2, vlen_enc); 1350 } else if (elem_bt == T_INT) { 1351 vpminsd(dst, src1, src2, vlen_enc); 1352 } else { 1353 assert(elem_bt == T_LONG, "required"); 1354 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1355 vpminsq(dst, src1, src2, vlen_enc); 1356 } else { 1357 assert_different_registers(dst, src1, src2); 1358 vpcmpgtq(dst, src1, src2, vlen_enc); 1359 vblendvpd(dst, src1, src2, dst, vlen_enc); 1360 } 1361 } 1362 } else { // opcode == Op_MaxV 1363 if (elem_bt == T_BYTE) { 1364 vpmaxsb(dst, src1, src2, vlen_enc); 1365 } else if (elem_bt == T_SHORT) { 1366 vpmaxsw(dst, src1, src2, vlen_enc); 1367 } else if (elem_bt == T_INT) { 1368 vpmaxsd(dst, src1, src2, vlen_enc); 1369 } else { 1370 assert(elem_bt == T_LONG, "required"); 1371 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1372 vpmaxsq(dst, src1, src2, vlen_enc); 1373 } else { 1374 assert_different_registers(dst, src1, src2); 1375 vpcmpgtq(dst, src1, src2, vlen_enc); 1376 vblendvpd(dst, src2, src1, dst, vlen_enc); 1377 } 1378 } 1379 } 1380 } 1381 1382 // Float/Double min max 1383 1384 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1385 XMMRegister dst, XMMRegister a, XMMRegister b, 1386 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1387 int vlen_enc) { 1388 assert(UseAVX > 0, "required"); 1389 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1390 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1391 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1392 assert_different_registers(a, tmp, atmp, btmp); 1393 assert_different_registers(b, tmp, atmp, btmp); 1394 1395 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1396 bool is_double_word = is_double_word_type(elem_bt); 1397 1398 /* Note on 'non-obvious' assembly sequence: 1399 * 1400 * While there are vminps/vmaxps instructions, there are two important differences between hardware 1401 * and Java on how they handle floats: 1402 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 1403 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 1404 * 1405 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 1406 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 1407 * (only useful when signs differ, noop otherwise) 1408 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 1409 1410 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 1411 * btmp = (b < +0.0) ? a : b 1412 * atmp = (b < +0.0) ? b : a 1413 * Tmp = Max_Float(atmp , btmp) 1414 * Res = (atmp == NaN) ? atmp : Tmp 1415 */ 1416 1417 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 1418 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 1419 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 1420 XMMRegister mask; 1421 1422 if (!is_double_word && is_min) { 1423 mask = a; 1424 vblend = &MacroAssembler::vblendvps; 1425 vmaxmin = &MacroAssembler::vminps; 1426 vcmp = &MacroAssembler::vcmpps; 1427 } else if (!is_double_word && !is_min) { 1428 mask = b; 1429 vblend = &MacroAssembler::vblendvps; 1430 vmaxmin = &MacroAssembler::vmaxps; 1431 vcmp = &MacroAssembler::vcmpps; 1432 } else if (is_double_word && is_min) { 1433 mask = a; 1434 vblend = &MacroAssembler::vblendvpd; 1435 vmaxmin = &MacroAssembler::vminpd; 1436 vcmp = &MacroAssembler::vcmppd; 1437 } else { 1438 assert(is_double_word && !is_min, "sanity"); 1439 mask = b; 1440 vblend = &MacroAssembler::vblendvpd; 1441 vmaxmin = &MacroAssembler::vmaxpd; 1442 vcmp = &MacroAssembler::vcmppd; 1443 } 1444 1445 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 1446 XMMRegister maxmin, scratch; 1447 if (dst == btmp) { 1448 maxmin = btmp; 1449 scratch = tmp; 1450 } else { 1451 maxmin = tmp; 1452 scratch = btmp; 1453 } 1454 1455 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 1456 if (precompute_mask && !is_double_word) { 1457 vpsrad(tmp, mask, 32, vlen_enc); 1458 mask = tmp; 1459 } else if (precompute_mask && is_double_word) { 1460 vpxor(tmp, tmp, tmp, vlen_enc); 1461 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1462 mask = tmp; 1463 } 1464 1465 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1466 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1467 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1468 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1469 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1470 } 1471 1472 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1473 XMMRegister dst, XMMRegister a, XMMRegister b, 1474 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1475 int vlen_enc) { 1476 assert(UseAVX > 2, "required"); 1477 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1478 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1479 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1480 assert_different_registers(dst, a, atmp, btmp); 1481 assert_different_registers(dst, b, atmp, btmp); 1482 1483 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1484 bool is_double_word = is_double_word_type(elem_bt); 1485 bool merge = true; 1486 1487 if (!is_double_word && is_min) { 1488 evpmovd2m(ktmp, a, vlen_enc); 1489 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1490 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1491 vminps(dst, atmp, btmp, vlen_enc); 1492 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1493 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1494 } else if (!is_double_word && !is_min) { 1495 evpmovd2m(ktmp, b, vlen_enc); 1496 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1497 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1498 vmaxps(dst, atmp, btmp, vlen_enc); 1499 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1500 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1501 } else if (is_double_word && is_min) { 1502 evpmovq2m(ktmp, a, vlen_enc); 1503 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1504 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1505 vminpd(dst, atmp, btmp, vlen_enc); 1506 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1507 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1508 } else { 1509 assert(is_double_word && !is_min, "sanity"); 1510 evpmovq2m(ktmp, b, vlen_enc); 1511 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1512 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1513 vmaxpd(dst, atmp, btmp, vlen_enc); 1514 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1515 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1516 } 1517 } 1518 1519 // Float/Double signum 1520 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1521 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1522 1523 Label DONE_LABEL; 1524 1525 if (opcode == Op_SignumF) { 1526 assert(UseSSE > 0, "required"); 1527 ucomiss(dst, zero); 1528 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1529 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1530 movflt(dst, one); 1531 jcc(Assembler::above, DONE_LABEL); 1532 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1533 } else if (opcode == Op_SignumD) { 1534 assert(UseSSE > 1, "required"); 1535 ucomisd(dst, zero); 1536 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1537 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1538 movdbl(dst, one); 1539 jcc(Assembler::above, DONE_LABEL); 1540 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1541 } 1542 1543 bind(DONE_LABEL); 1544 } 1545 1546 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1547 if (sign) { 1548 pmovsxbw(dst, src); 1549 } else { 1550 pmovzxbw(dst, src); 1551 } 1552 } 1553 1554 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1555 if (sign) { 1556 vpmovsxbw(dst, src, vector_len); 1557 } else { 1558 vpmovzxbw(dst, src, vector_len); 1559 } 1560 } 1561 1562 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1563 if (sign) { 1564 vpmovsxbd(dst, src, vector_len); 1565 } else { 1566 vpmovzxbd(dst, src, vector_len); 1567 } 1568 } 1569 1570 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1571 if (sign) { 1572 vpmovsxwd(dst, src, vector_len); 1573 } else { 1574 vpmovzxwd(dst, src, vector_len); 1575 } 1576 } 1577 1578 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1579 int shift, int vector_len) { 1580 if (opcode == Op_RotateLeftV) { 1581 if (etype == T_INT) { 1582 evprold(dst, src, shift, vector_len); 1583 } else { 1584 assert(etype == T_LONG, "expected type T_LONG"); 1585 evprolq(dst, src, shift, vector_len); 1586 } 1587 } else { 1588 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1589 if (etype == T_INT) { 1590 evprord(dst, src, shift, vector_len); 1591 } else { 1592 assert(etype == T_LONG, "expected type T_LONG"); 1593 evprorq(dst, src, shift, vector_len); 1594 } 1595 } 1596 } 1597 1598 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1599 XMMRegister shift, int vector_len) { 1600 if (opcode == Op_RotateLeftV) { 1601 if (etype == T_INT) { 1602 evprolvd(dst, src, shift, vector_len); 1603 } else { 1604 assert(etype == T_LONG, "expected type T_LONG"); 1605 evprolvq(dst, src, shift, vector_len); 1606 } 1607 } else { 1608 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1609 if (etype == T_INT) { 1610 evprorvd(dst, src, shift, vector_len); 1611 } else { 1612 assert(etype == T_LONG, "expected type T_LONG"); 1613 evprorvq(dst, src, shift, vector_len); 1614 } 1615 } 1616 } 1617 1618 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1619 if (opcode == Op_RShiftVI) { 1620 psrad(dst, shift); 1621 } else if (opcode == Op_LShiftVI) { 1622 pslld(dst, shift); 1623 } else { 1624 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1625 psrld(dst, shift); 1626 } 1627 } 1628 1629 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1630 switch (opcode) { 1631 case Op_RShiftVI: psrad(dst, shift); break; 1632 case Op_LShiftVI: pslld(dst, shift); break; 1633 case Op_URShiftVI: psrld(dst, shift); break; 1634 1635 default: assert(false, "%s", NodeClassNames[opcode]); 1636 } 1637 } 1638 1639 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1640 if (opcode == Op_RShiftVI) { 1641 vpsrad(dst, nds, shift, vector_len); 1642 } else if (opcode == Op_LShiftVI) { 1643 vpslld(dst, nds, shift, vector_len); 1644 } else { 1645 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1646 vpsrld(dst, nds, shift, vector_len); 1647 } 1648 } 1649 1650 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1651 switch (opcode) { 1652 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1653 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1654 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1655 1656 default: assert(false, "%s", NodeClassNames[opcode]); 1657 } 1658 } 1659 1660 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1661 switch (opcode) { 1662 case Op_RShiftVB: // fall-through 1663 case Op_RShiftVS: psraw(dst, shift); break; 1664 1665 case Op_LShiftVB: // fall-through 1666 case Op_LShiftVS: psllw(dst, shift); break; 1667 1668 case Op_URShiftVS: // fall-through 1669 case Op_URShiftVB: psrlw(dst, shift); break; 1670 1671 default: assert(false, "%s", NodeClassNames[opcode]); 1672 } 1673 } 1674 1675 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1676 switch (opcode) { 1677 case Op_RShiftVB: // fall-through 1678 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1679 1680 case Op_LShiftVB: // fall-through 1681 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1682 1683 case Op_URShiftVS: // fall-through 1684 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1685 1686 default: assert(false, "%s", NodeClassNames[opcode]); 1687 } 1688 } 1689 1690 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1691 switch (opcode) { 1692 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1693 case Op_LShiftVL: psllq(dst, shift); break; 1694 case Op_URShiftVL: psrlq(dst, shift); break; 1695 1696 default: assert(false, "%s", NodeClassNames[opcode]); 1697 } 1698 } 1699 1700 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1701 if (opcode == Op_RShiftVL) { 1702 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1703 } else if (opcode == Op_LShiftVL) { 1704 psllq(dst, shift); 1705 } else { 1706 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1707 psrlq(dst, shift); 1708 } 1709 } 1710 1711 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1712 switch (opcode) { 1713 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1714 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1715 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1716 1717 default: assert(false, "%s", NodeClassNames[opcode]); 1718 } 1719 } 1720 1721 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1722 if (opcode == Op_RShiftVL) { 1723 evpsraq(dst, nds, shift, vector_len); 1724 } else if (opcode == Op_LShiftVL) { 1725 vpsllq(dst, nds, shift, vector_len); 1726 } else { 1727 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1728 vpsrlq(dst, nds, shift, vector_len); 1729 } 1730 } 1731 1732 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1733 switch (opcode) { 1734 case Op_RShiftVB: // fall-through 1735 case Op_RShiftVS: // fall-through 1736 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1737 1738 case Op_LShiftVB: // fall-through 1739 case Op_LShiftVS: // fall-through 1740 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1741 1742 case Op_URShiftVB: // fall-through 1743 case Op_URShiftVS: // fall-through 1744 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1745 1746 default: assert(false, "%s", NodeClassNames[opcode]); 1747 } 1748 } 1749 1750 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1751 switch (opcode) { 1752 case Op_RShiftVB: // fall-through 1753 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1754 1755 case Op_LShiftVB: // fall-through 1756 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1757 1758 case Op_URShiftVB: // fall-through 1759 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1760 1761 default: assert(false, "%s", NodeClassNames[opcode]); 1762 } 1763 } 1764 1765 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1766 assert(UseAVX >= 2, "required"); 1767 switch (opcode) { 1768 case Op_RShiftVL: { 1769 if (UseAVX > 2) { 1770 assert(tmp == xnoreg, "not used"); 1771 if (!VM_Version::supports_avx512vl()) { 1772 vlen_enc = Assembler::AVX_512bit; 1773 } 1774 evpsravq(dst, src, shift, vlen_enc); 1775 } else { 1776 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1777 vpsrlvq(dst, src, shift, vlen_enc); 1778 vpsrlvq(tmp, tmp, shift, vlen_enc); 1779 vpxor(dst, dst, tmp, vlen_enc); 1780 vpsubq(dst, dst, tmp, vlen_enc); 1781 } 1782 break; 1783 } 1784 case Op_LShiftVL: { 1785 assert(tmp == xnoreg, "not used"); 1786 vpsllvq(dst, src, shift, vlen_enc); 1787 break; 1788 } 1789 case Op_URShiftVL: { 1790 assert(tmp == xnoreg, "not used"); 1791 vpsrlvq(dst, src, shift, vlen_enc); 1792 break; 1793 } 1794 default: assert(false, "%s", NodeClassNames[opcode]); 1795 } 1796 } 1797 1798 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1799 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1800 assert(opcode == Op_LShiftVB || 1801 opcode == Op_RShiftVB || 1802 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1803 bool sign = (opcode != Op_URShiftVB); 1804 assert(vector_len == 0, "required"); 1805 vextendbd(sign, dst, src, 1); 1806 vpmovzxbd(vtmp, shift, 1); 1807 varshiftd(opcode, dst, dst, vtmp, 1); 1808 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1809 vextracti128_high(vtmp, dst); 1810 vpackusdw(dst, dst, vtmp, 0); 1811 } 1812 1813 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1814 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1815 assert(opcode == Op_LShiftVB || 1816 opcode == Op_RShiftVB || 1817 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1818 bool sign = (opcode != Op_URShiftVB); 1819 int ext_vector_len = vector_len + 1; 1820 vextendbw(sign, dst, src, ext_vector_len); 1821 vpmovzxbw(vtmp, shift, ext_vector_len); 1822 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1823 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1824 if (vector_len == 0) { 1825 vextracti128_high(vtmp, dst); 1826 vpackuswb(dst, dst, vtmp, vector_len); 1827 } else { 1828 vextracti64x4_high(vtmp, dst); 1829 vpackuswb(dst, dst, vtmp, vector_len); 1830 vpermq(dst, dst, 0xD8, vector_len); 1831 } 1832 } 1833 1834 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1835 switch(typ) { 1836 case T_BYTE: 1837 pinsrb(dst, val, idx); 1838 break; 1839 case T_SHORT: 1840 pinsrw(dst, val, idx); 1841 break; 1842 case T_INT: 1843 pinsrd(dst, val, idx); 1844 break; 1845 case T_LONG: 1846 pinsrq(dst, val, idx); 1847 break; 1848 default: 1849 assert(false,"Should not reach here."); 1850 break; 1851 } 1852 } 1853 1854 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1855 switch(typ) { 1856 case T_BYTE: 1857 vpinsrb(dst, src, val, idx); 1858 break; 1859 case T_SHORT: 1860 vpinsrw(dst, src, val, idx); 1861 break; 1862 case T_INT: 1863 vpinsrd(dst, src, val, idx); 1864 break; 1865 case T_LONG: 1866 vpinsrq(dst, src, val, idx); 1867 break; 1868 default: 1869 assert(false,"Should not reach here."); 1870 break; 1871 } 1872 } 1873 1874 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1875 switch(typ) { 1876 case T_INT: 1877 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1878 break; 1879 case T_FLOAT: 1880 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1881 break; 1882 case T_LONG: 1883 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1884 break; 1885 case T_DOUBLE: 1886 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1887 break; 1888 default: 1889 assert(false,"Should not reach here."); 1890 break; 1891 } 1892 } 1893 1894 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1895 switch(typ) { 1896 case T_INT: 1897 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1898 break; 1899 case T_FLOAT: 1900 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1901 break; 1902 case T_LONG: 1903 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1904 break; 1905 case T_DOUBLE: 1906 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1907 break; 1908 default: 1909 assert(false,"Should not reach here."); 1910 break; 1911 } 1912 } 1913 1914 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1915 switch(typ) { 1916 case T_INT: 1917 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1918 break; 1919 case T_FLOAT: 1920 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1921 break; 1922 case T_LONG: 1923 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1924 break; 1925 case T_DOUBLE: 1926 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1927 break; 1928 default: 1929 assert(false,"Should not reach here."); 1930 break; 1931 } 1932 } 1933 1934 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1935 if (vlen_in_bytes <= 16) { 1936 pxor (dst, dst); 1937 psubb(dst, src); 1938 switch (elem_bt) { 1939 case T_BYTE: /* nothing to do */ break; 1940 case T_SHORT: pmovsxbw(dst, dst); break; 1941 case T_INT: pmovsxbd(dst, dst); break; 1942 case T_FLOAT: pmovsxbd(dst, dst); break; 1943 case T_LONG: pmovsxbq(dst, dst); break; 1944 case T_DOUBLE: pmovsxbq(dst, dst); break; 1945 1946 default: assert(false, "%s", type2name(elem_bt)); 1947 } 1948 } else { 1949 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1950 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1951 1952 vpxor (dst, dst, dst, vlen_enc); 1953 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1954 1955 switch (elem_bt) { 1956 case T_BYTE: /* nothing to do */ break; 1957 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1958 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1959 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1960 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1961 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1962 1963 default: assert(false, "%s", type2name(elem_bt)); 1964 } 1965 } 1966 } 1967 1968 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1969 if (novlbwdq) { 1970 vpmovsxbd(xtmp, src, vlen_enc); 1971 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1972 Assembler::eq, true, vlen_enc, noreg); 1973 } else { 1974 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1975 vpsubb(xtmp, xtmp, src, vlen_enc); 1976 evpmovb2m(dst, xtmp, vlen_enc); 1977 } 1978 } 1979 1980 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) { 1981 switch (vlen_in_bytes) { 1982 case 4: movdl(dst, src); break; 1983 case 8: movq(dst, src); break; 1984 case 16: movdqu(dst, src); break; 1985 case 32: vmovdqu(dst, src); break; 1986 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1987 default: ShouldNotReachHere(); 1988 } 1989 } 1990 1991 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1992 assert(rscratch != noreg || always_reachable(src), "missing"); 1993 1994 if (reachable(src)) { 1995 load_vector(dst, as_Address(src), vlen_in_bytes); 1996 } else { 1997 lea(rscratch, src); 1998 load_vector(dst, Address(rscratch, 0), vlen_in_bytes); 1999 } 2000 } 2001 2002 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 2003 int vlen_enc = vector_length_encoding(vlen); 2004 if (VM_Version::supports_avx()) { 2005 if (bt == T_LONG) { 2006 if (VM_Version::supports_avx2()) { 2007 vpbroadcastq(dst, src, vlen_enc); 2008 } else { 2009 vmovddup(dst, src, vlen_enc); 2010 } 2011 } else if (bt == T_DOUBLE) { 2012 if (vlen_enc != Assembler::AVX_128bit) { 2013 vbroadcastsd(dst, src, vlen_enc, noreg); 2014 } else { 2015 vmovddup(dst, src, vlen_enc); 2016 } 2017 } else { 2018 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 2019 vpbroadcastd(dst, src, vlen_enc); 2020 } else { 2021 vbroadcastss(dst, src, vlen_enc); 2022 } 2023 } 2024 } else if (VM_Version::supports_sse3()) { 2025 movddup(dst, src); 2026 } else { 2027 movq(dst, src); 2028 if (vlen == 16) { 2029 punpcklqdq(dst, dst); 2030 } 2031 } 2032 } 2033 2034 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 2035 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 2036 int offset = exact_log2(type2aelembytes(bt)) << 6; 2037 if (is_floating_point_type(bt)) { 2038 offset += 128; 2039 } 2040 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 2041 load_vector(dst, addr, vlen_in_bytes); 2042 } 2043 2044 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 2045 2046 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 2047 int vector_len = Assembler::AVX_128bit; 2048 2049 switch (opcode) { 2050 case Op_AndReductionV: pand(dst, src); break; 2051 case Op_OrReductionV: por (dst, src); break; 2052 case Op_XorReductionV: pxor(dst, src); break; 2053 case Op_MinReductionV: 2054 switch (typ) { 2055 case T_BYTE: pminsb(dst, src); break; 2056 case T_SHORT: pminsw(dst, src); break; 2057 case T_INT: pminsd(dst, src); break; 2058 case T_LONG: assert(UseAVX > 2, "required"); 2059 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 2060 default: assert(false, "wrong type"); 2061 } 2062 break; 2063 case Op_MaxReductionV: 2064 switch (typ) { 2065 case T_BYTE: pmaxsb(dst, src); break; 2066 case T_SHORT: pmaxsw(dst, src); break; 2067 case T_INT: pmaxsd(dst, src); break; 2068 case T_LONG: assert(UseAVX > 2, "required"); 2069 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 2070 default: assert(false, "wrong type"); 2071 } 2072 break; 2073 case Op_AddReductionVF: addss(dst, src); break; 2074 case Op_AddReductionVD: addsd(dst, src); break; 2075 case Op_AddReductionVI: 2076 switch (typ) { 2077 case T_BYTE: paddb(dst, src); break; 2078 case T_SHORT: paddw(dst, src); break; 2079 case T_INT: paddd(dst, src); break; 2080 default: assert(false, "wrong type"); 2081 } 2082 break; 2083 case Op_AddReductionVL: paddq(dst, src); break; 2084 case Op_MulReductionVF: mulss(dst, src); break; 2085 case Op_MulReductionVD: mulsd(dst, src); break; 2086 case Op_MulReductionVI: 2087 switch (typ) { 2088 case T_SHORT: pmullw(dst, src); break; 2089 case T_INT: pmulld(dst, src); break; 2090 default: assert(false, "wrong type"); 2091 } 2092 break; 2093 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 2094 evpmullq(dst, dst, src, vector_len); break; 2095 default: assert(false, "wrong opcode"); 2096 } 2097 } 2098 2099 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 2100 int vector_len = Assembler::AVX_256bit; 2101 2102 switch (opcode) { 2103 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 2104 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 2105 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 2106 case Op_MinReductionV: 2107 switch (typ) { 2108 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 2109 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 2110 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 2111 case T_LONG: assert(UseAVX > 2, "required"); 2112 vpminsq(dst, src1, src2, vector_len); break; 2113 default: assert(false, "wrong type"); 2114 } 2115 break; 2116 case Op_MaxReductionV: 2117 switch (typ) { 2118 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 2119 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 2120 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 2121 case T_LONG: assert(UseAVX > 2, "required"); 2122 vpmaxsq(dst, src1, src2, vector_len); break; 2123 default: assert(false, "wrong type"); 2124 } 2125 break; 2126 case Op_AddReductionVI: 2127 switch (typ) { 2128 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 2129 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 2130 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 2131 default: assert(false, "wrong type"); 2132 } 2133 break; 2134 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 2135 case Op_MulReductionVI: 2136 switch (typ) { 2137 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 2138 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 2139 default: assert(false, "wrong type"); 2140 } 2141 break; 2142 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 2143 default: assert(false, "wrong opcode"); 2144 } 2145 } 2146 2147 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 2148 XMMRegister dst, XMMRegister src, 2149 XMMRegister vtmp1, XMMRegister vtmp2) { 2150 switch (opcode) { 2151 case Op_AddReductionVF: 2152 case Op_MulReductionVF: 2153 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 2154 break; 2155 2156 case Op_AddReductionVD: 2157 case Op_MulReductionVD: 2158 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 2159 break; 2160 2161 default: assert(false, "wrong opcode"); 2162 } 2163 } 2164 2165 void C2_MacroAssembler::reduceB(int opcode, int vlen, 2166 Register dst, Register src1, XMMRegister src2, 2167 XMMRegister vtmp1, XMMRegister vtmp2) { 2168 switch (vlen) { 2169 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2170 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2171 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2172 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2173 2174 default: assert(false, "wrong vector length"); 2175 } 2176 } 2177 2178 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 2179 Register dst, Register src1, XMMRegister src2, 2180 XMMRegister vtmp1, XMMRegister vtmp2) { 2181 switch (vlen) { 2182 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2183 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2184 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2185 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2186 2187 default: assert(false, "wrong vector length"); 2188 } 2189 } 2190 2191 void C2_MacroAssembler::reduceS(int opcode, int vlen, 2192 Register dst, Register src1, XMMRegister src2, 2193 XMMRegister vtmp1, XMMRegister vtmp2) { 2194 switch (vlen) { 2195 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2196 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2197 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2198 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2199 2200 default: assert(false, "wrong vector length"); 2201 } 2202 } 2203 2204 void C2_MacroAssembler::reduceI(int opcode, int vlen, 2205 Register dst, Register src1, XMMRegister src2, 2206 XMMRegister vtmp1, XMMRegister vtmp2) { 2207 switch (vlen) { 2208 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2209 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2210 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2211 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2212 2213 default: assert(false, "wrong vector length"); 2214 } 2215 } 2216 2217 #ifdef _LP64 2218 void C2_MacroAssembler::reduceL(int opcode, int vlen, 2219 Register dst, Register src1, XMMRegister src2, 2220 XMMRegister vtmp1, XMMRegister vtmp2) { 2221 switch (vlen) { 2222 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2223 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2224 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2225 2226 default: assert(false, "wrong vector length"); 2227 } 2228 } 2229 #endif // _LP64 2230 2231 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2232 switch (vlen) { 2233 case 2: 2234 assert(vtmp2 == xnoreg, ""); 2235 reduce2F(opcode, dst, src, vtmp1); 2236 break; 2237 case 4: 2238 assert(vtmp2 == xnoreg, ""); 2239 reduce4F(opcode, dst, src, vtmp1); 2240 break; 2241 case 8: 2242 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2243 break; 2244 case 16: 2245 reduce16F(opcode, dst, src, vtmp1, vtmp2); 2246 break; 2247 default: assert(false, "wrong vector length"); 2248 } 2249 } 2250 2251 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2252 switch (vlen) { 2253 case 2: 2254 assert(vtmp2 == xnoreg, ""); 2255 reduce2D(opcode, dst, src, vtmp1); 2256 break; 2257 case 4: 2258 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2259 break; 2260 case 8: 2261 reduce8D(opcode, dst, src, vtmp1, vtmp2); 2262 break; 2263 default: assert(false, "wrong vector length"); 2264 } 2265 } 2266 2267 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2268 if (opcode == Op_AddReductionVI) { 2269 if (vtmp1 != src2) { 2270 movdqu(vtmp1, src2); 2271 } 2272 phaddd(vtmp1, vtmp1); 2273 } else { 2274 pshufd(vtmp1, src2, 0x1); 2275 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2276 } 2277 movdl(vtmp2, src1); 2278 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2279 movdl(dst, vtmp1); 2280 } 2281 2282 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2283 if (opcode == Op_AddReductionVI) { 2284 if (vtmp1 != src2) { 2285 movdqu(vtmp1, src2); 2286 } 2287 phaddd(vtmp1, src2); 2288 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2289 } else { 2290 pshufd(vtmp2, src2, 0xE); 2291 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2292 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2293 } 2294 } 2295 2296 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2297 if (opcode == Op_AddReductionVI) { 2298 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2299 vextracti128_high(vtmp2, vtmp1); 2300 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2301 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2302 } else { 2303 vextracti128_high(vtmp1, src2); 2304 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2305 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2306 } 2307 } 2308 2309 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2310 vextracti64x4_high(vtmp2, src2); 2311 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2312 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2313 } 2314 2315 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2316 pshufd(vtmp2, src2, 0x1); 2317 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2318 movdqu(vtmp1, vtmp2); 2319 psrldq(vtmp1, 2); 2320 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2321 movdqu(vtmp2, vtmp1); 2322 psrldq(vtmp2, 1); 2323 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2324 movdl(vtmp2, src1); 2325 pmovsxbd(vtmp1, vtmp1); 2326 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2327 pextrb(dst, vtmp1, 0x0); 2328 movsbl(dst, dst); 2329 } 2330 2331 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2332 pshufd(vtmp1, src2, 0xE); 2333 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2334 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2335 } 2336 2337 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2338 vextracti128_high(vtmp2, src2); 2339 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2340 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2341 } 2342 2343 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2344 vextracti64x4_high(vtmp1, src2); 2345 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2346 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2347 } 2348 2349 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2350 pmovsxbw(vtmp2, src2); 2351 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2352 } 2353 2354 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2355 if (UseAVX > 1) { 2356 int vector_len = Assembler::AVX_256bit; 2357 vpmovsxbw(vtmp1, src2, vector_len); 2358 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2359 } else { 2360 pmovsxbw(vtmp2, src2); 2361 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2362 pshufd(vtmp2, src2, 0x1); 2363 pmovsxbw(vtmp2, src2); 2364 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2365 } 2366 } 2367 2368 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2369 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2370 int vector_len = Assembler::AVX_512bit; 2371 vpmovsxbw(vtmp1, src2, vector_len); 2372 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2373 } else { 2374 assert(UseAVX >= 2,"Should not reach here."); 2375 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2376 vextracti128_high(vtmp2, src2); 2377 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2378 } 2379 } 2380 2381 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2382 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2383 vextracti64x4_high(vtmp2, src2); 2384 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2385 } 2386 2387 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2388 if (opcode == Op_AddReductionVI) { 2389 if (vtmp1 != src2) { 2390 movdqu(vtmp1, src2); 2391 } 2392 phaddw(vtmp1, vtmp1); 2393 phaddw(vtmp1, vtmp1); 2394 } else { 2395 pshufd(vtmp2, src2, 0x1); 2396 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2397 movdqu(vtmp1, vtmp2); 2398 psrldq(vtmp1, 2); 2399 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2400 } 2401 movdl(vtmp2, src1); 2402 pmovsxwd(vtmp1, vtmp1); 2403 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2404 pextrw(dst, vtmp1, 0x0); 2405 movswl(dst, dst); 2406 } 2407 2408 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2409 if (opcode == Op_AddReductionVI) { 2410 if (vtmp1 != src2) { 2411 movdqu(vtmp1, src2); 2412 } 2413 phaddw(vtmp1, src2); 2414 } else { 2415 pshufd(vtmp1, src2, 0xE); 2416 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2417 } 2418 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2419 } 2420 2421 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2422 if (opcode == Op_AddReductionVI) { 2423 int vector_len = Assembler::AVX_256bit; 2424 vphaddw(vtmp2, src2, src2, vector_len); 2425 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2426 } else { 2427 vextracti128_high(vtmp2, src2); 2428 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2429 } 2430 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2431 } 2432 2433 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2434 int vector_len = Assembler::AVX_256bit; 2435 vextracti64x4_high(vtmp1, src2); 2436 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2437 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2438 } 2439 2440 #ifdef _LP64 2441 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2442 pshufd(vtmp2, src2, 0xE); 2443 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2444 movdq(vtmp1, src1); 2445 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2446 movdq(dst, vtmp1); 2447 } 2448 2449 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2450 vextracti128_high(vtmp1, src2); 2451 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2452 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2453 } 2454 2455 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2456 vextracti64x4_high(vtmp2, src2); 2457 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2458 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2459 } 2460 2461 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2462 mov64(temp, -1L); 2463 bzhiq(temp, temp, len); 2464 kmovql(dst, temp); 2465 } 2466 #endif // _LP64 2467 2468 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2469 reduce_operation_128(T_FLOAT, opcode, dst, src); 2470 pshufd(vtmp, src, 0x1); 2471 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2472 } 2473 2474 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2475 reduce2F(opcode, dst, src, vtmp); 2476 pshufd(vtmp, src, 0x2); 2477 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2478 pshufd(vtmp, src, 0x3); 2479 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2480 } 2481 2482 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2483 reduce4F(opcode, dst, src, vtmp2); 2484 vextractf128_high(vtmp2, src); 2485 reduce4F(opcode, dst, vtmp2, vtmp1); 2486 } 2487 2488 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2489 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2490 vextracti64x4_high(vtmp1, src); 2491 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2492 } 2493 2494 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2495 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2496 pshufd(vtmp, src, 0xE); 2497 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2498 } 2499 2500 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2501 reduce2D(opcode, dst, src, vtmp2); 2502 vextractf128_high(vtmp2, src); 2503 reduce2D(opcode, dst, vtmp2, vtmp1); 2504 } 2505 2506 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2507 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2508 vextracti64x4_high(vtmp1, src); 2509 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2510 } 2511 2512 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2513 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2514 } 2515 2516 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2517 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2518 } 2519 2520 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2521 int vec_enc) { 2522 switch(elem_bt) { 2523 case T_INT: 2524 case T_FLOAT: 2525 vmaskmovps(dst, src, mask, vec_enc); 2526 break; 2527 case T_LONG: 2528 case T_DOUBLE: 2529 vmaskmovpd(dst, src, mask, vec_enc); 2530 break; 2531 default: 2532 fatal("Unsupported type %s", type2name(elem_bt)); 2533 break; 2534 } 2535 } 2536 2537 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2538 int vec_enc) { 2539 switch(elem_bt) { 2540 case T_INT: 2541 case T_FLOAT: 2542 vmaskmovps(dst, src, mask, vec_enc); 2543 break; 2544 case T_LONG: 2545 case T_DOUBLE: 2546 vmaskmovpd(dst, src, mask, vec_enc); 2547 break; 2548 default: 2549 fatal("Unsupported type %s", type2name(elem_bt)); 2550 break; 2551 } 2552 } 2553 2554 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2555 XMMRegister dst, XMMRegister src, 2556 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2557 XMMRegister xmm_0, XMMRegister xmm_1) { 2558 const int permconst[] = {1, 14}; 2559 XMMRegister wsrc = src; 2560 XMMRegister wdst = xmm_0; 2561 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2562 2563 int vlen_enc = Assembler::AVX_128bit; 2564 if (vlen == 16) { 2565 vlen_enc = Assembler::AVX_256bit; 2566 } 2567 2568 for (int i = log2(vlen) - 1; i >=0; i--) { 2569 if (i == 0 && !is_dst_valid) { 2570 wdst = dst; 2571 } 2572 if (i == 3) { 2573 vextracti64x4_high(wtmp, wsrc); 2574 } else if (i == 2) { 2575 vextracti128_high(wtmp, wsrc); 2576 } else { // i = [0,1] 2577 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2578 } 2579 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2580 wsrc = wdst; 2581 vlen_enc = Assembler::AVX_128bit; 2582 } 2583 if (is_dst_valid) { 2584 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2585 } 2586 } 2587 2588 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2589 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2590 XMMRegister xmm_0, XMMRegister xmm_1) { 2591 XMMRegister wsrc = src; 2592 XMMRegister wdst = xmm_0; 2593 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2594 int vlen_enc = Assembler::AVX_128bit; 2595 if (vlen == 8) { 2596 vlen_enc = Assembler::AVX_256bit; 2597 } 2598 for (int i = log2(vlen) - 1; i >=0; i--) { 2599 if (i == 0 && !is_dst_valid) { 2600 wdst = dst; 2601 } 2602 if (i == 1) { 2603 vextracti128_high(wtmp, wsrc); 2604 } else if (i == 2) { 2605 vextracti64x4_high(wtmp, wsrc); 2606 } else { 2607 assert(i == 0, "%d", i); 2608 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2609 } 2610 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2611 wsrc = wdst; 2612 vlen_enc = Assembler::AVX_128bit; 2613 } 2614 if (is_dst_valid) { 2615 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2616 } 2617 } 2618 2619 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2620 switch (bt) { 2621 case T_BYTE: pextrb(dst, src, idx); break; 2622 case T_SHORT: pextrw(dst, src, idx); break; 2623 case T_INT: pextrd(dst, src, idx); break; 2624 case T_LONG: pextrq(dst, src, idx); break; 2625 2626 default: 2627 assert(false,"Should not reach here."); 2628 break; 2629 } 2630 } 2631 2632 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2633 int esize = type2aelembytes(typ); 2634 int elem_per_lane = 16/esize; 2635 int lane = elemindex / elem_per_lane; 2636 int eindex = elemindex % elem_per_lane; 2637 2638 if (lane >= 2) { 2639 assert(UseAVX > 2, "required"); 2640 vextractf32x4(dst, src, lane & 3); 2641 return dst; 2642 } else if (lane > 0) { 2643 assert(UseAVX > 0, "required"); 2644 vextractf128(dst, src, lane); 2645 return dst; 2646 } else { 2647 return src; 2648 } 2649 } 2650 2651 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2652 if (typ == T_BYTE) { 2653 movsbl(dst, dst); 2654 } else if (typ == T_SHORT) { 2655 movswl(dst, dst); 2656 } 2657 } 2658 2659 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2660 int esize = type2aelembytes(typ); 2661 int elem_per_lane = 16/esize; 2662 int eindex = elemindex % elem_per_lane; 2663 assert(is_integral_type(typ),"required"); 2664 2665 if (eindex == 0) { 2666 if (typ == T_LONG) { 2667 movq(dst, src); 2668 } else { 2669 movdl(dst, src); 2670 movsxl(typ, dst); 2671 } 2672 } else { 2673 extract(typ, dst, src, eindex); 2674 movsxl(typ, dst); 2675 } 2676 } 2677 2678 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2679 int esize = type2aelembytes(typ); 2680 int elem_per_lane = 16/esize; 2681 int eindex = elemindex % elem_per_lane; 2682 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2683 2684 if (eindex == 0) { 2685 movq(dst, src); 2686 } else { 2687 if (typ == T_FLOAT) { 2688 if (UseAVX == 0) { 2689 movdqu(dst, src); 2690 shufps(dst, dst, eindex); 2691 } else { 2692 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2693 } 2694 } else { 2695 if (UseAVX == 0) { 2696 movdqu(dst, src); 2697 psrldq(dst, eindex*esize); 2698 } else { 2699 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2700 } 2701 movq(dst, dst); 2702 } 2703 } 2704 // Zero upper bits 2705 if (typ == T_FLOAT) { 2706 if (UseAVX == 0) { 2707 assert(vtmp != xnoreg, "required."); 2708 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2709 pand(dst, vtmp); 2710 } else { 2711 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2712 } 2713 } 2714 } 2715 2716 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2717 switch(typ) { 2718 case T_BYTE: 2719 case T_BOOLEAN: 2720 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2721 break; 2722 case T_SHORT: 2723 case T_CHAR: 2724 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2725 break; 2726 case T_INT: 2727 case T_FLOAT: 2728 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2729 break; 2730 case T_LONG: 2731 case T_DOUBLE: 2732 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2733 break; 2734 default: 2735 assert(false,"Should not reach here."); 2736 break; 2737 } 2738 } 2739 2740 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2741 assert(rscratch != noreg || always_reachable(src2), "missing"); 2742 2743 switch(typ) { 2744 case T_BOOLEAN: 2745 case T_BYTE: 2746 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2747 break; 2748 case T_CHAR: 2749 case T_SHORT: 2750 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2751 break; 2752 case T_INT: 2753 case T_FLOAT: 2754 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2755 break; 2756 case T_LONG: 2757 case T_DOUBLE: 2758 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2759 break; 2760 default: 2761 assert(false,"Should not reach here."); 2762 break; 2763 } 2764 } 2765 2766 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2767 switch(typ) { 2768 case T_BYTE: 2769 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2770 break; 2771 case T_SHORT: 2772 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2773 break; 2774 case T_INT: 2775 case T_FLOAT: 2776 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2777 break; 2778 case T_LONG: 2779 case T_DOUBLE: 2780 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2781 break; 2782 default: 2783 assert(false,"Should not reach here."); 2784 break; 2785 } 2786 } 2787 2788 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2789 assert(vlen_in_bytes <= 32, ""); 2790 int esize = type2aelembytes(bt); 2791 if (vlen_in_bytes == 32) { 2792 assert(vtmp == xnoreg, "required."); 2793 if (esize >= 4) { 2794 vtestps(src1, src2, AVX_256bit); 2795 } else { 2796 vptest(src1, src2, AVX_256bit); 2797 } 2798 return; 2799 } 2800 if (vlen_in_bytes < 16) { 2801 // Duplicate the lower part to fill the whole register, 2802 // Don't need to do so for src2 2803 assert(vtmp != xnoreg, "required"); 2804 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2805 pshufd(vtmp, src1, shuffle_imm); 2806 } else { 2807 assert(vtmp == xnoreg, "required"); 2808 vtmp = src1; 2809 } 2810 if (esize >= 4 && VM_Version::supports_avx()) { 2811 vtestps(vtmp, src2, AVX_128bit); 2812 } else { 2813 ptest(vtmp, src2); 2814 } 2815 } 2816 2817 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2818 assert(UseAVX >= 2, "required"); 2819 #ifdef ASSERT 2820 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2821 bool is_bw_supported = VM_Version::supports_avx512bw(); 2822 if (is_bw && !is_bw_supported) { 2823 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2824 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2825 "XMM register should be 0-15"); 2826 } 2827 #endif // ASSERT 2828 switch (elem_bt) { 2829 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2830 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2831 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2832 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2833 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2834 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2835 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2836 } 2837 } 2838 2839 #ifdef _LP64 2840 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2841 assert(UseAVX >= 2, "required"); 2842 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2843 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2844 if ((UseAVX > 2) && 2845 (!is_bw || VM_Version::supports_avx512bw()) && 2846 (!is_vl || VM_Version::supports_avx512vl())) { 2847 switch (elem_bt) { 2848 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2849 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2850 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2851 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2852 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2853 } 2854 } else { 2855 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2856 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2857 switch (elem_bt) { 2858 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2859 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2860 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2861 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2862 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2863 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2864 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2865 } 2866 } 2867 } 2868 #endif 2869 2870 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2871 switch (to_elem_bt) { 2872 case T_SHORT: 2873 vpmovsxbw(dst, src, vlen_enc); 2874 break; 2875 case T_INT: 2876 vpmovsxbd(dst, src, vlen_enc); 2877 break; 2878 case T_FLOAT: 2879 vpmovsxbd(dst, src, vlen_enc); 2880 vcvtdq2ps(dst, dst, vlen_enc); 2881 break; 2882 case T_LONG: 2883 vpmovsxbq(dst, src, vlen_enc); 2884 break; 2885 case T_DOUBLE: { 2886 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2887 vpmovsxbd(dst, src, mid_vlen_enc); 2888 vcvtdq2pd(dst, dst, vlen_enc); 2889 break; 2890 } 2891 default: 2892 fatal("Unsupported type %s", type2name(to_elem_bt)); 2893 break; 2894 } 2895 } 2896 2897 //------------------------------------------------------------------------------------------- 2898 2899 // IndexOf for constant substrings with size >= 8 chars 2900 // which don't need to be loaded through stack. 2901 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2902 Register cnt1, Register cnt2, 2903 int int_cnt2, Register result, 2904 XMMRegister vec, Register tmp, 2905 int ae) { 2906 ShortBranchVerifier sbv(this); 2907 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2908 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2909 2910 // This method uses the pcmpestri instruction with bound registers 2911 // inputs: 2912 // xmm - substring 2913 // rax - substring length (elements count) 2914 // mem - scanned string 2915 // rdx - string length (elements count) 2916 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2917 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2918 // outputs: 2919 // rcx - matched index in string 2920 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2921 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2922 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2923 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2924 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2925 2926 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2927 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2928 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2929 2930 // Note, inline_string_indexOf() generates checks: 2931 // if (substr.count > string.count) return -1; 2932 // if (substr.count == 0) return 0; 2933 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2934 2935 // Load substring. 2936 if (ae == StrIntrinsicNode::UL) { 2937 pmovzxbw(vec, Address(str2, 0)); 2938 } else { 2939 movdqu(vec, Address(str2, 0)); 2940 } 2941 movl(cnt2, int_cnt2); 2942 movptr(result, str1); // string addr 2943 2944 if (int_cnt2 > stride) { 2945 jmpb(SCAN_TO_SUBSTR); 2946 2947 // Reload substr for rescan, this code 2948 // is executed only for large substrings (> 8 chars) 2949 bind(RELOAD_SUBSTR); 2950 if (ae == StrIntrinsicNode::UL) { 2951 pmovzxbw(vec, Address(str2, 0)); 2952 } else { 2953 movdqu(vec, Address(str2, 0)); 2954 } 2955 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2956 2957 bind(RELOAD_STR); 2958 // We came here after the beginning of the substring was 2959 // matched but the rest of it was not so we need to search 2960 // again. Start from the next element after the previous match. 2961 2962 // cnt2 is number of substring reminding elements and 2963 // cnt1 is number of string reminding elements when cmp failed. 2964 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2965 subl(cnt1, cnt2); 2966 addl(cnt1, int_cnt2); 2967 movl(cnt2, int_cnt2); // Now restore cnt2 2968 2969 decrementl(cnt1); // Shift to next element 2970 cmpl(cnt1, cnt2); 2971 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2972 2973 addptr(result, (1<<scale1)); 2974 2975 } // (int_cnt2 > 8) 2976 2977 // Scan string for start of substr in 16-byte vectors 2978 bind(SCAN_TO_SUBSTR); 2979 pcmpestri(vec, Address(result, 0), mode); 2980 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2981 subl(cnt1, stride); 2982 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2983 cmpl(cnt1, cnt2); 2984 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2985 addptr(result, 16); 2986 jmpb(SCAN_TO_SUBSTR); 2987 2988 // Found a potential substr 2989 bind(FOUND_CANDIDATE); 2990 // Matched whole vector if first element matched (tmp(rcx) == 0). 2991 if (int_cnt2 == stride) { 2992 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2993 } else { // int_cnt2 > 8 2994 jccb(Assembler::overflow, FOUND_SUBSTR); 2995 } 2996 // After pcmpestri tmp(rcx) contains matched element index 2997 // Compute start addr of substr 2998 lea(result, Address(result, tmp, scale1)); 2999 3000 // Make sure string is still long enough 3001 subl(cnt1, tmp); 3002 cmpl(cnt1, cnt2); 3003 if (int_cnt2 == stride) { 3004 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3005 } else { // int_cnt2 > 8 3006 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 3007 } 3008 // Left less then substring. 3009 3010 bind(RET_NOT_FOUND); 3011 movl(result, -1); 3012 jmp(EXIT); 3013 3014 if (int_cnt2 > stride) { 3015 // This code is optimized for the case when whole substring 3016 // is matched if its head is matched. 3017 bind(MATCH_SUBSTR_HEAD); 3018 pcmpestri(vec, Address(result, 0), mode); 3019 // Reload only string if does not match 3020 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 3021 3022 Label CONT_SCAN_SUBSTR; 3023 // Compare the rest of substring (> 8 chars). 3024 bind(FOUND_SUBSTR); 3025 // First 8 chars are already matched. 3026 negptr(cnt2); 3027 addptr(cnt2, stride); 3028 3029 bind(SCAN_SUBSTR); 3030 subl(cnt1, stride); 3031 cmpl(cnt2, -stride); // Do not read beyond substring 3032 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 3033 // Back-up strings to avoid reading beyond substring: 3034 // cnt1 = cnt1 - cnt2 + 8 3035 addl(cnt1, cnt2); // cnt2 is negative 3036 addl(cnt1, stride); 3037 movl(cnt2, stride); negptr(cnt2); 3038 bind(CONT_SCAN_SUBSTR); 3039 if (int_cnt2 < (int)G) { 3040 int tail_off1 = int_cnt2<<scale1; 3041 int tail_off2 = int_cnt2<<scale2; 3042 if (ae == StrIntrinsicNode::UL) { 3043 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 3044 } else { 3045 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 3046 } 3047 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 3048 } else { 3049 // calculate index in register to avoid integer overflow (int_cnt2*2) 3050 movl(tmp, int_cnt2); 3051 addptr(tmp, cnt2); 3052 if (ae == StrIntrinsicNode::UL) { 3053 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 3054 } else { 3055 movdqu(vec, Address(str2, tmp, scale2, 0)); 3056 } 3057 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 3058 } 3059 // Need to reload strings pointers if not matched whole vector 3060 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3061 addptr(cnt2, stride); 3062 jcc(Assembler::negative, SCAN_SUBSTR); 3063 // Fall through if found full substring 3064 3065 } // (int_cnt2 > 8) 3066 3067 bind(RET_FOUND); 3068 // Found result if we matched full small substring. 3069 // Compute substr offset 3070 subptr(result, str1); 3071 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3072 shrl(result, 1); // index 3073 } 3074 bind(EXIT); 3075 3076 } // string_indexofC8 3077 3078 // Small strings are loaded through stack if they cross page boundary. 3079 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 3080 Register cnt1, Register cnt2, 3081 int int_cnt2, Register result, 3082 XMMRegister vec, Register tmp, 3083 int ae) { 3084 ShortBranchVerifier sbv(this); 3085 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3086 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 3087 3088 // 3089 // int_cnt2 is length of small (< 8 chars) constant substring 3090 // or (-1) for non constant substring in which case its length 3091 // is in cnt2 register. 3092 // 3093 // Note, inline_string_indexOf() generates checks: 3094 // if (substr.count > string.count) return -1; 3095 // if (substr.count == 0) return 0; 3096 // 3097 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 3098 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 3099 // This method uses the pcmpestri instruction with bound registers 3100 // inputs: 3101 // xmm - substring 3102 // rax - substring length (elements count) 3103 // mem - scanned string 3104 // rdx - string length (elements count) 3105 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 3106 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 3107 // outputs: 3108 // rcx - matched index in string 3109 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3110 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 3111 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 3112 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 3113 3114 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 3115 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 3116 FOUND_CANDIDATE; 3117 3118 { //======================================================== 3119 // We don't know where these strings are located 3120 // and we can't read beyond them. Load them through stack. 3121 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 3122 3123 movptr(tmp, rsp); // save old SP 3124 3125 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 3126 if (int_cnt2 == (1>>scale2)) { // One byte 3127 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 3128 load_unsigned_byte(result, Address(str2, 0)); 3129 movdl(vec, result); // move 32 bits 3130 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 3131 // Not enough header space in 32-bit VM: 12+3 = 15. 3132 movl(result, Address(str2, -1)); 3133 shrl(result, 8); 3134 movdl(vec, result); // move 32 bits 3135 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 3136 load_unsigned_short(result, Address(str2, 0)); 3137 movdl(vec, result); // move 32 bits 3138 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 3139 movdl(vec, Address(str2, 0)); // move 32 bits 3140 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 3141 movq(vec, Address(str2, 0)); // move 64 bits 3142 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 3143 // Array header size is 12 bytes in 32-bit VM 3144 // + 6 bytes for 3 chars == 18 bytes, 3145 // enough space to load vec and shift. 3146 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 3147 if (ae == StrIntrinsicNode::UL) { 3148 int tail_off = int_cnt2-8; 3149 pmovzxbw(vec, Address(str2, tail_off)); 3150 psrldq(vec, -2*tail_off); 3151 } 3152 else { 3153 int tail_off = int_cnt2*(1<<scale2); 3154 movdqu(vec, Address(str2, tail_off-16)); 3155 psrldq(vec, 16-tail_off); 3156 } 3157 } 3158 } else { // not constant substring 3159 cmpl(cnt2, stride); 3160 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 3161 3162 // We can read beyond string if srt+16 does not cross page boundary 3163 // since heaps are aligned and mapped by pages. 3164 assert(os::vm_page_size() < (int)G, "default page should be small"); 3165 movl(result, str2); // We need only low 32 bits 3166 andl(result, ((int)os::vm_page_size()-1)); 3167 cmpl(result, ((int)os::vm_page_size()-16)); 3168 jccb(Assembler::belowEqual, CHECK_STR); 3169 3170 // Move small strings to stack to allow load 16 bytes into vec. 3171 subptr(rsp, 16); 3172 int stk_offset = wordSize-(1<<scale2); 3173 push(cnt2); 3174 3175 bind(COPY_SUBSTR); 3176 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 3177 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 3178 movb(Address(rsp, cnt2, scale2, stk_offset), result); 3179 } else if (ae == StrIntrinsicNode::UU) { 3180 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 3181 movw(Address(rsp, cnt2, scale2, stk_offset), result); 3182 } 3183 decrement(cnt2); 3184 jccb(Assembler::notZero, COPY_SUBSTR); 3185 3186 pop(cnt2); 3187 movptr(str2, rsp); // New substring address 3188 } // non constant 3189 3190 bind(CHECK_STR); 3191 cmpl(cnt1, stride); 3192 jccb(Assembler::aboveEqual, BIG_STRINGS); 3193 3194 // Check cross page boundary. 3195 movl(result, str1); // We need only low 32 bits 3196 andl(result, ((int)os::vm_page_size()-1)); 3197 cmpl(result, ((int)os::vm_page_size()-16)); 3198 jccb(Assembler::belowEqual, BIG_STRINGS); 3199 3200 subptr(rsp, 16); 3201 int stk_offset = -(1<<scale1); 3202 if (int_cnt2 < 0) { // not constant 3203 push(cnt2); 3204 stk_offset += wordSize; 3205 } 3206 movl(cnt2, cnt1); 3207 3208 bind(COPY_STR); 3209 if (ae == StrIntrinsicNode::LL) { 3210 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3211 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3212 } else { 3213 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3214 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3215 } 3216 decrement(cnt2); 3217 jccb(Assembler::notZero, COPY_STR); 3218 3219 if (int_cnt2 < 0) { // not constant 3220 pop(cnt2); 3221 } 3222 movptr(str1, rsp); // New string address 3223 3224 bind(BIG_STRINGS); 3225 // Load substring. 3226 if (int_cnt2 < 0) { // -1 3227 if (ae == StrIntrinsicNode::UL) { 3228 pmovzxbw(vec, Address(str2, 0)); 3229 } else { 3230 movdqu(vec, Address(str2, 0)); 3231 } 3232 push(cnt2); // substr count 3233 push(str2); // substr addr 3234 push(str1); // string addr 3235 } else { 3236 // Small (< 8 chars) constant substrings are loaded already. 3237 movl(cnt2, int_cnt2); 3238 } 3239 push(tmp); // original SP 3240 3241 } // Finished loading 3242 3243 //======================================================== 3244 // Start search 3245 // 3246 3247 movptr(result, str1); // string addr 3248 3249 if (int_cnt2 < 0) { // Only for non constant substring 3250 jmpb(SCAN_TO_SUBSTR); 3251 3252 // SP saved at sp+0 3253 // String saved at sp+1*wordSize 3254 // Substr saved at sp+2*wordSize 3255 // Substr count saved at sp+3*wordSize 3256 3257 // Reload substr for rescan, this code 3258 // is executed only for large substrings (> 8 chars) 3259 bind(RELOAD_SUBSTR); 3260 movptr(str2, Address(rsp, 2*wordSize)); 3261 movl(cnt2, Address(rsp, 3*wordSize)); 3262 if (ae == StrIntrinsicNode::UL) { 3263 pmovzxbw(vec, Address(str2, 0)); 3264 } else { 3265 movdqu(vec, Address(str2, 0)); 3266 } 3267 // We came here after the beginning of the substring was 3268 // matched but the rest of it was not so we need to search 3269 // again. Start from the next element after the previous match. 3270 subptr(str1, result); // Restore counter 3271 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3272 shrl(str1, 1); 3273 } 3274 addl(cnt1, str1); 3275 decrementl(cnt1); // Shift to next element 3276 cmpl(cnt1, cnt2); 3277 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3278 3279 addptr(result, (1<<scale1)); 3280 } // non constant 3281 3282 // Scan string for start of substr in 16-byte vectors 3283 bind(SCAN_TO_SUBSTR); 3284 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3285 pcmpestri(vec, Address(result, 0), mode); 3286 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3287 subl(cnt1, stride); 3288 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3289 cmpl(cnt1, cnt2); 3290 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3291 addptr(result, 16); 3292 3293 bind(ADJUST_STR); 3294 cmpl(cnt1, stride); // Do not read beyond string 3295 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3296 // Back-up string to avoid reading beyond string. 3297 lea(result, Address(result, cnt1, scale1, -16)); 3298 movl(cnt1, stride); 3299 jmpb(SCAN_TO_SUBSTR); 3300 3301 // Found a potential substr 3302 bind(FOUND_CANDIDATE); 3303 // After pcmpestri tmp(rcx) contains matched element index 3304 3305 // Make sure string is still long enough 3306 subl(cnt1, tmp); 3307 cmpl(cnt1, cnt2); 3308 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3309 // Left less then substring. 3310 3311 bind(RET_NOT_FOUND); 3312 movl(result, -1); 3313 jmp(CLEANUP); 3314 3315 bind(FOUND_SUBSTR); 3316 // Compute start addr of substr 3317 lea(result, Address(result, tmp, scale1)); 3318 if (int_cnt2 > 0) { // Constant substring 3319 // Repeat search for small substring (< 8 chars) 3320 // from new point without reloading substring. 3321 // Have to check that we don't read beyond string. 3322 cmpl(tmp, stride-int_cnt2); 3323 jccb(Assembler::greater, ADJUST_STR); 3324 // Fall through if matched whole substring. 3325 } else { // non constant 3326 assert(int_cnt2 == -1, "should be != 0"); 3327 3328 addl(tmp, cnt2); 3329 // Found result if we matched whole substring. 3330 cmpl(tmp, stride); 3331 jcc(Assembler::lessEqual, RET_FOUND); 3332 3333 // Repeat search for small substring (<= 8 chars) 3334 // from new point 'str1' without reloading substring. 3335 cmpl(cnt2, stride); 3336 // Have to check that we don't read beyond string. 3337 jccb(Assembler::lessEqual, ADJUST_STR); 3338 3339 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3340 // Compare the rest of substring (> 8 chars). 3341 movptr(str1, result); 3342 3343 cmpl(tmp, cnt2); 3344 // First 8 chars are already matched. 3345 jccb(Assembler::equal, CHECK_NEXT); 3346 3347 bind(SCAN_SUBSTR); 3348 pcmpestri(vec, Address(str1, 0), mode); 3349 // Need to reload strings pointers if not matched whole vector 3350 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3351 3352 bind(CHECK_NEXT); 3353 subl(cnt2, stride); 3354 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3355 addptr(str1, 16); 3356 if (ae == StrIntrinsicNode::UL) { 3357 addptr(str2, 8); 3358 } else { 3359 addptr(str2, 16); 3360 } 3361 subl(cnt1, stride); 3362 cmpl(cnt2, stride); // Do not read beyond substring 3363 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3364 // Back-up strings to avoid reading beyond substring. 3365 3366 if (ae == StrIntrinsicNode::UL) { 3367 lea(str2, Address(str2, cnt2, scale2, -8)); 3368 lea(str1, Address(str1, cnt2, scale1, -16)); 3369 } else { 3370 lea(str2, Address(str2, cnt2, scale2, -16)); 3371 lea(str1, Address(str1, cnt2, scale1, -16)); 3372 } 3373 subl(cnt1, cnt2); 3374 movl(cnt2, stride); 3375 addl(cnt1, stride); 3376 bind(CONT_SCAN_SUBSTR); 3377 if (ae == StrIntrinsicNode::UL) { 3378 pmovzxbw(vec, Address(str2, 0)); 3379 } else { 3380 movdqu(vec, Address(str2, 0)); 3381 } 3382 jmp(SCAN_SUBSTR); 3383 3384 bind(RET_FOUND_LONG); 3385 movptr(str1, Address(rsp, wordSize)); 3386 } // non constant 3387 3388 bind(RET_FOUND); 3389 // Compute substr offset 3390 subptr(result, str1); 3391 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3392 shrl(result, 1); // index 3393 } 3394 bind(CLEANUP); 3395 pop(rsp); // restore SP 3396 3397 } // string_indexof 3398 3399 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3400 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3401 ShortBranchVerifier sbv(this); 3402 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3403 3404 int stride = 8; 3405 3406 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3407 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3408 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3409 FOUND_SEQ_CHAR, DONE_LABEL; 3410 3411 movptr(result, str1); 3412 if (UseAVX >= 2) { 3413 cmpl(cnt1, stride); 3414 jcc(Assembler::less, SCAN_TO_CHAR); 3415 cmpl(cnt1, 2*stride); 3416 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3417 movdl(vec1, ch); 3418 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3419 vpxor(vec2, vec2); 3420 movl(tmp, cnt1); 3421 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3422 andl(cnt1,0x0000000F); //tail count (in chars) 3423 3424 bind(SCAN_TO_16_CHAR_LOOP); 3425 vmovdqu(vec3, Address(result, 0)); 3426 vpcmpeqw(vec3, vec3, vec1, 1); 3427 vptest(vec2, vec3); 3428 jcc(Assembler::carryClear, FOUND_CHAR); 3429 addptr(result, 32); 3430 subl(tmp, 2*stride); 3431 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3432 jmp(SCAN_TO_8_CHAR); 3433 bind(SCAN_TO_8_CHAR_INIT); 3434 movdl(vec1, ch); 3435 pshuflw(vec1, vec1, 0x00); 3436 pshufd(vec1, vec1, 0); 3437 pxor(vec2, vec2); 3438 } 3439 bind(SCAN_TO_8_CHAR); 3440 cmpl(cnt1, stride); 3441 jcc(Assembler::less, SCAN_TO_CHAR); 3442 if (UseAVX < 2) { 3443 movdl(vec1, ch); 3444 pshuflw(vec1, vec1, 0x00); 3445 pshufd(vec1, vec1, 0); 3446 pxor(vec2, vec2); 3447 } 3448 movl(tmp, cnt1); 3449 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3450 andl(cnt1,0x00000007); //tail count (in chars) 3451 3452 bind(SCAN_TO_8_CHAR_LOOP); 3453 movdqu(vec3, Address(result, 0)); 3454 pcmpeqw(vec3, vec1); 3455 ptest(vec2, vec3); 3456 jcc(Assembler::carryClear, FOUND_CHAR); 3457 addptr(result, 16); 3458 subl(tmp, stride); 3459 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3460 bind(SCAN_TO_CHAR); 3461 testl(cnt1, cnt1); 3462 jcc(Assembler::zero, RET_NOT_FOUND); 3463 bind(SCAN_TO_CHAR_LOOP); 3464 load_unsigned_short(tmp, Address(result, 0)); 3465 cmpl(ch, tmp); 3466 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3467 addptr(result, 2); 3468 subl(cnt1, 1); 3469 jccb(Assembler::zero, RET_NOT_FOUND); 3470 jmp(SCAN_TO_CHAR_LOOP); 3471 3472 bind(RET_NOT_FOUND); 3473 movl(result, -1); 3474 jmpb(DONE_LABEL); 3475 3476 bind(FOUND_CHAR); 3477 if (UseAVX >= 2) { 3478 vpmovmskb(tmp, vec3); 3479 } else { 3480 pmovmskb(tmp, vec3); 3481 } 3482 bsfl(ch, tmp); 3483 addptr(result, ch); 3484 3485 bind(FOUND_SEQ_CHAR); 3486 subptr(result, str1); 3487 shrl(result, 1); 3488 3489 bind(DONE_LABEL); 3490 } // string_indexof_char 3491 3492 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3493 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3494 ShortBranchVerifier sbv(this); 3495 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3496 3497 int stride = 16; 3498 3499 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3500 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3501 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3502 FOUND_SEQ_CHAR, DONE_LABEL; 3503 3504 movptr(result, str1); 3505 if (UseAVX >= 2) { 3506 cmpl(cnt1, stride); 3507 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3508 cmpl(cnt1, stride*2); 3509 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3510 movdl(vec1, ch); 3511 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3512 vpxor(vec2, vec2); 3513 movl(tmp, cnt1); 3514 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3515 andl(cnt1,0x0000001F); //tail count (in chars) 3516 3517 bind(SCAN_TO_32_CHAR_LOOP); 3518 vmovdqu(vec3, Address(result, 0)); 3519 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3520 vptest(vec2, vec3); 3521 jcc(Assembler::carryClear, FOUND_CHAR); 3522 addptr(result, 32); 3523 subl(tmp, stride*2); 3524 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3525 jmp(SCAN_TO_16_CHAR); 3526 3527 bind(SCAN_TO_16_CHAR_INIT); 3528 movdl(vec1, ch); 3529 pxor(vec2, vec2); 3530 pshufb(vec1, vec2); 3531 } 3532 3533 bind(SCAN_TO_16_CHAR); 3534 cmpl(cnt1, stride); 3535 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3536 if (UseAVX < 2) { 3537 movdl(vec1, ch); 3538 pxor(vec2, vec2); 3539 pshufb(vec1, vec2); 3540 } 3541 movl(tmp, cnt1); 3542 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3543 andl(cnt1,0x0000000F); //tail count (in bytes) 3544 3545 bind(SCAN_TO_16_CHAR_LOOP); 3546 movdqu(vec3, Address(result, 0)); 3547 pcmpeqb(vec3, vec1); 3548 ptest(vec2, vec3); 3549 jcc(Assembler::carryClear, FOUND_CHAR); 3550 addptr(result, 16); 3551 subl(tmp, stride); 3552 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3553 3554 bind(SCAN_TO_CHAR_INIT); 3555 testl(cnt1, cnt1); 3556 jcc(Assembler::zero, RET_NOT_FOUND); 3557 bind(SCAN_TO_CHAR_LOOP); 3558 load_unsigned_byte(tmp, Address(result, 0)); 3559 cmpl(ch, tmp); 3560 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3561 addptr(result, 1); 3562 subl(cnt1, 1); 3563 jccb(Assembler::zero, RET_NOT_FOUND); 3564 jmp(SCAN_TO_CHAR_LOOP); 3565 3566 bind(RET_NOT_FOUND); 3567 movl(result, -1); 3568 jmpb(DONE_LABEL); 3569 3570 bind(FOUND_CHAR); 3571 if (UseAVX >= 2) { 3572 vpmovmskb(tmp, vec3); 3573 } else { 3574 pmovmskb(tmp, vec3); 3575 } 3576 bsfl(ch, tmp); 3577 addptr(result, ch); 3578 3579 bind(FOUND_SEQ_CHAR); 3580 subptr(result, str1); 3581 3582 bind(DONE_LABEL); 3583 } // stringL_indexof_char 3584 3585 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3586 switch (eltype) { 3587 case T_BOOLEAN: return sizeof(jboolean); 3588 case T_BYTE: return sizeof(jbyte); 3589 case T_SHORT: return sizeof(jshort); 3590 case T_CHAR: return sizeof(jchar); 3591 case T_INT: return sizeof(jint); 3592 default: 3593 ShouldNotReachHere(); 3594 return -1; 3595 } 3596 } 3597 3598 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3599 switch (eltype) { 3600 // T_BOOLEAN used as surrogate for unsigned byte 3601 case T_BOOLEAN: movzbl(dst, src); break; 3602 case T_BYTE: movsbl(dst, src); break; 3603 case T_SHORT: movswl(dst, src); break; 3604 case T_CHAR: movzwl(dst, src); break; 3605 case T_INT: movl(dst, src); break; 3606 default: 3607 ShouldNotReachHere(); 3608 } 3609 } 3610 3611 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3612 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3613 } 3614 3615 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3616 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3617 } 3618 3619 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3620 const int vlen = Assembler::AVX_256bit; 3621 switch (eltype) { 3622 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3623 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3624 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3625 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3626 case T_INT: 3627 // do nothing 3628 break; 3629 default: 3630 ShouldNotReachHere(); 3631 } 3632 } 3633 3634 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3635 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3636 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3637 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3638 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3639 BasicType eltype) { 3640 ShortBranchVerifier sbv(this); 3641 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3642 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3643 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3644 3645 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3646 SHORT_UNROLLED_LOOP_EXIT, 3647 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3648 UNROLLED_VECTOR_LOOP_BEGIN, 3649 END; 3650 switch (eltype) { 3651 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3652 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3653 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3654 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3655 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3656 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3657 } 3658 3659 // For "renaming" for readibility of the code 3660 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3661 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3662 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3663 3664 const int elsize = arrays_hashcode_elsize(eltype); 3665 3666 /* 3667 if (cnt1 >= 2) { 3668 if (cnt1 >= 32) { 3669 UNROLLED VECTOR LOOP 3670 } 3671 UNROLLED SCALAR LOOP 3672 } 3673 SINGLE SCALAR 3674 */ 3675 3676 cmpl(cnt1, 32); 3677 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3678 3679 // cnt1 >= 32 && generate_vectorized_loop 3680 xorl(index, index); 3681 3682 // vresult = IntVector.zero(I256); 3683 for (int idx = 0; idx < 4; idx++) { 3684 vpxor(vresult[idx], vresult[idx]); 3685 } 3686 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3687 Register bound = tmp2; 3688 Register next = tmp3; 3689 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3690 movl(next, Address(tmp2, 0)); 3691 movdl(vnext, next); 3692 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3693 3694 // index = 0; 3695 // bound = cnt1 & ~(32 - 1); 3696 movl(bound, cnt1); 3697 andl(bound, ~(32 - 1)); 3698 // for (; index < bound; index += 32) { 3699 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3700 // result *= next; 3701 imull(result, next); 3702 // loop fission to upfront the cost of fetching from memory, OOO execution 3703 // can then hopefully do a better job of prefetching 3704 for (int idx = 0; idx < 4; idx++) { 3705 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3706 } 3707 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3708 for (int idx = 0; idx < 4; idx++) { 3709 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3710 arrays_hashcode_elvcast(vtmp[idx], eltype); 3711 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3712 } 3713 // index += 32; 3714 addl(index, 32); 3715 // index < bound; 3716 cmpl(index, bound); 3717 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3718 // } 3719 3720 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3721 subl(cnt1, bound); 3722 // release bound 3723 3724 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3725 for (int idx = 0; idx < 4; idx++) { 3726 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3727 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3728 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3729 } 3730 // result += vresult.reduceLanes(ADD); 3731 for (int idx = 0; idx < 4; idx++) { 3732 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3733 } 3734 3735 // } else if (cnt1 < 32) { 3736 3737 bind(SHORT_UNROLLED_BEGIN); 3738 // int i = 1; 3739 movl(index, 1); 3740 cmpl(index, cnt1); 3741 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3742 3743 // for (; i < cnt1 ; i += 2) { 3744 bind(SHORT_UNROLLED_LOOP_BEGIN); 3745 movl(tmp3, 961); 3746 imull(result, tmp3); 3747 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3748 movl(tmp3, tmp2); 3749 shll(tmp3, 5); 3750 subl(tmp3, tmp2); 3751 addl(result, tmp3); 3752 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3753 addl(result, tmp3); 3754 addl(index, 2); 3755 cmpl(index, cnt1); 3756 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3757 3758 // } 3759 // if (i >= cnt1) { 3760 bind(SHORT_UNROLLED_LOOP_EXIT); 3761 jccb(Assembler::greater, END); 3762 movl(tmp2, result); 3763 shll(result, 5); 3764 subl(result, tmp2); 3765 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3766 addl(result, tmp3); 3767 // } 3768 bind(END); 3769 3770 BLOCK_COMMENT("} // arrays_hashcode"); 3771 3772 } // arrays_hashcode 3773 3774 // helper function for string_compare 3775 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3776 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3777 Address::ScaleFactor scale2, Register index, int ae) { 3778 if (ae == StrIntrinsicNode::LL) { 3779 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3780 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3781 } else if (ae == StrIntrinsicNode::UU) { 3782 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3783 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3784 } else { 3785 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3786 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3787 } 3788 } 3789 3790 // Compare strings, used for char[] and byte[]. 3791 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3792 Register cnt1, Register cnt2, Register result, 3793 XMMRegister vec1, int ae, KRegister mask) { 3794 ShortBranchVerifier sbv(this); 3795 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3796 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3797 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3798 int stride2x2 = 0x40; 3799 Address::ScaleFactor scale = Address::no_scale; 3800 Address::ScaleFactor scale1 = Address::no_scale; 3801 Address::ScaleFactor scale2 = Address::no_scale; 3802 3803 if (ae != StrIntrinsicNode::LL) { 3804 stride2x2 = 0x20; 3805 } 3806 3807 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3808 shrl(cnt2, 1); 3809 } 3810 // Compute the minimum of the string lengths and the 3811 // difference of the string lengths (stack). 3812 // Do the conditional move stuff 3813 movl(result, cnt1); 3814 subl(cnt1, cnt2); 3815 push(cnt1); 3816 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3817 3818 // Is the minimum length zero? 3819 testl(cnt2, cnt2); 3820 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3821 if (ae == StrIntrinsicNode::LL) { 3822 // Load first bytes 3823 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3824 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3825 } else if (ae == StrIntrinsicNode::UU) { 3826 // Load first characters 3827 load_unsigned_short(result, Address(str1, 0)); 3828 load_unsigned_short(cnt1, Address(str2, 0)); 3829 } else { 3830 load_unsigned_byte(result, Address(str1, 0)); 3831 load_unsigned_short(cnt1, Address(str2, 0)); 3832 } 3833 subl(result, cnt1); 3834 jcc(Assembler::notZero, POP_LABEL); 3835 3836 if (ae == StrIntrinsicNode::UU) { 3837 // Divide length by 2 to get number of chars 3838 shrl(cnt2, 1); 3839 } 3840 cmpl(cnt2, 1); 3841 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3842 3843 // Check if the strings start at the same location and setup scale and stride 3844 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3845 cmpptr(str1, str2); 3846 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3847 if (ae == StrIntrinsicNode::LL) { 3848 scale = Address::times_1; 3849 stride = 16; 3850 } else { 3851 scale = Address::times_2; 3852 stride = 8; 3853 } 3854 } else { 3855 scale1 = Address::times_1; 3856 scale2 = Address::times_2; 3857 // scale not used 3858 stride = 8; 3859 } 3860 3861 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3862 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3863 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3864 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3865 Label COMPARE_TAIL_LONG; 3866 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3867 3868 int pcmpmask = 0x19; 3869 if (ae == StrIntrinsicNode::LL) { 3870 pcmpmask &= ~0x01; 3871 } 3872 3873 // Setup to compare 16-chars (32-bytes) vectors, 3874 // start from first character again because it has aligned address. 3875 if (ae == StrIntrinsicNode::LL) { 3876 stride2 = 32; 3877 } else { 3878 stride2 = 16; 3879 } 3880 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3881 adr_stride = stride << scale; 3882 } else { 3883 adr_stride1 = 8; //stride << scale1; 3884 adr_stride2 = 16; //stride << scale2; 3885 } 3886 3887 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3888 // rax and rdx are used by pcmpestri as elements counters 3889 movl(result, cnt2); 3890 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3891 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3892 3893 // fast path : compare first 2 8-char vectors. 3894 bind(COMPARE_16_CHARS); 3895 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3896 movdqu(vec1, Address(str1, 0)); 3897 } else { 3898 pmovzxbw(vec1, Address(str1, 0)); 3899 } 3900 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3901 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3902 3903 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3904 movdqu(vec1, Address(str1, adr_stride)); 3905 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3906 } else { 3907 pmovzxbw(vec1, Address(str1, adr_stride1)); 3908 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3909 } 3910 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3911 addl(cnt1, stride); 3912 3913 // Compare the characters at index in cnt1 3914 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3915 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3916 subl(result, cnt2); 3917 jmp(POP_LABEL); 3918 3919 // Setup the registers to start vector comparison loop 3920 bind(COMPARE_WIDE_VECTORS); 3921 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3922 lea(str1, Address(str1, result, scale)); 3923 lea(str2, Address(str2, result, scale)); 3924 } else { 3925 lea(str1, Address(str1, result, scale1)); 3926 lea(str2, Address(str2, result, scale2)); 3927 } 3928 subl(result, stride2); 3929 subl(cnt2, stride2); 3930 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3931 negptr(result); 3932 3933 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3934 bind(COMPARE_WIDE_VECTORS_LOOP); 3935 3936 #ifdef _LP64 3937 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3938 cmpl(cnt2, stride2x2); 3939 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3940 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3941 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3942 3943 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3944 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3945 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3946 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3947 } else { 3948 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3949 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3950 } 3951 kortestql(mask, mask); 3952 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3953 addptr(result, stride2x2); // update since we already compared at this addr 3954 subl(cnt2, stride2x2); // and sub the size too 3955 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3956 3957 vpxor(vec1, vec1); 3958 jmpb(COMPARE_WIDE_TAIL); 3959 }//if (VM_Version::supports_avx512vlbw()) 3960 #endif // _LP64 3961 3962 3963 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3964 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3965 vmovdqu(vec1, Address(str1, result, scale)); 3966 vpxor(vec1, Address(str2, result, scale)); 3967 } else { 3968 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3969 vpxor(vec1, Address(str2, result, scale2)); 3970 } 3971 vptest(vec1, vec1); 3972 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3973 addptr(result, stride2); 3974 subl(cnt2, stride2); 3975 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3976 // clean upper bits of YMM registers 3977 vpxor(vec1, vec1); 3978 3979 // compare wide vectors tail 3980 bind(COMPARE_WIDE_TAIL); 3981 testptr(result, result); 3982 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3983 3984 movl(result, stride2); 3985 movl(cnt2, result); 3986 negptr(result); 3987 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3988 3989 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3990 bind(VECTOR_NOT_EQUAL); 3991 // clean upper bits of YMM registers 3992 vpxor(vec1, vec1); 3993 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3994 lea(str1, Address(str1, result, scale)); 3995 lea(str2, Address(str2, result, scale)); 3996 } else { 3997 lea(str1, Address(str1, result, scale1)); 3998 lea(str2, Address(str2, result, scale2)); 3999 } 4000 jmp(COMPARE_16_CHARS); 4001 4002 // Compare tail chars, length between 1 to 15 chars 4003 bind(COMPARE_TAIL_LONG); 4004 movl(cnt2, result); 4005 cmpl(cnt2, stride); 4006 jcc(Assembler::less, COMPARE_SMALL_STR); 4007 4008 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4009 movdqu(vec1, Address(str1, 0)); 4010 } else { 4011 pmovzxbw(vec1, Address(str1, 0)); 4012 } 4013 pcmpestri(vec1, Address(str2, 0), pcmpmask); 4014 jcc(Assembler::below, COMPARE_INDEX_CHAR); 4015 subptr(cnt2, stride); 4016 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 4017 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4018 lea(str1, Address(str1, result, scale)); 4019 lea(str2, Address(str2, result, scale)); 4020 } else { 4021 lea(str1, Address(str1, result, scale1)); 4022 lea(str2, Address(str2, result, scale2)); 4023 } 4024 negptr(cnt2); 4025 jmpb(WHILE_HEAD_LABEL); 4026 4027 bind(COMPARE_SMALL_STR); 4028 } else if (UseSSE42Intrinsics) { 4029 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 4030 int pcmpmask = 0x19; 4031 // Setup to compare 8-char (16-byte) vectors, 4032 // start from first character again because it has aligned address. 4033 movl(result, cnt2); 4034 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 4035 if (ae == StrIntrinsicNode::LL) { 4036 pcmpmask &= ~0x01; 4037 } 4038 jcc(Assembler::zero, COMPARE_TAIL); 4039 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4040 lea(str1, Address(str1, result, scale)); 4041 lea(str2, Address(str2, result, scale)); 4042 } else { 4043 lea(str1, Address(str1, result, scale1)); 4044 lea(str2, Address(str2, result, scale2)); 4045 } 4046 negptr(result); 4047 4048 // pcmpestri 4049 // inputs: 4050 // vec1- substring 4051 // rax - negative string length (elements count) 4052 // mem - scanned string 4053 // rdx - string length (elements count) 4054 // pcmpmask - cmp mode: 11000 (string compare with negated result) 4055 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 4056 // outputs: 4057 // rcx - first mismatched element index 4058 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 4059 4060 bind(COMPARE_WIDE_VECTORS); 4061 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4062 movdqu(vec1, Address(str1, result, scale)); 4063 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 4064 } else { 4065 pmovzxbw(vec1, Address(str1, result, scale1)); 4066 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 4067 } 4068 // After pcmpestri cnt1(rcx) contains mismatched element index 4069 4070 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 4071 addptr(result, stride); 4072 subptr(cnt2, stride); 4073 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4074 4075 // compare wide vectors tail 4076 testptr(result, result); 4077 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 4078 4079 movl(cnt2, stride); 4080 movl(result, stride); 4081 negptr(result); 4082 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4083 movdqu(vec1, Address(str1, result, scale)); 4084 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 4085 } else { 4086 pmovzxbw(vec1, Address(str1, result, scale1)); 4087 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 4088 } 4089 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 4090 4091 // Mismatched characters in the vectors 4092 bind(VECTOR_NOT_EQUAL); 4093 addptr(cnt1, result); 4094 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 4095 subl(result, cnt2); 4096 jmpb(POP_LABEL); 4097 4098 bind(COMPARE_TAIL); // limit is zero 4099 movl(cnt2, result); 4100 // Fallthru to tail compare 4101 } 4102 // Shift str2 and str1 to the end of the arrays, negate min 4103 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4104 lea(str1, Address(str1, cnt2, scale)); 4105 lea(str2, Address(str2, cnt2, scale)); 4106 } else { 4107 lea(str1, Address(str1, cnt2, scale1)); 4108 lea(str2, Address(str2, cnt2, scale2)); 4109 } 4110 decrementl(cnt2); // first character was compared already 4111 negptr(cnt2); 4112 4113 // Compare the rest of the elements 4114 bind(WHILE_HEAD_LABEL); 4115 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 4116 subl(result, cnt1); 4117 jccb(Assembler::notZero, POP_LABEL); 4118 increment(cnt2); 4119 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 4120 4121 // Strings are equal up to min length. Return the length difference. 4122 bind(LENGTH_DIFF_LABEL); 4123 pop(result); 4124 if (ae == StrIntrinsicNode::UU) { 4125 // Divide diff by 2 to get number of chars 4126 sarl(result, 1); 4127 } 4128 jmpb(DONE_LABEL); 4129 4130 #ifdef _LP64 4131 if (VM_Version::supports_avx512vlbw()) { 4132 4133 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 4134 4135 kmovql(cnt1, mask); 4136 notq(cnt1); 4137 bsfq(cnt2, cnt1); 4138 if (ae != StrIntrinsicNode::LL) { 4139 // Divide diff by 2 to get number of chars 4140 sarl(cnt2, 1); 4141 } 4142 addq(result, cnt2); 4143 if (ae == StrIntrinsicNode::LL) { 4144 load_unsigned_byte(cnt1, Address(str2, result)); 4145 load_unsigned_byte(result, Address(str1, result)); 4146 } else if (ae == StrIntrinsicNode::UU) { 4147 load_unsigned_short(cnt1, Address(str2, result, scale)); 4148 load_unsigned_short(result, Address(str1, result, scale)); 4149 } else { 4150 load_unsigned_short(cnt1, Address(str2, result, scale2)); 4151 load_unsigned_byte(result, Address(str1, result, scale1)); 4152 } 4153 subl(result, cnt1); 4154 jmpb(POP_LABEL); 4155 }//if (VM_Version::supports_avx512vlbw()) 4156 #endif // _LP64 4157 4158 // Discard the stored length difference 4159 bind(POP_LABEL); 4160 pop(cnt1); 4161 4162 // That's it 4163 bind(DONE_LABEL); 4164 if(ae == StrIntrinsicNode::UL) { 4165 negl(result); 4166 } 4167 4168 } 4169 4170 // Search for Non-ASCII character (Negative byte value) in a byte array, 4171 // return the index of the first such character, otherwise the length 4172 // of the array segment searched. 4173 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 4174 // @IntrinsicCandidate 4175 // public static int countPositives(byte[] ba, int off, int len) { 4176 // for (int i = off; i < off + len; i++) { 4177 // if (ba[i] < 0) { 4178 // return i - off; 4179 // } 4180 // } 4181 // return len; 4182 // } 4183 void C2_MacroAssembler::count_positives(Register ary1, Register len, 4184 Register result, Register tmp1, 4185 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 4186 // rsi: byte array 4187 // rcx: len 4188 // rax: result 4189 ShortBranchVerifier sbv(this); 4190 assert_different_registers(ary1, len, result, tmp1); 4191 assert_different_registers(vec1, vec2); 4192 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 4193 4194 movl(result, len); // copy 4195 // len == 0 4196 testl(len, len); 4197 jcc(Assembler::zero, DONE); 4198 4199 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 4200 VM_Version::supports_avx512vlbw() && 4201 VM_Version::supports_bmi2()) { 4202 4203 Label test_64_loop, test_tail, BREAK_LOOP; 4204 movl(tmp1, len); 4205 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4206 4207 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 4208 andl(len, 0xffffffc0); // vector count (in chars) 4209 jccb(Assembler::zero, test_tail); 4210 4211 lea(ary1, Address(ary1, len, Address::times_1)); 4212 negptr(len); 4213 4214 bind(test_64_loop); 4215 // Check whether our 64 elements of size byte contain negatives 4216 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4217 kortestql(mask1, mask1); 4218 jcc(Assembler::notZero, BREAK_LOOP); 4219 4220 addptr(len, 64); 4221 jccb(Assembler::notZero, test_64_loop); 4222 4223 bind(test_tail); 4224 // bail out when there is nothing to be done 4225 testl(tmp1, -1); 4226 jcc(Assembler::zero, DONE); 4227 4228 4229 // check the tail for absense of negatives 4230 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4231 #ifdef _LP64 4232 { 4233 Register tmp3_aliased = len; 4234 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4235 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4236 notq(tmp3_aliased); 4237 kmovql(mask2, tmp3_aliased); 4238 } 4239 #else 4240 Label k_init; 4241 jmp(k_init); 4242 4243 // We could not read 64-bits from a general purpose register thus we move 4244 // data required to compose 64 1's to the instruction stream 4245 // We emit 64 byte wide series of elements from 0..63 which later on would 4246 // be used as a compare targets with tail count contained in tmp1 register. 4247 // Result would be a k register having tmp1 consecutive number or 1 4248 // counting from least significant bit. 4249 address tmp = pc(); 4250 emit_int64(0x0706050403020100); 4251 emit_int64(0x0F0E0D0C0B0A0908); 4252 emit_int64(0x1716151413121110); 4253 emit_int64(0x1F1E1D1C1B1A1918); 4254 emit_int64(0x2726252423222120); 4255 emit_int64(0x2F2E2D2C2B2A2928); 4256 emit_int64(0x3736353433323130); 4257 emit_int64(0x3F3E3D3C3B3A3938); 4258 4259 bind(k_init); 4260 lea(len, InternalAddress(tmp)); 4261 // create mask to test for negative byte inside a vector 4262 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 4263 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 4264 4265 #endif 4266 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4267 ktestq(mask1, mask2); 4268 jcc(Assembler::zero, DONE); 4269 4270 // do a full check for negative registers in the tail 4271 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4272 // ary1 already pointing to the right place 4273 jmpb(TAIL_START); 4274 4275 bind(BREAK_LOOP); 4276 // At least one byte in the last 64 byte block was negative. 4277 // Set up to look at the last 64 bytes as if they were a tail 4278 lea(ary1, Address(ary1, len, Address::times_1)); 4279 addptr(result, len); 4280 // Ignore the very last byte: if all others are positive, 4281 // it must be negative, so we can skip right to the 2+1 byte 4282 // end comparison at this point 4283 orl(result, 63); 4284 movl(len, 63); 4285 // Fallthru to tail compare 4286 } else { 4287 4288 if (UseAVX >= 2 && UseSSE >= 2) { 4289 // With AVX2, use 32-byte vector compare 4290 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4291 4292 // Compare 32-byte vectors 4293 testl(len, 0xffffffe0); // vector count (in bytes) 4294 jccb(Assembler::zero, TAIL_START); 4295 4296 andl(len, 0xffffffe0); 4297 lea(ary1, Address(ary1, len, Address::times_1)); 4298 negptr(len); 4299 4300 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4301 movdl(vec2, tmp1); 4302 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4303 4304 bind(COMPARE_WIDE_VECTORS); 4305 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4306 vptest(vec1, vec2); 4307 jccb(Assembler::notZero, BREAK_LOOP); 4308 addptr(len, 32); 4309 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4310 4311 testl(result, 0x0000001f); // any bytes remaining? 4312 jcc(Assembler::zero, DONE); 4313 4314 // Quick test using the already prepared vector mask 4315 movl(len, result); 4316 andl(len, 0x0000001f); 4317 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4318 vptest(vec1, vec2); 4319 jcc(Assembler::zero, DONE); 4320 // There are zeros, jump to the tail to determine exactly where 4321 jmpb(TAIL_START); 4322 4323 bind(BREAK_LOOP); 4324 // At least one byte in the last 32-byte vector is negative. 4325 // Set up to look at the last 32 bytes as if they were a tail 4326 lea(ary1, Address(ary1, len, Address::times_1)); 4327 addptr(result, len); 4328 // Ignore the very last byte: if all others are positive, 4329 // it must be negative, so we can skip right to the 2+1 byte 4330 // end comparison at this point 4331 orl(result, 31); 4332 movl(len, 31); 4333 // Fallthru to tail compare 4334 } else if (UseSSE42Intrinsics) { 4335 // With SSE4.2, use double quad vector compare 4336 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4337 4338 // Compare 16-byte vectors 4339 testl(len, 0xfffffff0); // vector count (in bytes) 4340 jcc(Assembler::zero, TAIL_START); 4341 4342 andl(len, 0xfffffff0); 4343 lea(ary1, Address(ary1, len, Address::times_1)); 4344 negptr(len); 4345 4346 movl(tmp1, 0x80808080); 4347 movdl(vec2, tmp1); 4348 pshufd(vec2, vec2, 0); 4349 4350 bind(COMPARE_WIDE_VECTORS); 4351 movdqu(vec1, Address(ary1, len, Address::times_1)); 4352 ptest(vec1, vec2); 4353 jccb(Assembler::notZero, BREAK_LOOP); 4354 addptr(len, 16); 4355 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4356 4357 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4358 jcc(Assembler::zero, DONE); 4359 4360 // Quick test using the already prepared vector mask 4361 movl(len, result); 4362 andl(len, 0x0000000f); // tail count (in bytes) 4363 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4364 ptest(vec1, vec2); 4365 jcc(Assembler::zero, DONE); 4366 jmpb(TAIL_START); 4367 4368 bind(BREAK_LOOP); 4369 // At least one byte in the last 16-byte vector is negative. 4370 // Set up and look at the last 16 bytes as if they were a tail 4371 lea(ary1, Address(ary1, len, Address::times_1)); 4372 addptr(result, len); 4373 // Ignore the very last byte: if all others are positive, 4374 // it must be negative, so we can skip right to the 2+1 byte 4375 // end comparison at this point 4376 orl(result, 15); 4377 movl(len, 15); 4378 // Fallthru to tail compare 4379 } 4380 } 4381 4382 bind(TAIL_START); 4383 // Compare 4-byte vectors 4384 andl(len, 0xfffffffc); // vector count (in bytes) 4385 jccb(Assembler::zero, COMPARE_CHAR); 4386 4387 lea(ary1, Address(ary1, len, Address::times_1)); 4388 negptr(len); 4389 4390 bind(COMPARE_VECTORS); 4391 movl(tmp1, Address(ary1, len, Address::times_1)); 4392 andl(tmp1, 0x80808080); 4393 jccb(Assembler::notZero, TAIL_ADJUST); 4394 addptr(len, 4); 4395 jccb(Assembler::notZero, COMPARE_VECTORS); 4396 4397 // Compare trailing char (final 2-3 bytes), if any 4398 bind(COMPARE_CHAR); 4399 4400 testl(result, 0x2); // tail char 4401 jccb(Assembler::zero, COMPARE_BYTE); 4402 load_unsigned_short(tmp1, Address(ary1, 0)); 4403 andl(tmp1, 0x00008080); 4404 jccb(Assembler::notZero, CHAR_ADJUST); 4405 lea(ary1, Address(ary1, 2)); 4406 4407 bind(COMPARE_BYTE); 4408 testl(result, 0x1); // tail byte 4409 jccb(Assembler::zero, DONE); 4410 load_unsigned_byte(tmp1, Address(ary1, 0)); 4411 testl(tmp1, 0x00000080); 4412 jccb(Assembler::zero, DONE); 4413 subptr(result, 1); 4414 jmpb(DONE); 4415 4416 bind(TAIL_ADJUST); 4417 // there are negative bits in the last 4 byte block. 4418 // Adjust result and check the next three bytes 4419 addptr(result, len); 4420 orl(result, 3); 4421 lea(ary1, Address(ary1, len, Address::times_1)); 4422 jmpb(COMPARE_CHAR); 4423 4424 bind(CHAR_ADJUST); 4425 // We are looking at a char + optional byte tail, and found that one 4426 // of the bytes in the char is negative. Adjust the result, check the 4427 // first byte and readjust if needed. 4428 andl(result, 0xfffffffc); 4429 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4430 jccb(Assembler::notZero, DONE); 4431 addptr(result, 1); 4432 4433 // That's it 4434 bind(DONE); 4435 if (UseAVX >= 2 && UseSSE >= 2) { 4436 // clean upper bits of YMM registers 4437 vpxor(vec1, vec1); 4438 vpxor(vec2, vec2); 4439 } 4440 } 4441 4442 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4443 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4444 Register limit, Register result, Register chr, 4445 XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) { 4446 ShortBranchVerifier sbv(this); 4447 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4448 4449 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4450 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4451 4452 if (is_array_equ) { 4453 // Check the input args 4454 cmpoop(ary1, ary2); 4455 jcc(Assembler::equal, TRUE_LABEL); 4456 4457 // Need additional checks for arrays_equals. 4458 testptr(ary1, ary1); 4459 jcc(Assembler::zero, FALSE_LABEL); 4460 testptr(ary2, ary2); 4461 jcc(Assembler::zero, FALSE_LABEL); 4462 4463 // Check the lengths 4464 movl(limit, Address(ary1, length_offset)); 4465 cmpl(limit, Address(ary2, length_offset)); 4466 jcc(Assembler::notEqual, FALSE_LABEL); 4467 } 4468 4469 // count == 0 4470 testl(limit, limit); 4471 jcc(Assembler::zero, TRUE_LABEL); 4472 4473 if (is_array_equ) { 4474 // Load array address 4475 lea(ary1, Address(ary1, base_offset)); 4476 lea(ary2, Address(ary2, base_offset)); 4477 } 4478 4479 if (is_array_equ && is_char) { 4480 // arrays_equals when used for char[]. 4481 shll(limit, 1); // byte count != 0 4482 } 4483 movl(result, limit); // copy 4484 4485 if (UseAVX >= 2) { 4486 // With AVX2, use 32-byte vector compare 4487 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4488 4489 // Compare 32-byte vectors 4490 andl(result, 0x0000001f); // tail count (in bytes) 4491 andl(limit, 0xffffffe0); // vector count (in bytes) 4492 jcc(Assembler::zero, COMPARE_TAIL); 4493 4494 lea(ary1, Address(ary1, limit, Address::times_1)); 4495 lea(ary2, Address(ary2, limit, Address::times_1)); 4496 negptr(limit); 4497 4498 #ifdef _LP64 4499 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4500 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4501 4502 cmpl(limit, -64); 4503 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4504 4505 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4506 4507 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4508 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4509 kortestql(mask, mask); 4510 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4511 addptr(limit, 64); // update since we already compared at this addr 4512 cmpl(limit, -64); 4513 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4514 4515 // At this point we may still need to compare -limit+result bytes. 4516 // We could execute the next two instruction and just continue via non-wide path: 4517 // cmpl(limit, 0); 4518 // jcc(Assembler::equal, COMPARE_TAIL); // true 4519 // But since we stopped at the points ary{1,2}+limit which are 4520 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4521 // (|limit| <= 32 and result < 32), 4522 // we may just compare the last 64 bytes. 4523 // 4524 addptr(result, -64); // it is safe, bc we just came from this area 4525 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4526 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4527 kortestql(mask, mask); 4528 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4529 4530 jmp(TRUE_LABEL); 4531 4532 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4533 4534 }//if (VM_Version::supports_avx512vlbw()) 4535 #endif //_LP64 4536 bind(COMPARE_WIDE_VECTORS); 4537 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); 4538 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4539 vpxor(vec1, vec2); 4540 4541 vptest(vec1, vec1); 4542 jcc(Assembler::notZero, FALSE_LABEL); 4543 addptr(limit, 32); 4544 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4545 4546 testl(result, result); 4547 jcc(Assembler::zero, TRUE_LABEL); 4548 4549 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 4550 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4551 vpxor(vec1, vec2); 4552 4553 vptest(vec1, vec1); 4554 jccb(Assembler::notZero, FALSE_LABEL); 4555 jmpb(TRUE_LABEL); 4556 4557 bind(COMPARE_TAIL); // limit is zero 4558 movl(limit, result); 4559 // Fallthru to tail compare 4560 } else if (UseSSE42Intrinsics) { 4561 // With SSE4.2, use double quad vector compare 4562 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4563 4564 // Compare 16-byte vectors 4565 andl(result, 0x0000000f); // tail count (in bytes) 4566 andl(limit, 0xfffffff0); // vector count (in bytes) 4567 jcc(Assembler::zero, COMPARE_TAIL); 4568 4569 lea(ary1, Address(ary1, limit, Address::times_1)); 4570 lea(ary2, Address(ary2, limit, Address::times_1)); 4571 negptr(limit); 4572 4573 bind(COMPARE_WIDE_VECTORS); 4574 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4575 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4576 pxor(vec1, vec2); 4577 4578 ptest(vec1, vec1); 4579 jcc(Assembler::notZero, FALSE_LABEL); 4580 addptr(limit, 16); 4581 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4582 4583 testl(result, result); 4584 jcc(Assembler::zero, TRUE_LABEL); 4585 4586 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4587 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4588 pxor(vec1, vec2); 4589 4590 ptest(vec1, vec1); 4591 jccb(Assembler::notZero, FALSE_LABEL); 4592 jmpb(TRUE_LABEL); 4593 4594 bind(COMPARE_TAIL); // limit is zero 4595 movl(limit, result); 4596 // Fallthru to tail compare 4597 } 4598 4599 // Compare 4-byte vectors 4600 andl(limit, 0xfffffffc); // vector count (in bytes) 4601 jccb(Assembler::zero, COMPARE_CHAR); 4602 4603 lea(ary1, Address(ary1, limit, Address::times_1)); 4604 lea(ary2, Address(ary2, limit, Address::times_1)); 4605 negptr(limit); 4606 4607 bind(COMPARE_VECTORS); 4608 movl(chr, Address(ary1, limit, Address::times_1)); 4609 cmpl(chr, Address(ary2, limit, Address::times_1)); 4610 jccb(Assembler::notEqual, FALSE_LABEL); 4611 addptr(limit, 4); 4612 jcc(Assembler::notZero, COMPARE_VECTORS); 4613 4614 // Compare trailing char (final 2 bytes), if any 4615 bind(COMPARE_CHAR); 4616 testl(result, 0x2); // tail char 4617 jccb(Assembler::zero, COMPARE_BYTE); 4618 load_unsigned_short(chr, Address(ary1, 0)); 4619 load_unsigned_short(limit, Address(ary2, 0)); 4620 cmpl(chr, limit); 4621 jccb(Assembler::notEqual, FALSE_LABEL); 4622 4623 if (is_array_equ && is_char) { 4624 bind(COMPARE_BYTE); 4625 } else { 4626 lea(ary1, Address(ary1, 2)); 4627 lea(ary2, Address(ary2, 2)); 4628 4629 bind(COMPARE_BYTE); 4630 testl(result, 0x1); // tail byte 4631 jccb(Assembler::zero, TRUE_LABEL); 4632 load_unsigned_byte(chr, Address(ary1, 0)); 4633 load_unsigned_byte(limit, Address(ary2, 0)); 4634 cmpl(chr, limit); 4635 jccb(Assembler::notEqual, FALSE_LABEL); 4636 } 4637 bind(TRUE_LABEL); 4638 movl(result, 1); // return true 4639 jmpb(DONE); 4640 4641 bind(FALSE_LABEL); 4642 xorl(result, result); // return false 4643 4644 // That's it 4645 bind(DONE); 4646 if (UseAVX >= 2) { 4647 // clean upper bits of YMM registers 4648 vpxor(vec1, vec1); 4649 vpxor(vec2, vec2); 4650 } 4651 } 4652 4653 #ifdef _LP64 4654 4655 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4656 #define __ masm. 4657 Register dst = stub.data<0>(); 4658 XMMRegister src = stub.data<1>(); 4659 address target = stub.data<2>(); 4660 __ bind(stub.entry()); 4661 __ subptr(rsp, 8); 4662 __ movdbl(Address(rsp), src); 4663 __ call(RuntimeAddress(target)); 4664 __ pop(dst); 4665 __ jmp(stub.continuation()); 4666 #undef __ 4667 } 4668 4669 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4670 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4671 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4672 4673 address slowpath_target; 4674 if (dst_bt == T_INT) { 4675 if (src_bt == T_FLOAT) { 4676 cvttss2sil(dst, src); 4677 cmpl(dst, 0x80000000); 4678 slowpath_target = StubRoutines::x86::f2i_fixup(); 4679 } else { 4680 cvttsd2sil(dst, src); 4681 cmpl(dst, 0x80000000); 4682 slowpath_target = StubRoutines::x86::d2i_fixup(); 4683 } 4684 } else { 4685 if (src_bt == T_FLOAT) { 4686 cvttss2siq(dst, src); 4687 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4688 slowpath_target = StubRoutines::x86::f2l_fixup(); 4689 } else { 4690 cvttsd2siq(dst, src); 4691 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4692 slowpath_target = StubRoutines::x86::d2l_fixup(); 4693 } 4694 } 4695 4696 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4697 jcc(Assembler::equal, stub->entry()); 4698 bind(stub->continuation()); 4699 } 4700 4701 #endif // _LP64 4702 4703 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4704 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4705 switch(ideal_opc) { 4706 case Op_LShiftVS: 4707 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4708 case Op_LShiftVI: 4709 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4710 case Op_LShiftVL: 4711 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4712 case Op_RShiftVS: 4713 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4714 case Op_RShiftVI: 4715 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4716 case Op_RShiftVL: 4717 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4718 case Op_URShiftVS: 4719 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4720 case Op_URShiftVI: 4721 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4722 case Op_URShiftVL: 4723 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4724 case Op_RotateRightV: 4725 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4726 case Op_RotateLeftV: 4727 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4728 default: 4729 fatal("Unsupported masked operation"); break; 4730 } 4731 } 4732 4733 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4734 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4735 bool is_varshift) { 4736 switch (ideal_opc) { 4737 case Op_AddVB: 4738 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4739 case Op_AddVS: 4740 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4741 case Op_AddVI: 4742 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4743 case Op_AddVL: 4744 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4745 case Op_AddVF: 4746 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4747 case Op_AddVD: 4748 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4749 case Op_SubVB: 4750 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4751 case Op_SubVS: 4752 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4753 case Op_SubVI: 4754 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4755 case Op_SubVL: 4756 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4757 case Op_SubVF: 4758 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4759 case Op_SubVD: 4760 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4761 case Op_MulVS: 4762 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4763 case Op_MulVI: 4764 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4765 case Op_MulVL: 4766 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4767 case Op_MulVF: 4768 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4769 case Op_MulVD: 4770 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4771 case Op_DivVF: 4772 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4773 case Op_DivVD: 4774 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4775 case Op_SqrtVF: 4776 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4777 case Op_SqrtVD: 4778 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4779 case Op_AbsVB: 4780 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4781 case Op_AbsVS: 4782 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4783 case Op_AbsVI: 4784 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4785 case Op_AbsVL: 4786 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4787 case Op_FmaVF: 4788 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4789 case Op_FmaVD: 4790 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4791 case Op_VectorRearrange: 4792 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4793 case Op_LShiftVS: 4794 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4795 case Op_LShiftVI: 4796 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4797 case Op_LShiftVL: 4798 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4799 case Op_RShiftVS: 4800 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4801 case Op_RShiftVI: 4802 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4803 case Op_RShiftVL: 4804 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4805 case Op_URShiftVS: 4806 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4807 case Op_URShiftVI: 4808 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4809 case Op_URShiftVL: 4810 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4811 case Op_RotateLeftV: 4812 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4813 case Op_RotateRightV: 4814 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4815 case Op_MaxV: 4816 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4817 case Op_MinV: 4818 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4819 case Op_XorV: 4820 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4821 case Op_OrV: 4822 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4823 case Op_AndV: 4824 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4825 default: 4826 fatal("Unsupported masked operation"); break; 4827 } 4828 } 4829 4830 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4831 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4832 switch (ideal_opc) { 4833 case Op_AddVB: 4834 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4835 case Op_AddVS: 4836 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4837 case Op_AddVI: 4838 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4839 case Op_AddVL: 4840 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4841 case Op_AddVF: 4842 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4843 case Op_AddVD: 4844 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4845 case Op_SubVB: 4846 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4847 case Op_SubVS: 4848 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4849 case Op_SubVI: 4850 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4851 case Op_SubVL: 4852 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4853 case Op_SubVF: 4854 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4855 case Op_SubVD: 4856 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4857 case Op_MulVS: 4858 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4859 case Op_MulVI: 4860 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4861 case Op_MulVL: 4862 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4863 case Op_MulVF: 4864 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4865 case Op_MulVD: 4866 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4867 case Op_DivVF: 4868 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4869 case Op_DivVD: 4870 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4871 case Op_FmaVF: 4872 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4873 case Op_FmaVD: 4874 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4875 case Op_MaxV: 4876 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4877 case Op_MinV: 4878 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4879 case Op_XorV: 4880 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4881 case Op_OrV: 4882 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4883 case Op_AndV: 4884 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4885 default: 4886 fatal("Unsupported masked operation"); break; 4887 } 4888 } 4889 4890 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4891 KRegister src1, KRegister src2) { 4892 BasicType etype = T_ILLEGAL; 4893 switch(mask_len) { 4894 case 2: 4895 case 4: 4896 case 8: etype = T_BYTE; break; 4897 case 16: etype = T_SHORT; break; 4898 case 32: etype = T_INT; break; 4899 case 64: etype = T_LONG; break; 4900 default: fatal("Unsupported type"); break; 4901 } 4902 assert(etype != T_ILLEGAL, ""); 4903 switch(ideal_opc) { 4904 case Op_AndVMask: 4905 kand(etype, dst, src1, src2); break; 4906 case Op_OrVMask: 4907 kor(etype, dst, src1, src2); break; 4908 case Op_XorVMask: 4909 kxor(etype, dst, src1, src2); break; 4910 default: 4911 fatal("Unsupported masked operation"); break; 4912 } 4913 } 4914 4915 /* 4916 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4917 * If src is NaN, the result is 0. 4918 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4919 * the result is equal to the value of Integer.MIN_VALUE. 4920 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4921 * the result is equal to the value of Integer.MAX_VALUE. 4922 */ 4923 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4924 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4925 Register rscratch, AddressLiteral float_sign_flip, 4926 int vec_enc) { 4927 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4928 Label done; 4929 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4930 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4931 vptest(xtmp2, xtmp2, vec_enc); 4932 jccb(Assembler::equal, done); 4933 4934 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4935 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4936 4937 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4938 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4939 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4940 4941 // Recompute the mask for remaining special value. 4942 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4943 // Extract SRC values corresponding to TRUE mask lanes. 4944 vpand(xtmp4, xtmp2, src, vec_enc); 4945 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4946 // values are set. 4947 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4948 4949 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4950 bind(done); 4951 } 4952 4953 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4954 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4955 Register rscratch, AddressLiteral float_sign_flip, 4956 int vec_enc) { 4957 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4958 Label done; 4959 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4960 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4961 kortestwl(ktmp1, ktmp1); 4962 jccb(Assembler::equal, done); 4963 4964 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4965 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4966 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4967 4968 kxorwl(ktmp1, ktmp1, ktmp2); 4969 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4970 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4971 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4972 bind(done); 4973 } 4974 4975 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4976 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4977 Register rscratch, AddressLiteral double_sign_flip, 4978 int vec_enc) { 4979 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4980 4981 Label done; 4982 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4983 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4984 kortestwl(ktmp1, ktmp1); 4985 jccb(Assembler::equal, done); 4986 4987 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4988 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4989 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4990 4991 kxorwl(ktmp1, ktmp1, ktmp2); 4992 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4993 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4994 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4995 bind(done); 4996 } 4997 4998 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4999 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5000 Register rscratch, AddressLiteral float_sign_flip, 5001 int vec_enc) { 5002 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5003 Label done; 5004 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 5005 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 5006 kortestwl(ktmp1, ktmp1); 5007 jccb(Assembler::equal, done); 5008 5009 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5010 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5011 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 5012 5013 kxorwl(ktmp1, ktmp1, ktmp2); 5014 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5015 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5016 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 5017 bind(done); 5018 } 5019 5020 /* 5021 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 5022 * If src is NaN, the result is 0. 5023 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 5024 * the result is equal to the value of Long.MIN_VALUE. 5025 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 5026 * the result is equal to the value of Long.MAX_VALUE. 5027 */ 5028 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5029 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 5030 Register rscratch, AddressLiteral double_sign_flip, 5031 int vec_enc) { 5032 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 5033 5034 Label done; 5035 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 5036 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 5037 kortestwl(ktmp1, ktmp1); 5038 jccb(Assembler::equal, done); 5039 5040 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5041 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 5042 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 5043 5044 kxorwl(ktmp1, ktmp1, ktmp2); 5045 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 5046 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 5047 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 5048 bind(done); 5049 } 5050 5051 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 5052 XMMRegister xtmp, int index, int vec_enc) { 5053 assert(vec_enc < Assembler::AVX_512bit, ""); 5054 if (vec_enc == Assembler::AVX_256bit) { 5055 vextractf128_high(xtmp, src); 5056 vshufps(dst, src, xtmp, index, vec_enc); 5057 } else { 5058 vshufps(dst, src, zero, index, vec_enc); 5059 } 5060 } 5061 5062 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5063 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 5064 AddressLiteral float_sign_flip, int src_vec_enc) { 5065 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 5066 5067 Label done; 5068 // Compare the destination lanes with float_sign_flip 5069 // value to get mask for all special values. 5070 movdqu(xtmp1, float_sign_flip, rscratch); 5071 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 5072 ptest(xtmp2, xtmp2); 5073 jccb(Assembler::equal, done); 5074 5075 // Flip float_sign_flip to get max integer value. 5076 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5077 pxor(xtmp1, xtmp4); 5078 5079 // Set detination lanes corresponding to unordered source lanes as zero. 5080 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5081 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5082 5083 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5084 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5085 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5086 5087 // Recompute the mask for remaining special value. 5088 pxor(xtmp2, xtmp3); 5089 // Extract mask corresponding to non-negative source lanes. 5090 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5091 5092 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5093 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5094 pand(xtmp3, xtmp2); 5095 5096 // Replace destination lanes holding special value(0x80000000) with max int 5097 // if corresponding source lane holds a +ve value. 5098 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5099 bind(done); 5100 } 5101 5102 5103 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5104 XMMRegister xtmp, Register rscratch, int vec_enc) { 5105 switch(to_elem_bt) { 5106 case T_SHORT: 5107 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5108 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5109 vpackusdw(dst, dst, zero, vec_enc); 5110 if (vec_enc == Assembler::AVX_256bit) { 5111 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5112 } 5113 break; 5114 case T_BYTE: 5115 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5116 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5117 vpackusdw(dst, dst, zero, vec_enc); 5118 if (vec_enc == Assembler::AVX_256bit) { 5119 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5120 } 5121 vpackuswb(dst, dst, zero, vec_enc); 5122 break; 5123 default: assert(false, "%s", type2name(to_elem_bt)); 5124 } 5125 } 5126 5127 /* 5128 * Algorithm for vector D2L and F2I conversions:- 5129 * a) Perform vector D2L/F2I cast. 5130 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5131 * It signifies that source value could be any of the special floating point 5132 * values(NaN,-Inf,Inf,Max,-Min). 5133 * c) Set destination to zero if source is NaN value. 5134 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5135 */ 5136 5137 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5138 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5139 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5140 int to_elem_sz = type2aelembytes(to_elem_bt); 5141 assert(to_elem_sz <= 4, ""); 5142 vcvttps2dq(dst, src, vec_enc); 5143 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5144 if (to_elem_sz < 4) { 5145 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5146 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5147 } 5148 } 5149 5150 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5151 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5152 Register rscratch, int vec_enc) { 5153 int to_elem_sz = type2aelembytes(to_elem_bt); 5154 assert(to_elem_sz <= 4, ""); 5155 vcvttps2dq(dst, src, vec_enc); 5156 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5157 switch(to_elem_bt) { 5158 case T_INT: 5159 break; 5160 case T_SHORT: 5161 evpmovdw(dst, dst, vec_enc); 5162 break; 5163 case T_BYTE: 5164 evpmovdb(dst, dst, vec_enc); 5165 break; 5166 default: assert(false, "%s", type2name(to_elem_bt)); 5167 } 5168 } 5169 5170 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5171 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5172 Register rscratch, int vec_enc) { 5173 evcvttps2qq(dst, src, vec_enc); 5174 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5175 } 5176 5177 // Handling for downcasting from double to integer or sub-word types on AVX2. 5178 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5179 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5180 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5181 int to_elem_sz = type2aelembytes(to_elem_bt); 5182 assert(to_elem_sz < 8, ""); 5183 vcvttpd2dq(dst, src, vec_enc); 5184 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5185 float_sign_flip, vec_enc); 5186 if (to_elem_sz < 4) { 5187 // xtmp4 holds all zero lanes. 5188 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5189 } 5190 } 5191 5192 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5193 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5194 KRegister ktmp2, AddressLiteral sign_flip, 5195 Register rscratch, int vec_enc) { 5196 if (VM_Version::supports_avx512dq()) { 5197 evcvttpd2qq(dst, src, vec_enc); 5198 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5199 switch(to_elem_bt) { 5200 case T_LONG: 5201 break; 5202 case T_INT: 5203 evpmovsqd(dst, dst, vec_enc); 5204 break; 5205 case T_SHORT: 5206 evpmovsqd(dst, dst, vec_enc); 5207 evpmovdw(dst, dst, vec_enc); 5208 break; 5209 case T_BYTE: 5210 evpmovsqd(dst, dst, vec_enc); 5211 evpmovdb(dst, dst, vec_enc); 5212 break; 5213 default: assert(false, "%s", type2name(to_elem_bt)); 5214 } 5215 } else { 5216 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5217 vcvttpd2dq(dst, src, vec_enc); 5218 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5219 switch(to_elem_bt) { 5220 case T_INT: 5221 break; 5222 case T_SHORT: 5223 evpmovdw(dst, dst, vec_enc); 5224 break; 5225 case T_BYTE: 5226 evpmovdb(dst, dst, vec_enc); 5227 break; 5228 default: assert(false, "%s", type2name(to_elem_bt)); 5229 } 5230 } 5231 } 5232 5233 #ifdef _LP64 5234 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5235 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5236 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5237 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5238 // and re-instantiate original MXCSR.RC mode after that. 5239 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5240 5241 mov64(tmp, julong_cast(0.5L)); 5242 evpbroadcastq(xtmp1, tmp, vec_enc); 5243 vaddpd(xtmp1, src , xtmp1, vec_enc); 5244 evcvtpd2qq(dst, xtmp1, vec_enc); 5245 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5246 double_sign_flip, vec_enc);; 5247 5248 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5249 } 5250 5251 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5252 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5253 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5254 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5255 // and re-instantiate original MXCSR.RC mode after that. 5256 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5257 5258 movl(tmp, jint_cast(0.5)); 5259 movq(xtmp1, tmp); 5260 vbroadcastss(xtmp1, xtmp1, vec_enc); 5261 vaddps(xtmp1, src , xtmp1, vec_enc); 5262 vcvtps2dq(dst, xtmp1, vec_enc); 5263 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5264 float_sign_flip, vec_enc); 5265 5266 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5267 } 5268 5269 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5270 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5271 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5272 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5273 // and re-instantiate original MXCSR.RC mode after that. 5274 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5275 5276 movl(tmp, jint_cast(0.5)); 5277 movq(xtmp1, tmp); 5278 vbroadcastss(xtmp1, xtmp1, vec_enc); 5279 vaddps(xtmp1, src , xtmp1, vec_enc); 5280 vcvtps2dq(dst, xtmp1, vec_enc); 5281 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5282 5283 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5284 } 5285 #endif // _LP64 5286 5287 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5288 BasicType from_elem_bt, BasicType to_elem_bt) { 5289 switch (from_elem_bt) { 5290 case T_BYTE: 5291 switch (to_elem_bt) { 5292 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5293 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5294 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5295 default: ShouldNotReachHere(); 5296 } 5297 break; 5298 case T_SHORT: 5299 switch (to_elem_bt) { 5300 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5301 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5302 default: ShouldNotReachHere(); 5303 } 5304 break; 5305 case T_INT: 5306 assert(to_elem_bt == T_LONG, ""); 5307 vpmovzxdq(dst, src, vlen_enc); 5308 break; 5309 default: 5310 ShouldNotReachHere(); 5311 } 5312 } 5313 5314 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5315 BasicType from_elem_bt, BasicType to_elem_bt) { 5316 switch (from_elem_bt) { 5317 case T_BYTE: 5318 switch (to_elem_bt) { 5319 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5320 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5321 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5322 default: ShouldNotReachHere(); 5323 } 5324 break; 5325 case T_SHORT: 5326 switch (to_elem_bt) { 5327 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5328 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5329 default: ShouldNotReachHere(); 5330 } 5331 break; 5332 case T_INT: 5333 assert(to_elem_bt == T_LONG, ""); 5334 vpmovsxdq(dst, src, vlen_enc); 5335 break; 5336 default: 5337 ShouldNotReachHere(); 5338 } 5339 } 5340 5341 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5342 BasicType dst_bt, BasicType src_bt, int vlen) { 5343 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5344 assert(vlen_enc != AVX_512bit, ""); 5345 5346 int dst_bt_size = type2aelembytes(dst_bt); 5347 int src_bt_size = type2aelembytes(src_bt); 5348 if (dst_bt_size > src_bt_size) { 5349 switch (dst_bt_size / src_bt_size) { 5350 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5351 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5352 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5353 default: ShouldNotReachHere(); 5354 } 5355 } else { 5356 assert(dst_bt_size < src_bt_size, ""); 5357 switch (src_bt_size / dst_bt_size) { 5358 case 2: { 5359 if (vlen_enc == AVX_128bit) { 5360 vpacksswb(dst, src, src, vlen_enc); 5361 } else { 5362 vpacksswb(dst, src, src, vlen_enc); 5363 vpermq(dst, dst, 0x08, vlen_enc); 5364 } 5365 break; 5366 } 5367 case 4: { 5368 if (vlen_enc == AVX_128bit) { 5369 vpackssdw(dst, src, src, vlen_enc); 5370 vpacksswb(dst, dst, dst, vlen_enc); 5371 } else { 5372 vpackssdw(dst, src, src, vlen_enc); 5373 vpermq(dst, dst, 0x08, vlen_enc); 5374 vpacksswb(dst, dst, dst, AVX_128bit); 5375 } 5376 break; 5377 } 5378 case 8: { 5379 if (vlen_enc == AVX_128bit) { 5380 vpshufd(dst, src, 0x08, vlen_enc); 5381 vpackssdw(dst, dst, dst, vlen_enc); 5382 vpacksswb(dst, dst, dst, vlen_enc); 5383 } else { 5384 vpshufd(dst, src, 0x08, vlen_enc); 5385 vpermq(dst, dst, 0x08, vlen_enc); 5386 vpackssdw(dst, dst, dst, AVX_128bit); 5387 vpacksswb(dst, dst, dst, AVX_128bit); 5388 } 5389 break; 5390 } 5391 default: ShouldNotReachHere(); 5392 } 5393 } 5394 } 5395 5396 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5397 bool merge, BasicType bt, int vlen_enc) { 5398 if (bt == T_INT) { 5399 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5400 } else { 5401 assert(bt == T_LONG, ""); 5402 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5403 } 5404 } 5405 5406 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5407 bool merge, BasicType bt, int vlen_enc) { 5408 if (bt == T_INT) { 5409 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5410 } else { 5411 assert(bt == T_LONG, ""); 5412 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5413 } 5414 } 5415 5416 #ifdef _LP64 5417 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5418 Register rtmp2, XMMRegister xtmp, int mask_len, 5419 int vec_enc) { 5420 int index = 0; 5421 int vindex = 0; 5422 mov64(rtmp1, 0x0101010101010101L); 5423 pdepq(rtmp1, src, rtmp1); 5424 if (mask_len > 8) { 5425 movq(rtmp2, src); 5426 vpxor(xtmp, xtmp, xtmp, vec_enc); 5427 movq(xtmp, rtmp1); 5428 } 5429 movq(dst, rtmp1); 5430 5431 mask_len -= 8; 5432 while (mask_len > 0) { 5433 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5434 index++; 5435 if ((index % 2) == 0) { 5436 pxor(xtmp, xtmp); 5437 } 5438 mov64(rtmp1, 0x0101010101010101L); 5439 shrq(rtmp2, 8); 5440 pdepq(rtmp1, rtmp2, rtmp1); 5441 pinsrq(xtmp, rtmp1, index % 2); 5442 vindex = index / 2; 5443 if (vindex) { 5444 // Write entire 16 byte vector when both 64 bit 5445 // lanes are update to save redundant instructions. 5446 if (index % 2) { 5447 vinsertf128(dst, dst, xtmp, vindex); 5448 } 5449 } else { 5450 vmovdqu(dst, xtmp); 5451 } 5452 mask_len -= 8; 5453 } 5454 } 5455 5456 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5457 switch(opc) { 5458 case Op_VectorMaskTrueCount: 5459 popcntq(dst, tmp); 5460 break; 5461 case Op_VectorMaskLastTrue: 5462 if (VM_Version::supports_lzcnt()) { 5463 lzcntq(tmp, tmp); 5464 movl(dst, 63); 5465 subl(dst, tmp); 5466 } else { 5467 movl(dst, -1); 5468 bsrq(tmp, tmp); 5469 cmov32(Assembler::notZero, dst, tmp); 5470 } 5471 break; 5472 case Op_VectorMaskFirstTrue: 5473 if (VM_Version::supports_bmi1()) { 5474 if (masklen < 32) { 5475 orl(tmp, 1 << masklen); 5476 tzcntl(dst, tmp); 5477 } else if (masklen == 32) { 5478 tzcntl(dst, tmp); 5479 } else { 5480 assert(masklen == 64, ""); 5481 tzcntq(dst, tmp); 5482 } 5483 } else { 5484 if (masklen < 32) { 5485 orl(tmp, 1 << masklen); 5486 bsfl(dst, tmp); 5487 } else { 5488 assert(masklen == 32 || masklen == 64, ""); 5489 movl(dst, masklen); 5490 if (masklen == 32) { 5491 bsfl(tmp, tmp); 5492 } else { 5493 bsfq(tmp, tmp); 5494 } 5495 cmov32(Assembler::notZero, dst, tmp); 5496 } 5497 } 5498 break; 5499 case Op_VectorMaskToLong: 5500 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5501 break; 5502 default: assert(false, "Unhandled mask operation"); 5503 } 5504 } 5505 5506 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5507 int masklen, int masksize, int vec_enc) { 5508 assert(VM_Version::supports_popcnt(), ""); 5509 5510 if(VM_Version::supports_avx512bw()) { 5511 kmovql(tmp, mask); 5512 } else { 5513 assert(masklen <= 16, ""); 5514 kmovwl(tmp, mask); 5515 } 5516 5517 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5518 // operations needs to be clipped. 5519 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5520 andq(tmp, (1 << masklen) - 1); 5521 } 5522 5523 vector_mask_operation_helper(opc, dst, tmp, masklen); 5524 } 5525 5526 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5527 Register tmp, int masklen, BasicType bt, int vec_enc) { 5528 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5529 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5530 assert(VM_Version::supports_popcnt(), ""); 5531 5532 bool need_clip = false; 5533 switch(bt) { 5534 case T_BOOLEAN: 5535 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5536 vpxor(xtmp, xtmp, xtmp, vec_enc); 5537 vpsubb(xtmp, xtmp, mask, vec_enc); 5538 vpmovmskb(tmp, xtmp, vec_enc); 5539 need_clip = masklen < 16; 5540 break; 5541 case T_BYTE: 5542 vpmovmskb(tmp, mask, vec_enc); 5543 need_clip = masklen < 16; 5544 break; 5545 case T_SHORT: 5546 vpacksswb(xtmp, mask, mask, vec_enc); 5547 if (masklen >= 16) { 5548 vpermpd(xtmp, xtmp, 8, vec_enc); 5549 } 5550 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5551 need_clip = masklen < 16; 5552 break; 5553 case T_INT: 5554 case T_FLOAT: 5555 vmovmskps(tmp, mask, vec_enc); 5556 need_clip = masklen < 4; 5557 break; 5558 case T_LONG: 5559 case T_DOUBLE: 5560 vmovmskpd(tmp, mask, vec_enc); 5561 need_clip = masklen < 2; 5562 break; 5563 default: assert(false, "Unhandled type, %s", type2name(bt)); 5564 } 5565 5566 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5567 // operations needs to be clipped. 5568 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5569 // need_clip implies masklen < 32 5570 andq(tmp, (1 << masklen) - 1); 5571 } 5572 5573 vector_mask_operation_helper(opc, dst, tmp, masklen); 5574 } 5575 5576 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5577 Register rtmp2, int mask_len) { 5578 kmov(rtmp1, src); 5579 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5580 mov64(rtmp2, -1L); 5581 pextq(rtmp2, rtmp2, rtmp1); 5582 kmov(dst, rtmp2); 5583 } 5584 5585 #ifdef _LP64 5586 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5587 XMMRegister mask, Register rtmp, Register rscratch, 5588 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5589 int vec_enc) { 5590 assert(type2aelembytes(bt) >= 4, ""); 5591 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5592 address compress_perm_table = nullptr; 5593 address expand_perm_table = nullptr; 5594 if (type2aelembytes(bt) == 8) { 5595 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5596 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5597 vmovmskpd(rtmp, mask, vec_enc); 5598 } else { 5599 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5600 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5601 vmovmskps(rtmp, mask, vec_enc); 5602 } 5603 shlq(rtmp, 5); // for 32 byte permute row. 5604 if (opcode == Op_CompressV) { 5605 lea(rscratch, ExternalAddress(compress_perm_table)); 5606 } else { 5607 lea(rscratch, ExternalAddress(expand_perm_table)); 5608 } 5609 addptr(rtmp, rscratch); 5610 vmovdqu(permv, Address(rtmp)); 5611 vpermps(dst, permv, src, Assembler::AVX_256bit); 5612 vpxor(xtmp, xtmp, xtmp, vec_enc); 5613 // Blend the result with zero vector using permute mask, each column entry 5614 // in a permute table row contains either a valid permute index or a -1 (default) 5615 // value, this can potentially be used as a blending mask after 5616 // compressing/expanding the source vector lanes. 5617 vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv); 5618 } 5619 #endif 5620 5621 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5622 bool merge, BasicType bt, int vec_enc) { 5623 if (opcode == Op_CompressV) { 5624 switch(bt) { 5625 case T_BYTE: 5626 evpcompressb(dst, mask, src, merge, vec_enc); 5627 break; 5628 case T_CHAR: 5629 case T_SHORT: 5630 evpcompressw(dst, mask, src, merge, vec_enc); 5631 break; 5632 case T_INT: 5633 evpcompressd(dst, mask, src, merge, vec_enc); 5634 break; 5635 case T_FLOAT: 5636 evcompressps(dst, mask, src, merge, vec_enc); 5637 break; 5638 case T_LONG: 5639 evpcompressq(dst, mask, src, merge, vec_enc); 5640 break; 5641 case T_DOUBLE: 5642 evcompresspd(dst, mask, src, merge, vec_enc); 5643 break; 5644 default: 5645 fatal("Unsupported type %s", type2name(bt)); 5646 break; 5647 } 5648 } else { 5649 assert(opcode == Op_ExpandV, ""); 5650 switch(bt) { 5651 case T_BYTE: 5652 evpexpandb(dst, mask, src, merge, vec_enc); 5653 break; 5654 case T_CHAR: 5655 case T_SHORT: 5656 evpexpandw(dst, mask, src, merge, vec_enc); 5657 break; 5658 case T_INT: 5659 evpexpandd(dst, mask, src, merge, vec_enc); 5660 break; 5661 case T_FLOAT: 5662 evexpandps(dst, mask, src, merge, vec_enc); 5663 break; 5664 case T_LONG: 5665 evpexpandq(dst, mask, src, merge, vec_enc); 5666 break; 5667 case T_DOUBLE: 5668 evexpandpd(dst, mask, src, merge, vec_enc); 5669 break; 5670 default: 5671 fatal("Unsupported type %s", type2name(bt)); 5672 break; 5673 } 5674 } 5675 } 5676 #endif 5677 5678 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5679 KRegister ktmp1, int vec_enc) { 5680 if (opcode == Op_SignumVD) { 5681 vsubpd(dst, zero, one, vec_enc); 5682 // if src < 0 ? -1 : 1 5683 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5684 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5685 // if src == NaN, -0.0 or 0.0 return src. 5686 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5687 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5688 } else { 5689 assert(opcode == Op_SignumVF, ""); 5690 vsubps(dst, zero, one, vec_enc); 5691 // if src < 0 ? -1 : 1 5692 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5693 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5694 // if src == NaN, -0.0 or 0.0 return src. 5695 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5696 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5697 } 5698 } 5699 5700 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5701 XMMRegister xtmp1, int vec_enc) { 5702 if (opcode == Op_SignumVD) { 5703 vsubpd(dst, zero, one, vec_enc); 5704 // if src < 0 ? -1 : 1 5705 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5706 // if src == NaN, -0.0 or 0.0 return src. 5707 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5708 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5709 } else { 5710 assert(opcode == Op_SignumVF, ""); 5711 vsubps(dst, zero, one, vec_enc); 5712 // if src < 0 ? -1 : 1 5713 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5714 // if src == NaN, -0.0 or 0.0 return src. 5715 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5716 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5717 } 5718 } 5719 5720 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5721 if (VM_Version::supports_avx512bw()) { 5722 if (mask_len > 32) { 5723 kmovql(dst, src); 5724 } else { 5725 kmovdl(dst, src); 5726 if (mask_len != 32) { 5727 kshiftrdl(dst, dst, 32 - mask_len); 5728 } 5729 } 5730 } else { 5731 assert(mask_len <= 16, ""); 5732 kmovwl(dst, src); 5733 if (mask_len != 16) { 5734 kshiftrwl(dst, dst, 16 - mask_len); 5735 } 5736 } 5737 } 5738 5739 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5740 int lane_size = type2aelembytes(bt); 5741 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5742 if ((is_LP64 || lane_size < 8) && 5743 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5744 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5745 movptr(rtmp, imm32); 5746 switch(lane_size) { 5747 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5748 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5749 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5750 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5751 fatal("Unsupported lane size %d", lane_size); 5752 break; 5753 } 5754 } else { 5755 movptr(rtmp, imm32); 5756 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5757 switch(lane_size) { 5758 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5759 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5760 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5761 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5762 fatal("Unsupported lane size %d", lane_size); 5763 break; 5764 } 5765 } 5766 } 5767 5768 // 5769 // Following is lookup table based popcount computation algorithm:- 5770 // Index Bit set count 5771 // [ 0000 -> 0, 5772 // 0001 -> 1, 5773 // 0010 -> 1, 5774 // 0011 -> 2, 5775 // 0100 -> 1, 5776 // 0101 -> 2, 5777 // 0110 -> 2, 5778 // 0111 -> 3, 5779 // 1000 -> 1, 5780 // 1001 -> 2, 5781 // 1010 -> 3, 5782 // 1011 -> 3, 5783 // 1100 -> 2, 5784 // 1101 -> 3, 5785 // 1111 -> 4 ] 5786 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5787 // shuffle indices for lookup table access. 5788 // b. Right shift each byte of vector lane by 4 positions. 5789 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5790 // shuffle indices for lookup table access. 5791 // d. Add the bitset count of upper and lower 4 bits of each byte. 5792 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5793 // count of all the bytes of a quadword. 5794 // f. Perform step e. for upper 128bit vector lane. 5795 // g. Pack the bitset count of quadwords back to double word. 5796 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5797 5798 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5799 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5800 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5801 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5802 vpsrlw(dst, src, 4, vec_enc); 5803 vpand(dst, dst, xtmp1, vec_enc); 5804 vpand(xtmp1, src, xtmp1, vec_enc); 5805 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5806 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5807 vpshufb(dst, xtmp2, dst, vec_enc); 5808 vpaddb(dst, dst, xtmp1, vec_enc); 5809 } 5810 5811 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5812 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5813 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5814 // Following code is as per steps e,f,g and h of above algorithm. 5815 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5816 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5817 vpsadbw(dst, dst, xtmp2, vec_enc); 5818 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5819 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5820 vpackuswb(dst, xtmp1, dst, vec_enc); 5821 } 5822 5823 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5824 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5825 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5826 // Add the popcount of upper and lower bytes of word. 5827 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5828 vpsrlw(dst, xtmp1, 8, vec_enc); 5829 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5830 vpaddw(dst, dst, xtmp1, vec_enc); 5831 } 5832 5833 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5834 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5835 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5836 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5837 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5838 } 5839 5840 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5841 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5842 switch(bt) { 5843 case T_LONG: 5844 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5845 break; 5846 case T_INT: 5847 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5848 break; 5849 case T_CHAR: 5850 case T_SHORT: 5851 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5852 break; 5853 case T_BYTE: 5854 case T_BOOLEAN: 5855 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5856 break; 5857 default: 5858 fatal("Unsupported type %s", type2name(bt)); 5859 break; 5860 } 5861 } 5862 5863 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5864 KRegister mask, bool merge, int vec_enc) { 5865 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5866 switch(bt) { 5867 case T_LONG: 5868 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5869 evpopcntq(dst, mask, src, merge, vec_enc); 5870 break; 5871 case T_INT: 5872 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5873 evpopcntd(dst, mask, src, merge, vec_enc); 5874 break; 5875 case T_CHAR: 5876 case T_SHORT: 5877 assert(VM_Version::supports_avx512_bitalg(), ""); 5878 evpopcntw(dst, mask, src, merge, vec_enc); 5879 break; 5880 case T_BYTE: 5881 case T_BOOLEAN: 5882 assert(VM_Version::supports_avx512_bitalg(), ""); 5883 evpopcntb(dst, mask, src, merge, vec_enc); 5884 break; 5885 default: 5886 fatal("Unsupported type %s", type2name(bt)); 5887 break; 5888 } 5889 } 5890 5891 #ifndef _LP64 5892 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 5893 assert(VM_Version::supports_avx512bw(), ""); 5894 kmovdl(tmp, src); 5895 kunpckdql(dst, tmp, tmp); 5896 } 5897 #endif 5898 5899 // Bit reversal algorithm first reverses the bits of each byte followed by 5900 // a byte level reversal for multi-byte primitive types (short/int/long). 5901 // Algorithm performs a lookup table access to get reverse bit sequence 5902 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5903 // is obtained by swapping the reverse bit sequences of upper and lower 5904 // nibble of a byte. 5905 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5906 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5907 if (VM_Version::supports_avx512vlbw()) { 5908 5909 // Get the reverse bit sequence of lower nibble of each byte. 5910 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5911 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5912 evpandq(dst, xtmp2, src, vec_enc); 5913 vpshufb(dst, xtmp1, dst, vec_enc); 5914 vpsllq(dst, dst, 4, vec_enc); 5915 5916 // Get the reverse bit sequence of upper nibble of each byte. 5917 vpandn(xtmp2, xtmp2, src, vec_enc); 5918 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5919 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5920 5921 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5922 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5923 evporq(xtmp2, dst, xtmp2, vec_enc); 5924 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5925 5926 } else if(vec_enc == Assembler::AVX_512bit) { 5927 // Shift based bit reversal. 5928 assert(bt == T_LONG || bt == T_INT, ""); 5929 5930 // Swap lower and upper nibble of each byte. 5931 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5932 5933 // Swap two least and most significant bits of each nibble. 5934 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5935 5936 // Swap adjacent pair of bits. 5937 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5938 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5939 5940 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5941 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5942 } else { 5943 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5944 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5945 5946 // Get the reverse bit sequence of lower nibble of each byte. 5947 vpand(dst, xtmp2, src, vec_enc); 5948 vpshufb(dst, xtmp1, dst, vec_enc); 5949 vpsllq(dst, dst, 4, vec_enc); 5950 5951 // Get the reverse bit sequence of upper nibble of each byte. 5952 vpandn(xtmp2, xtmp2, src, vec_enc); 5953 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5954 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5955 5956 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5957 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5958 vpor(xtmp2, dst, xtmp2, vec_enc); 5959 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5960 } 5961 } 5962 5963 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 5964 XMMRegister xtmp, Register rscratch) { 5965 assert(VM_Version::supports_gfni(), ""); 5966 assert(rscratch != noreg || always_reachable(mask), "missing"); 5967 5968 // Galois field instruction based bit reversal based on following algorithm. 5969 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5970 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 5971 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 5972 vector_reverse_byte(bt, dst, xtmp, vec_enc); 5973 } 5974 5975 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 5976 XMMRegister xtmp1, Register rtmp, int vec_enc) { 5977 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 5978 evpandq(dst, xtmp1, src, vec_enc); 5979 vpsllq(dst, dst, nbits, vec_enc); 5980 vpandn(xtmp1, xtmp1, src, vec_enc); 5981 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 5982 evporq(dst, dst, xtmp1, vec_enc); 5983 } 5984 5985 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5986 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5987 // Shift based bit reversal. 5988 assert(VM_Version::supports_evex(), ""); 5989 switch(bt) { 5990 case T_LONG: 5991 // Swap upper and lower double word of each quad word. 5992 evprorq(xtmp1, k0, src, 32, true, vec_enc); 5993 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 5994 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5995 break; 5996 case T_INT: 5997 // Swap upper and lower word of each double word. 5998 evprord(xtmp1, k0, src, 16, true, vec_enc); 5999 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 6000 break; 6001 case T_CHAR: 6002 case T_SHORT: 6003 // Swap upper and lower byte of each word. 6004 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 6005 break; 6006 case T_BYTE: 6007 evmovdquq(dst, k0, src, true, vec_enc); 6008 break; 6009 default: 6010 fatal("Unsupported type %s", type2name(bt)); 6011 break; 6012 } 6013 } 6014 6015 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 6016 if (bt == T_BYTE) { 6017 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 6018 evmovdquq(dst, k0, src, true, vec_enc); 6019 } else { 6020 vmovdqu(dst, src); 6021 } 6022 return; 6023 } 6024 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 6025 // pre-computed shuffle indices. 6026 switch(bt) { 6027 case T_LONG: 6028 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 6029 break; 6030 case T_INT: 6031 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 6032 break; 6033 case T_CHAR: 6034 case T_SHORT: 6035 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 6036 break; 6037 default: 6038 fatal("Unsupported type %s", type2name(bt)); 6039 break; 6040 } 6041 vpshufb(dst, src, dst, vec_enc); 6042 } 6043 6044 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6045 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6046 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 6047 assert(is_integral_type(bt), ""); 6048 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 6049 assert(VM_Version::supports_avx512cd(), ""); 6050 switch(bt) { 6051 case T_LONG: 6052 evplzcntq(dst, ktmp, src, merge, vec_enc); 6053 break; 6054 case T_INT: 6055 evplzcntd(dst, ktmp, src, merge, vec_enc); 6056 break; 6057 case T_SHORT: 6058 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 6059 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 6060 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 6061 vpunpckhwd(dst, xtmp1, src, vec_enc); 6062 evplzcntd(dst, ktmp, dst, merge, vec_enc); 6063 vpackusdw(dst, xtmp2, dst, vec_enc); 6064 break; 6065 case T_BYTE: 6066 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6067 // accessing the lookup table. 6068 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6069 // accessing the lookup table. 6070 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6071 assert(VM_Version::supports_avx512bw(), ""); 6072 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 6073 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 6074 vpand(xtmp2, dst, src, vec_enc); 6075 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6076 vpsrlw(xtmp3, src, 4, vec_enc); 6077 vpand(xtmp3, dst, xtmp3, vec_enc); 6078 vpshufb(dst, xtmp1, xtmp3, vec_enc); 6079 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6080 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 6081 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 6082 break; 6083 default: 6084 fatal("Unsupported type %s", type2name(bt)); 6085 break; 6086 } 6087 } 6088 6089 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6090 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6091 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6092 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6093 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6094 // accessing the lookup table. 6095 vpand(dst, xtmp2, src, vec_enc); 6096 vpshufb(dst, xtmp1, dst, vec_enc); 6097 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6098 // accessing the lookup table. 6099 vpsrlw(xtmp3, src, 4, vec_enc); 6100 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6101 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6102 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6103 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6104 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6105 vpaddb(dst, dst, xtmp2, vec_enc); 6106 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6107 } 6108 6109 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6110 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6111 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6112 // Add zero counts of lower byte and upper byte of a word if 6113 // upper byte holds a zero value. 6114 vpsrlw(xtmp3, src, 8, vec_enc); 6115 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6116 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6117 vpsllw(xtmp2, dst, 8, vec_enc); 6118 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6119 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6120 vpsrlw(dst, dst, 8, vec_enc); 6121 } 6122 6123 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6124 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6125 // Since IEEE 754 floating point format represents mantissa in 1.0 format 6126 // hence biased exponent can be used to compute leading zero count as per 6127 // following formula:- 6128 // LZCNT = 32 - (biased_exp - 127) 6129 // Special handling has been introduced for Zero, Max_Int and -ve source values. 6130 6131 // Broadcast 0xFF 6132 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6133 vpsrld(xtmp1, xtmp1, 24, vec_enc); 6134 6135 // Extract biased exponent. 6136 vcvtdq2ps(dst, src, vec_enc); 6137 vpsrld(dst, dst, 23, vec_enc); 6138 vpand(dst, dst, xtmp1, vec_enc); 6139 6140 // Broadcast 127. 6141 vpsrld(xtmp1, xtmp1, 1, vec_enc); 6142 // Exponent = biased_exp - 127 6143 vpsubd(dst, dst, xtmp1, vec_enc); 6144 6145 // Exponent = Exponent + 1 6146 vpsrld(xtmp3, xtmp1, 6, vec_enc); 6147 vpaddd(dst, dst, xtmp3, vec_enc); 6148 6149 // Replace -ve exponent with zero, exponent is -ve when src 6150 // lane contains a zero value. 6151 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6152 vblendvps(dst, dst, xtmp2, dst, vec_enc); 6153 6154 // Rematerialize broadcast 32. 6155 vpslld(xtmp1, xtmp3, 5, vec_enc); 6156 // Exponent is 32 if corresponding source lane contains max_int value. 6157 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 6158 // LZCNT = 32 - exponent 6159 vpsubd(dst, xtmp1, dst, vec_enc); 6160 6161 // Replace LZCNT with a value 1 if corresponding source lane 6162 // contains max_int value. 6163 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 6164 6165 // Replace biased_exp with 0 if source lane value is less than zero. 6166 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6167 vblendvps(dst, dst, xtmp2, src, vec_enc); 6168 } 6169 6170 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6171 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6172 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6173 // Add zero counts of lower word and upper word of a double word if 6174 // upper word holds a zero value. 6175 vpsrld(xtmp3, src, 16, vec_enc); 6176 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6177 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6178 vpslld(xtmp2, dst, 16, vec_enc); 6179 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6180 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6181 vpsrld(dst, dst, 16, vec_enc); 6182 // Add zero counts of lower doubleword and upper doubleword of a 6183 // quadword if upper doubleword holds a zero value. 6184 vpsrlq(xtmp3, src, 32, vec_enc); 6185 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6186 vpsllq(xtmp2, dst, 32, vec_enc); 6187 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6188 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6189 vpsrlq(dst, dst, 32, vec_enc); 6190 } 6191 6192 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6193 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6194 Register rtmp, int vec_enc) { 6195 assert(is_integral_type(bt), "unexpected type"); 6196 assert(vec_enc < Assembler::AVX_512bit, ""); 6197 switch(bt) { 6198 case T_LONG: 6199 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6200 break; 6201 case T_INT: 6202 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6203 break; 6204 case T_SHORT: 6205 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6206 break; 6207 case T_BYTE: 6208 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6209 break; 6210 default: 6211 fatal("Unsupported type %s", type2name(bt)); 6212 break; 6213 } 6214 } 6215 6216 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6217 switch(bt) { 6218 case T_BYTE: 6219 vpsubb(dst, src1, src2, vec_enc); 6220 break; 6221 case T_SHORT: 6222 vpsubw(dst, src1, src2, vec_enc); 6223 break; 6224 case T_INT: 6225 vpsubd(dst, src1, src2, vec_enc); 6226 break; 6227 case T_LONG: 6228 vpsubq(dst, src1, src2, vec_enc); 6229 break; 6230 default: 6231 fatal("Unsupported type %s", type2name(bt)); 6232 break; 6233 } 6234 } 6235 6236 // Trailing zero count computation is based on leading zero count operation as per 6237 // following equation. All AVX3 targets support AVX512CD feature which offers 6238 // direct vector instruction to compute leading zero count. 6239 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6240 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6241 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6242 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6243 assert(is_integral_type(bt), ""); 6244 // xtmp = -1 6245 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6246 // xtmp = xtmp + src 6247 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6248 // xtmp = xtmp & ~src 6249 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6250 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6251 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6252 vpsub(bt, dst, xtmp4, dst, vec_enc); 6253 } 6254 6255 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6256 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6257 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6258 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6259 assert(is_integral_type(bt), ""); 6260 // xtmp = 0 6261 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6262 // xtmp = 0 - src 6263 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6264 // xtmp = xtmp | src 6265 vpor(xtmp3, xtmp3, src, vec_enc); 6266 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6267 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6268 vpsub(bt, dst, xtmp1, dst, vec_enc); 6269 } 6270 6271 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6272 Label done; 6273 Label neg_divisor_fastpath; 6274 cmpl(divisor, 0); 6275 jccb(Assembler::less, neg_divisor_fastpath); 6276 xorl(rdx, rdx); 6277 divl(divisor); 6278 jmpb(done); 6279 bind(neg_divisor_fastpath); 6280 // Fastpath for divisor < 0: 6281 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6282 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6283 movl(rdx, rax); 6284 subl(rdx, divisor); 6285 if (VM_Version::supports_bmi1()) { 6286 andnl(rax, rdx, rax); 6287 } else { 6288 notl(rdx); 6289 andl(rax, rdx); 6290 } 6291 shrl(rax, 31); 6292 bind(done); 6293 } 6294 6295 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6296 Label done; 6297 Label neg_divisor_fastpath; 6298 cmpl(divisor, 0); 6299 jccb(Assembler::less, neg_divisor_fastpath); 6300 xorl(rdx, rdx); 6301 divl(divisor); 6302 jmpb(done); 6303 bind(neg_divisor_fastpath); 6304 // Fastpath when divisor < 0: 6305 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6306 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6307 movl(rdx, rax); 6308 subl(rax, divisor); 6309 if (VM_Version::supports_bmi1()) { 6310 andnl(rax, rax, rdx); 6311 } else { 6312 notl(rax); 6313 andl(rax, rdx); 6314 } 6315 sarl(rax, 31); 6316 andl(rax, divisor); 6317 subl(rdx, rax); 6318 bind(done); 6319 } 6320 6321 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6322 Label done; 6323 Label neg_divisor_fastpath; 6324 6325 cmpl(divisor, 0); 6326 jccb(Assembler::less, neg_divisor_fastpath); 6327 xorl(rdx, rdx); 6328 divl(divisor); 6329 jmpb(done); 6330 bind(neg_divisor_fastpath); 6331 // Fastpath for divisor < 0: 6332 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6333 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6334 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6335 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6336 movl(rdx, rax); 6337 subl(rax, divisor); 6338 if (VM_Version::supports_bmi1()) { 6339 andnl(rax, rax, rdx); 6340 } else { 6341 notl(rax); 6342 andl(rax, rdx); 6343 } 6344 movl(tmp, rax); 6345 shrl(rax, 31); // quotient 6346 sarl(tmp, 31); 6347 andl(tmp, divisor); 6348 subl(rdx, tmp); // remainder 6349 bind(done); 6350 } 6351 6352 #ifdef _LP64 6353 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6354 XMMRegister xtmp2, Register rtmp) { 6355 if(VM_Version::supports_gfni()) { 6356 // Galois field instruction based bit reversal based on following algorithm. 6357 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6358 mov64(rtmp, 0x8040201008040201L); 6359 movq(xtmp1, src); 6360 movq(xtmp2, rtmp); 6361 gf2p8affineqb(xtmp1, xtmp2, 0); 6362 movq(dst, xtmp1); 6363 } else { 6364 // Swap even and odd numbered bits. 6365 movl(rtmp, src); 6366 andl(rtmp, 0x55555555); 6367 shll(rtmp, 1); 6368 movl(dst, src); 6369 andl(dst, 0xAAAAAAAA); 6370 shrl(dst, 1); 6371 orl(dst, rtmp); 6372 6373 // Swap LSB and MSB 2 bits of each nibble. 6374 movl(rtmp, dst); 6375 andl(rtmp, 0x33333333); 6376 shll(rtmp, 2); 6377 andl(dst, 0xCCCCCCCC); 6378 shrl(dst, 2); 6379 orl(dst, rtmp); 6380 6381 // Swap LSB and MSB 4 bits of each byte. 6382 movl(rtmp, dst); 6383 andl(rtmp, 0x0F0F0F0F); 6384 shll(rtmp, 4); 6385 andl(dst, 0xF0F0F0F0); 6386 shrl(dst, 4); 6387 orl(dst, rtmp); 6388 } 6389 bswapl(dst); 6390 } 6391 6392 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6393 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6394 if(VM_Version::supports_gfni()) { 6395 // Galois field instruction based bit reversal based on following algorithm. 6396 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6397 mov64(rtmp1, 0x8040201008040201L); 6398 movq(xtmp1, src); 6399 movq(xtmp2, rtmp1); 6400 gf2p8affineqb(xtmp1, xtmp2, 0); 6401 movq(dst, xtmp1); 6402 } else { 6403 // Swap even and odd numbered bits. 6404 movq(rtmp1, src); 6405 mov64(rtmp2, 0x5555555555555555L); 6406 andq(rtmp1, rtmp2); 6407 shlq(rtmp1, 1); 6408 movq(dst, src); 6409 notq(rtmp2); 6410 andq(dst, rtmp2); 6411 shrq(dst, 1); 6412 orq(dst, rtmp1); 6413 6414 // Swap LSB and MSB 2 bits of each nibble. 6415 movq(rtmp1, dst); 6416 mov64(rtmp2, 0x3333333333333333L); 6417 andq(rtmp1, rtmp2); 6418 shlq(rtmp1, 2); 6419 notq(rtmp2); 6420 andq(dst, rtmp2); 6421 shrq(dst, 2); 6422 orq(dst, rtmp1); 6423 6424 // Swap LSB and MSB 4 bits of each byte. 6425 movq(rtmp1, dst); 6426 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6427 andq(rtmp1, rtmp2); 6428 shlq(rtmp1, 4); 6429 notq(rtmp2); 6430 andq(dst, rtmp2); 6431 shrq(dst, 4); 6432 orq(dst, rtmp1); 6433 } 6434 bswapq(dst); 6435 } 6436 6437 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6438 Label done; 6439 Label neg_divisor_fastpath; 6440 cmpq(divisor, 0); 6441 jccb(Assembler::less, neg_divisor_fastpath); 6442 xorl(rdx, rdx); 6443 divq(divisor); 6444 jmpb(done); 6445 bind(neg_divisor_fastpath); 6446 // Fastpath for divisor < 0: 6447 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6448 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6449 movq(rdx, rax); 6450 subq(rdx, divisor); 6451 if (VM_Version::supports_bmi1()) { 6452 andnq(rax, rdx, rax); 6453 } else { 6454 notq(rdx); 6455 andq(rax, rdx); 6456 } 6457 shrq(rax, 63); 6458 bind(done); 6459 } 6460 6461 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6462 Label done; 6463 Label neg_divisor_fastpath; 6464 cmpq(divisor, 0); 6465 jccb(Assembler::less, neg_divisor_fastpath); 6466 xorq(rdx, rdx); 6467 divq(divisor); 6468 jmp(done); 6469 bind(neg_divisor_fastpath); 6470 // Fastpath when divisor < 0: 6471 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6472 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6473 movq(rdx, rax); 6474 subq(rax, divisor); 6475 if (VM_Version::supports_bmi1()) { 6476 andnq(rax, rax, rdx); 6477 } else { 6478 notq(rax); 6479 andq(rax, rdx); 6480 } 6481 sarq(rax, 63); 6482 andq(rax, divisor); 6483 subq(rdx, rax); 6484 bind(done); 6485 } 6486 6487 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6488 Label done; 6489 Label neg_divisor_fastpath; 6490 cmpq(divisor, 0); 6491 jccb(Assembler::less, neg_divisor_fastpath); 6492 xorq(rdx, rdx); 6493 divq(divisor); 6494 jmp(done); 6495 bind(neg_divisor_fastpath); 6496 // Fastpath for divisor < 0: 6497 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6498 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6499 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6500 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6501 movq(rdx, rax); 6502 subq(rax, divisor); 6503 if (VM_Version::supports_bmi1()) { 6504 andnq(rax, rax, rdx); 6505 } else { 6506 notq(rax); 6507 andq(rax, rdx); 6508 } 6509 movq(tmp, rax); 6510 shrq(rax, 63); // quotient 6511 sarq(tmp, 63); 6512 andq(tmp, divisor); 6513 subq(rdx, tmp); // remainder 6514 bind(done); 6515 } 6516 #endif 6517 6518 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6519 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6520 int vlen_enc) { 6521 assert(VM_Version::supports_avx512bw(), ""); 6522 // Byte shuffles are inlane operations and indices are determined using 6523 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6524 // normalized to index range 0-15. This makes sure that all the multiples 6525 // of an index value are placed at same relative position in 128 bit 6526 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6527 // will be 16th element in their respective 128 bit lanes. 6528 movl(rtmp, 16); 6529 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6530 6531 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6532 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6533 // original shuffle indices and move the shuffled lanes corresponding to true 6534 // mask to destination vector. 6535 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6536 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6537 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6538 6539 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6540 // and broadcasting second 128 bit lane. 6541 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6542 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6543 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6544 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6545 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6546 6547 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6548 // and broadcasting third 128 bit lane. 6549 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6550 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6551 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6552 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6553 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6554 6555 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6556 // and broadcasting third 128 bit lane. 6557 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6558 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6559 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6560 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6561 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6562 } 6563 6564 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6565 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6566 if (vlen_enc == AVX_128bit) { 6567 vpermilps(dst, src, shuffle, vlen_enc); 6568 } else if (bt == T_INT) { 6569 vpermd(dst, shuffle, src, vlen_enc); 6570 } else { 6571 assert(bt == T_FLOAT, ""); 6572 vpermps(dst, shuffle, src, vlen_enc); 6573 } 6574 } 6575 6576 #ifdef _LP64 6577 void C2_MacroAssembler::load_nklass_compact_c2(Register dst, Register obj, Register index, Address::ScaleFactor scale, int disp) { 6578 // Note: Don't clobber obj anywhere in that method! 6579 6580 // The incoming address is pointing into obj-start + klass_offset_in_bytes. We need to extract 6581 // obj-start, so that we can load from the object's mark-word instead. Usually the address 6582 // comes as obj-start in obj and klass_offset_in_bytes in disp. However, sometimes C2 6583 // emits code that pre-computes obj-start + klass_offset_in_bytes into a register, and 6584 // then passes that register as obj and 0 in disp. The following code extracts the base 6585 // and offset to load the mark-word. 6586 int offset = oopDesc::mark_offset_in_bytes() + disp - oopDesc::klass_offset_in_bytes(); 6587 movq(dst, Address(obj, index, scale, offset)); 6588 shrq(dst, markWord::klass_shift); 6589 } 6590 #endif