1 /* 2 * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "oops/methodData.hpp" 31 #include "opto/c2_MacroAssembler.hpp" 32 #include "opto/intrinsicnode.hpp" 33 #include "opto/output.hpp" 34 #include "opto/opcodes.hpp" 35 #include "opto/subnode.hpp" 36 #include "runtime/globals.hpp" 37 #include "runtime/objectMonitor.hpp" 38 #include "runtime/stubRoutines.hpp" 39 #include "utilities/globalDefinitions.hpp" 40 #include "utilities/powerOfTwo.hpp" 41 #include "utilities/sizes.hpp" 42 43 #ifdef PRODUCT 44 #define BLOCK_COMMENT(str) /* nothing */ 45 #define STOP(error) stop(error) 46 #else 47 #define BLOCK_COMMENT(str) block_comment(str) 48 #define STOP(error) block_comment(error); stop(error) 49 #endif 50 51 // C2 compiled method's prolog code. 52 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { 53 54 // WARNING: Initial instruction MUST be 5 bytes or longer so that 55 // NativeJump::patch_verified_entry will be able to patch out the entry 56 // code safely. The push to verify stack depth is ok at 5 bytes, 57 // the frame allocation can be either 3 or 6 bytes. So if we don't do 58 // stack bang then we must use the 6 byte frame allocation even if 59 // we have no frame. :-( 60 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 61 62 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 63 // Remove word for return addr 64 framesize -= wordSize; 65 stack_bang_size -= wordSize; 66 67 // Calls to C2R adapters often do not accept exceptional returns. 68 // We require that their callers must bang for them. But be careful, because 69 // some VM calls (such as call site linkage) can use several kilobytes of 70 // stack. But the stack safety zone should account for that. 71 // See bugs 4446381, 4468289, 4497237. 72 if (stack_bang_size > 0) { 73 generate_stack_overflow_check(stack_bang_size); 74 75 // We always push rbp, so that on return to interpreter rbp, will be 76 // restored correctly and we can correct the stack. 77 push(rbp); 78 // Save caller's stack pointer into RBP if the frame pointer is preserved. 79 if (PreserveFramePointer) { 80 mov(rbp, rsp); 81 } 82 // Remove word for ebp 83 framesize -= wordSize; 84 85 // Create frame 86 if (framesize) { 87 subptr(rsp, framesize); 88 } 89 } else { 90 // Create frame (force generation of a 4 byte immediate value) 91 subptr_imm32(rsp, framesize); 92 93 // Save RBP register now. 94 framesize -= wordSize; 95 movptr(Address(rsp, framesize), rbp); 96 // Save caller's stack pointer into RBP if the frame pointer is preserved. 97 if (PreserveFramePointer) { 98 movptr(rbp, rsp); 99 if (framesize > 0) { 100 addptr(rbp, framesize); 101 } 102 } 103 } 104 105 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 106 framesize -= wordSize; 107 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 108 } 109 110 #ifndef _LP64 111 // If method sets FPU control word do it now 112 if (fp_mode_24b) { 113 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 114 } 115 if (UseSSE >= 2 && VerifyFPU) { 116 verify_FPU(0, "FPU stack must be clean on entry"); 117 } 118 #endif 119 120 #ifdef ASSERT 121 if (VerifyStackAtCalls) { 122 Label L; 123 push(rax); 124 mov(rax, rsp); 125 andptr(rax, StackAlignmentInBytes-1); 126 cmpptr(rax, StackAlignmentInBytes-wordSize); 127 pop(rax); 128 jcc(Assembler::equal, L); 129 STOP("Stack is not properly aligned!"); 130 bind(L); 131 } 132 #endif 133 134 if (!is_stub) { 135 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 136 #ifdef _LP64 137 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 138 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 139 Label dummy_slow_path; 140 Label dummy_continuation; 141 Label* slow_path = &dummy_slow_path; 142 Label* continuation = &dummy_continuation; 143 if (!Compile::current()->output()->in_scratch_emit_size()) { 144 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 145 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 146 Compile::current()->output()->add_stub(stub); 147 slow_path = &stub->entry(); 148 continuation = &stub->continuation(); 149 } 150 bs->nmethod_entry_barrier(this, slow_path, continuation); 151 } 152 #else 153 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 154 bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */); 155 #endif 156 } 157 } 158 159 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 160 switch (vlen_in_bytes) { 161 case 4: // fall-through 162 case 8: // fall-through 163 case 16: return Assembler::AVX_128bit; 164 case 32: return Assembler::AVX_256bit; 165 case 64: return Assembler::AVX_512bit; 166 167 default: { 168 ShouldNotReachHere(); 169 return Assembler::AVX_NoVec; 170 } 171 } 172 } 173 174 #if INCLUDE_RTM_OPT 175 176 // Update rtm_counters based on abort status 177 // input: abort_status 178 // rtm_counters (RTMLockingCounters*) 179 // flags are killed 180 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) { 181 182 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset())); 183 if (PrintPreciseRTMLockingStatistics) { 184 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 185 Label check_abort; 186 testl(abort_status, (1<<i)); 187 jccb(Assembler::equal, check_abort); 188 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx)))); 189 bind(check_abort); 190 } 191 } 192 } 193 194 // Branch if (random & (count-1) != 0), count is 2^n 195 // tmp, scr and flags are killed 196 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) { 197 assert(tmp == rax, ""); 198 assert(scr == rdx, ""); 199 rdtsc(); // modifies EDX:EAX 200 andptr(tmp, count-1); 201 jccb(Assembler::notZero, brLabel); 202 } 203 204 // Perform abort ratio calculation, set no_rtm bit if high ratio 205 // input: rtm_counters_Reg (RTMLockingCounters* address) 206 // tmpReg, rtm_counters_Reg and flags are killed 207 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg, 208 Register rtm_counters_Reg, 209 RTMLockingCounters* rtm_counters, 210 Metadata* method_data) { 211 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 212 213 if (RTMLockingCalculationDelay > 0) { 214 // Delay calculation 215 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr())); 216 testptr(tmpReg, tmpReg); 217 jccb(Assembler::equal, L_done); 218 } 219 // Abort ratio calculation only if abort_count > RTMAbortThreshold 220 // Aborted transactions = abort_count * 100 221 // All transactions = total_count * RTMTotalCountIncrRate 222 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 223 224 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset())); 225 cmpptr(tmpReg, RTMAbortThreshold); 226 jccb(Assembler::below, L_check_always_rtm2); 227 imulptr(tmpReg, tmpReg, 100); 228 229 Register scrReg = rtm_counters_Reg; 230 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 231 imulptr(scrReg, scrReg, RTMTotalCountIncrRate); 232 imulptr(scrReg, scrReg, RTMAbortRatio); 233 cmpptr(tmpReg, scrReg); 234 jccb(Assembler::below, L_check_always_rtm1); 235 if (method_data != nullptr) { 236 // set rtm_state to "no rtm" in MDO 237 mov_metadata(tmpReg, method_data); 238 lock(); 239 orl(Address(tmpReg, MethodData::rtm_state_offset()), NoRTM); 240 } 241 jmpb(L_done); 242 bind(L_check_always_rtm1); 243 // Reload RTMLockingCounters* address 244 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 245 bind(L_check_always_rtm2); 246 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 247 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 248 jccb(Assembler::below, L_done); 249 if (method_data != nullptr) { 250 // set rtm_state to "always rtm" in MDO 251 mov_metadata(tmpReg, method_data); 252 lock(); 253 orl(Address(tmpReg, MethodData::rtm_state_offset()), UseRTM); 254 } 255 bind(L_done); 256 } 257 258 // Update counters and perform abort ratio calculation 259 // input: abort_status_Reg 260 // rtm_counters_Reg, flags are killed 261 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg, 262 Register rtm_counters_Reg, 263 RTMLockingCounters* rtm_counters, 264 Metadata* method_data, 265 bool profile_rtm) { 266 267 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 268 // update rtm counters based on rax value at abort 269 // reads abort_status_Reg, updates flags 270 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 271 rtm_counters_update(abort_status_Reg, rtm_counters_Reg); 272 if (profile_rtm) { 273 // Save abort status because abort_status_Reg is used by following code. 274 if (RTMRetryCount > 0) { 275 push(abort_status_Reg); 276 } 277 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 278 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data); 279 // restore abort status 280 if (RTMRetryCount > 0) { 281 pop(abort_status_Reg); 282 } 283 } 284 } 285 286 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4) 287 // inputs: retry_count_Reg 288 // : abort_status_Reg 289 // output: retry_count_Reg decremented by 1 290 // flags are killed 291 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) { 292 Label doneRetry; 293 assert(abort_status_Reg == rax, ""); 294 // The abort reason bits are in eax (see all states in rtmLocking.hpp) 295 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) 296 // if reason is in 0x6 and retry count != 0 then retry 297 andptr(abort_status_Reg, 0x6); 298 jccb(Assembler::zero, doneRetry); 299 testl(retry_count_Reg, retry_count_Reg); 300 jccb(Assembler::zero, doneRetry); 301 pause(); 302 decrementl(retry_count_Reg); 303 jmp(retryLabel); 304 bind(doneRetry); 305 } 306 307 // Spin and retry if lock is busy, 308 // inputs: box_Reg (monitor address) 309 // : retry_count_Reg 310 // output: retry_count_Reg decremented by 1 311 // : clear z flag if retry count exceeded 312 // tmp_Reg, scr_Reg, flags are killed 313 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg, 314 Register tmp_Reg, Register scr_Reg, Label& retryLabel) { 315 Label SpinLoop, SpinExit, doneRetry; 316 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 317 318 testl(retry_count_Reg, retry_count_Reg); 319 jccb(Assembler::zero, doneRetry); 320 decrementl(retry_count_Reg); 321 movptr(scr_Reg, RTMSpinLoopCount); 322 323 bind(SpinLoop); 324 pause(); 325 decrementl(scr_Reg); 326 jccb(Assembler::lessEqual, SpinExit); 327 movptr(tmp_Reg, Address(box_Reg, owner_offset)); 328 testptr(tmp_Reg, tmp_Reg); 329 jccb(Assembler::notZero, SpinLoop); 330 331 bind(SpinExit); 332 jmp(retryLabel); 333 bind(doneRetry); 334 incrementl(retry_count_Reg); // clear z flag 335 } 336 337 // Use RTM for normal stack locks 338 // Input: objReg (object to lock) 339 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg, 340 Register retry_on_abort_count_Reg, 341 RTMLockingCounters* stack_rtm_counters, 342 Metadata* method_data, bool profile_rtm, 343 Label& DONE_LABEL, Label& IsInflated) { 344 assert(UseRTMForStackLocks, "why call this otherwise?"); 345 assert(tmpReg == rax, ""); 346 assert(scrReg == rdx, ""); 347 Label L_rtm_retry, L_decrement_retry, L_on_abort; 348 349 if (RTMRetryCount > 0) { 350 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 351 bind(L_rtm_retry); 352 } 353 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 354 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 355 jcc(Assembler::notZero, IsInflated); 356 357 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 358 Label L_noincrement; 359 if (RTMTotalCountIncrRate > 1) { 360 // tmpReg, scrReg and flags are killed 361 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 362 } 363 assert(stack_rtm_counters != nullptr, "should not be null when profiling RTM"); 364 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg); 365 bind(L_noincrement); 366 } 367 xbegin(L_on_abort); 368 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 369 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 370 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 371 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked 372 373 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 374 if (UseRTMXendForLockBusy) { 375 xend(); 376 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry) 377 jmp(L_decrement_retry); 378 } 379 else { 380 xabort(0); 381 } 382 bind(L_on_abort); 383 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 384 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm); 385 } 386 bind(L_decrement_retry); 387 if (RTMRetryCount > 0) { 388 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 389 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 390 } 391 } 392 393 // Use RTM for inflating locks 394 // inputs: objReg (object to lock) 395 // boxReg (on-stack box address (displaced header location) - KILLED) 396 // tmpReg (ObjectMonitor address + markWord::monitor_value) 397 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg, 398 Register scrReg, Register retry_on_busy_count_Reg, 399 Register retry_on_abort_count_Reg, 400 RTMLockingCounters* rtm_counters, 401 Metadata* method_data, bool profile_rtm, 402 Label& DONE_LABEL) { 403 assert(UseRTMLocking, "why call this otherwise?"); 404 assert(tmpReg == rax, ""); 405 assert(scrReg == rdx, ""); 406 Label L_rtm_retry, L_decrement_retry, L_on_abort; 407 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 408 409 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 410 movptr(boxReg, tmpReg); // Save ObjectMonitor address 411 412 if (RTMRetryCount > 0) { 413 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy 414 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 415 bind(L_rtm_retry); 416 } 417 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 418 Label L_noincrement; 419 if (RTMTotalCountIncrRate > 1) { 420 // tmpReg, scrReg and flags are killed 421 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 422 } 423 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 424 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg); 425 bind(L_noincrement); 426 } 427 xbegin(L_on_abort); 428 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 429 movptr(tmpReg, Address(tmpReg, owner_offset)); 430 testptr(tmpReg, tmpReg); 431 jcc(Assembler::zero, DONE_LABEL); 432 if (UseRTMXendForLockBusy) { 433 xend(); 434 jmp(L_decrement_retry); 435 } 436 else { 437 xabort(0); 438 } 439 bind(L_on_abort); 440 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 441 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 442 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm); 443 } 444 if (RTMRetryCount > 0) { 445 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 446 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 447 } 448 449 movptr(tmpReg, Address(boxReg, owner_offset)) ; 450 testptr(tmpReg, tmpReg) ; 451 jccb(Assembler::notZero, L_decrement_retry) ; 452 453 // Appears unlocked - try to swing _owner from null to non-null. 454 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 455 #ifdef _LP64 456 Register threadReg = r15_thread; 457 #else 458 get_thread(scrReg); 459 Register threadReg = scrReg; 460 #endif 461 lock(); 462 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg 463 464 if (RTMRetryCount > 0) { 465 // success done else retry 466 jccb(Assembler::equal, DONE_LABEL) ; 467 bind(L_decrement_retry); 468 // Spin and retry if lock is busy. 469 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry); 470 } 471 else { 472 bind(L_decrement_retry); 473 } 474 } 475 476 #endif // INCLUDE_RTM_OPT 477 478 // fast_lock and fast_unlock used by C2 479 480 // Because the transitions from emitted code to the runtime 481 // monitorenter/exit helper stubs are so slow it's critical that 482 // we inline both the stack-locking fast path and the inflated fast path. 483 // 484 // See also: cmpFastLock and cmpFastUnlock. 485 // 486 // What follows is a specialized inline transliteration of the code 487 // in enter() and exit(). If we're concerned about I$ bloat another 488 // option would be to emit TrySlowEnter and TrySlowExit methods 489 // at startup-time. These methods would accept arguments as 490 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 491 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 492 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 493 // In practice, however, the # of lock sites is bounded and is usually small. 494 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 495 // if the processor uses simple bimodal branch predictors keyed by EIP 496 // Since the helper routines would be called from multiple synchronization 497 // sites. 498 // 499 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 500 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 501 // to those specialized methods. That'd give us a mostly platform-independent 502 // implementation that the JITs could optimize and inline at their pleasure. 503 // Done correctly, the only time we'd need to cross to native could would be 504 // to park() or unpark() threads. We'd also need a few more unsafe operators 505 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 506 // (b) explicit barriers or fence operations. 507 // 508 // TODO: 509 // 510 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 511 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 512 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 513 // the lock operators would typically be faster than reifying Self. 514 // 515 // * Ideally I'd define the primitives as: 516 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 517 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 518 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 519 // Instead, we're stuck with a rather awkward and brittle register assignments below. 520 // Furthermore the register assignments are overconstrained, possibly resulting in 521 // sub-optimal code near the synchronization site. 522 // 523 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 524 // Alternately, use a better sp-proximity test. 525 // 526 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 527 // Either one is sufficient to uniquely identify a thread. 528 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 529 // 530 // * Intrinsify notify() and notifyAll() for the common cases where the 531 // object is locked by the calling thread but the waitlist is empty. 532 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 533 // 534 // * use jccb and jmpb instead of jcc and jmp to improve code density. 535 // But beware of excessive branch density on AMD Opterons. 536 // 537 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 538 // or failure of the fast path. If the fast path fails then we pass 539 // control to the slow path, typically in C. In fast_lock and 540 // fast_unlock we often branch to DONE_LABEL, just to find that C2 541 // will emit a conditional branch immediately after the node. 542 // So we have branches to branches and lots of ICC.ZF games. 543 // Instead, it might be better to have C2 pass a "FailureLabel" 544 // into fast_lock and fast_unlock. In the case of success, control 545 // will drop through the node. ICC.ZF is undefined at exit. 546 // In the case of failure, the node will branch directly to the 547 // FailureLabel 548 549 550 // obj: object to lock 551 // box: on-stack box address (displaced header location) - KILLED 552 // rax,: tmp -- KILLED 553 // scr: tmp -- KILLED 554 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 555 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 556 RTMLockingCounters* rtm_counters, 557 RTMLockingCounters* stack_rtm_counters, 558 Metadata* method_data, 559 bool use_rtm, bool profile_rtm) { 560 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 561 // Ensure the register assignments are disjoint 562 assert(tmpReg == rax, ""); 563 564 if (use_rtm) { 565 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg); 566 } else { 567 assert(cx1Reg == noreg, ""); 568 assert(cx2Reg == noreg, ""); 569 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 570 } 571 572 // Possible cases that we'll encounter in fast_lock 573 // ------------------------------------------------ 574 // * Inflated 575 // -- unlocked 576 // -- Locked 577 // = by self 578 // = by other 579 // * neutral 580 // * stack-locked 581 // -- by self 582 // = sp-proximity test hits 583 // = sp-proximity test generates false-negative 584 // -- by other 585 // 586 587 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 588 589 if (DiagnoseSyncOnValueBasedClasses != 0) { 590 load_klass(tmpReg, objReg, scrReg); 591 movl(tmpReg, Address(tmpReg, Klass::access_flags_offset())); 592 testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS); 593 jcc(Assembler::notZero, DONE_LABEL); 594 } 595 596 #if INCLUDE_RTM_OPT 597 if (UseRTMForStackLocks && use_rtm) { 598 assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive"); 599 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg, 600 stack_rtm_counters, method_data, profile_rtm, 601 DONE_LABEL, IsInflated); 602 } 603 #endif // INCLUDE_RTM_OPT 604 605 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 606 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 607 jcc(Assembler::notZero, IsInflated); 608 609 if (LockingMode == LM_MONITOR) { 610 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 611 testptr(objReg, objReg); 612 } else { 613 assert(LockingMode == LM_LEGACY, "must be"); 614 // Attempt stack-locking ... 615 orptr (tmpReg, markWord::unlocked_value); 616 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 617 lock(); 618 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 619 jcc(Assembler::equal, COUNT); // Success 620 621 // Recursive locking. 622 // The object is stack-locked: markword contains stack pointer to BasicLock. 623 // Locked by current thread if difference with current SP is less than one page. 624 subptr(tmpReg, rsp); 625 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 626 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) ); 627 movptr(Address(boxReg, 0), tmpReg); 628 } 629 jmp(DONE_LABEL); 630 631 bind(IsInflated); 632 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 633 634 #if INCLUDE_RTM_OPT 635 // Use the same RTM locking code in 32- and 64-bit VM. 636 if (use_rtm) { 637 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg, 638 rtm_counters, method_data, profile_rtm, DONE_LABEL); 639 } else { 640 #endif // INCLUDE_RTM_OPT 641 642 #ifndef _LP64 643 // The object is inflated. 644 645 // boxReg refers to the on-stack BasicLock in the current frame. 646 // We'd like to write: 647 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 648 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 649 // additional latency as we have another ST in the store buffer that must drain. 650 651 // avoid ST-before-CAS 652 // register juggle because we need tmpReg for cmpxchgptr below 653 movptr(scrReg, boxReg); 654 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 655 656 // Optimistic form: consider XORL tmpReg,tmpReg 657 movptr(tmpReg, NULL_WORD); 658 659 // Appears unlocked - try to swing _owner from null to non-null. 660 // Ideally, I'd manifest "Self" with get_thread and then attempt 661 // to CAS the register containing Self into m->Owner. 662 // But we don't have enough registers, so instead we can either try to CAS 663 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 664 // we later store "Self" into m->Owner. Transiently storing a stack address 665 // (rsp or the address of the box) into m->owner is harmless. 666 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 667 lock(); 668 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 669 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 670 // If we weren't able to swing _owner from null to the BasicLock 671 // then take the slow path. 672 jccb (Assembler::notZero, NO_COUNT); 673 // update _owner from BasicLock to thread 674 get_thread (scrReg); // beware: clobbers ICCs 675 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 676 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 677 678 // If the CAS fails we can either retry or pass control to the slow path. 679 // We use the latter tactic. 680 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 681 // If the CAS was successful ... 682 // Self has acquired the lock 683 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 684 // Intentional fall-through into DONE_LABEL ... 685 #else // _LP64 686 // It's inflated and we use scrReg for ObjectMonitor* in this section. 687 movq(scrReg, tmpReg); 688 xorq(tmpReg, tmpReg); 689 lock(); 690 cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 691 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 692 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 693 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 694 // Propagate ICC.ZF from CAS above into DONE_LABEL. 695 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 696 697 cmpptr(thread, rax); // Check if we are already the owner (recursive lock) 698 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 699 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 700 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 701 #endif // _LP64 702 #if INCLUDE_RTM_OPT 703 } // use_rtm() 704 #endif 705 bind(DONE_LABEL); 706 707 // ZFlag == 1 count in fast path 708 // ZFlag == 0 count in slow path 709 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 710 711 bind(COUNT); 712 // Count monitors in fast path 713 increment(Address(thread, JavaThread::held_monitor_count_offset())); 714 715 xorl(tmpReg, tmpReg); // Set ZF == 1 716 717 bind(NO_COUNT); 718 719 // At NO_COUNT the icc ZFlag is set as follows ... 720 // fast_unlock uses the same protocol. 721 // ZFlag == 1 -> Success 722 // ZFlag == 0 -> Failure - force control through the slow path 723 } 724 725 // obj: object to unlock 726 // box: box address (displaced header location), killed. Must be EAX. 727 // tmp: killed, cannot be obj nor box. 728 // 729 // Some commentary on balanced locking: 730 // 731 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 732 // Methods that don't have provably balanced locking are forced to run in the 733 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 734 // The interpreter provides two properties: 735 // I1: At return-time the interpreter automatically and quietly unlocks any 736 // objects acquired the current activation (frame). Recall that the 737 // interpreter maintains an on-stack list of locks currently held by 738 // a frame. 739 // I2: If a method attempts to unlock an object that is not held by the 740 // the frame the interpreter throws IMSX. 741 // 742 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 743 // B() doesn't have provably balanced locking so it runs in the interpreter. 744 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 745 // is still locked by A(). 746 // 747 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 748 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 749 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 750 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 751 // Arguably given that the spec legislates the JNI case as undefined our implementation 752 // could reasonably *avoid* checking owner in fast_unlock(). 753 // In the interest of performance we elide m->Owner==Self check in unlock. 754 // A perfectly viable alternative is to elide the owner check except when 755 // Xcheck:jni is enabled. 756 757 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { 758 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 759 assert(boxReg == rax, ""); 760 assert_different_registers(objReg, boxReg, tmpReg); 761 762 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 763 764 #if INCLUDE_RTM_OPT 765 if (UseRTMForStackLocks && use_rtm) { 766 assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive"); 767 Label L_regular_unlock; 768 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 769 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 770 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 771 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock 772 xend(); // otherwise end... 773 jmp(DONE_LABEL); // ... and we're done 774 bind(L_regular_unlock); 775 } 776 #endif 777 778 if (LockingMode == LM_LEGACY) { 779 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 780 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 781 } 782 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 783 if (LockingMode != LM_MONITOR) { 784 testptr(tmpReg, markWord::monitor_value); // Inflated? 785 jcc(Assembler::zero, Stacked); 786 } 787 788 // It's inflated. 789 790 #if INCLUDE_RTM_OPT 791 if (use_rtm) { 792 Label L_regular_inflated_unlock; 793 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 794 movptr(boxReg, Address(tmpReg, owner_offset)); 795 testptr(boxReg, boxReg); 796 jccb(Assembler::notZero, L_regular_inflated_unlock); 797 xend(); 798 jmp(DONE_LABEL); 799 bind(L_regular_inflated_unlock); 800 } 801 #endif 802 803 // Despite our balanced locking property we still check that m->_owner == Self 804 // as java routines or native JNI code called by this thread might 805 // have released the lock. 806 // Refer to the comments in synchronizer.cpp for how we might encode extra 807 // state in _succ so we can avoid fetching EntryList|cxq. 808 // 809 // If there's no contention try a 1-0 exit. That is, exit without 810 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 811 // we detect and recover from the race that the 1-0 exit admits. 812 // 813 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 814 // before it STs null into _owner, releasing the lock. Updates 815 // to data protected by the critical section must be visible before 816 // we drop the lock (and thus before any other thread could acquire 817 // the lock and observe the fields protected by the lock). 818 // IA32's memory-model is SPO, so STs are ordered with respect to 819 // each other and there's no need for an explicit barrier (fence). 820 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 821 #ifndef _LP64 822 // Note that we could employ various encoding schemes to reduce 823 // the number of loads below (currently 4) to just 2 or 3. 824 // Refer to the comments in synchronizer.cpp. 825 // In practice the chain of fetches doesn't seem to impact performance, however. 826 xorptr(boxReg, boxReg); 827 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 828 jccb (Assembler::notZero, DONE_LABEL); 829 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 830 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 831 jccb (Assembler::notZero, DONE_LABEL); 832 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 833 jmpb (DONE_LABEL); 834 #else // _LP64 835 // It's inflated 836 Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath; 837 838 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 839 jccb(Assembler::equal, LNotRecursive); 840 841 // Recursive inflated unlock 842 decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 843 jmpb(LSuccess); 844 845 bind(LNotRecursive); 846 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 847 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 848 jccb (Assembler::notZero, CheckSucc); 849 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 850 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 851 jmpb (DONE_LABEL); 852 853 // Try to avoid passing control into the slow_path ... 854 bind (CheckSucc); 855 856 // The following optional optimization can be elided if necessary 857 // Effectively: if (succ == null) goto slow path 858 // The code reduces the window for a race, however, 859 // and thus benefits performance. 860 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 861 jccb (Assembler::zero, LGoSlowPath); 862 863 xorptr(boxReg, boxReg); 864 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 865 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 866 867 // Memory barrier/fence 868 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 869 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 870 // This is faster on Nehalem and AMD Shanghai/Barcelona. 871 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 872 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 873 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 874 lock(); addl(Address(rsp, 0), 0); 875 876 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 877 jccb (Assembler::notZero, LSuccess); 878 879 // Rare inopportune interleaving - race. 880 // The successor vanished in the small window above. 881 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 882 // We need to ensure progress and succession. 883 // Try to reacquire the lock. 884 // If that fails then the new owner is responsible for succession and this 885 // thread needs to take no further action and can exit via the fast path (success). 886 // If the re-acquire succeeds then pass control into the slow path. 887 // As implemented, this latter mode is horrible because we generated more 888 // coherence traffic on the lock *and* artificially extended the critical section 889 // length while by virtue of passing control into the slow path. 890 891 // box is really RAX -- the following CMPXCHG depends on that binding 892 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 893 lock(); 894 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 895 // There's no successor so we tried to regrab the lock. 896 // If that didn't work, then another thread grabbed the 897 // lock so we're done (and exit was a success). 898 jccb (Assembler::notEqual, LSuccess); 899 // Intentional fall-through into slow path 900 901 bind (LGoSlowPath); 902 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 903 jmpb (DONE_LABEL); 904 905 bind (LSuccess); 906 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 907 jmpb (DONE_LABEL); 908 909 #endif 910 if (LockingMode == LM_LEGACY) { 911 bind (Stacked); 912 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 913 lock(); 914 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 915 // Intentional fall-thru into DONE_LABEL 916 } 917 918 bind(DONE_LABEL); 919 920 // ZFlag == 1 count in fast path 921 // ZFlag == 0 count in slow path 922 jccb(Assembler::notZero, NO_COUNT); 923 924 bind(COUNT); 925 // Count monitors in fast path 926 #ifndef _LP64 927 get_thread(tmpReg); 928 decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); 929 #else // _LP64 930 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 931 #endif 932 933 xorl(tmpReg, tmpReg); // Set ZF == 1 934 935 bind(NO_COUNT); 936 } 937 938 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 939 Register t, Register thread) { 940 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 941 assert(rax_reg == rax, "Used for CAS"); 942 assert_different_registers(obj, box, rax_reg, t, thread); 943 944 // Handle inflated monitor. 945 Label inflated; 946 // Finish fast lock successfully. ZF value is irrelevant. 947 Label locked; 948 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 949 Label slow_path; 950 951 if (DiagnoseSyncOnValueBasedClasses != 0) { 952 load_klass(rax_reg, obj, t); 953 movl(rax_reg, Address(rax_reg, Klass::access_flags_offset())); 954 testl(rax_reg, JVM_ACC_IS_VALUE_BASED_CLASS); 955 jcc(Assembler::notZero, slow_path); 956 } 957 958 const Register mark = t; 959 960 { // Lightweight Lock 961 962 Label push; 963 964 const Register top = box; 965 966 // Load the mark. 967 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 968 969 // Prefetch top. 970 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 971 972 // Check for monitor (0b10). 973 testptr(mark, markWord::monitor_value); 974 jcc(Assembler::notZero, inflated); 975 976 // Check if lock-stack is full. 977 cmpl(top, LockStack::end_offset() - 1); 978 jcc(Assembler::greater, slow_path); 979 980 // Check if recursive. 981 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 982 jccb(Assembler::equal, push); 983 984 // Try to lock. Transition lock bits 0b01 => 0b00 985 movptr(rax_reg, mark); 986 orptr(rax_reg, markWord::unlocked_value); 987 andptr(mark, ~(int32_t)markWord::unlocked_value); 988 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 989 jcc(Assembler::notEqual, slow_path); 990 991 bind(push); 992 // After successful lock, push object on lock-stack. 993 movptr(Address(thread, top), obj); 994 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 995 jmpb(locked); 996 } 997 998 { // Handle inflated monitor. 999 bind(inflated); 1000 1001 const Register tagged_monitor = mark; 1002 1003 // CAS owner (null => current thread). 1004 xorptr(rax_reg, rax_reg); 1005 lock(); cmpxchgptr(thread, Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 1006 jccb(Assembler::equal, locked); 1007 1008 // Check if recursive. 1009 cmpptr(thread, rax_reg); 1010 jccb(Assembler::notEqual, slow_path); 1011 1012 // Recursive. 1013 increment(Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 1014 } 1015 1016 bind(locked); 1017 increment(Address(thread, JavaThread::held_monitor_count_offset())); 1018 // Set ZF = 1 1019 xorl(rax_reg, rax_reg); 1020 1021 #ifdef ASSERT 1022 // Check that locked label is reached with ZF set. 1023 Label zf_correct; 1024 jccb(Assembler::zero, zf_correct); 1025 stop("Fast Lock ZF != 1"); 1026 #endif 1027 1028 bind(slow_path); 1029 #ifdef ASSERT 1030 // Check that slow_path label is reached with ZF not set. 1031 jccb(Assembler::notZero, zf_correct); 1032 stop("Fast Lock ZF != 0"); 1033 bind(zf_correct); 1034 #endif 1035 // C2 uses the value of ZF to determine the continuation. 1036 } 1037 1038 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 1039 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 1040 assert(reg_rax == rax, "Used for CAS"); 1041 assert_different_registers(obj, reg_rax, t); 1042 1043 // Handle inflated monitor. 1044 Label inflated, inflated_check_lock_stack; 1045 // Finish fast unlock successfully. MUST jump with ZF == 1 1046 Label unlocked; 1047 1048 // Assume success. 1049 decrement(Address(thread, JavaThread::held_monitor_count_offset())); 1050 1051 const Register mark = t; 1052 const Register top = reg_rax; 1053 1054 Label dummy; 1055 C2FastUnlockLightweightStub* stub = nullptr; 1056 1057 if (!Compile::current()->output()->in_scratch_emit_size()) { 1058 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 1059 Compile::current()->output()->add_stub(stub); 1060 } 1061 1062 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 1063 Label& check_successor = stub == nullptr ? dummy : stub->check_successor(); 1064 1065 { // Lightweight Unlock 1066 1067 // Load top. 1068 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 1069 1070 // Prefetch mark. 1071 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 1072 1073 // Check if obj is top of lock-stack. 1074 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 1075 // Top of lock stack was not obj. Must be monitor. 1076 jcc(Assembler::notEqual, inflated_check_lock_stack); 1077 1078 // Pop lock-stack. 1079 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 1080 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 1081 1082 // Check if recursive. 1083 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 1084 jcc(Assembler::equal, unlocked); 1085 1086 // We elide the monitor check, let the CAS fail instead. 1087 1088 // Try to unlock. Transition lock bits 0b00 => 0b01 1089 movptr(reg_rax, mark); 1090 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 1091 orptr(mark, markWord::unlocked_value); 1092 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 1093 jcc(Assembler::notEqual, push_and_slow_path); 1094 jmp(unlocked); 1095 } 1096 1097 1098 { // Handle inflated monitor. 1099 bind(inflated_check_lock_stack); 1100 #ifdef ASSERT 1101 Label check_done; 1102 subl(top, oopSize); 1103 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 1104 jcc(Assembler::below, check_done); 1105 cmpptr(obj, Address(thread, top)); 1106 jccb(Assembler::notEqual, inflated_check_lock_stack); 1107 stop("Fast Unlock lock on stack"); 1108 bind(check_done); 1109 testptr(mark, markWord::monitor_value); 1110 jccb(Assembler::notZero, inflated); 1111 stop("Fast Unlock not monitor"); 1112 #endif 1113 1114 bind(inflated); 1115 1116 // mark contains the tagged ObjectMonitor*. 1117 const Register monitor = mark; 1118 1119 #ifndef _LP64 1120 // Check if recursive. 1121 xorptr(reg_rax, reg_rax); 1122 orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 1123 jcc(Assembler::notZero, check_successor); 1124 1125 // Check if the entry lists are empty. 1126 movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 1127 orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 1128 jcc(Assembler::notZero, check_successor); 1129 1130 // Release lock. 1131 movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 1132 #else // _LP64 1133 Label recursive; 1134 1135 // Check if recursive. 1136 cmpptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 1137 jccb(Assembler::notEqual, recursive); 1138 1139 // Check if the entry lists are empty. 1140 movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 1141 orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 1142 jcc(Assembler::notZero, check_successor); 1143 1144 // Release lock. 1145 movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 1146 jmpb(unlocked); 1147 1148 // Recursive unlock. 1149 bind(recursive); 1150 decrement(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 1151 xorl(t, t); 1152 #endif 1153 } 1154 1155 bind(unlocked); 1156 if (stub != nullptr) { 1157 bind(stub->unlocked_continuation()); 1158 } 1159 1160 #ifdef ASSERT 1161 // Check that unlocked label is reached with ZF set. 1162 Label zf_correct; 1163 jccb(Assembler::zero, zf_correct); 1164 stop("Fast Unlock ZF != 1"); 1165 #endif 1166 1167 if (stub != nullptr) { 1168 bind(stub->slow_path_continuation()); 1169 } 1170 #ifdef ASSERT 1171 // Check that stub->continuation() label is reached with ZF not set. 1172 jccb(Assembler::notZero, zf_correct); 1173 stop("Fast Unlock ZF != 0"); 1174 bind(zf_correct); 1175 #endif 1176 // C2 uses the value of ZF to determine the continuation. 1177 } 1178 1179 //------------------------------------------------------------------------------------------- 1180 // Generic instructions support for use in .ad files C2 code generation 1181 1182 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 1183 if (dst != src) { 1184 movdqu(dst, src); 1185 } 1186 if (opcode == Op_AbsVD) { 1187 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 1188 } else { 1189 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 1190 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1191 } 1192 } 1193 1194 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 1195 if (opcode == Op_AbsVD) { 1196 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 1197 } else { 1198 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 1199 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 1200 } 1201 } 1202 1203 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 1204 if (dst != src) { 1205 movdqu(dst, src); 1206 } 1207 if (opcode == Op_AbsVF) { 1208 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 1209 } else { 1210 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 1211 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1212 } 1213 } 1214 1215 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 1216 if (opcode == Op_AbsVF) { 1217 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 1218 } else { 1219 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 1220 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 1221 } 1222 } 1223 1224 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 1225 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1226 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 1227 1228 if (opcode == Op_MinV) { 1229 if (elem_bt == T_BYTE) { 1230 pminsb(dst, src); 1231 } else if (elem_bt == T_SHORT) { 1232 pminsw(dst, src); 1233 } else if (elem_bt == T_INT) { 1234 pminsd(dst, src); 1235 } else { 1236 assert(elem_bt == T_LONG, "required"); 1237 assert(tmp == xmm0, "required"); 1238 assert_different_registers(dst, src, tmp); 1239 movdqu(xmm0, dst); 1240 pcmpgtq(xmm0, src); 1241 blendvpd(dst, src); // xmm0 as mask 1242 } 1243 } else { // opcode == Op_MaxV 1244 if (elem_bt == T_BYTE) { 1245 pmaxsb(dst, src); 1246 } else if (elem_bt == T_SHORT) { 1247 pmaxsw(dst, src); 1248 } else if (elem_bt == T_INT) { 1249 pmaxsd(dst, src); 1250 } else { 1251 assert(elem_bt == T_LONG, "required"); 1252 assert(tmp == xmm0, "required"); 1253 assert_different_registers(dst, src, tmp); 1254 movdqu(xmm0, src); 1255 pcmpgtq(xmm0, dst); 1256 blendvpd(dst, src); // xmm0 as mask 1257 } 1258 } 1259 } 1260 1261 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 1262 XMMRegister dst, XMMRegister src1, XMMRegister src2, 1263 int vlen_enc) { 1264 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1265 1266 if (opcode == Op_MinV) { 1267 if (elem_bt == T_BYTE) { 1268 vpminsb(dst, src1, src2, vlen_enc); 1269 } else if (elem_bt == T_SHORT) { 1270 vpminsw(dst, src1, src2, vlen_enc); 1271 } else if (elem_bt == T_INT) { 1272 vpminsd(dst, src1, src2, vlen_enc); 1273 } else { 1274 assert(elem_bt == T_LONG, "required"); 1275 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1276 vpminsq(dst, src1, src2, vlen_enc); 1277 } else { 1278 assert_different_registers(dst, src1, src2); 1279 vpcmpgtq(dst, src1, src2, vlen_enc); 1280 vblendvpd(dst, src1, src2, dst, vlen_enc); 1281 } 1282 } 1283 } else { // opcode == Op_MaxV 1284 if (elem_bt == T_BYTE) { 1285 vpmaxsb(dst, src1, src2, vlen_enc); 1286 } else if (elem_bt == T_SHORT) { 1287 vpmaxsw(dst, src1, src2, vlen_enc); 1288 } else if (elem_bt == T_INT) { 1289 vpmaxsd(dst, src1, src2, vlen_enc); 1290 } else { 1291 assert(elem_bt == T_LONG, "required"); 1292 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1293 vpmaxsq(dst, src1, src2, vlen_enc); 1294 } else { 1295 assert_different_registers(dst, src1, src2); 1296 vpcmpgtq(dst, src1, src2, vlen_enc); 1297 vblendvpd(dst, src2, src1, dst, vlen_enc); 1298 } 1299 } 1300 } 1301 } 1302 1303 // Float/Double min max 1304 1305 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1306 XMMRegister dst, XMMRegister a, XMMRegister b, 1307 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1308 int vlen_enc) { 1309 assert(UseAVX > 0, "required"); 1310 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1311 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1312 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1313 assert_different_registers(a, b, tmp, atmp, btmp); 1314 1315 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1316 bool is_double_word = is_double_word_type(elem_bt); 1317 1318 if (!is_double_word && is_min) { 1319 vblendvps(atmp, a, b, a, vlen_enc); 1320 vblendvps(btmp, b, a, a, vlen_enc); 1321 vminps(tmp, atmp, btmp, vlen_enc); 1322 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1323 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 1324 } else if (!is_double_word && !is_min) { 1325 vblendvps(btmp, b, a, b, vlen_enc); 1326 vblendvps(atmp, a, b, b, vlen_enc); 1327 vmaxps(tmp, atmp, btmp, vlen_enc); 1328 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1329 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 1330 } else if (is_double_word && is_min) { 1331 vblendvpd(atmp, a, b, a, vlen_enc); 1332 vblendvpd(btmp, b, a, a, vlen_enc); 1333 vminpd(tmp, atmp, btmp, vlen_enc); 1334 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1335 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 1336 } else { 1337 assert(is_double_word && !is_min, "sanity"); 1338 vblendvpd(btmp, b, a, b, vlen_enc); 1339 vblendvpd(atmp, a, b, b, vlen_enc); 1340 vmaxpd(tmp, atmp, btmp, vlen_enc); 1341 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1342 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 1343 } 1344 } 1345 1346 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1347 XMMRegister dst, XMMRegister a, XMMRegister b, 1348 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1349 int vlen_enc) { 1350 assert(UseAVX > 2, "required"); 1351 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1352 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1353 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1354 assert_different_registers(dst, a, b, atmp, btmp); 1355 1356 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1357 bool is_double_word = is_double_word_type(elem_bt); 1358 bool merge = true; 1359 1360 if (!is_double_word && is_min) { 1361 evpmovd2m(ktmp, a, vlen_enc); 1362 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1363 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1364 vminps(dst, atmp, btmp, vlen_enc); 1365 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1366 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1367 } else if (!is_double_word && !is_min) { 1368 evpmovd2m(ktmp, b, vlen_enc); 1369 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1370 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1371 vmaxps(dst, atmp, btmp, vlen_enc); 1372 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1373 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1374 } else if (is_double_word && is_min) { 1375 evpmovq2m(ktmp, a, vlen_enc); 1376 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1377 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1378 vminpd(dst, atmp, btmp, vlen_enc); 1379 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1380 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1381 } else { 1382 assert(is_double_word && !is_min, "sanity"); 1383 evpmovq2m(ktmp, b, vlen_enc); 1384 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1385 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1386 vmaxpd(dst, atmp, btmp, vlen_enc); 1387 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1388 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1389 } 1390 } 1391 1392 // Float/Double signum 1393 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1394 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1395 1396 Label DONE_LABEL; 1397 1398 if (opcode == Op_SignumF) { 1399 assert(UseSSE > 0, "required"); 1400 ucomiss(dst, zero); 1401 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1402 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1403 movflt(dst, one); 1404 jcc(Assembler::above, DONE_LABEL); 1405 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1406 } else if (opcode == Op_SignumD) { 1407 assert(UseSSE > 1, "required"); 1408 ucomisd(dst, zero); 1409 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1410 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1411 movdbl(dst, one); 1412 jcc(Assembler::above, DONE_LABEL); 1413 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1414 } 1415 1416 bind(DONE_LABEL); 1417 } 1418 1419 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1420 if (sign) { 1421 pmovsxbw(dst, src); 1422 } else { 1423 pmovzxbw(dst, src); 1424 } 1425 } 1426 1427 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1428 if (sign) { 1429 vpmovsxbw(dst, src, vector_len); 1430 } else { 1431 vpmovzxbw(dst, src, vector_len); 1432 } 1433 } 1434 1435 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1436 if (sign) { 1437 vpmovsxbd(dst, src, vector_len); 1438 } else { 1439 vpmovzxbd(dst, src, vector_len); 1440 } 1441 } 1442 1443 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1444 if (sign) { 1445 vpmovsxwd(dst, src, vector_len); 1446 } else { 1447 vpmovzxwd(dst, src, vector_len); 1448 } 1449 } 1450 1451 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1452 int shift, int vector_len) { 1453 if (opcode == Op_RotateLeftV) { 1454 if (etype == T_INT) { 1455 evprold(dst, src, shift, vector_len); 1456 } else { 1457 assert(etype == T_LONG, "expected type T_LONG"); 1458 evprolq(dst, src, shift, vector_len); 1459 } 1460 } else { 1461 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1462 if (etype == T_INT) { 1463 evprord(dst, src, shift, vector_len); 1464 } else { 1465 assert(etype == T_LONG, "expected type T_LONG"); 1466 evprorq(dst, src, shift, vector_len); 1467 } 1468 } 1469 } 1470 1471 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1472 XMMRegister shift, int vector_len) { 1473 if (opcode == Op_RotateLeftV) { 1474 if (etype == T_INT) { 1475 evprolvd(dst, src, shift, vector_len); 1476 } else { 1477 assert(etype == T_LONG, "expected type T_LONG"); 1478 evprolvq(dst, src, shift, vector_len); 1479 } 1480 } else { 1481 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1482 if (etype == T_INT) { 1483 evprorvd(dst, src, shift, vector_len); 1484 } else { 1485 assert(etype == T_LONG, "expected type T_LONG"); 1486 evprorvq(dst, src, shift, vector_len); 1487 } 1488 } 1489 } 1490 1491 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1492 if (opcode == Op_RShiftVI) { 1493 psrad(dst, shift); 1494 } else if (opcode == Op_LShiftVI) { 1495 pslld(dst, shift); 1496 } else { 1497 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1498 psrld(dst, shift); 1499 } 1500 } 1501 1502 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1503 switch (opcode) { 1504 case Op_RShiftVI: psrad(dst, shift); break; 1505 case Op_LShiftVI: pslld(dst, shift); break; 1506 case Op_URShiftVI: psrld(dst, shift); break; 1507 1508 default: assert(false, "%s", NodeClassNames[opcode]); 1509 } 1510 } 1511 1512 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1513 if (opcode == Op_RShiftVI) { 1514 vpsrad(dst, nds, shift, vector_len); 1515 } else if (opcode == Op_LShiftVI) { 1516 vpslld(dst, nds, shift, vector_len); 1517 } else { 1518 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1519 vpsrld(dst, nds, shift, vector_len); 1520 } 1521 } 1522 1523 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1524 switch (opcode) { 1525 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1526 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1527 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1528 1529 default: assert(false, "%s", NodeClassNames[opcode]); 1530 } 1531 } 1532 1533 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1534 switch (opcode) { 1535 case Op_RShiftVB: // fall-through 1536 case Op_RShiftVS: psraw(dst, shift); break; 1537 1538 case Op_LShiftVB: // fall-through 1539 case Op_LShiftVS: psllw(dst, shift); break; 1540 1541 case Op_URShiftVS: // fall-through 1542 case Op_URShiftVB: psrlw(dst, shift); break; 1543 1544 default: assert(false, "%s", NodeClassNames[opcode]); 1545 } 1546 } 1547 1548 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1549 switch (opcode) { 1550 case Op_RShiftVB: // fall-through 1551 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1552 1553 case Op_LShiftVB: // fall-through 1554 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1555 1556 case Op_URShiftVS: // fall-through 1557 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1558 1559 default: assert(false, "%s", NodeClassNames[opcode]); 1560 } 1561 } 1562 1563 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1564 switch (opcode) { 1565 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1566 case Op_LShiftVL: psllq(dst, shift); break; 1567 case Op_URShiftVL: psrlq(dst, shift); break; 1568 1569 default: assert(false, "%s", NodeClassNames[opcode]); 1570 } 1571 } 1572 1573 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1574 if (opcode == Op_RShiftVL) { 1575 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1576 } else if (opcode == Op_LShiftVL) { 1577 psllq(dst, shift); 1578 } else { 1579 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1580 psrlq(dst, shift); 1581 } 1582 } 1583 1584 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1585 switch (opcode) { 1586 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1587 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1588 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1589 1590 default: assert(false, "%s", NodeClassNames[opcode]); 1591 } 1592 } 1593 1594 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1595 if (opcode == Op_RShiftVL) { 1596 evpsraq(dst, nds, shift, vector_len); 1597 } else if (opcode == Op_LShiftVL) { 1598 vpsllq(dst, nds, shift, vector_len); 1599 } else { 1600 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1601 vpsrlq(dst, nds, shift, vector_len); 1602 } 1603 } 1604 1605 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1606 switch (opcode) { 1607 case Op_RShiftVB: // fall-through 1608 case Op_RShiftVS: // fall-through 1609 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1610 1611 case Op_LShiftVB: // fall-through 1612 case Op_LShiftVS: // fall-through 1613 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1614 1615 case Op_URShiftVB: // fall-through 1616 case Op_URShiftVS: // fall-through 1617 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1618 1619 default: assert(false, "%s", NodeClassNames[opcode]); 1620 } 1621 } 1622 1623 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1624 switch (opcode) { 1625 case Op_RShiftVB: // fall-through 1626 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1627 1628 case Op_LShiftVB: // fall-through 1629 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1630 1631 case Op_URShiftVB: // fall-through 1632 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1633 1634 default: assert(false, "%s", NodeClassNames[opcode]); 1635 } 1636 } 1637 1638 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1639 assert(UseAVX >= 2, "required"); 1640 switch (opcode) { 1641 case Op_RShiftVL: { 1642 if (UseAVX > 2) { 1643 assert(tmp == xnoreg, "not used"); 1644 if (!VM_Version::supports_avx512vl()) { 1645 vlen_enc = Assembler::AVX_512bit; 1646 } 1647 evpsravq(dst, src, shift, vlen_enc); 1648 } else { 1649 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1650 vpsrlvq(dst, src, shift, vlen_enc); 1651 vpsrlvq(tmp, tmp, shift, vlen_enc); 1652 vpxor(dst, dst, tmp, vlen_enc); 1653 vpsubq(dst, dst, tmp, vlen_enc); 1654 } 1655 break; 1656 } 1657 case Op_LShiftVL: { 1658 assert(tmp == xnoreg, "not used"); 1659 vpsllvq(dst, src, shift, vlen_enc); 1660 break; 1661 } 1662 case Op_URShiftVL: { 1663 assert(tmp == xnoreg, "not used"); 1664 vpsrlvq(dst, src, shift, vlen_enc); 1665 break; 1666 } 1667 default: assert(false, "%s", NodeClassNames[opcode]); 1668 } 1669 } 1670 1671 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1672 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1673 assert(opcode == Op_LShiftVB || 1674 opcode == Op_RShiftVB || 1675 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1676 bool sign = (opcode != Op_URShiftVB); 1677 assert(vector_len == 0, "required"); 1678 vextendbd(sign, dst, src, 1); 1679 vpmovzxbd(vtmp, shift, 1); 1680 varshiftd(opcode, dst, dst, vtmp, 1); 1681 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1682 vextracti128_high(vtmp, dst); 1683 vpackusdw(dst, dst, vtmp, 0); 1684 } 1685 1686 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1687 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1688 assert(opcode == Op_LShiftVB || 1689 opcode == Op_RShiftVB || 1690 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1691 bool sign = (opcode != Op_URShiftVB); 1692 int ext_vector_len = vector_len + 1; 1693 vextendbw(sign, dst, src, ext_vector_len); 1694 vpmovzxbw(vtmp, shift, ext_vector_len); 1695 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1696 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1697 if (vector_len == 0) { 1698 vextracti128_high(vtmp, dst); 1699 vpackuswb(dst, dst, vtmp, vector_len); 1700 } else { 1701 vextracti64x4_high(vtmp, dst); 1702 vpackuswb(dst, dst, vtmp, vector_len); 1703 vpermq(dst, dst, 0xD8, vector_len); 1704 } 1705 } 1706 1707 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1708 switch(typ) { 1709 case T_BYTE: 1710 pinsrb(dst, val, idx); 1711 break; 1712 case T_SHORT: 1713 pinsrw(dst, val, idx); 1714 break; 1715 case T_INT: 1716 pinsrd(dst, val, idx); 1717 break; 1718 case T_LONG: 1719 pinsrq(dst, val, idx); 1720 break; 1721 default: 1722 assert(false,"Should not reach here."); 1723 break; 1724 } 1725 } 1726 1727 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1728 switch(typ) { 1729 case T_BYTE: 1730 vpinsrb(dst, src, val, idx); 1731 break; 1732 case T_SHORT: 1733 vpinsrw(dst, src, val, idx); 1734 break; 1735 case T_INT: 1736 vpinsrd(dst, src, val, idx); 1737 break; 1738 case T_LONG: 1739 vpinsrq(dst, src, val, idx); 1740 break; 1741 default: 1742 assert(false,"Should not reach here."); 1743 break; 1744 } 1745 } 1746 1747 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1748 switch(typ) { 1749 case T_INT: 1750 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1751 break; 1752 case T_FLOAT: 1753 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1754 break; 1755 case T_LONG: 1756 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1757 break; 1758 case T_DOUBLE: 1759 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1760 break; 1761 default: 1762 assert(false,"Should not reach here."); 1763 break; 1764 } 1765 } 1766 1767 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1768 switch(typ) { 1769 case T_INT: 1770 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1771 break; 1772 case T_FLOAT: 1773 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1774 break; 1775 case T_LONG: 1776 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1777 break; 1778 case T_DOUBLE: 1779 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1780 break; 1781 default: 1782 assert(false,"Should not reach here."); 1783 break; 1784 } 1785 } 1786 1787 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1788 switch(typ) { 1789 case T_INT: 1790 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1791 break; 1792 case T_FLOAT: 1793 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1794 break; 1795 case T_LONG: 1796 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1797 break; 1798 case T_DOUBLE: 1799 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1800 break; 1801 default: 1802 assert(false,"Should not reach here."); 1803 break; 1804 } 1805 } 1806 1807 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1808 if (vlen_in_bytes <= 16) { 1809 pxor (dst, dst); 1810 psubb(dst, src); 1811 switch (elem_bt) { 1812 case T_BYTE: /* nothing to do */ break; 1813 case T_SHORT: pmovsxbw(dst, dst); break; 1814 case T_INT: pmovsxbd(dst, dst); break; 1815 case T_FLOAT: pmovsxbd(dst, dst); break; 1816 case T_LONG: pmovsxbq(dst, dst); break; 1817 case T_DOUBLE: pmovsxbq(dst, dst); break; 1818 1819 default: assert(false, "%s", type2name(elem_bt)); 1820 } 1821 } else { 1822 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1823 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1824 1825 vpxor (dst, dst, dst, vlen_enc); 1826 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1827 1828 switch (elem_bt) { 1829 case T_BYTE: /* nothing to do */ break; 1830 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1831 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1832 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1833 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1834 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1835 1836 default: assert(false, "%s", type2name(elem_bt)); 1837 } 1838 } 1839 } 1840 1841 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1842 if (novlbwdq) { 1843 vpmovsxbd(xtmp, src, vlen_enc); 1844 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1845 Assembler::eq, true, vlen_enc, noreg); 1846 } else { 1847 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1848 vpsubb(xtmp, xtmp, src, vlen_enc); 1849 evpmovb2m(dst, xtmp, vlen_enc); 1850 } 1851 } 1852 1853 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) { 1854 switch (vlen_in_bytes) { 1855 case 4: movdl(dst, src); break; 1856 case 8: movq(dst, src); break; 1857 case 16: movdqu(dst, src); break; 1858 case 32: vmovdqu(dst, src); break; 1859 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1860 default: ShouldNotReachHere(); 1861 } 1862 } 1863 1864 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1865 assert(rscratch != noreg || always_reachable(src), "missing"); 1866 1867 if (reachable(src)) { 1868 load_vector(dst, as_Address(src), vlen_in_bytes); 1869 } else { 1870 lea(rscratch, src); 1871 load_vector(dst, Address(rscratch, 0), vlen_in_bytes); 1872 } 1873 } 1874 1875 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1876 int vlen_enc = vector_length_encoding(vlen); 1877 if (VM_Version::supports_avx()) { 1878 if (bt == T_LONG) { 1879 if (VM_Version::supports_avx2()) { 1880 vpbroadcastq(dst, src, vlen_enc); 1881 } else { 1882 vmovddup(dst, src, vlen_enc); 1883 } 1884 } else if (bt == T_DOUBLE) { 1885 if (vlen_enc != Assembler::AVX_128bit) { 1886 vbroadcastsd(dst, src, vlen_enc, noreg); 1887 } else { 1888 vmovddup(dst, src, vlen_enc); 1889 } 1890 } else { 1891 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1892 vpbroadcastd(dst, src, vlen_enc); 1893 } else { 1894 vbroadcastss(dst, src, vlen_enc); 1895 } 1896 } 1897 } else if (VM_Version::supports_sse3()) { 1898 movddup(dst, src); 1899 } else { 1900 movq(dst, src); 1901 if (vlen == 16) { 1902 punpcklqdq(dst, dst); 1903 } 1904 } 1905 } 1906 1907 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1908 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1909 int offset = exact_log2(type2aelembytes(bt)) << 6; 1910 if (is_floating_point_type(bt)) { 1911 offset += 128; 1912 } 1913 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1914 load_vector(dst, addr, vlen_in_bytes); 1915 } 1916 1917 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1918 1919 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1920 int vector_len = Assembler::AVX_128bit; 1921 1922 switch (opcode) { 1923 case Op_AndReductionV: pand(dst, src); break; 1924 case Op_OrReductionV: por (dst, src); break; 1925 case Op_XorReductionV: pxor(dst, src); break; 1926 case Op_MinReductionV: 1927 switch (typ) { 1928 case T_BYTE: pminsb(dst, src); break; 1929 case T_SHORT: pminsw(dst, src); break; 1930 case T_INT: pminsd(dst, src); break; 1931 case T_LONG: assert(UseAVX > 2, "required"); 1932 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1933 default: assert(false, "wrong type"); 1934 } 1935 break; 1936 case Op_MaxReductionV: 1937 switch (typ) { 1938 case T_BYTE: pmaxsb(dst, src); break; 1939 case T_SHORT: pmaxsw(dst, src); break; 1940 case T_INT: pmaxsd(dst, src); break; 1941 case T_LONG: assert(UseAVX > 2, "required"); 1942 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1943 default: assert(false, "wrong type"); 1944 } 1945 break; 1946 case Op_AddReductionVF: addss(dst, src); break; 1947 case Op_AddReductionVD: addsd(dst, src); break; 1948 case Op_AddReductionVI: 1949 switch (typ) { 1950 case T_BYTE: paddb(dst, src); break; 1951 case T_SHORT: paddw(dst, src); break; 1952 case T_INT: paddd(dst, src); break; 1953 default: assert(false, "wrong type"); 1954 } 1955 break; 1956 case Op_AddReductionVL: paddq(dst, src); break; 1957 case Op_MulReductionVF: mulss(dst, src); break; 1958 case Op_MulReductionVD: mulsd(dst, src); break; 1959 case Op_MulReductionVI: 1960 switch (typ) { 1961 case T_SHORT: pmullw(dst, src); break; 1962 case T_INT: pmulld(dst, src); break; 1963 default: assert(false, "wrong type"); 1964 } 1965 break; 1966 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1967 evpmullq(dst, dst, src, vector_len); break; 1968 default: assert(false, "wrong opcode"); 1969 } 1970 } 1971 1972 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1973 int vector_len = Assembler::AVX_256bit; 1974 1975 switch (opcode) { 1976 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1977 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1978 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1979 case Op_MinReductionV: 1980 switch (typ) { 1981 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1982 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1983 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1984 case T_LONG: assert(UseAVX > 2, "required"); 1985 vpminsq(dst, src1, src2, vector_len); break; 1986 default: assert(false, "wrong type"); 1987 } 1988 break; 1989 case Op_MaxReductionV: 1990 switch (typ) { 1991 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1992 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1993 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1994 case T_LONG: assert(UseAVX > 2, "required"); 1995 vpmaxsq(dst, src1, src2, vector_len); break; 1996 default: assert(false, "wrong type"); 1997 } 1998 break; 1999 case Op_AddReductionVI: 2000 switch (typ) { 2001 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 2002 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 2003 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 2004 default: assert(false, "wrong type"); 2005 } 2006 break; 2007 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 2008 case Op_MulReductionVI: 2009 switch (typ) { 2010 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 2011 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 2012 default: assert(false, "wrong type"); 2013 } 2014 break; 2015 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 2016 default: assert(false, "wrong opcode"); 2017 } 2018 } 2019 2020 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 2021 XMMRegister dst, XMMRegister src, 2022 XMMRegister vtmp1, XMMRegister vtmp2) { 2023 switch (opcode) { 2024 case Op_AddReductionVF: 2025 case Op_MulReductionVF: 2026 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 2027 break; 2028 2029 case Op_AddReductionVD: 2030 case Op_MulReductionVD: 2031 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 2032 break; 2033 2034 default: assert(false, "wrong opcode"); 2035 } 2036 } 2037 2038 void C2_MacroAssembler::reduceB(int opcode, int vlen, 2039 Register dst, Register src1, XMMRegister src2, 2040 XMMRegister vtmp1, XMMRegister vtmp2) { 2041 switch (vlen) { 2042 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2043 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2044 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2045 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2046 2047 default: assert(false, "wrong vector length"); 2048 } 2049 } 2050 2051 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 2052 Register dst, Register src1, XMMRegister src2, 2053 XMMRegister vtmp1, XMMRegister vtmp2) { 2054 switch (vlen) { 2055 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2056 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2057 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2058 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2059 2060 default: assert(false, "wrong vector length"); 2061 } 2062 } 2063 2064 void C2_MacroAssembler::reduceS(int opcode, int vlen, 2065 Register dst, Register src1, XMMRegister src2, 2066 XMMRegister vtmp1, XMMRegister vtmp2) { 2067 switch (vlen) { 2068 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2069 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2070 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2071 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2072 2073 default: assert(false, "wrong vector length"); 2074 } 2075 } 2076 2077 void C2_MacroAssembler::reduceI(int opcode, int vlen, 2078 Register dst, Register src1, XMMRegister src2, 2079 XMMRegister vtmp1, XMMRegister vtmp2) { 2080 switch (vlen) { 2081 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2082 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2083 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2084 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2085 2086 default: assert(false, "wrong vector length"); 2087 } 2088 } 2089 2090 #ifdef _LP64 2091 void C2_MacroAssembler::reduceL(int opcode, int vlen, 2092 Register dst, Register src1, XMMRegister src2, 2093 XMMRegister vtmp1, XMMRegister vtmp2) { 2094 switch (vlen) { 2095 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2096 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2097 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2098 2099 default: assert(false, "wrong vector length"); 2100 } 2101 } 2102 #endif // _LP64 2103 2104 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2105 switch (vlen) { 2106 case 2: 2107 assert(vtmp2 == xnoreg, ""); 2108 reduce2F(opcode, dst, src, vtmp1); 2109 break; 2110 case 4: 2111 assert(vtmp2 == xnoreg, ""); 2112 reduce4F(opcode, dst, src, vtmp1); 2113 break; 2114 case 8: 2115 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2116 break; 2117 case 16: 2118 reduce16F(opcode, dst, src, vtmp1, vtmp2); 2119 break; 2120 default: assert(false, "wrong vector length"); 2121 } 2122 } 2123 2124 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2125 switch (vlen) { 2126 case 2: 2127 assert(vtmp2 == xnoreg, ""); 2128 reduce2D(opcode, dst, src, vtmp1); 2129 break; 2130 case 4: 2131 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2132 break; 2133 case 8: 2134 reduce8D(opcode, dst, src, vtmp1, vtmp2); 2135 break; 2136 default: assert(false, "wrong vector length"); 2137 } 2138 } 2139 2140 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2141 if (opcode == Op_AddReductionVI) { 2142 if (vtmp1 != src2) { 2143 movdqu(vtmp1, src2); 2144 } 2145 phaddd(vtmp1, vtmp1); 2146 } else { 2147 pshufd(vtmp1, src2, 0x1); 2148 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2149 } 2150 movdl(vtmp2, src1); 2151 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2152 movdl(dst, vtmp1); 2153 } 2154 2155 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2156 if (opcode == Op_AddReductionVI) { 2157 if (vtmp1 != src2) { 2158 movdqu(vtmp1, src2); 2159 } 2160 phaddd(vtmp1, src2); 2161 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2162 } else { 2163 pshufd(vtmp2, src2, 0xE); 2164 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2165 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2166 } 2167 } 2168 2169 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2170 if (opcode == Op_AddReductionVI) { 2171 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2172 vextracti128_high(vtmp2, vtmp1); 2173 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2174 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2175 } else { 2176 vextracti128_high(vtmp1, src2); 2177 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2178 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2179 } 2180 } 2181 2182 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2183 vextracti64x4_high(vtmp2, src2); 2184 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2185 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2186 } 2187 2188 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2189 pshufd(vtmp2, src2, 0x1); 2190 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2191 movdqu(vtmp1, vtmp2); 2192 psrldq(vtmp1, 2); 2193 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2194 movdqu(vtmp2, vtmp1); 2195 psrldq(vtmp2, 1); 2196 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2197 movdl(vtmp2, src1); 2198 pmovsxbd(vtmp1, vtmp1); 2199 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2200 pextrb(dst, vtmp1, 0x0); 2201 movsbl(dst, dst); 2202 } 2203 2204 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2205 pshufd(vtmp1, src2, 0xE); 2206 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2207 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2208 } 2209 2210 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2211 vextracti128_high(vtmp2, src2); 2212 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2213 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2214 } 2215 2216 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2217 vextracti64x4_high(vtmp1, src2); 2218 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2219 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2220 } 2221 2222 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2223 pmovsxbw(vtmp2, src2); 2224 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2225 } 2226 2227 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2228 if (UseAVX > 1) { 2229 int vector_len = Assembler::AVX_256bit; 2230 vpmovsxbw(vtmp1, src2, vector_len); 2231 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2232 } else { 2233 pmovsxbw(vtmp2, src2); 2234 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2235 pshufd(vtmp2, src2, 0x1); 2236 pmovsxbw(vtmp2, src2); 2237 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2238 } 2239 } 2240 2241 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2242 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2243 int vector_len = Assembler::AVX_512bit; 2244 vpmovsxbw(vtmp1, src2, vector_len); 2245 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2246 } else { 2247 assert(UseAVX >= 2,"Should not reach here."); 2248 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2249 vextracti128_high(vtmp2, src2); 2250 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2251 } 2252 } 2253 2254 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2255 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2256 vextracti64x4_high(vtmp2, src2); 2257 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2258 } 2259 2260 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2261 if (opcode == Op_AddReductionVI) { 2262 if (vtmp1 != src2) { 2263 movdqu(vtmp1, src2); 2264 } 2265 phaddw(vtmp1, vtmp1); 2266 phaddw(vtmp1, vtmp1); 2267 } else { 2268 pshufd(vtmp2, src2, 0x1); 2269 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2270 movdqu(vtmp1, vtmp2); 2271 psrldq(vtmp1, 2); 2272 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2273 } 2274 movdl(vtmp2, src1); 2275 pmovsxwd(vtmp1, vtmp1); 2276 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2277 pextrw(dst, vtmp1, 0x0); 2278 movswl(dst, dst); 2279 } 2280 2281 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2282 if (opcode == Op_AddReductionVI) { 2283 if (vtmp1 != src2) { 2284 movdqu(vtmp1, src2); 2285 } 2286 phaddw(vtmp1, src2); 2287 } else { 2288 pshufd(vtmp1, src2, 0xE); 2289 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2290 } 2291 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2292 } 2293 2294 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2295 if (opcode == Op_AddReductionVI) { 2296 int vector_len = Assembler::AVX_256bit; 2297 vphaddw(vtmp2, src2, src2, vector_len); 2298 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2299 } else { 2300 vextracti128_high(vtmp2, src2); 2301 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2302 } 2303 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2304 } 2305 2306 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2307 int vector_len = Assembler::AVX_256bit; 2308 vextracti64x4_high(vtmp1, src2); 2309 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2310 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2311 } 2312 2313 #ifdef _LP64 2314 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2315 pshufd(vtmp2, src2, 0xE); 2316 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2317 movdq(vtmp1, src1); 2318 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2319 movdq(dst, vtmp1); 2320 } 2321 2322 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2323 vextracti128_high(vtmp1, src2); 2324 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2325 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2326 } 2327 2328 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2329 vextracti64x4_high(vtmp2, src2); 2330 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2331 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2332 } 2333 2334 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2335 mov64(temp, -1L); 2336 bzhiq(temp, temp, len); 2337 kmovql(dst, temp); 2338 } 2339 #endif // _LP64 2340 2341 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2342 reduce_operation_128(T_FLOAT, opcode, dst, src); 2343 pshufd(vtmp, src, 0x1); 2344 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2345 } 2346 2347 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2348 reduce2F(opcode, dst, src, vtmp); 2349 pshufd(vtmp, src, 0x2); 2350 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2351 pshufd(vtmp, src, 0x3); 2352 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2353 } 2354 2355 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2356 reduce4F(opcode, dst, src, vtmp2); 2357 vextractf128_high(vtmp2, src); 2358 reduce4F(opcode, dst, vtmp2, vtmp1); 2359 } 2360 2361 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2362 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2363 vextracti64x4_high(vtmp1, src); 2364 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2365 } 2366 2367 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2368 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2369 pshufd(vtmp, src, 0xE); 2370 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2371 } 2372 2373 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2374 reduce2D(opcode, dst, src, vtmp2); 2375 vextractf128_high(vtmp2, src); 2376 reduce2D(opcode, dst, vtmp2, vtmp1); 2377 } 2378 2379 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2380 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2381 vextracti64x4_high(vtmp1, src); 2382 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2383 } 2384 2385 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2386 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2387 } 2388 2389 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2390 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2391 } 2392 2393 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2394 int vec_enc) { 2395 switch(elem_bt) { 2396 case T_INT: 2397 case T_FLOAT: 2398 vmaskmovps(dst, src, mask, vec_enc); 2399 break; 2400 case T_LONG: 2401 case T_DOUBLE: 2402 vmaskmovpd(dst, src, mask, vec_enc); 2403 break; 2404 default: 2405 fatal("Unsupported type %s", type2name(elem_bt)); 2406 break; 2407 } 2408 } 2409 2410 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2411 int vec_enc) { 2412 switch(elem_bt) { 2413 case T_INT: 2414 case T_FLOAT: 2415 vmaskmovps(dst, src, mask, vec_enc); 2416 break; 2417 case T_LONG: 2418 case T_DOUBLE: 2419 vmaskmovpd(dst, src, mask, vec_enc); 2420 break; 2421 default: 2422 fatal("Unsupported type %s", type2name(elem_bt)); 2423 break; 2424 } 2425 } 2426 2427 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2428 XMMRegister dst, XMMRegister src, 2429 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2430 XMMRegister xmm_0, XMMRegister xmm_1) { 2431 const int permconst[] = {1, 14}; 2432 XMMRegister wsrc = src; 2433 XMMRegister wdst = xmm_0; 2434 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2435 2436 int vlen_enc = Assembler::AVX_128bit; 2437 if (vlen == 16) { 2438 vlen_enc = Assembler::AVX_256bit; 2439 } 2440 2441 for (int i = log2(vlen) - 1; i >=0; i--) { 2442 if (i == 0 && !is_dst_valid) { 2443 wdst = dst; 2444 } 2445 if (i == 3) { 2446 vextracti64x4_high(wtmp, wsrc); 2447 } else if (i == 2) { 2448 vextracti128_high(wtmp, wsrc); 2449 } else { // i = [0,1] 2450 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2451 } 2452 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2453 wsrc = wdst; 2454 vlen_enc = Assembler::AVX_128bit; 2455 } 2456 if (is_dst_valid) { 2457 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2458 } 2459 } 2460 2461 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2462 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2463 XMMRegister xmm_0, XMMRegister xmm_1) { 2464 XMMRegister wsrc = src; 2465 XMMRegister wdst = xmm_0; 2466 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2467 int vlen_enc = Assembler::AVX_128bit; 2468 if (vlen == 8) { 2469 vlen_enc = Assembler::AVX_256bit; 2470 } 2471 for (int i = log2(vlen) - 1; i >=0; i--) { 2472 if (i == 0 && !is_dst_valid) { 2473 wdst = dst; 2474 } 2475 if (i == 1) { 2476 vextracti128_high(wtmp, wsrc); 2477 } else if (i == 2) { 2478 vextracti64x4_high(wtmp, wsrc); 2479 } else { 2480 assert(i == 0, "%d", i); 2481 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2482 } 2483 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2484 wsrc = wdst; 2485 vlen_enc = Assembler::AVX_128bit; 2486 } 2487 if (is_dst_valid) { 2488 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2489 } 2490 } 2491 2492 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2493 switch (bt) { 2494 case T_BYTE: pextrb(dst, src, idx); break; 2495 case T_SHORT: pextrw(dst, src, idx); break; 2496 case T_INT: pextrd(dst, src, idx); break; 2497 case T_LONG: pextrq(dst, src, idx); break; 2498 2499 default: 2500 assert(false,"Should not reach here."); 2501 break; 2502 } 2503 } 2504 2505 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2506 int esize = type2aelembytes(typ); 2507 int elem_per_lane = 16/esize; 2508 int lane = elemindex / elem_per_lane; 2509 int eindex = elemindex % elem_per_lane; 2510 2511 if (lane >= 2) { 2512 assert(UseAVX > 2, "required"); 2513 vextractf32x4(dst, src, lane & 3); 2514 return dst; 2515 } else if (lane > 0) { 2516 assert(UseAVX > 0, "required"); 2517 vextractf128(dst, src, lane); 2518 return dst; 2519 } else { 2520 return src; 2521 } 2522 } 2523 2524 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2525 if (typ == T_BYTE) { 2526 movsbl(dst, dst); 2527 } else if (typ == T_SHORT) { 2528 movswl(dst, dst); 2529 } 2530 } 2531 2532 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2533 int esize = type2aelembytes(typ); 2534 int elem_per_lane = 16/esize; 2535 int eindex = elemindex % elem_per_lane; 2536 assert(is_integral_type(typ),"required"); 2537 2538 if (eindex == 0) { 2539 if (typ == T_LONG) { 2540 movq(dst, src); 2541 } else { 2542 movdl(dst, src); 2543 movsxl(typ, dst); 2544 } 2545 } else { 2546 extract(typ, dst, src, eindex); 2547 movsxl(typ, dst); 2548 } 2549 } 2550 2551 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2552 int esize = type2aelembytes(typ); 2553 int elem_per_lane = 16/esize; 2554 int eindex = elemindex % elem_per_lane; 2555 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2556 2557 if (eindex == 0) { 2558 movq(dst, src); 2559 } else { 2560 if (typ == T_FLOAT) { 2561 if (UseAVX == 0) { 2562 movdqu(dst, src); 2563 shufps(dst, dst, eindex); 2564 } else { 2565 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2566 } 2567 } else { 2568 if (UseAVX == 0) { 2569 movdqu(dst, src); 2570 psrldq(dst, eindex*esize); 2571 } else { 2572 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2573 } 2574 movq(dst, dst); 2575 } 2576 } 2577 // Zero upper bits 2578 if (typ == T_FLOAT) { 2579 if (UseAVX == 0) { 2580 assert(vtmp != xnoreg, "required."); 2581 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2582 pand(dst, vtmp); 2583 } else { 2584 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2585 } 2586 } 2587 } 2588 2589 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2590 switch(typ) { 2591 case T_BYTE: 2592 case T_BOOLEAN: 2593 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2594 break; 2595 case T_SHORT: 2596 case T_CHAR: 2597 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2598 break; 2599 case T_INT: 2600 case T_FLOAT: 2601 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2602 break; 2603 case T_LONG: 2604 case T_DOUBLE: 2605 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2606 break; 2607 default: 2608 assert(false,"Should not reach here."); 2609 break; 2610 } 2611 } 2612 2613 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2614 assert(rscratch != noreg || always_reachable(src2), "missing"); 2615 2616 switch(typ) { 2617 case T_BOOLEAN: 2618 case T_BYTE: 2619 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2620 break; 2621 case T_CHAR: 2622 case T_SHORT: 2623 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2624 break; 2625 case T_INT: 2626 case T_FLOAT: 2627 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2628 break; 2629 case T_LONG: 2630 case T_DOUBLE: 2631 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2632 break; 2633 default: 2634 assert(false,"Should not reach here."); 2635 break; 2636 } 2637 } 2638 2639 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2640 switch(typ) { 2641 case T_BYTE: 2642 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2643 break; 2644 case T_SHORT: 2645 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2646 break; 2647 case T_INT: 2648 case T_FLOAT: 2649 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2650 break; 2651 case T_LONG: 2652 case T_DOUBLE: 2653 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2654 break; 2655 default: 2656 assert(false,"Should not reach here."); 2657 break; 2658 } 2659 } 2660 2661 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2662 assert(vlen_in_bytes <= 32, ""); 2663 int esize = type2aelembytes(bt); 2664 if (vlen_in_bytes == 32) { 2665 assert(vtmp == xnoreg, "required."); 2666 if (esize >= 4) { 2667 vtestps(src1, src2, AVX_256bit); 2668 } else { 2669 vptest(src1, src2, AVX_256bit); 2670 } 2671 return; 2672 } 2673 if (vlen_in_bytes < 16) { 2674 // Duplicate the lower part to fill the whole register, 2675 // Don't need to do so for src2 2676 assert(vtmp != xnoreg, "required"); 2677 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2678 pshufd(vtmp, src1, shuffle_imm); 2679 } else { 2680 assert(vtmp == xnoreg, "required"); 2681 vtmp = src1; 2682 } 2683 if (esize >= 4 && VM_Version::supports_avx()) { 2684 vtestps(vtmp, src2, AVX_128bit); 2685 } else { 2686 ptest(vtmp, src2); 2687 } 2688 } 2689 2690 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2691 assert(UseAVX >= 2, "required"); 2692 #ifdef ASSERT 2693 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2694 bool is_bw_supported = VM_Version::supports_avx512bw(); 2695 if (is_bw && !is_bw_supported) { 2696 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2697 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2698 "XMM register should be 0-15"); 2699 } 2700 #endif // ASSERT 2701 switch (elem_bt) { 2702 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2703 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2704 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2705 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2706 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2707 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2708 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2709 } 2710 } 2711 2712 #ifdef _LP64 2713 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2714 assert(UseAVX >= 2, "required"); 2715 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2716 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2717 if ((UseAVX > 2) && 2718 (!is_bw || VM_Version::supports_avx512bw()) && 2719 (!is_vl || VM_Version::supports_avx512vl())) { 2720 switch (elem_bt) { 2721 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2722 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2723 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2724 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2725 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2726 } 2727 } else { 2728 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2729 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2730 switch (elem_bt) { 2731 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2732 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2733 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2734 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2735 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2736 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2737 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2738 } 2739 } 2740 } 2741 #endif 2742 2743 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2744 switch (to_elem_bt) { 2745 case T_SHORT: 2746 vpmovsxbw(dst, src, vlen_enc); 2747 break; 2748 case T_INT: 2749 vpmovsxbd(dst, src, vlen_enc); 2750 break; 2751 case T_FLOAT: 2752 vpmovsxbd(dst, src, vlen_enc); 2753 vcvtdq2ps(dst, dst, vlen_enc); 2754 break; 2755 case T_LONG: 2756 vpmovsxbq(dst, src, vlen_enc); 2757 break; 2758 case T_DOUBLE: { 2759 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2760 vpmovsxbd(dst, src, mid_vlen_enc); 2761 vcvtdq2pd(dst, dst, vlen_enc); 2762 break; 2763 } 2764 default: 2765 fatal("Unsupported type %s", type2name(to_elem_bt)); 2766 break; 2767 } 2768 } 2769 2770 //------------------------------------------------------------------------------------------- 2771 2772 // IndexOf for constant substrings with size >= 8 chars 2773 // which don't need to be loaded through stack. 2774 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2775 Register cnt1, Register cnt2, 2776 int int_cnt2, Register result, 2777 XMMRegister vec, Register tmp, 2778 int ae) { 2779 ShortBranchVerifier sbv(this); 2780 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2781 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2782 2783 // This method uses the pcmpestri instruction with bound registers 2784 // inputs: 2785 // xmm - substring 2786 // rax - substring length (elements count) 2787 // mem - scanned string 2788 // rdx - string length (elements count) 2789 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2790 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2791 // outputs: 2792 // rcx - matched index in string 2793 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2794 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2795 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2796 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2797 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2798 2799 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2800 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2801 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2802 2803 // Note, inline_string_indexOf() generates checks: 2804 // if (substr.count > string.count) return -1; 2805 // if (substr.count == 0) return 0; 2806 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2807 2808 // Load substring. 2809 if (ae == StrIntrinsicNode::UL) { 2810 pmovzxbw(vec, Address(str2, 0)); 2811 } else { 2812 movdqu(vec, Address(str2, 0)); 2813 } 2814 movl(cnt2, int_cnt2); 2815 movptr(result, str1); // string addr 2816 2817 if (int_cnt2 > stride) { 2818 jmpb(SCAN_TO_SUBSTR); 2819 2820 // Reload substr for rescan, this code 2821 // is executed only for large substrings (> 8 chars) 2822 bind(RELOAD_SUBSTR); 2823 if (ae == StrIntrinsicNode::UL) { 2824 pmovzxbw(vec, Address(str2, 0)); 2825 } else { 2826 movdqu(vec, Address(str2, 0)); 2827 } 2828 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2829 2830 bind(RELOAD_STR); 2831 // We came here after the beginning of the substring was 2832 // matched but the rest of it was not so we need to search 2833 // again. Start from the next element after the previous match. 2834 2835 // cnt2 is number of substring reminding elements and 2836 // cnt1 is number of string reminding elements when cmp failed. 2837 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2838 subl(cnt1, cnt2); 2839 addl(cnt1, int_cnt2); 2840 movl(cnt2, int_cnt2); // Now restore cnt2 2841 2842 decrementl(cnt1); // Shift to next element 2843 cmpl(cnt1, cnt2); 2844 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2845 2846 addptr(result, (1<<scale1)); 2847 2848 } // (int_cnt2 > 8) 2849 2850 // Scan string for start of substr in 16-byte vectors 2851 bind(SCAN_TO_SUBSTR); 2852 pcmpestri(vec, Address(result, 0), mode); 2853 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2854 subl(cnt1, stride); 2855 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2856 cmpl(cnt1, cnt2); 2857 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2858 addptr(result, 16); 2859 jmpb(SCAN_TO_SUBSTR); 2860 2861 // Found a potential substr 2862 bind(FOUND_CANDIDATE); 2863 // Matched whole vector if first element matched (tmp(rcx) == 0). 2864 if (int_cnt2 == stride) { 2865 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2866 } else { // int_cnt2 > 8 2867 jccb(Assembler::overflow, FOUND_SUBSTR); 2868 } 2869 // After pcmpestri tmp(rcx) contains matched element index 2870 // Compute start addr of substr 2871 lea(result, Address(result, tmp, scale1)); 2872 2873 // Make sure string is still long enough 2874 subl(cnt1, tmp); 2875 cmpl(cnt1, cnt2); 2876 if (int_cnt2 == stride) { 2877 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2878 } else { // int_cnt2 > 8 2879 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2880 } 2881 // Left less then substring. 2882 2883 bind(RET_NOT_FOUND); 2884 movl(result, -1); 2885 jmp(EXIT); 2886 2887 if (int_cnt2 > stride) { 2888 // This code is optimized for the case when whole substring 2889 // is matched if its head is matched. 2890 bind(MATCH_SUBSTR_HEAD); 2891 pcmpestri(vec, Address(result, 0), mode); 2892 // Reload only string if does not match 2893 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2894 2895 Label CONT_SCAN_SUBSTR; 2896 // Compare the rest of substring (> 8 chars). 2897 bind(FOUND_SUBSTR); 2898 // First 8 chars are already matched. 2899 negptr(cnt2); 2900 addptr(cnt2, stride); 2901 2902 bind(SCAN_SUBSTR); 2903 subl(cnt1, stride); 2904 cmpl(cnt2, -stride); // Do not read beyond substring 2905 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2906 // Back-up strings to avoid reading beyond substring: 2907 // cnt1 = cnt1 - cnt2 + 8 2908 addl(cnt1, cnt2); // cnt2 is negative 2909 addl(cnt1, stride); 2910 movl(cnt2, stride); negptr(cnt2); 2911 bind(CONT_SCAN_SUBSTR); 2912 if (int_cnt2 < (int)G) { 2913 int tail_off1 = int_cnt2<<scale1; 2914 int tail_off2 = int_cnt2<<scale2; 2915 if (ae == StrIntrinsicNode::UL) { 2916 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2917 } else { 2918 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2919 } 2920 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2921 } else { 2922 // calculate index in register to avoid integer overflow (int_cnt2*2) 2923 movl(tmp, int_cnt2); 2924 addptr(tmp, cnt2); 2925 if (ae == StrIntrinsicNode::UL) { 2926 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2927 } else { 2928 movdqu(vec, Address(str2, tmp, scale2, 0)); 2929 } 2930 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2931 } 2932 // Need to reload strings pointers if not matched whole vector 2933 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2934 addptr(cnt2, stride); 2935 jcc(Assembler::negative, SCAN_SUBSTR); 2936 // Fall through if found full substring 2937 2938 } // (int_cnt2 > 8) 2939 2940 bind(RET_FOUND); 2941 // Found result if we matched full small substring. 2942 // Compute substr offset 2943 subptr(result, str1); 2944 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2945 shrl(result, 1); // index 2946 } 2947 bind(EXIT); 2948 2949 } // string_indexofC8 2950 2951 // Small strings are loaded through stack if they cross page boundary. 2952 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2953 Register cnt1, Register cnt2, 2954 int int_cnt2, Register result, 2955 XMMRegister vec, Register tmp, 2956 int ae) { 2957 ShortBranchVerifier sbv(this); 2958 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2959 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2960 2961 // 2962 // int_cnt2 is length of small (< 8 chars) constant substring 2963 // or (-1) for non constant substring in which case its length 2964 // is in cnt2 register. 2965 // 2966 // Note, inline_string_indexOf() generates checks: 2967 // if (substr.count > string.count) return -1; 2968 // if (substr.count == 0) return 0; 2969 // 2970 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2971 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2972 // This method uses the pcmpestri instruction with bound registers 2973 // inputs: 2974 // xmm - substring 2975 // rax - substring length (elements count) 2976 // mem - scanned string 2977 // rdx - string length (elements count) 2978 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2979 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2980 // outputs: 2981 // rcx - matched index in string 2982 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2983 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2984 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2985 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2986 2987 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2988 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2989 FOUND_CANDIDATE; 2990 2991 { //======================================================== 2992 // We don't know where these strings are located 2993 // and we can't read beyond them. Load them through stack. 2994 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2995 2996 movptr(tmp, rsp); // save old SP 2997 2998 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2999 if (int_cnt2 == (1>>scale2)) { // One byte 3000 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 3001 load_unsigned_byte(result, Address(str2, 0)); 3002 movdl(vec, result); // move 32 bits 3003 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 3004 // Not enough header space in 32-bit VM: 12+3 = 15. 3005 movl(result, Address(str2, -1)); 3006 shrl(result, 8); 3007 movdl(vec, result); // move 32 bits 3008 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 3009 load_unsigned_short(result, Address(str2, 0)); 3010 movdl(vec, result); // move 32 bits 3011 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 3012 movdl(vec, Address(str2, 0)); // move 32 bits 3013 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 3014 movq(vec, Address(str2, 0)); // move 64 bits 3015 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 3016 // Array header size is 12 bytes in 32-bit VM 3017 // + 6 bytes for 3 chars == 18 bytes, 3018 // enough space to load vec and shift. 3019 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 3020 if (ae == StrIntrinsicNode::UL) { 3021 int tail_off = int_cnt2-8; 3022 pmovzxbw(vec, Address(str2, tail_off)); 3023 psrldq(vec, -2*tail_off); 3024 } 3025 else { 3026 int tail_off = int_cnt2*(1<<scale2); 3027 movdqu(vec, Address(str2, tail_off-16)); 3028 psrldq(vec, 16-tail_off); 3029 } 3030 } 3031 } else { // not constant substring 3032 cmpl(cnt2, stride); 3033 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 3034 3035 // We can read beyond string if srt+16 does not cross page boundary 3036 // since heaps are aligned and mapped by pages. 3037 assert(os::vm_page_size() < (int)G, "default page should be small"); 3038 movl(result, str2); // We need only low 32 bits 3039 andl(result, ((int)os::vm_page_size()-1)); 3040 cmpl(result, ((int)os::vm_page_size()-16)); 3041 jccb(Assembler::belowEqual, CHECK_STR); 3042 3043 // Move small strings to stack to allow load 16 bytes into vec. 3044 subptr(rsp, 16); 3045 int stk_offset = wordSize-(1<<scale2); 3046 push(cnt2); 3047 3048 bind(COPY_SUBSTR); 3049 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 3050 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 3051 movb(Address(rsp, cnt2, scale2, stk_offset), result); 3052 } else if (ae == StrIntrinsicNode::UU) { 3053 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 3054 movw(Address(rsp, cnt2, scale2, stk_offset), result); 3055 } 3056 decrement(cnt2); 3057 jccb(Assembler::notZero, COPY_SUBSTR); 3058 3059 pop(cnt2); 3060 movptr(str2, rsp); // New substring address 3061 } // non constant 3062 3063 bind(CHECK_STR); 3064 cmpl(cnt1, stride); 3065 jccb(Assembler::aboveEqual, BIG_STRINGS); 3066 3067 // Check cross page boundary. 3068 movl(result, str1); // We need only low 32 bits 3069 andl(result, ((int)os::vm_page_size()-1)); 3070 cmpl(result, ((int)os::vm_page_size()-16)); 3071 jccb(Assembler::belowEqual, BIG_STRINGS); 3072 3073 subptr(rsp, 16); 3074 int stk_offset = -(1<<scale1); 3075 if (int_cnt2 < 0) { // not constant 3076 push(cnt2); 3077 stk_offset += wordSize; 3078 } 3079 movl(cnt2, cnt1); 3080 3081 bind(COPY_STR); 3082 if (ae == StrIntrinsicNode::LL) { 3083 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3084 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3085 } else { 3086 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3087 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3088 } 3089 decrement(cnt2); 3090 jccb(Assembler::notZero, COPY_STR); 3091 3092 if (int_cnt2 < 0) { // not constant 3093 pop(cnt2); 3094 } 3095 movptr(str1, rsp); // New string address 3096 3097 bind(BIG_STRINGS); 3098 // Load substring. 3099 if (int_cnt2 < 0) { // -1 3100 if (ae == StrIntrinsicNode::UL) { 3101 pmovzxbw(vec, Address(str2, 0)); 3102 } else { 3103 movdqu(vec, Address(str2, 0)); 3104 } 3105 push(cnt2); // substr count 3106 push(str2); // substr addr 3107 push(str1); // string addr 3108 } else { 3109 // Small (< 8 chars) constant substrings are loaded already. 3110 movl(cnt2, int_cnt2); 3111 } 3112 push(tmp); // original SP 3113 3114 } // Finished loading 3115 3116 //======================================================== 3117 // Start search 3118 // 3119 3120 movptr(result, str1); // string addr 3121 3122 if (int_cnt2 < 0) { // Only for non constant substring 3123 jmpb(SCAN_TO_SUBSTR); 3124 3125 // SP saved at sp+0 3126 // String saved at sp+1*wordSize 3127 // Substr saved at sp+2*wordSize 3128 // Substr count saved at sp+3*wordSize 3129 3130 // Reload substr for rescan, this code 3131 // is executed only for large substrings (> 8 chars) 3132 bind(RELOAD_SUBSTR); 3133 movptr(str2, Address(rsp, 2*wordSize)); 3134 movl(cnt2, Address(rsp, 3*wordSize)); 3135 if (ae == StrIntrinsicNode::UL) { 3136 pmovzxbw(vec, Address(str2, 0)); 3137 } else { 3138 movdqu(vec, Address(str2, 0)); 3139 } 3140 // We came here after the beginning of the substring was 3141 // matched but the rest of it was not so we need to search 3142 // again. Start from the next element after the previous match. 3143 subptr(str1, result); // Restore counter 3144 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3145 shrl(str1, 1); 3146 } 3147 addl(cnt1, str1); 3148 decrementl(cnt1); // Shift to next element 3149 cmpl(cnt1, cnt2); 3150 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3151 3152 addptr(result, (1<<scale1)); 3153 } // non constant 3154 3155 // Scan string for start of substr in 16-byte vectors 3156 bind(SCAN_TO_SUBSTR); 3157 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3158 pcmpestri(vec, Address(result, 0), mode); 3159 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3160 subl(cnt1, stride); 3161 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3162 cmpl(cnt1, cnt2); 3163 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3164 addptr(result, 16); 3165 3166 bind(ADJUST_STR); 3167 cmpl(cnt1, stride); // Do not read beyond string 3168 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3169 // Back-up string to avoid reading beyond string. 3170 lea(result, Address(result, cnt1, scale1, -16)); 3171 movl(cnt1, stride); 3172 jmpb(SCAN_TO_SUBSTR); 3173 3174 // Found a potential substr 3175 bind(FOUND_CANDIDATE); 3176 // After pcmpestri tmp(rcx) contains matched element index 3177 3178 // Make sure string is still long enough 3179 subl(cnt1, tmp); 3180 cmpl(cnt1, cnt2); 3181 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3182 // Left less then substring. 3183 3184 bind(RET_NOT_FOUND); 3185 movl(result, -1); 3186 jmp(CLEANUP); 3187 3188 bind(FOUND_SUBSTR); 3189 // Compute start addr of substr 3190 lea(result, Address(result, tmp, scale1)); 3191 if (int_cnt2 > 0) { // Constant substring 3192 // Repeat search for small substring (< 8 chars) 3193 // from new point without reloading substring. 3194 // Have to check that we don't read beyond string. 3195 cmpl(tmp, stride-int_cnt2); 3196 jccb(Assembler::greater, ADJUST_STR); 3197 // Fall through if matched whole substring. 3198 } else { // non constant 3199 assert(int_cnt2 == -1, "should be != 0"); 3200 3201 addl(tmp, cnt2); 3202 // Found result if we matched whole substring. 3203 cmpl(tmp, stride); 3204 jcc(Assembler::lessEqual, RET_FOUND); 3205 3206 // Repeat search for small substring (<= 8 chars) 3207 // from new point 'str1' without reloading substring. 3208 cmpl(cnt2, stride); 3209 // Have to check that we don't read beyond string. 3210 jccb(Assembler::lessEqual, ADJUST_STR); 3211 3212 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3213 // Compare the rest of substring (> 8 chars). 3214 movptr(str1, result); 3215 3216 cmpl(tmp, cnt2); 3217 // First 8 chars are already matched. 3218 jccb(Assembler::equal, CHECK_NEXT); 3219 3220 bind(SCAN_SUBSTR); 3221 pcmpestri(vec, Address(str1, 0), mode); 3222 // Need to reload strings pointers if not matched whole vector 3223 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3224 3225 bind(CHECK_NEXT); 3226 subl(cnt2, stride); 3227 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3228 addptr(str1, 16); 3229 if (ae == StrIntrinsicNode::UL) { 3230 addptr(str2, 8); 3231 } else { 3232 addptr(str2, 16); 3233 } 3234 subl(cnt1, stride); 3235 cmpl(cnt2, stride); // Do not read beyond substring 3236 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3237 // Back-up strings to avoid reading beyond substring. 3238 3239 if (ae == StrIntrinsicNode::UL) { 3240 lea(str2, Address(str2, cnt2, scale2, -8)); 3241 lea(str1, Address(str1, cnt2, scale1, -16)); 3242 } else { 3243 lea(str2, Address(str2, cnt2, scale2, -16)); 3244 lea(str1, Address(str1, cnt2, scale1, -16)); 3245 } 3246 subl(cnt1, cnt2); 3247 movl(cnt2, stride); 3248 addl(cnt1, stride); 3249 bind(CONT_SCAN_SUBSTR); 3250 if (ae == StrIntrinsicNode::UL) { 3251 pmovzxbw(vec, Address(str2, 0)); 3252 } else { 3253 movdqu(vec, Address(str2, 0)); 3254 } 3255 jmp(SCAN_SUBSTR); 3256 3257 bind(RET_FOUND_LONG); 3258 movptr(str1, Address(rsp, wordSize)); 3259 } // non constant 3260 3261 bind(RET_FOUND); 3262 // Compute substr offset 3263 subptr(result, str1); 3264 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3265 shrl(result, 1); // index 3266 } 3267 bind(CLEANUP); 3268 pop(rsp); // restore SP 3269 3270 } // string_indexof 3271 3272 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3273 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3274 ShortBranchVerifier sbv(this); 3275 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3276 3277 int stride = 8; 3278 3279 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3280 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3281 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3282 FOUND_SEQ_CHAR, DONE_LABEL; 3283 3284 movptr(result, str1); 3285 if (UseAVX >= 2) { 3286 cmpl(cnt1, stride); 3287 jcc(Assembler::less, SCAN_TO_CHAR); 3288 cmpl(cnt1, 2*stride); 3289 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3290 movdl(vec1, ch); 3291 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3292 vpxor(vec2, vec2); 3293 movl(tmp, cnt1); 3294 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3295 andl(cnt1,0x0000000F); //tail count (in chars) 3296 3297 bind(SCAN_TO_16_CHAR_LOOP); 3298 vmovdqu(vec3, Address(result, 0)); 3299 vpcmpeqw(vec3, vec3, vec1, 1); 3300 vptest(vec2, vec3); 3301 jcc(Assembler::carryClear, FOUND_CHAR); 3302 addptr(result, 32); 3303 subl(tmp, 2*stride); 3304 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3305 jmp(SCAN_TO_8_CHAR); 3306 bind(SCAN_TO_8_CHAR_INIT); 3307 movdl(vec1, ch); 3308 pshuflw(vec1, vec1, 0x00); 3309 pshufd(vec1, vec1, 0); 3310 pxor(vec2, vec2); 3311 } 3312 bind(SCAN_TO_8_CHAR); 3313 cmpl(cnt1, stride); 3314 jcc(Assembler::less, SCAN_TO_CHAR); 3315 if (UseAVX < 2) { 3316 movdl(vec1, ch); 3317 pshuflw(vec1, vec1, 0x00); 3318 pshufd(vec1, vec1, 0); 3319 pxor(vec2, vec2); 3320 } 3321 movl(tmp, cnt1); 3322 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3323 andl(cnt1,0x00000007); //tail count (in chars) 3324 3325 bind(SCAN_TO_8_CHAR_LOOP); 3326 movdqu(vec3, Address(result, 0)); 3327 pcmpeqw(vec3, vec1); 3328 ptest(vec2, vec3); 3329 jcc(Assembler::carryClear, FOUND_CHAR); 3330 addptr(result, 16); 3331 subl(tmp, stride); 3332 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3333 bind(SCAN_TO_CHAR); 3334 testl(cnt1, cnt1); 3335 jcc(Assembler::zero, RET_NOT_FOUND); 3336 bind(SCAN_TO_CHAR_LOOP); 3337 load_unsigned_short(tmp, Address(result, 0)); 3338 cmpl(ch, tmp); 3339 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3340 addptr(result, 2); 3341 subl(cnt1, 1); 3342 jccb(Assembler::zero, RET_NOT_FOUND); 3343 jmp(SCAN_TO_CHAR_LOOP); 3344 3345 bind(RET_NOT_FOUND); 3346 movl(result, -1); 3347 jmpb(DONE_LABEL); 3348 3349 bind(FOUND_CHAR); 3350 if (UseAVX >= 2) { 3351 vpmovmskb(tmp, vec3); 3352 } else { 3353 pmovmskb(tmp, vec3); 3354 } 3355 bsfl(ch, tmp); 3356 addptr(result, ch); 3357 3358 bind(FOUND_SEQ_CHAR); 3359 subptr(result, str1); 3360 shrl(result, 1); 3361 3362 bind(DONE_LABEL); 3363 } // string_indexof_char 3364 3365 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3366 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3367 ShortBranchVerifier sbv(this); 3368 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3369 3370 int stride = 16; 3371 3372 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3373 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3374 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3375 FOUND_SEQ_CHAR, DONE_LABEL; 3376 3377 movptr(result, str1); 3378 if (UseAVX >= 2) { 3379 cmpl(cnt1, stride); 3380 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3381 cmpl(cnt1, stride*2); 3382 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3383 movdl(vec1, ch); 3384 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3385 vpxor(vec2, vec2); 3386 movl(tmp, cnt1); 3387 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3388 andl(cnt1,0x0000001F); //tail count (in chars) 3389 3390 bind(SCAN_TO_32_CHAR_LOOP); 3391 vmovdqu(vec3, Address(result, 0)); 3392 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3393 vptest(vec2, vec3); 3394 jcc(Assembler::carryClear, FOUND_CHAR); 3395 addptr(result, 32); 3396 subl(tmp, stride*2); 3397 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3398 jmp(SCAN_TO_16_CHAR); 3399 3400 bind(SCAN_TO_16_CHAR_INIT); 3401 movdl(vec1, ch); 3402 pxor(vec2, vec2); 3403 pshufb(vec1, vec2); 3404 } 3405 3406 bind(SCAN_TO_16_CHAR); 3407 cmpl(cnt1, stride); 3408 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3409 if (UseAVX < 2) { 3410 movdl(vec1, ch); 3411 pxor(vec2, vec2); 3412 pshufb(vec1, vec2); 3413 } 3414 movl(tmp, cnt1); 3415 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3416 andl(cnt1,0x0000000F); //tail count (in bytes) 3417 3418 bind(SCAN_TO_16_CHAR_LOOP); 3419 movdqu(vec3, Address(result, 0)); 3420 pcmpeqb(vec3, vec1); 3421 ptest(vec2, vec3); 3422 jcc(Assembler::carryClear, FOUND_CHAR); 3423 addptr(result, 16); 3424 subl(tmp, stride); 3425 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3426 3427 bind(SCAN_TO_CHAR_INIT); 3428 testl(cnt1, cnt1); 3429 jcc(Assembler::zero, RET_NOT_FOUND); 3430 bind(SCAN_TO_CHAR_LOOP); 3431 load_unsigned_byte(tmp, Address(result, 0)); 3432 cmpl(ch, tmp); 3433 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3434 addptr(result, 1); 3435 subl(cnt1, 1); 3436 jccb(Assembler::zero, RET_NOT_FOUND); 3437 jmp(SCAN_TO_CHAR_LOOP); 3438 3439 bind(RET_NOT_FOUND); 3440 movl(result, -1); 3441 jmpb(DONE_LABEL); 3442 3443 bind(FOUND_CHAR); 3444 if (UseAVX >= 2) { 3445 vpmovmskb(tmp, vec3); 3446 } else { 3447 pmovmskb(tmp, vec3); 3448 } 3449 bsfl(ch, tmp); 3450 addptr(result, ch); 3451 3452 bind(FOUND_SEQ_CHAR); 3453 subptr(result, str1); 3454 3455 bind(DONE_LABEL); 3456 } // stringL_indexof_char 3457 3458 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3459 switch (eltype) { 3460 case T_BOOLEAN: return sizeof(jboolean); 3461 case T_BYTE: return sizeof(jbyte); 3462 case T_SHORT: return sizeof(jshort); 3463 case T_CHAR: return sizeof(jchar); 3464 case T_INT: return sizeof(jint); 3465 default: 3466 ShouldNotReachHere(); 3467 return -1; 3468 } 3469 } 3470 3471 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3472 switch (eltype) { 3473 // T_BOOLEAN used as surrogate for unsigned byte 3474 case T_BOOLEAN: movzbl(dst, src); break; 3475 case T_BYTE: movsbl(dst, src); break; 3476 case T_SHORT: movswl(dst, src); break; 3477 case T_CHAR: movzwl(dst, src); break; 3478 case T_INT: movl(dst, src); break; 3479 default: 3480 ShouldNotReachHere(); 3481 } 3482 } 3483 3484 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3485 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3486 } 3487 3488 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3489 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3490 } 3491 3492 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3493 const int vlen = Assembler::AVX_256bit; 3494 switch (eltype) { 3495 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3496 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3497 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3498 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3499 case T_INT: 3500 // do nothing 3501 break; 3502 default: 3503 ShouldNotReachHere(); 3504 } 3505 } 3506 3507 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3508 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3509 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3510 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3511 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3512 BasicType eltype) { 3513 ShortBranchVerifier sbv(this); 3514 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3515 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3516 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3517 3518 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3519 SHORT_UNROLLED_LOOP_EXIT, 3520 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3521 UNROLLED_VECTOR_LOOP_BEGIN, 3522 END; 3523 switch (eltype) { 3524 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3525 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3526 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3527 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3528 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3529 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3530 } 3531 3532 // For "renaming" for readibility of the code 3533 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3534 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3535 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3536 3537 const int elsize = arrays_hashcode_elsize(eltype); 3538 3539 /* 3540 if (cnt1 >= 2) { 3541 if (cnt1 >= 32) { 3542 UNROLLED VECTOR LOOP 3543 } 3544 UNROLLED SCALAR LOOP 3545 } 3546 SINGLE SCALAR 3547 */ 3548 3549 cmpl(cnt1, 32); 3550 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3551 3552 // cnt1 >= 32 && generate_vectorized_loop 3553 xorl(index, index); 3554 3555 // vresult = IntVector.zero(I256); 3556 for (int idx = 0; idx < 4; idx++) { 3557 vpxor(vresult[idx], vresult[idx]); 3558 } 3559 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3560 Register bound = tmp2; 3561 Register next = tmp3; 3562 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3563 movl(next, Address(tmp2, 0)); 3564 movdl(vnext, next); 3565 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3566 3567 // index = 0; 3568 // bound = cnt1 & ~(32 - 1); 3569 movl(bound, cnt1); 3570 andl(bound, ~(32 - 1)); 3571 // for (; index < bound; index += 32) { 3572 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3573 // result *= next; 3574 imull(result, next); 3575 // loop fission to upfront the cost of fetching from memory, OOO execution 3576 // can then hopefully do a better job of prefetching 3577 for (int idx = 0; idx < 4; idx++) { 3578 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3579 } 3580 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3581 for (int idx = 0; idx < 4; idx++) { 3582 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3583 arrays_hashcode_elvcast(vtmp[idx], eltype); 3584 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3585 } 3586 // index += 32; 3587 addl(index, 32); 3588 // index < bound; 3589 cmpl(index, bound); 3590 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3591 // } 3592 3593 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3594 subl(cnt1, bound); 3595 // release bound 3596 3597 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3598 for (int idx = 0; idx < 4; idx++) { 3599 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3600 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3601 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3602 } 3603 // result += vresult.reduceLanes(ADD); 3604 for (int idx = 0; idx < 4; idx++) { 3605 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3606 } 3607 3608 // } else if (cnt1 < 32) { 3609 3610 bind(SHORT_UNROLLED_BEGIN); 3611 // int i = 1; 3612 movl(index, 1); 3613 cmpl(index, cnt1); 3614 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3615 3616 // for (; i < cnt1 ; i += 2) { 3617 bind(SHORT_UNROLLED_LOOP_BEGIN); 3618 movl(tmp3, 961); 3619 imull(result, tmp3); 3620 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3621 movl(tmp3, tmp2); 3622 shll(tmp3, 5); 3623 subl(tmp3, tmp2); 3624 addl(result, tmp3); 3625 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3626 addl(result, tmp3); 3627 addl(index, 2); 3628 cmpl(index, cnt1); 3629 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3630 3631 // } 3632 // if (i >= cnt1) { 3633 bind(SHORT_UNROLLED_LOOP_EXIT); 3634 jccb(Assembler::greater, END); 3635 movl(tmp2, result); 3636 shll(result, 5); 3637 subl(result, tmp2); 3638 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3639 addl(result, tmp3); 3640 // } 3641 bind(END); 3642 3643 BLOCK_COMMENT("} // arrays_hashcode"); 3644 3645 } // arrays_hashcode 3646 3647 // helper function for string_compare 3648 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3649 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3650 Address::ScaleFactor scale2, Register index, int ae) { 3651 if (ae == StrIntrinsicNode::LL) { 3652 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3653 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3654 } else if (ae == StrIntrinsicNode::UU) { 3655 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3656 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3657 } else { 3658 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3659 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3660 } 3661 } 3662 3663 // Compare strings, used for char[] and byte[]. 3664 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3665 Register cnt1, Register cnt2, Register result, 3666 XMMRegister vec1, int ae, KRegister mask) { 3667 ShortBranchVerifier sbv(this); 3668 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3669 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3670 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3671 int stride2x2 = 0x40; 3672 Address::ScaleFactor scale = Address::no_scale; 3673 Address::ScaleFactor scale1 = Address::no_scale; 3674 Address::ScaleFactor scale2 = Address::no_scale; 3675 3676 if (ae != StrIntrinsicNode::LL) { 3677 stride2x2 = 0x20; 3678 } 3679 3680 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3681 shrl(cnt2, 1); 3682 } 3683 // Compute the minimum of the string lengths and the 3684 // difference of the string lengths (stack). 3685 // Do the conditional move stuff 3686 movl(result, cnt1); 3687 subl(cnt1, cnt2); 3688 push(cnt1); 3689 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3690 3691 // Is the minimum length zero? 3692 testl(cnt2, cnt2); 3693 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3694 if (ae == StrIntrinsicNode::LL) { 3695 // Load first bytes 3696 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3697 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3698 } else if (ae == StrIntrinsicNode::UU) { 3699 // Load first characters 3700 load_unsigned_short(result, Address(str1, 0)); 3701 load_unsigned_short(cnt1, Address(str2, 0)); 3702 } else { 3703 load_unsigned_byte(result, Address(str1, 0)); 3704 load_unsigned_short(cnt1, Address(str2, 0)); 3705 } 3706 subl(result, cnt1); 3707 jcc(Assembler::notZero, POP_LABEL); 3708 3709 if (ae == StrIntrinsicNode::UU) { 3710 // Divide length by 2 to get number of chars 3711 shrl(cnt2, 1); 3712 } 3713 cmpl(cnt2, 1); 3714 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3715 3716 // Check if the strings start at the same location and setup scale and stride 3717 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3718 cmpptr(str1, str2); 3719 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3720 if (ae == StrIntrinsicNode::LL) { 3721 scale = Address::times_1; 3722 stride = 16; 3723 } else { 3724 scale = Address::times_2; 3725 stride = 8; 3726 } 3727 } else { 3728 scale1 = Address::times_1; 3729 scale2 = Address::times_2; 3730 // scale not used 3731 stride = 8; 3732 } 3733 3734 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3735 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3736 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3737 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3738 Label COMPARE_TAIL_LONG; 3739 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3740 3741 int pcmpmask = 0x19; 3742 if (ae == StrIntrinsicNode::LL) { 3743 pcmpmask &= ~0x01; 3744 } 3745 3746 // Setup to compare 16-chars (32-bytes) vectors, 3747 // start from first character again because it has aligned address. 3748 if (ae == StrIntrinsicNode::LL) { 3749 stride2 = 32; 3750 } else { 3751 stride2 = 16; 3752 } 3753 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3754 adr_stride = stride << scale; 3755 } else { 3756 adr_stride1 = 8; //stride << scale1; 3757 adr_stride2 = 16; //stride << scale2; 3758 } 3759 3760 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3761 // rax and rdx are used by pcmpestri as elements counters 3762 movl(result, cnt2); 3763 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3764 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3765 3766 // fast path : compare first 2 8-char vectors. 3767 bind(COMPARE_16_CHARS); 3768 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3769 movdqu(vec1, Address(str1, 0)); 3770 } else { 3771 pmovzxbw(vec1, Address(str1, 0)); 3772 } 3773 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3774 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3775 3776 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3777 movdqu(vec1, Address(str1, adr_stride)); 3778 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3779 } else { 3780 pmovzxbw(vec1, Address(str1, adr_stride1)); 3781 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3782 } 3783 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3784 addl(cnt1, stride); 3785 3786 // Compare the characters at index in cnt1 3787 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3788 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3789 subl(result, cnt2); 3790 jmp(POP_LABEL); 3791 3792 // Setup the registers to start vector comparison loop 3793 bind(COMPARE_WIDE_VECTORS); 3794 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3795 lea(str1, Address(str1, result, scale)); 3796 lea(str2, Address(str2, result, scale)); 3797 } else { 3798 lea(str1, Address(str1, result, scale1)); 3799 lea(str2, Address(str2, result, scale2)); 3800 } 3801 subl(result, stride2); 3802 subl(cnt2, stride2); 3803 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3804 negptr(result); 3805 3806 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3807 bind(COMPARE_WIDE_VECTORS_LOOP); 3808 3809 #ifdef _LP64 3810 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3811 cmpl(cnt2, stride2x2); 3812 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3813 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3814 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3815 3816 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3817 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3818 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3819 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3820 } else { 3821 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3822 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3823 } 3824 kortestql(mask, mask); 3825 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3826 addptr(result, stride2x2); // update since we already compared at this addr 3827 subl(cnt2, stride2x2); // and sub the size too 3828 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3829 3830 vpxor(vec1, vec1); 3831 jmpb(COMPARE_WIDE_TAIL); 3832 }//if (VM_Version::supports_avx512vlbw()) 3833 #endif // _LP64 3834 3835 3836 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3837 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3838 vmovdqu(vec1, Address(str1, result, scale)); 3839 vpxor(vec1, Address(str2, result, scale)); 3840 } else { 3841 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3842 vpxor(vec1, Address(str2, result, scale2)); 3843 } 3844 vptest(vec1, vec1); 3845 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3846 addptr(result, stride2); 3847 subl(cnt2, stride2); 3848 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3849 // clean upper bits of YMM registers 3850 vpxor(vec1, vec1); 3851 3852 // compare wide vectors tail 3853 bind(COMPARE_WIDE_TAIL); 3854 testptr(result, result); 3855 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3856 3857 movl(result, stride2); 3858 movl(cnt2, result); 3859 negptr(result); 3860 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3861 3862 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3863 bind(VECTOR_NOT_EQUAL); 3864 // clean upper bits of YMM registers 3865 vpxor(vec1, vec1); 3866 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3867 lea(str1, Address(str1, result, scale)); 3868 lea(str2, Address(str2, result, scale)); 3869 } else { 3870 lea(str1, Address(str1, result, scale1)); 3871 lea(str2, Address(str2, result, scale2)); 3872 } 3873 jmp(COMPARE_16_CHARS); 3874 3875 // Compare tail chars, length between 1 to 15 chars 3876 bind(COMPARE_TAIL_LONG); 3877 movl(cnt2, result); 3878 cmpl(cnt2, stride); 3879 jcc(Assembler::less, COMPARE_SMALL_STR); 3880 3881 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3882 movdqu(vec1, Address(str1, 0)); 3883 } else { 3884 pmovzxbw(vec1, Address(str1, 0)); 3885 } 3886 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3887 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3888 subptr(cnt2, stride); 3889 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3890 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3891 lea(str1, Address(str1, result, scale)); 3892 lea(str2, Address(str2, result, scale)); 3893 } else { 3894 lea(str1, Address(str1, result, scale1)); 3895 lea(str2, Address(str2, result, scale2)); 3896 } 3897 negptr(cnt2); 3898 jmpb(WHILE_HEAD_LABEL); 3899 3900 bind(COMPARE_SMALL_STR); 3901 } else if (UseSSE42Intrinsics) { 3902 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3903 int pcmpmask = 0x19; 3904 // Setup to compare 8-char (16-byte) vectors, 3905 // start from first character again because it has aligned address. 3906 movl(result, cnt2); 3907 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3908 if (ae == StrIntrinsicNode::LL) { 3909 pcmpmask &= ~0x01; 3910 } 3911 jcc(Assembler::zero, COMPARE_TAIL); 3912 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3913 lea(str1, Address(str1, result, scale)); 3914 lea(str2, Address(str2, result, scale)); 3915 } else { 3916 lea(str1, Address(str1, result, scale1)); 3917 lea(str2, Address(str2, result, scale2)); 3918 } 3919 negptr(result); 3920 3921 // pcmpestri 3922 // inputs: 3923 // vec1- substring 3924 // rax - negative string length (elements count) 3925 // mem - scanned string 3926 // rdx - string length (elements count) 3927 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3928 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3929 // outputs: 3930 // rcx - first mismatched element index 3931 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3932 3933 bind(COMPARE_WIDE_VECTORS); 3934 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3935 movdqu(vec1, Address(str1, result, scale)); 3936 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3937 } else { 3938 pmovzxbw(vec1, Address(str1, result, scale1)); 3939 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3940 } 3941 // After pcmpestri cnt1(rcx) contains mismatched element index 3942 3943 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3944 addptr(result, stride); 3945 subptr(cnt2, stride); 3946 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3947 3948 // compare wide vectors tail 3949 testptr(result, result); 3950 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3951 3952 movl(cnt2, stride); 3953 movl(result, stride); 3954 negptr(result); 3955 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3956 movdqu(vec1, Address(str1, result, scale)); 3957 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3958 } else { 3959 pmovzxbw(vec1, Address(str1, result, scale1)); 3960 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3961 } 3962 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3963 3964 // Mismatched characters in the vectors 3965 bind(VECTOR_NOT_EQUAL); 3966 addptr(cnt1, result); 3967 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3968 subl(result, cnt2); 3969 jmpb(POP_LABEL); 3970 3971 bind(COMPARE_TAIL); // limit is zero 3972 movl(cnt2, result); 3973 // Fallthru to tail compare 3974 } 3975 // Shift str2 and str1 to the end of the arrays, negate min 3976 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3977 lea(str1, Address(str1, cnt2, scale)); 3978 lea(str2, Address(str2, cnt2, scale)); 3979 } else { 3980 lea(str1, Address(str1, cnt2, scale1)); 3981 lea(str2, Address(str2, cnt2, scale2)); 3982 } 3983 decrementl(cnt2); // first character was compared already 3984 negptr(cnt2); 3985 3986 // Compare the rest of the elements 3987 bind(WHILE_HEAD_LABEL); 3988 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3989 subl(result, cnt1); 3990 jccb(Assembler::notZero, POP_LABEL); 3991 increment(cnt2); 3992 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3993 3994 // Strings are equal up to min length. Return the length difference. 3995 bind(LENGTH_DIFF_LABEL); 3996 pop(result); 3997 if (ae == StrIntrinsicNode::UU) { 3998 // Divide diff by 2 to get number of chars 3999 sarl(result, 1); 4000 } 4001 jmpb(DONE_LABEL); 4002 4003 #ifdef _LP64 4004 if (VM_Version::supports_avx512vlbw()) { 4005 4006 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 4007 4008 kmovql(cnt1, mask); 4009 notq(cnt1); 4010 bsfq(cnt2, cnt1); 4011 if (ae != StrIntrinsicNode::LL) { 4012 // Divide diff by 2 to get number of chars 4013 sarl(cnt2, 1); 4014 } 4015 addq(result, cnt2); 4016 if (ae == StrIntrinsicNode::LL) { 4017 load_unsigned_byte(cnt1, Address(str2, result)); 4018 load_unsigned_byte(result, Address(str1, result)); 4019 } else if (ae == StrIntrinsicNode::UU) { 4020 load_unsigned_short(cnt1, Address(str2, result, scale)); 4021 load_unsigned_short(result, Address(str1, result, scale)); 4022 } else { 4023 load_unsigned_short(cnt1, Address(str2, result, scale2)); 4024 load_unsigned_byte(result, Address(str1, result, scale1)); 4025 } 4026 subl(result, cnt1); 4027 jmpb(POP_LABEL); 4028 }//if (VM_Version::supports_avx512vlbw()) 4029 #endif // _LP64 4030 4031 // Discard the stored length difference 4032 bind(POP_LABEL); 4033 pop(cnt1); 4034 4035 // That's it 4036 bind(DONE_LABEL); 4037 if(ae == StrIntrinsicNode::UL) { 4038 negl(result); 4039 } 4040 4041 } 4042 4043 // Search for Non-ASCII character (Negative byte value) in a byte array, 4044 // return the index of the first such character, otherwise the length 4045 // of the array segment searched. 4046 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 4047 // @IntrinsicCandidate 4048 // public static int countPositives(byte[] ba, int off, int len) { 4049 // for (int i = off; i < off + len; i++) { 4050 // if (ba[i] < 0) { 4051 // return i - off; 4052 // } 4053 // } 4054 // return len; 4055 // } 4056 void C2_MacroAssembler::count_positives(Register ary1, Register len, 4057 Register result, Register tmp1, 4058 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 4059 // rsi: byte array 4060 // rcx: len 4061 // rax: result 4062 ShortBranchVerifier sbv(this); 4063 assert_different_registers(ary1, len, result, tmp1); 4064 assert_different_registers(vec1, vec2); 4065 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 4066 4067 movl(result, len); // copy 4068 // len == 0 4069 testl(len, len); 4070 jcc(Assembler::zero, DONE); 4071 4072 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 4073 VM_Version::supports_avx512vlbw() && 4074 VM_Version::supports_bmi2()) { 4075 4076 Label test_64_loop, test_tail, BREAK_LOOP; 4077 Register tmp3_aliased = len; 4078 4079 movl(tmp1, len); 4080 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4081 4082 andl(tmp1, 64 - 1); // tail count (in chars) 0x3F 4083 andl(len, ~(64 - 1)); // vector count (in chars) 4084 jccb(Assembler::zero, test_tail); 4085 4086 lea(ary1, Address(ary1, len, Address::times_1)); 4087 negptr(len); 4088 4089 bind(test_64_loop); 4090 // Check whether our 64 elements of size byte contain negatives 4091 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4092 kortestql(mask1, mask1); 4093 jcc(Assembler::notZero, BREAK_LOOP); 4094 4095 addptr(len, 64); 4096 jccb(Assembler::notZero, test_64_loop); 4097 4098 bind(test_tail); 4099 // bail out when there is nothing to be done 4100 testl(tmp1, -1); 4101 jcc(Assembler::zero, DONE); 4102 4103 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4104 #ifdef _LP64 4105 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4106 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4107 notq(tmp3_aliased); 4108 kmovql(mask2, tmp3_aliased); 4109 #else 4110 Label k_init; 4111 jmp(k_init); 4112 4113 // We could not read 64-bits from a general purpose register thus we move 4114 // data required to compose 64 1's to the instruction stream 4115 // We emit 64 byte wide series of elements from 0..63 which later on would 4116 // be used as a compare targets with tail count contained in tmp1 register. 4117 // Result would be a k register having tmp1 consecutive number or 1 4118 // counting from least significant bit. 4119 address tmp = pc(); 4120 emit_int64(0x0706050403020100); 4121 emit_int64(0x0F0E0D0C0B0A0908); 4122 emit_int64(0x1716151413121110); 4123 emit_int64(0x1F1E1D1C1B1A1918); 4124 emit_int64(0x2726252423222120); 4125 emit_int64(0x2F2E2D2C2B2A2928); 4126 emit_int64(0x3736353433323130); 4127 emit_int64(0x3F3E3D3C3B3A3938); 4128 4129 bind(k_init); 4130 lea(len, InternalAddress(tmp)); 4131 // create mask to test for negative byte inside a vector 4132 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 4133 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 4134 4135 #endif 4136 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4137 ktestq(mask1, mask2); 4138 jcc(Assembler::zero, DONE); 4139 4140 bind(BREAK_LOOP); 4141 // At least one byte in the last 64 bytes is negative. 4142 // Set up to look at the last 64 bytes as if they were a tail 4143 lea(ary1, Address(ary1, len, Address::times_1)); 4144 addptr(result, len); 4145 // Ignore the very last byte: if all others are positive, 4146 // it must be negative, so we can skip right to the 2+1 byte 4147 // end comparison at this point 4148 orl(result, 63); 4149 movl(len, 63); 4150 // Fallthru to tail compare 4151 } else { 4152 4153 if (UseAVX >= 2 && UseSSE >= 2) { 4154 // With AVX2, use 32-byte vector compare 4155 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4156 4157 // Compare 32-byte vectors 4158 testl(len, 0xffffffe0); // vector count (in bytes) 4159 jccb(Assembler::zero, TAIL_START); 4160 4161 andl(len, 0xffffffe0); 4162 lea(ary1, Address(ary1, len, Address::times_1)); 4163 negptr(len); 4164 4165 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4166 movdl(vec2, tmp1); 4167 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4168 4169 bind(COMPARE_WIDE_VECTORS); 4170 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4171 vptest(vec1, vec2); 4172 jccb(Assembler::notZero, BREAK_LOOP); 4173 addptr(len, 32); 4174 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4175 4176 testl(result, 0x0000001f); // any bytes remaining? 4177 jcc(Assembler::zero, DONE); 4178 4179 // Quick test using the already prepared vector mask 4180 movl(len, result); 4181 andl(len, 0x0000001f); 4182 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4183 vptest(vec1, vec2); 4184 jcc(Assembler::zero, DONE); 4185 // There are zeros, jump to the tail to determine exactly where 4186 jmpb(TAIL_START); 4187 4188 bind(BREAK_LOOP); 4189 // At least one byte in the last 32-byte vector is negative. 4190 // Set up to look at the last 32 bytes as if they were a tail 4191 lea(ary1, Address(ary1, len, Address::times_1)); 4192 addptr(result, len); 4193 // Ignore the very last byte: if all others are positive, 4194 // it must be negative, so we can skip right to the 2+1 byte 4195 // end comparison at this point 4196 orl(result, 31); 4197 movl(len, 31); 4198 // Fallthru to tail compare 4199 } else if (UseSSE42Intrinsics) { 4200 // With SSE4.2, use double quad vector compare 4201 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4202 4203 // Compare 16-byte vectors 4204 testl(len, 0xfffffff0); // vector count (in bytes) 4205 jcc(Assembler::zero, TAIL_START); 4206 4207 andl(len, 0xfffffff0); 4208 lea(ary1, Address(ary1, len, Address::times_1)); 4209 negptr(len); 4210 4211 movl(tmp1, 0x80808080); 4212 movdl(vec2, tmp1); 4213 pshufd(vec2, vec2, 0); 4214 4215 bind(COMPARE_WIDE_VECTORS); 4216 movdqu(vec1, Address(ary1, len, Address::times_1)); 4217 ptest(vec1, vec2); 4218 jccb(Assembler::notZero, BREAK_LOOP); 4219 addptr(len, 16); 4220 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4221 4222 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4223 jcc(Assembler::zero, DONE); 4224 4225 // Quick test using the already prepared vector mask 4226 movl(len, result); 4227 andl(len, 0x0000000f); // tail count (in bytes) 4228 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4229 ptest(vec1, vec2); 4230 jcc(Assembler::zero, DONE); 4231 jmpb(TAIL_START); 4232 4233 bind(BREAK_LOOP); 4234 // At least one byte in the last 16-byte vector is negative. 4235 // Set up and look at the last 16 bytes as if they were a tail 4236 lea(ary1, Address(ary1, len, Address::times_1)); 4237 addptr(result, len); 4238 // Ignore the very last byte: if all others are positive, 4239 // it must be negative, so we can skip right to the 2+1 byte 4240 // end comparison at this point 4241 orl(result, 15); 4242 movl(len, 15); 4243 // Fallthru to tail compare 4244 } 4245 } 4246 4247 bind(TAIL_START); 4248 // Compare 4-byte vectors 4249 andl(len, 0xfffffffc); // vector count (in bytes) 4250 jccb(Assembler::zero, COMPARE_CHAR); 4251 4252 lea(ary1, Address(ary1, len, Address::times_1)); 4253 negptr(len); 4254 4255 bind(COMPARE_VECTORS); 4256 movl(tmp1, Address(ary1, len, Address::times_1)); 4257 andl(tmp1, 0x80808080); 4258 jccb(Assembler::notZero, TAIL_ADJUST); 4259 addptr(len, 4); 4260 jccb(Assembler::notZero, COMPARE_VECTORS); 4261 4262 // Compare trailing char (final 2-3 bytes), if any 4263 bind(COMPARE_CHAR); 4264 4265 testl(result, 0x2); // tail char 4266 jccb(Assembler::zero, COMPARE_BYTE); 4267 load_unsigned_short(tmp1, Address(ary1, 0)); 4268 andl(tmp1, 0x00008080); 4269 jccb(Assembler::notZero, CHAR_ADJUST); 4270 lea(ary1, Address(ary1, 2)); 4271 4272 bind(COMPARE_BYTE); 4273 testl(result, 0x1); // tail byte 4274 jccb(Assembler::zero, DONE); 4275 load_unsigned_byte(tmp1, Address(ary1, 0)); 4276 testl(tmp1, 0x00000080); 4277 jccb(Assembler::zero, DONE); 4278 subptr(result, 1); 4279 jmpb(DONE); 4280 4281 bind(TAIL_ADJUST); 4282 // there are negative bits in the last 4 byte block. 4283 // Adjust result and check the next three bytes 4284 addptr(result, len); 4285 orl(result, 3); 4286 lea(ary1, Address(ary1, len, Address::times_1)); 4287 jmpb(COMPARE_CHAR); 4288 4289 bind(CHAR_ADJUST); 4290 // We are looking at a char + optional byte tail, and found that one 4291 // of the bytes in the char is negative. Adjust the result, check the 4292 // first byte and readjust if needed. 4293 andl(result, 0xfffffffc); 4294 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4295 jccb(Assembler::notZero, DONE); 4296 addptr(result, 1); 4297 4298 // That's it 4299 bind(DONE); 4300 if (UseAVX >= 2 && UseSSE >= 2) { 4301 // clean upper bits of YMM registers 4302 vpxor(vec1, vec1); 4303 vpxor(vec2, vec2); 4304 } 4305 } 4306 4307 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4308 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4309 Register limit, Register result, Register chr, 4310 XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) { 4311 ShortBranchVerifier sbv(this); 4312 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4313 4314 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4315 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4316 4317 if (is_array_equ) { 4318 // Check the input args 4319 cmpoop(ary1, ary2); 4320 jcc(Assembler::equal, TRUE_LABEL); 4321 4322 // Need additional checks for arrays_equals. 4323 testptr(ary1, ary1); 4324 jcc(Assembler::zero, FALSE_LABEL); 4325 testptr(ary2, ary2); 4326 jcc(Assembler::zero, FALSE_LABEL); 4327 4328 // Check the lengths 4329 movl(limit, Address(ary1, length_offset)); 4330 cmpl(limit, Address(ary2, length_offset)); 4331 jcc(Assembler::notEqual, FALSE_LABEL); 4332 } 4333 4334 // count == 0 4335 testl(limit, limit); 4336 jcc(Assembler::zero, TRUE_LABEL); 4337 4338 if (is_array_equ) { 4339 // Load array address 4340 lea(ary1, Address(ary1, base_offset)); 4341 lea(ary2, Address(ary2, base_offset)); 4342 } 4343 4344 if (is_array_equ && is_char) { 4345 // arrays_equals when used for char[]. 4346 shll(limit, 1); // byte count != 0 4347 } 4348 movl(result, limit); // copy 4349 4350 if (UseAVX >= 2) { 4351 // With AVX2, use 32-byte vector compare 4352 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4353 4354 // Compare 32-byte vectors 4355 andl(result, 0x0000001f); // tail count (in bytes) 4356 andl(limit, 0xffffffe0); // vector count (in bytes) 4357 jcc(Assembler::zero, COMPARE_TAIL); 4358 4359 lea(ary1, Address(ary1, limit, Address::times_1)); 4360 lea(ary2, Address(ary2, limit, Address::times_1)); 4361 negptr(limit); 4362 4363 #ifdef _LP64 4364 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4365 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4366 4367 cmpl(limit, -64); 4368 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4369 4370 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4371 4372 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4373 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4374 kortestql(mask, mask); 4375 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4376 addptr(limit, 64); // update since we already compared at this addr 4377 cmpl(limit, -64); 4378 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4379 4380 // At this point we may still need to compare -limit+result bytes. 4381 // We could execute the next two instruction and just continue via non-wide path: 4382 // cmpl(limit, 0); 4383 // jcc(Assembler::equal, COMPARE_TAIL); // true 4384 // But since we stopped at the points ary{1,2}+limit which are 4385 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4386 // (|limit| <= 32 and result < 32), 4387 // we may just compare the last 64 bytes. 4388 // 4389 addptr(result, -64); // it is safe, bc we just came from this area 4390 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4391 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4392 kortestql(mask, mask); 4393 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4394 4395 jmp(TRUE_LABEL); 4396 4397 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4398 4399 }//if (VM_Version::supports_avx512vlbw()) 4400 #endif //_LP64 4401 bind(COMPARE_WIDE_VECTORS); 4402 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); 4403 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4404 vpxor(vec1, vec2); 4405 4406 vptest(vec1, vec1); 4407 jcc(Assembler::notZero, FALSE_LABEL); 4408 addptr(limit, 32); 4409 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4410 4411 testl(result, result); 4412 jcc(Assembler::zero, TRUE_LABEL); 4413 4414 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 4415 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4416 vpxor(vec1, vec2); 4417 4418 vptest(vec1, vec1); 4419 jccb(Assembler::notZero, FALSE_LABEL); 4420 jmpb(TRUE_LABEL); 4421 4422 bind(COMPARE_TAIL); // limit is zero 4423 movl(limit, result); 4424 // Fallthru to tail compare 4425 } else if (UseSSE42Intrinsics) { 4426 // With SSE4.2, use double quad vector compare 4427 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4428 4429 // Compare 16-byte vectors 4430 andl(result, 0x0000000f); // tail count (in bytes) 4431 andl(limit, 0xfffffff0); // vector count (in bytes) 4432 jcc(Assembler::zero, COMPARE_TAIL); 4433 4434 lea(ary1, Address(ary1, limit, Address::times_1)); 4435 lea(ary2, Address(ary2, limit, Address::times_1)); 4436 negptr(limit); 4437 4438 bind(COMPARE_WIDE_VECTORS); 4439 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4440 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4441 pxor(vec1, vec2); 4442 4443 ptest(vec1, vec1); 4444 jcc(Assembler::notZero, FALSE_LABEL); 4445 addptr(limit, 16); 4446 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4447 4448 testl(result, result); 4449 jcc(Assembler::zero, TRUE_LABEL); 4450 4451 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4452 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4453 pxor(vec1, vec2); 4454 4455 ptest(vec1, vec1); 4456 jccb(Assembler::notZero, FALSE_LABEL); 4457 jmpb(TRUE_LABEL); 4458 4459 bind(COMPARE_TAIL); // limit is zero 4460 movl(limit, result); 4461 // Fallthru to tail compare 4462 } 4463 4464 // Compare 4-byte vectors 4465 andl(limit, 0xfffffffc); // vector count (in bytes) 4466 jccb(Assembler::zero, COMPARE_CHAR); 4467 4468 lea(ary1, Address(ary1, limit, Address::times_1)); 4469 lea(ary2, Address(ary2, limit, Address::times_1)); 4470 negptr(limit); 4471 4472 bind(COMPARE_VECTORS); 4473 movl(chr, Address(ary1, limit, Address::times_1)); 4474 cmpl(chr, Address(ary2, limit, Address::times_1)); 4475 jccb(Assembler::notEqual, FALSE_LABEL); 4476 addptr(limit, 4); 4477 jcc(Assembler::notZero, COMPARE_VECTORS); 4478 4479 // Compare trailing char (final 2 bytes), if any 4480 bind(COMPARE_CHAR); 4481 testl(result, 0x2); // tail char 4482 jccb(Assembler::zero, COMPARE_BYTE); 4483 load_unsigned_short(chr, Address(ary1, 0)); 4484 load_unsigned_short(limit, Address(ary2, 0)); 4485 cmpl(chr, limit); 4486 jccb(Assembler::notEqual, FALSE_LABEL); 4487 4488 if (is_array_equ && is_char) { 4489 bind(COMPARE_BYTE); 4490 } else { 4491 lea(ary1, Address(ary1, 2)); 4492 lea(ary2, Address(ary2, 2)); 4493 4494 bind(COMPARE_BYTE); 4495 testl(result, 0x1); // tail byte 4496 jccb(Assembler::zero, TRUE_LABEL); 4497 load_unsigned_byte(chr, Address(ary1, 0)); 4498 load_unsigned_byte(limit, Address(ary2, 0)); 4499 cmpl(chr, limit); 4500 jccb(Assembler::notEqual, FALSE_LABEL); 4501 } 4502 bind(TRUE_LABEL); 4503 movl(result, 1); // return true 4504 jmpb(DONE); 4505 4506 bind(FALSE_LABEL); 4507 xorl(result, result); // return false 4508 4509 // That's it 4510 bind(DONE); 4511 if (UseAVX >= 2) { 4512 // clean upper bits of YMM registers 4513 vpxor(vec1, vec1); 4514 vpxor(vec2, vec2); 4515 } 4516 } 4517 4518 #ifdef _LP64 4519 4520 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4521 #define __ masm. 4522 Register dst = stub.data<0>(); 4523 XMMRegister src = stub.data<1>(); 4524 address target = stub.data<2>(); 4525 __ bind(stub.entry()); 4526 __ subptr(rsp, 8); 4527 __ movdbl(Address(rsp), src); 4528 __ call(RuntimeAddress(target)); 4529 __ pop(dst); 4530 __ jmp(stub.continuation()); 4531 #undef __ 4532 } 4533 4534 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4535 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4536 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4537 4538 address slowpath_target; 4539 if (dst_bt == T_INT) { 4540 if (src_bt == T_FLOAT) { 4541 cvttss2sil(dst, src); 4542 cmpl(dst, 0x80000000); 4543 slowpath_target = StubRoutines::x86::f2i_fixup(); 4544 } else { 4545 cvttsd2sil(dst, src); 4546 cmpl(dst, 0x80000000); 4547 slowpath_target = StubRoutines::x86::d2i_fixup(); 4548 } 4549 } else { 4550 if (src_bt == T_FLOAT) { 4551 cvttss2siq(dst, src); 4552 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4553 slowpath_target = StubRoutines::x86::f2l_fixup(); 4554 } else { 4555 cvttsd2siq(dst, src); 4556 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4557 slowpath_target = StubRoutines::x86::d2l_fixup(); 4558 } 4559 } 4560 4561 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4562 jcc(Assembler::equal, stub->entry()); 4563 bind(stub->continuation()); 4564 } 4565 4566 #endif // _LP64 4567 4568 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4569 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4570 switch(ideal_opc) { 4571 case Op_LShiftVS: 4572 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4573 case Op_LShiftVI: 4574 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4575 case Op_LShiftVL: 4576 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4577 case Op_RShiftVS: 4578 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4579 case Op_RShiftVI: 4580 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4581 case Op_RShiftVL: 4582 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4583 case Op_URShiftVS: 4584 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4585 case Op_URShiftVI: 4586 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4587 case Op_URShiftVL: 4588 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4589 case Op_RotateRightV: 4590 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4591 case Op_RotateLeftV: 4592 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4593 default: 4594 fatal("Unsupported masked operation"); break; 4595 } 4596 } 4597 4598 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4599 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4600 bool is_varshift) { 4601 switch (ideal_opc) { 4602 case Op_AddVB: 4603 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4604 case Op_AddVS: 4605 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4606 case Op_AddVI: 4607 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4608 case Op_AddVL: 4609 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4610 case Op_AddVF: 4611 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4612 case Op_AddVD: 4613 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4614 case Op_SubVB: 4615 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4616 case Op_SubVS: 4617 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4618 case Op_SubVI: 4619 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4620 case Op_SubVL: 4621 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4622 case Op_SubVF: 4623 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4624 case Op_SubVD: 4625 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4626 case Op_MulVS: 4627 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4628 case Op_MulVI: 4629 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4630 case Op_MulVL: 4631 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4632 case Op_MulVF: 4633 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4634 case Op_MulVD: 4635 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4636 case Op_DivVF: 4637 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4638 case Op_DivVD: 4639 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4640 case Op_SqrtVF: 4641 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4642 case Op_SqrtVD: 4643 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4644 case Op_AbsVB: 4645 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4646 case Op_AbsVS: 4647 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4648 case Op_AbsVI: 4649 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4650 case Op_AbsVL: 4651 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4652 case Op_FmaVF: 4653 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4654 case Op_FmaVD: 4655 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4656 case Op_VectorRearrange: 4657 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4658 case Op_LShiftVS: 4659 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4660 case Op_LShiftVI: 4661 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4662 case Op_LShiftVL: 4663 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4664 case Op_RShiftVS: 4665 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4666 case Op_RShiftVI: 4667 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4668 case Op_RShiftVL: 4669 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4670 case Op_URShiftVS: 4671 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4672 case Op_URShiftVI: 4673 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4674 case Op_URShiftVL: 4675 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4676 case Op_RotateLeftV: 4677 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4678 case Op_RotateRightV: 4679 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4680 case Op_MaxV: 4681 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4682 case Op_MinV: 4683 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4684 case Op_XorV: 4685 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4686 case Op_OrV: 4687 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4688 case Op_AndV: 4689 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4690 default: 4691 fatal("Unsupported masked operation"); break; 4692 } 4693 } 4694 4695 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4696 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4697 switch (ideal_opc) { 4698 case Op_AddVB: 4699 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4700 case Op_AddVS: 4701 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4702 case Op_AddVI: 4703 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4704 case Op_AddVL: 4705 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4706 case Op_AddVF: 4707 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4708 case Op_AddVD: 4709 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4710 case Op_SubVB: 4711 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4712 case Op_SubVS: 4713 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4714 case Op_SubVI: 4715 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4716 case Op_SubVL: 4717 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4718 case Op_SubVF: 4719 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4720 case Op_SubVD: 4721 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4722 case Op_MulVS: 4723 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4724 case Op_MulVI: 4725 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4726 case Op_MulVL: 4727 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4728 case Op_MulVF: 4729 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4730 case Op_MulVD: 4731 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4732 case Op_DivVF: 4733 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4734 case Op_DivVD: 4735 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4736 case Op_FmaVF: 4737 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4738 case Op_FmaVD: 4739 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4740 case Op_MaxV: 4741 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4742 case Op_MinV: 4743 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4744 case Op_XorV: 4745 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4746 case Op_OrV: 4747 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4748 case Op_AndV: 4749 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4750 default: 4751 fatal("Unsupported masked operation"); break; 4752 } 4753 } 4754 4755 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4756 KRegister src1, KRegister src2) { 4757 BasicType etype = T_ILLEGAL; 4758 switch(mask_len) { 4759 case 2: 4760 case 4: 4761 case 8: etype = T_BYTE; break; 4762 case 16: etype = T_SHORT; break; 4763 case 32: etype = T_INT; break; 4764 case 64: etype = T_LONG; break; 4765 default: fatal("Unsupported type"); break; 4766 } 4767 assert(etype != T_ILLEGAL, ""); 4768 switch(ideal_opc) { 4769 case Op_AndVMask: 4770 kand(etype, dst, src1, src2); break; 4771 case Op_OrVMask: 4772 kor(etype, dst, src1, src2); break; 4773 case Op_XorVMask: 4774 kxor(etype, dst, src1, src2); break; 4775 default: 4776 fatal("Unsupported masked operation"); break; 4777 } 4778 } 4779 4780 /* 4781 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4782 * If src is NaN, the result is 0. 4783 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4784 * the result is equal to the value of Integer.MIN_VALUE. 4785 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4786 * the result is equal to the value of Integer.MAX_VALUE. 4787 */ 4788 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4789 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4790 Register rscratch, AddressLiteral float_sign_flip, 4791 int vec_enc) { 4792 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4793 Label done; 4794 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4795 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4796 vptest(xtmp2, xtmp2, vec_enc); 4797 jccb(Assembler::equal, done); 4798 4799 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4800 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4801 4802 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4803 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4804 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4805 4806 // Recompute the mask for remaining special value. 4807 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4808 // Extract SRC values corresponding to TRUE mask lanes. 4809 vpand(xtmp4, xtmp2, src, vec_enc); 4810 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4811 // values are set. 4812 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4813 4814 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4815 bind(done); 4816 } 4817 4818 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4819 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4820 Register rscratch, AddressLiteral float_sign_flip, 4821 int vec_enc) { 4822 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4823 Label done; 4824 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4825 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4826 kortestwl(ktmp1, ktmp1); 4827 jccb(Assembler::equal, done); 4828 4829 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4830 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4831 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4832 4833 kxorwl(ktmp1, ktmp1, ktmp2); 4834 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4835 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4836 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4837 bind(done); 4838 } 4839 4840 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4841 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4842 Register rscratch, AddressLiteral double_sign_flip, 4843 int vec_enc) { 4844 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4845 4846 Label done; 4847 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4848 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4849 kortestwl(ktmp1, ktmp1); 4850 jccb(Assembler::equal, done); 4851 4852 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4853 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4854 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4855 4856 kxorwl(ktmp1, ktmp1, ktmp2); 4857 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4858 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4859 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4860 bind(done); 4861 } 4862 4863 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4864 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4865 Register rscratch, AddressLiteral float_sign_flip, 4866 int vec_enc) { 4867 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4868 Label done; 4869 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4870 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4871 kortestwl(ktmp1, ktmp1); 4872 jccb(Assembler::equal, done); 4873 4874 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4875 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4876 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4877 4878 kxorwl(ktmp1, ktmp1, ktmp2); 4879 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4880 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4881 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4882 bind(done); 4883 } 4884 4885 /* 4886 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4887 * If src is NaN, the result is 0. 4888 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 4889 * the result is equal to the value of Long.MIN_VALUE. 4890 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 4891 * the result is equal to the value of Long.MAX_VALUE. 4892 */ 4893 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4894 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4895 Register rscratch, AddressLiteral double_sign_flip, 4896 int vec_enc) { 4897 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4898 4899 Label done; 4900 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4901 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 4902 kortestwl(ktmp1, ktmp1); 4903 jccb(Assembler::equal, done); 4904 4905 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4906 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4907 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4908 4909 kxorwl(ktmp1, ktmp1, ktmp2); 4910 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4911 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4912 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4913 bind(done); 4914 } 4915 4916 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 4917 XMMRegister xtmp, int index, int vec_enc) { 4918 assert(vec_enc < Assembler::AVX_512bit, ""); 4919 if (vec_enc == Assembler::AVX_256bit) { 4920 vextractf128_high(xtmp, src); 4921 vshufps(dst, src, xtmp, index, vec_enc); 4922 } else { 4923 vshufps(dst, src, zero, index, vec_enc); 4924 } 4925 } 4926 4927 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4928 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 4929 AddressLiteral float_sign_flip, int src_vec_enc) { 4930 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4931 4932 Label done; 4933 // Compare the destination lanes with float_sign_flip 4934 // value to get mask for all special values. 4935 movdqu(xtmp1, float_sign_flip, rscratch); 4936 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 4937 ptest(xtmp2, xtmp2); 4938 jccb(Assembler::equal, done); 4939 4940 // Flip float_sign_flip to get max integer value. 4941 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 4942 pxor(xtmp1, xtmp4); 4943 4944 // Set detination lanes corresponding to unordered source lanes as zero. 4945 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 4946 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 4947 4948 // Shuffle mask vector and pack lower doubles word from each quadword lane. 4949 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 4950 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 4951 4952 // Recompute the mask for remaining special value. 4953 pxor(xtmp2, xtmp3); 4954 // Extract mask corresponding to non-negative source lanes. 4955 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 4956 4957 // Shuffle mask vector and pack lower doubles word from each quadword lane. 4958 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 4959 pand(xtmp3, xtmp2); 4960 4961 // Replace destination lanes holding special value(0x80000000) with max int 4962 // if corresponding source lane holds a +ve value. 4963 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 4964 bind(done); 4965 } 4966 4967 4968 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 4969 XMMRegister xtmp, Register rscratch, int vec_enc) { 4970 switch(to_elem_bt) { 4971 case T_SHORT: 4972 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 4973 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 4974 vpackusdw(dst, dst, zero, vec_enc); 4975 if (vec_enc == Assembler::AVX_256bit) { 4976 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 4977 } 4978 break; 4979 case T_BYTE: 4980 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 4981 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 4982 vpackusdw(dst, dst, zero, vec_enc); 4983 if (vec_enc == Assembler::AVX_256bit) { 4984 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 4985 } 4986 vpackuswb(dst, dst, zero, vec_enc); 4987 break; 4988 default: assert(false, "%s", type2name(to_elem_bt)); 4989 } 4990 } 4991 4992 /* 4993 * Algorithm for vector D2L and F2I conversions:- 4994 * a) Perform vector D2L/F2I cast. 4995 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 4996 * It signifies that source value could be any of the special floating point 4997 * values(NaN,-Inf,Inf,Max,-Min). 4998 * c) Set destination to zero if source is NaN value. 4999 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5000 */ 5001 5002 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5003 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5004 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5005 int to_elem_sz = type2aelembytes(to_elem_bt); 5006 assert(to_elem_sz <= 4, ""); 5007 vcvttps2dq(dst, src, vec_enc); 5008 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5009 if (to_elem_sz < 4) { 5010 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5011 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5012 } 5013 } 5014 5015 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5016 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5017 Register rscratch, int vec_enc) { 5018 int to_elem_sz = type2aelembytes(to_elem_bt); 5019 assert(to_elem_sz <= 4, ""); 5020 vcvttps2dq(dst, src, vec_enc); 5021 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5022 switch(to_elem_bt) { 5023 case T_INT: 5024 break; 5025 case T_SHORT: 5026 evpmovdw(dst, dst, vec_enc); 5027 break; 5028 case T_BYTE: 5029 evpmovdb(dst, dst, vec_enc); 5030 break; 5031 default: assert(false, "%s", type2name(to_elem_bt)); 5032 } 5033 } 5034 5035 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5036 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5037 Register rscratch, int vec_enc) { 5038 evcvttps2qq(dst, src, vec_enc); 5039 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5040 } 5041 5042 // Handling for downcasting from double to integer or sub-word types on AVX2. 5043 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5044 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5045 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5046 int to_elem_sz = type2aelembytes(to_elem_bt); 5047 assert(to_elem_sz < 8, ""); 5048 vcvttpd2dq(dst, src, vec_enc); 5049 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5050 float_sign_flip, vec_enc); 5051 if (to_elem_sz < 4) { 5052 // xtmp4 holds all zero lanes. 5053 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5054 } 5055 } 5056 5057 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5058 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5059 KRegister ktmp2, AddressLiteral sign_flip, 5060 Register rscratch, int vec_enc) { 5061 if (VM_Version::supports_avx512dq()) { 5062 evcvttpd2qq(dst, src, vec_enc); 5063 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5064 switch(to_elem_bt) { 5065 case T_LONG: 5066 break; 5067 case T_INT: 5068 evpmovsqd(dst, dst, vec_enc); 5069 break; 5070 case T_SHORT: 5071 evpmovsqd(dst, dst, vec_enc); 5072 evpmovdw(dst, dst, vec_enc); 5073 break; 5074 case T_BYTE: 5075 evpmovsqd(dst, dst, vec_enc); 5076 evpmovdb(dst, dst, vec_enc); 5077 break; 5078 default: assert(false, "%s", type2name(to_elem_bt)); 5079 } 5080 } else { 5081 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5082 vcvttpd2dq(dst, src, vec_enc); 5083 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5084 switch(to_elem_bt) { 5085 case T_INT: 5086 break; 5087 case T_SHORT: 5088 evpmovdw(dst, dst, vec_enc); 5089 break; 5090 case T_BYTE: 5091 evpmovdb(dst, dst, vec_enc); 5092 break; 5093 default: assert(false, "%s", type2name(to_elem_bt)); 5094 } 5095 } 5096 } 5097 5098 #ifdef _LP64 5099 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5100 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5101 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5102 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5103 // and re-instantiate original MXCSR.RC mode after that. 5104 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5105 5106 mov64(tmp, julong_cast(0.5L)); 5107 evpbroadcastq(xtmp1, tmp, vec_enc); 5108 vaddpd(xtmp1, src , xtmp1, vec_enc); 5109 evcvtpd2qq(dst, xtmp1, vec_enc); 5110 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5111 double_sign_flip, vec_enc);; 5112 5113 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5114 } 5115 5116 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5117 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5118 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5119 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5120 // and re-instantiate original MXCSR.RC mode after that. 5121 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5122 5123 movl(tmp, jint_cast(0.5)); 5124 movq(xtmp1, tmp); 5125 vbroadcastss(xtmp1, xtmp1, vec_enc); 5126 vaddps(xtmp1, src , xtmp1, vec_enc); 5127 vcvtps2dq(dst, xtmp1, vec_enc); 5128 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5129 float_sign_flip, vec_enc); 5130 5131 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5132 } 5133 5134 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5135 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5136 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5137 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5138 // and re-instantiate original MXCSR.RC mode after that. 5139 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5140 5141 movl(tmp, jint_cast(0.5)); 5142 movq(xtmp1, tmp); 5143 vbroadcastss(xtmp1, xtmp1, vec_enc); 5144 vaddps(xtmp1, src , xtmp1, vec_enc); 5145 vcvtps2dq(dst, xtmp1, vec_enc); 5146 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5147 5148 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5149 } 5150 #endif // _LP64 5151 5152 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5153 BasicType from_elem_bt, BasicType to_elem_bt) { 5154 switch (from_elem_bt) { 5155 case T_BYTE: 5156 switch (to_elem_bt) { 5157 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5158 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5159 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5160 default: ShouldNotReachHere(); 5161 } 5162 break; 5163 case T_SHORT: 5164 switch (to_elem_bt) { 5165 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5166 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5167 default: ShouldNotReachHere(); 5168 } 5169 break; 5170 case T_INT: 5171 assert(to_elem_bt == T_LONG, ""); 5172 vpmovzxdq(dst, src, vlen_enc); 5173 break; 5174 default: 5175 ShouldNotReachHere(); 5176 } 5177 } 5178 5179 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5180 BasicType from_elem_bt, BasicType to_elem_bt) { 5181 switch (from_elem_bt) { 5182 case T_BYTE: 5183 switch (to_elem_bt) { 5184 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5185 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5186 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5187 default: ShouldNotReachHere(); 5188 } 5189 break; 5190 case T_SHORT: 5191 switch (to_elem_bt) { 5192 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5193 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5194 default: ShouldNotReachHere(); 5195 } 5196 break; 5197 case T_INT: 5198 assert(to_elem_bt == T_LONG, ""); 5199 vpmovsxdq(dst, src, vlen_enc); 5200 break; 5201 default: 5202 ShouldNotReachHere(); 5203 } 5204 } 5205 5206 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5207 BasicType dst_bt, BasicType src_bt, int vlen) { 5208 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5209 assert(vlen_enc != AVX_512bit, ""); 5210 5211 int dst_bt_size = type2aelembytes(dst_bt); 5212 int src_bt_size = type2aelembytes(src_bt); 5213 if (dst_bt_size > src_bt_size) { 5214 switch (dst_bt_size / src_bt_size) { 5215 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5216 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5217 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5218 default: ShouldNotReachHere(); 5219 } 5220 } else { 5221 assert(dst_bt_size < src_bt_size, ""); 5222 switch (src_bt_size / dst_bt_size) { 5223 case 2: { 5224 if (vlen_enc == AVX_128bit) { 5225 vpacksswb(dst, src, src, vlen_enc); 5226 } else { 5227 vpacksswb(dst, src, src, vlen_enc); 5228 vpermq(dst, dst, 0x08, vlen_enc); 5229 } 5230 break; 5231 } 5232 case 4: { 5233 if (vlen_enc == AVX_128bit) { 5234 vpackssdw(dst, src, src, vlen_enc); 5235 vpacksswb(dst, dst, dst, vlen_enc); 5236 } else { 5237 vpackssdw(dst, src, src, vlen_enc); 5238 vpermq(dst, dst, 0x08, vlen_enc); 5239 vpacksswb(dst, dst, dst, AVX_128bit); 5240 } 5241 break; 5242 } 5243 case 8: { 5244 if (vlen_enc == AVX_128bit) { 5245 vpshufd(dst, src, 0x08, vlen_enc); 5246 vpackssdw(dst, dst, dst, vlen_enc); 5247 vpacksswb(dst, dst, dst, vlen_enc); 5248 } else { 5249 vpshufd(dst, src, 0x08, vlen_enc); 5250 vpermq(dst, dst, 0x08, vlen_enc); 5251 vpackssdw(dst, dst, dst, AVX_128bit); 5252 vpacksswb(dst, dst, dst, AVX_128bit); 5253 } 5254 break; 5255 } 5256 default: ShouldNotReachHere(); 5257 } 5258 } 5259 } 5260 5261 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5262 bool merge, BasicType bt, int vlen_enc) { 5263 if (bt == T_INT) { 5264 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5265 } else { 5266 assert(bt == T_LONG, ""); 5267 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5268 } 5269 } 5270 5271 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5272 bool merge, BasicType bt, int vlen_enc) { 5273 if (bt == T_INT) { 5274 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5275 } else { 5276 assert(bt == T_LONG, ""); 5277 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5278 } 5279 } 5280 5281 #ifdef _LP64 5282 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5283 Register rtmp2, XMMRegister xtmp, int mask_len, 5284 int vec_enc) { 5285 int index = 0; 5286 int vindex = 0; 5287 mov64(rtmp1, 0x0101010101010101L); 5288 pdepq(rtmp1, src, rtmp1); 5289 if (mask_len > 8) { 5290 movq(rtmp2, src); 5291 vpxor(xtmp, xtmp, xtmp, vec_enc); 5292 movq(xtmp, rtmp1); 5293 } 5294 movq(dst, rtmp1); 5295 5296 mask_len -= 8; 5297 while (mask_len > 0) { 5298 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5299 index++; 5300 if ((index % 2) == 0) { 5301 pxor(xtmp, xtmp); 5302 } 5303 mov64(rtmp1, 0x0101010101010101L); 5304 shrq(rtmp2, 8); 5305 pdepq(rtmp1, rtmp2, rtmp1); 5306 pinsrq(xtmp, rtmp1, index % 2); 5307 vindex = index / 2; 5308 if (vindex) { 5309 // Write entire 16 byte vector when both 64 bit 5310 // lanes are update to save redundant instructions. 5311 if (index % 2) { 5312 vinsertf128(dst, dst, xtmp, vindex); 5313 } 5314 } else { 5315 vmovdqu(dst, xtmp); 5316 } 5317 mask_len -= 8; 5318 } 5319 } 5320 5321 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5322 switch(opc) { 5323 case Op_VectorMaskTrueCount: 5324 popcntq(dst, tmp); 5325 break; 5326 case Op_VectorMaskLastTrue: 5327 if (VM_Version::supports_lzcnt()) { 5328 lzcntq(tmp, tmp); 5329 movl(dst, 63); 5330 subl(dst, tmp); 5331 } else { 5332 movl(dst, -1); 5333 bsrq(tmp, tmp); 5334 cmov32(Assembler::notZero, dst, tmp); 5335 } 5336 break; 5337 case Op_VectorMaskFirstTrue: 5338 if (VM_Version::supports_bmi1()) { 5339 if (masklen < 32) { 5340 orl(tmp, 1 << masklen); 5341 tzcntl(dst, tmp); 5342 } else if (masklen == 32) { 5343 tzcntl(dst, tmp); 5344 } else { 5345 assert(masklen == 64, ""); 5346 tzcntq(dst, tmp); 5347 } 5348 } else { 5349 if (masklen < 32) { 5350 orl(tmp, 1 << masklen); 5351 bsfl(dst, tmp); 5352 } else { 5353 assert(masklen == 32 || masklen == 64, ""); 5354 movl(dst, masklen); 5355 if (masklen == 32) { 5356 bsfl(tmp, tmp); 5357 } else { 5358 bsfq(tmp, tmp); 5359 } 5360 cmov32(Assembler::notZero, dst, tmp); 5361 } 5362 } 5363 break; 5364 case Op_VectorMaskToLong: 5365 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5366 break; 5367 default: assert(false, "Unhandled mask operation"); 5368 } 5369 } 5370 5371 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5372 int masklen, int masksize, int vec_enc) { 5373 assert(VM_Version::supports_popcnt(), ""); 5374 5375 if(VM_Version::supports_avx512bw()) { 5376 kmovql(tmp, mask); 5377 } else { 5378 assert(masklen <= 16, ""); 5379 kmovwl(tmp, mask); 5380 } 5381 5382 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5383 // operations needs to be clipped. 5384 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5385 andq(tmp, (1 << masklen) - 1); 5386 } 5387 5388 vector_mask_operation_helper(opc, dst, tmp, masklen); 5389 } 5390 5391 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5392 Register tmp, int masklen, BasicType bt, int vec_enc) { 5393 assert(vec_enc == AVX_128bit && VM_Version::supports_avx() || 5394 vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4), ""); 5395 assert(VM_Version::supports_popcnt(), ""); 5396 5397 bool need_clip = false; 5398 switch(bt) { 5399 case T_BOOLEAN: 5400 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5401 vpxor(xtmp, xtmp, xtmp, vec_enc); 5402 vpsubb(xtmp, xtmp, mask, vec_enc); 5403 vpmovmskb(tmp, xtmp, vec_enc); 5404 need_clip = masklen < 16; 5405 break; 5406 case T_BYTE: 5407 vpmovmskb(tmp, mask, vec_enc); 5408 need_clip = masklen < 16; 5409 break; 5410 case T_SHORT: 5411 vpacksswb(xtmp, mask, mask, vec_enc); 5412 if (masklen >= 16) { 5413 vpermpd(xtmp, xtmp, 8, vec_enc); 5414 } 5415 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5416 need_clip = masklen < 16; 5417 break; 5418 case T_INT: 5419 case T_FLOAT: 5420 vmovmskps(tmp, mask, vec_enc); 5421 need_clip = masklen < 4; 5422 break; 5423 case T_LONG: 5424 case T_DOUBLE: 5425 vmovmskpd(tmp, mask, vec_enc); 5426 need_clip = masklen < 2; 5427 break; 5428 default: assert(false, "Unhandled type, %s", type2name(bt)); 5429 } 5430 5431 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5432 // operations needs to be clipped. 5433 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5434 // need_clip implies masklen < 32 5435 andq(tmp, (1 << masklen) - 1); 5436 } 5437 5438 vector_mask_operation_helper(opc, dst, tmp, masklen); 5439 } 5440 5441 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5442 Register rtmp2, int mask_len) { 5443 kmov(rtmp1, src); 5444 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5445 mov64(rtmp2, -1L); 5446 pextq(rtmp2, rtmp2, rtmp1); 5447 kmov(dst, rtmp2); 5448 } 5449 5450 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5451 bool merge, BasicType bt, int vec_enc) { 5452 if (opcode == Op_CompressV) { 5453 switch(bt) { 5454 case T_BYTE: 5455 evpcompressb(dst, mask, src, merge, vec_enc); 5456 break; 5457 case T_CHAR: 5458 case T_SHORT: 5459 evpcompressw(dst, mask, src, merge, vec_enc); 5460 break; 5461 case T_INT: 5462 evpcompressd(dst, mask, src, merge, vec_enc); 5463 break; 5464 case T_FLOAT: 5465 evcompressps(dst, mask, src, merge, vec_enc); 5466 break; 5467 case T_LONG: 5468 evpcompressq(dst, mask, src, merge, vec_enc); 5469 break; 5470 case T_DOUBLE: 5471 evcompresspd(dst, mask, src, merge, vec_enc); 5472 break; 5473 default: 5474 fatal("Unsupported type %s", type2name(bt)); 5475 break; 5476 } 5477 } else { 5478 assert(opcode == Op_ExpandV, ""); 5479 switch(bt) { 5480 case T_BYTE: 5481 evpexpandb(dst, mask, src, merge, vec_enc); 5482 break; 5483 case T_CHAR: 5484 case T_SHORT: 5485 evpexpandw(dst, mask, src, merge, vec_enc); 5486 break; 5487 case T_INT: 5488 evpexpandd(dst, mask, src, merge, vec_enc); 5489 break; 5490 case T_FLOAT: 5491 evexpandps(dst, mask, src, merge, vec_enc); 5492 break; 5493 case T_LONG: 5494 evpexpandq(dst, mask, src, merge, vec_enc); 5495 break; 5496 case T_DOUBLE: 5497 evexpandpd(dst, mask, src, merge, vec_enc); 5498 break; 5499 default: 5500 fatal("Unsupported type %s", type2name(bt)); 5501 break; 5502 } 5503 } 5504 } 5505 #endif 5506 5507 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5508 KRegister ktmp1, int vec_enc) { 5509 if (opcode == Op_SignumVD) { 5510 vsubpd(dst, zero, one, vec_enc); 5511 // if src < 0 ? -1 : 1 5512 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5513 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5514 // if src == NaN, -0.0 or 0.0 return src. 5515 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5516 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5517 } else { 5518 assert(opcode == Op_SignumVF, ""); 5519 vsubps(dst, zero, one, vec_enc); 5520 // if src < 0 ? -1 : 1 5521 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5522 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5523 // if src == NaN, -0.0 or 0.0 return src. 5524 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5525 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5526 } 5527 } 5528 5529 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5530 XMMRegister xtmp1, int vec_enc) { 5531 if (opcode == Op_SignumVD) { 5532 vsubpd(dst, zero, one, vec_enc); 5533 // if src < 0 ? -1 : 1 5534 vblendvpd(dst, one, dst, src, vec_enc); 5535 // if src == NaN, -0.0 or 0.0 return src. 5536 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5537 vblendvpd(dst, dst, src, xtmp1, vec_enc); 5538 } else { 5539 assert(opcode == Op_SignumVF, ""); 5540 vsubps(dst, zero, one, vec_enc); 5541 // if src < 0 ? -1 : 1 5542 vblendvps(dst, one, dst, src, vec_enc); 5543 // if src == NaN, -0.0 or 0.0 return src. 5544 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5545 vblendvps(dst, dst, src, xtmp1, vec_enc); 5546 } 5547 } 5548 5549 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5550 if (VM_Version::supports_avx512bw()) { 5551 if (mask_len > 32) { 5552 kmovql(dst, src); 5553 } else { 5554 kmovdl(dst, src); 5555 if (mask_len != 32) { 5556 kshiftrdl(dst, dst, 32 - mask_len); 5557 } 5558 } 5559 } else { 5560 assert(mask_len <= 16, ""); 5561 kmovwl(dst, src); 5562 if (mask_len != 16) { 5563 kshiftrwl(dst, dst, 16 - mask_len); 5564 } 5565 } 5566 } 5567 5568 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5569 int lane_size = type2aelembytes(bt); 5570 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5571 if ((is_LP64 || lane_size < 8) && 5572 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5573 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5574 movptr(rtmp, imm32); 5575 switch(lane_size) { 5576 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5577 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5578 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5579 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5580 fatal("Unsupported lane size %d", lane_size); 5581 break; 5582 } 5583 } else { 5584 movptr(rtmp, imm32); 5585 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5586 switch(lane_size) { 5587 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5588 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5589 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5590 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5591 fatal("Unsupported lane size %d", lane_size); 5592 break; 5593 } 5594 } 5595 } 5596 5597 // 5598 // Following is lookup table based popcount computation algorithm:- 5599 // Index Bit set count 5600 // [ 0000 -> 0, 5601 // 0001 -> 1, 5602 // 0010 -> 1, 5603 // 0011 -> 2, 5604 // 0100 -> 1, 5605 // 0101 -> 2, 5606 // 0110 -> 2, 5607 // 0111 -> 3, 5608 // 1000 -> 1, 5609 // 1001 -> 2, 5610 // 1010 -> 3, 5611 // 1011 -> 3, 5612 // 1100 -> 2, 5613 // 1101 -> 3, 5614 // 1111 -> 4 ] 5615 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5616 // shuffle indices for lookup table access. 5617 // b. Right shift each byte of vector lane by 4 positions. 5618 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5619 // shuffle indices for lookup table access. 5620 // d. Add the bitset count of upper and lower 4 bits of each byte. 5621 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5622 // count of all the bytes of a quadword. 5623 // f. Perform step e. for upper 128bit vector lane. 5624 // g. Pack the bitset count of quadwords back to double word. 5625 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5626 5627 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5628 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5629 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5630 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5631 vpsrlw(dst, src, 4, vec_enc); 5632 vpand(dst, dst, xtmp1, vec_enc); 5633 vpand(xtmp1, src, xtmp1, vec_enc); 5634 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5635 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5636 vpshufb(dst, xtmp2, dst, vec_enc); 5637 vpaddb(dst, dst, xtmp1, vec_enc); 5638 } 5639 5640 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5641 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5642 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5643 // Following code is as per steps e,f,g and h of above algorithm. 5644 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5645 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5646 vpsadbw(dst, dst, xtmp2, vec_enc); 5647 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5648 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5649 vpackuswb(dst, xtmp1, dst, vec_enc); 5650 } 5651 5652 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5653 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5654 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5655 // Add the popcount of upper and lower bytes of word. 5656 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5657 vpsrlw(dst, xtmp1, 8, vec_enc); 5658 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5659 vpaddw(dst, dst, xtmp1, vec_enc); 5660 } 5661 5662 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5663 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5664 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5665 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5666 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5667 } 5668 5669 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5670 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5671 switch(bt) { 5672 case T_LONG: 5673 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5674 break; 5675 case T_INT: 5676 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5677 break; 5678 case T_CHAR: 5679 case T_SHORT: 5680 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5681 break; 5682 case T_BYTE: 5683 case T_BOOLEAN: 5684 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5685 break; 5686 default: 5687 fatal("Unsupported type %s", type2name(bt)); 5688 break; 5689 } 5690 } 5691 5692 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5693 KRegister mask, bool merge, int vec_enc) { 5694 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5695 switch(bt) { 5696 case T_LONG: 5697 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5698 evpopcntq(dst, mask, src, merge, vec_enc); 5699 break; 5700 case T_INT: 5701 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5702 evpopcntd(dst, mask, src, merge, vec_enc); 5703 break; 5704 case T_CHAR: 5705 case T_SHORT: 5706 assert(VM_Version::supports_avx512_bitalg(), ""); 5707 evpopcntw(dst, mask, src, merge, vec_enc); 5708 break; 5709 case T_BYTE: 5710 case T_BOOLEAN: 5711 assert(VM_Version::supports_avx512_bitalg(), ""); 5712 evpopcntb(dst, mask, src, merge, vec_enc); 5713 break; 5714 default: 5715 fatal("Unsupported type %s", type2name(bt)); 5716 break; 5717 } 5718 } 5719 5720 #ifndef _LP64 5721 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 5722 assert(VM_Version::supports_avx512bw(), ""); 5723 kmovdl(tmp, src); 5724 kunpckdql(dst, tmp, tmp); 5725 } 5726 #endif 5727 5728 // Bit reversal algorithm first reverses the bits of each byte followed by 5729 // a byte level reversal for multi-byte primitive types (short/int/long). 5730 // Algorithm performs a lookup table access to get reverse bit sequence 5731 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5732 // is obtained by swapping the reverse bit sequences of upper and lower 5733 // nibble of a byte. 5734 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5735 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5736 if (VM_Version::supports_avx512vlbw()) { 5737 5738 // Get the reverse bit sequence of lower nibble of each byte. 5739 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5740 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5741 evpandq(dst, xtmp2, src, vec_enc); 5742 vpshufb(dst, xtmp1, dst, vec_enc); 5743 vpsllq(dst, dst, 4, vec_enc); 5744 5745 // Get the reverse bit sequence of upper nibble of each byte. 5746 vpandn(xtmp2, xtmp2, src, vec_enc); 5747 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5748 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5749 5750 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5751 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5752 evporq(xtmp2, dst, xtmp2, vec_enc); 5753 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5754 5755 } else if(vec_enc == Assembler::AVX_512bit) { 5756 // Shift based bit reversal. 5757 assert(bt == T_LONG || bt == T_INT, ""); 5758 5759 // Swap lower and upper nibble of each byte. 5760 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5761 5762 // Swap two least and most significant bits of each nibble. 5763 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5764 5765 // Swap adjacent pair of bits. 5766 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5767 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5768 5769 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5770 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5771 } else { 5772 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5773 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5774 5775 // Get the reverse bit sequence of lower nibble of each byte. 5776 vpand(dst, xtmp2, src, vec_enc); 5777 vpshufb(dst, xtmp1, dst, vec_enc); 5778 vpsllq(dst, dst, 4, vec_enc); 5779 5780 // Get the reverse bit sequence of upper nibble of each byte. 5781 vpandn(xtmp2, xtmp2, src, vec_enc); 5782 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5783 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5784 5785 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5786 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5787 vpor(xtmp2, dst, xtmp2, vec_enc); 5788 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5789 } 5790 } 5791 5792 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 5793 XMMRegister xtmp, Register rscratch) { 5794 assert(VM_Version::supports_gfni(), ""); 5795 assert(rscratch != noreg || always_reachable(mask), "missing"); 5796 5797 // Galois field instruction based bit reversal based on following algorithm. 5798 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5799 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 5800 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 5801 vector_reverse_byte(bt, dst, xtmp, vec_enc); 5802 } 5803 5804 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 5805 XMMRegister xtmp1, Register rtmp, int vec_enc) { 5806 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 5807 evpandq(dst, xtmp1, src, vec_enc); 5808 vpsllq(dst, dst, nbits, vec_enc); 5809 vpandn(xtmp1, xtmp1, src, vec_enc); 5810 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 5811 evporq(dst, dst, xtmp1, vec_enc); 5812 } 5813 5814 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5815 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5816 // Shift based bit reversal. 5817 assert(VM_Version::supports_evex(), ""); 5818 switch(bt) { 5819 case T_LONG: 5820 // Swap upper and lower double word of each quad word. 5821 evprorq(xtmp1, k0, src, 32, true, vec_enc); 5822 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 5823 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5824 break; 5825 case T_INT: 5826 // Swap upper and lower word of each double word. 5827 evprord(xtmp1, k0, src, 16, true, vec_enc); 5828 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5829 break; 5830 case T_CHAR: 5831 case T_SHORT: 5832 // Swap upper and lower byte of each word. 5833 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 5834 break; 5835 case T_BYTE: 5836 evmovdquq(dst, k0, src, true, vec_enc); 5837 break; 5838 default: 5839 fatal("Unsupported type %s", type2name(bt)); 5840 break; 5841 } 5842 } 5843 5844 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5845 if (bt == T_BYTE) { 5846 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 5847 evmovdquq(dst, k0, src, true, vec_enc); 5848 } else { 5849 vmovdqu(dst, src); 5850 } 5851 return; 5852 } 5853 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 5854 // pre-computed shuffle indices. 5855 switch(bt) { 5856 case T_LONG: 5857 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 5858 break; 5859 case T_INT: 5860 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 5861 break; 5862 case T_CHAR: 5863 case T_SHORT: 5864 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 5865 break; 5866 default: 5867 fatal("Unsupported type %s", type2name(bt)); 5868 break; 5869 } 5870 vpshufb(dst, src, dst, vec_enc); 5871 } 5872 5873 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5874 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5875 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 5876 assert(is_integral_type(bt), ""); 5877 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5878 assert(VM_Version::supports_avx512cd(), ""); 5879 switch(bt) { 5880 case T_LONG: 5881 evplzcntq(dst, ktmp, src, merge, vec_enc); 5882 break; 5883 case T_INT: 5884 evplzcntd(dst, ktmp, src, merge, vec_enc); 5885 break; 5886 case T_SHORT: 5887 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 5888 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 5889 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 5890 vpunpckhwd(dst, xtmp1, src, vec_enc); 5891 evplzcntd(dst, ktmp, dst, merge, vec_enc); 5892 vpackusdw(dst, xtmp2, dst, vec_enc); 5893 break; 5894 case T_BYTE: 5895 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5896 // accessing the lookup table. 5897 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 5898 // accessing the lookup table. 5899 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 5900 assert(VM_Version::supports_avx512bw(), ""); 5901 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 5902 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 5903 vpand(xtmp2, dst, src, vec_enc); 5904 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5905 vpsrlw(xtmp3, src, 4, vec_enc); 5906 vpand(xtmp3, dst, xtmp3, vec_enc); 5907 vpshufb(dst, xtmp1, xtmp3, vec_enc); 5908 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 5909 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 5910 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 5911 break; 5912 default: 5913 fatal("Unsupported type %s", type2name(bt)); 5914 break; 5915 } 5916 } 5917 5918 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5919 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5920 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 5921 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5922 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5923 // accessing the lookup table. 5924 vpand(dst, xtmp2, src, vec_enc); 5925 vpshufb(dst, xtmp1, dst, vec_enc); 5926 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 5927 // accessing the lookup table. 5928 vpsrlw(xtmp3, src, 4, vec_enc); 5929 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 5930 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 5931 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 5932 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 5933 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 5934 vpaddb(dst, dst, xtmp2, vec_enc); 5935 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 5936 } 5937 5938 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5939 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5940 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5941 // Add zero counts of lower byte and upper byte of a word if 5942 // upper byte holds a zero value. 5943 vpsrlw(xtmp3, src, 8, vec_enc); 5944 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 5945 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 5946 vpsllw(xtmp2, dst, 8, vec_enc); 5947 vpaddw(xtmp2, xtmp2, dst, vec_enc); 5948 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5949 vpsrlw(dst, dst, 8, vec_enc); 5950 } 5951 5952 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5953 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 5954 // Since IEEE 754 floating point format represents mantissa in 1.0 format 5955 // hence biased exponent can be used to compute leading zero count as per 5956 // following formula:- 5957 // LZCNT = 32 - (biased_exp - 127) 5958 // Special handling has been introduced for Zero, Max_Int and -ve source values. 5959 5960 // Broadcast 0xFF 5961 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 5962 vpsrld(xtmp1, xtmp1, 24, vec_enc); 5963 5964 // Extract biased exponent. 5965 vcvtdq2ps(dst, src, vec_enc); 5966 vpsrld(dst, dst, 23, vec_enc); 5967 vpand(dst, dst, xtmp1, vec_enc); 5968 5969 // Broadcast 127. 5970 vpsrld(xtmp1, xtmp1, 1, vec_enc); 5971 // Exponent = biased_exp - 127 5972 vpsubd(dst, dst, xtmp1, vec_enc); 5973 5974 // Exponent = Exponent + 1 5975 vpsrld(xtmp3, xtmp1, 6, vec_enc); 5976 vpaddd(dst, dst, xtmp3, vec_enc); 5977 5978 // Replace -ve exponent with zero, exponent is -ve when src 5979 // lane contains a zero value. 5980 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5981 vblendvps(dst, dst, xtmp2, dst, vec_enc); 5982 5983 // Rematerialize broadcast 32. 5984 vpslld(xtmp1, xtmp3, 5, vec_enc); 5985 // Exponent is 32 if corresponding source lane contains max_int value. 5986 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 5987 // LZCNT = 32 - exponent 5988 vpsubd(dst, xtmp1, dst, vec_enc); 5989 5990 // Replace LZCNT with a value 1 if corresponding source lane 5991 // contains max_int value. 5992 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 5993 5994 // Replace biased_exp with 0 if source lane value is less than zero. 5995 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5996 vblendvps(dst, dst, xtmp2, src, vec_enc); 5997 } 5998 5999 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6000 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6001 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6002 // Add zero counts of lower word and upper word of a double word if 6003 // upper word holds a zero value. 6004 vpsrld(xtmp3, src, 16, vec_enc); 6005 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6006 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6007 vpslld(xtmp2, dst, 16, vec_enc); 6008 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6009 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6010 vpsrld(dst, dst, 16, vec_enc); 6011 // Add zero counts of lower doubleword and upper doubleword of a 6012 // quadword if upper doubleword holds a zero value. 6013 vpsrlq(xtmp3, src, 32, vec_enc); 6014 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6015 vpsllq(xtmp2, dst, 32, vec_enc); 6016 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6017 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6018 vpsrlq(dst, dst, 32, vec_enc); 6019 } 6020 6021 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6022 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6023 Register rtmp, int vec_enc) { 6024 assert(is_integral_type(bt), "unexpected type"); 6025 assert(vec_enc < Assembler::AVX_512bit, ""); 6026 switch(bt) { 6027 case T_LONG: 6028 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6029 break; 6030 case T_INT: 6031 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6032 break; 6033 case T_SHORT: 6034 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6035 break; 6036 case T_BYTE: 6037 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6038 break; 6039 default: 6040 fatal("Unsupported type %s", type2name(bt)); 6041 break; 6042 } 6043 } 6044 6045 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6046 switch(bt) { 6047 case T_BYTE: 6048 vpsubb(dst, src1, src2, vec_enc); 6049 break; 6050 case T_SHORT: 6051 vpsubw(dst, src1, src2, vec_enc); 6052 break; 6053 case T_INT: 6054 vpsubd(dst, src1, src2, vec_enc); 6055 break; 6056 case T_LONG: 6057 vpsubq(dst, src1, src2, vec_enc); 6058 break; 6059 default: 6060 fatal("Unsupported type %s", type2name(bt)); 6061 break; 6062 } 6063 } 6064 6065 // Trailing zero count computation is based on leading zero count operation as per 6066 // following equation. All AVX3 targets support AVX512CD feature which offers 6067 // direct vector instruction to compute leading zero count. 6068 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6069 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6070 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6071 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6072 assert(is_integral_type(bt), ""); 6073 // xtmp = -1 6074 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6075 // xtmp = xtmp + src 6076 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6077 // xtmp = xtmp & ~src 6078 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6079 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6080 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6081 vpsub(bt, dst, xtmp4, dst, vec_enc); 6082 } 6083 6084 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6085 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6086 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6087 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6088 assert(is_integral_type(bt), ""); 6089 // xtmp = 0 6090 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6091 // xtmp = 0 - src 6092 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6093 // xtmp = xtmp | src 6094 vpor(xtmp3, xtmp3, src, vec_enc); 6095 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6096 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6097 vpsub(bt, dst, xtmp1, dst, vec_enc); 6098 } 6099 6100 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6101 Label done; 6102 Label neg_divisor_fastpath; 6103 cmpl(divisor, 0); 6104 jccb(Assembler::less, neg_divisor_fastpath); 6105 xorl(rdx, rdx); 6106 divl(divisor); 6107 jmpb(done); 6108 bind(neg_divisor_fastpath); 6109 // Fastpath for divisor < 0: 6110 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6111 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6112 movl(rdx, rax); 6113 subl(rdx, divisor); 6114 if (VM_Version::supports_bmi1()) { 6115 andnl(rax, rdx, rax); 6116 } else { 6117 notl(rdx); 6118 andl(rax, rdx); 6119 } 6120 shrl(rax, 31); 6121 bind(done); 6122 } 6123 6124 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6125 Label done; 6126 Label neg_divisor_fastpath; 6127 cmpl(divisor, 0); 6128 jccb(Assembler::less, neg_divisor_fastpath); 6129 xorl(rdx, rdx); 6130 divl(divisor); 6131 jmpb(done); 6132 bind(neg_divisor_fastpath); 6133 // Fastpath when divisor < 0: 6134 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6135 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6136 movl(rdx, rax); 6137 subl(rax, divisor); 6138 if (VM_Version::supports_bmi1()) { 6139 andnl(rax, rax, rdx); 6140 } else { 6141 notl(rax); 6142 andl(rax, rdx); 6143 } 6144 sarl(rax, 31); 6145 andl(rax, divisor); 6146 subl(rdx, rax); 6147 bind(done); 6148 } 6149 6150 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6151 Label done; 6152 Label neg_divisor_fastpath; 6153 6154 cmpl(divisor, 0); 6155 jccb(Assembler::less, neg_divisor_fastpath); 6156 xorl(rdx, rdx); 6157 divl(divisor); 6158 jmpb(done); 6159 bind(neg_divisor_fastpath); 6160 // Fastpath for divisor < 0: 6161 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6162 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6163 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6164 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6165 movl(rdx, rax); 6166 subl(rax, divisor); 6167 if (VM_Version::supports_bmi1()) { 6168 andnl(rax, rax, rdx); 6169 } else { 6170 notl(rax); 6171 andl(rax, rdx); 6172 } 6173 movl(tmp, rax); 6174 shrl(rax, 31); // quotient 6175 sarl(tmp, 31); 6176 andl(tmp, divisor); 6177 subl(rdx, tmp); // remainder 6178 bind(done); 6179 } 6180 6181 #ifdef _LP64 6182 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6183 XMMRegister xtmp2, Register rtmp) { 6184 if(VM_Version::supports_gfni()) { 6185 // Galois field instruction based bit reversal based on following algorithm. 6186 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6187 mov64(rtmp, 0x8040201008040201L); 6188 movq(xtmp1, src); 6189 movq(xtmp2, rtmp); 6190 gf2p8affineqb(xtmp1, xtmp2, 0); 6191 movq(dst, xtmp1); 6192 } else { 6193 // Swap even and odd numbered bits. 6194 movl(rtmp, src); 6195 andl(rtmp, 0x55555555); 6196 shll(rtmp, 1); 6197 movl(dst, src); 6198 andl(dst, 0xAAAAAAAA); 6199 shrl(dst, 1); 6200 orl(dst, rtmp); 6201 6202 // Swap LSB and MSB 2 bits of each nibble. 6203 movl(rtmp, dst); 6204 andl(rtmp, 0x33333333); 6205 shll(rtmp, 2); 6206 andl(dst, 0xCCCCCCCC); 6207 shrl(dst, 2); 6208 orl(dst, rtmp); 6209 6210 // Swap LSB and MSB 4 bits of each byte. 6211 movl(rtmp, dst); 6212 andl(rtmp, 0x0F0F0F0F); 6213 shll(rtmp, 4); 6214 andl(dst, 0xF0F0F0F0); 6215 shrl(dst, 4); 6216 orl(dst, rtmp); 6217 } 6218 bswapl(dst); 6219 } 6220 6221 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6222 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6223 if(VM_Version::supports_gfni()) { 6224 // Galois field instruction based bit reversal based on following algorithm. 6225 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6226 mov64(rtmp1, 0x8040201008040201L); 6227 movq(xtmp1, src); 6228 movq(xtmp2, rtmp1); 6229 gf2p8affineqb(xtmp1, xtmp2, 0); 6230 movq(dst, xtmp1); 6231 } else { 6232 // Swap even and odd numbered bits. 6233 movq(rtmp1, src); 6234 mov64(rtmp2, 0x5555555555555555L); 6235 andq(rtmp1, rtmp2); 6236 shlq(rtmp1, 1); 6237 movq(dst, src); 6238 notq(rtmp2); 6239 andq(dst, rtmp2); 6240 shrq(dst, 1); 6241 orq(dst, rtmp1); 6242 6243 // Swap LSB and MSB 2 bits of each nibble. 6244 movq(rtmp1, dst); 6245 mov64(rtmp2, 0x3333333333333333L); 6246 andq(rtmp1, rtmp2); 6247 shlq(rtmp1, 2); 6248 notq(rtmp2); 6249 andq(dst, rtmp2); 6250 shrq(dst, 2); 6251 orq(dst, rtmp1); 6252 6253 // Swap LSB and MSB 4 bits of each byte. 6254 movq(rtmp1, dst); 6255 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6256 andq(rtmp1, rtmp2); 6257 shlq(rtmp1, 4); 6258 notq(rtmp2); 6259 andq(dst, rtmp2); 6260 shrq(dst, 4); 6261 orq(dst, rtmp1); 6262 } 6263 bswapq(dst); 6264 } 6265 6266 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6267 Label done; 6268 Label neg_divisor_fastpath; 6269 cmpq(divisor, 0); 6270 jccb(Assembler::less, neg_divisor_fastpath); 6271 xorl(rdx, rdx); 6272 divq(divisor); 6273 jmpb(done); 6274 bind(neg_divisor_fastpath); 6275 // Fastpath for divisor < 0: 6276 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6277 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6278 movq(rdx, rax); 6279 subq(rdx, divisor); 6280 if (VM_Version::supports_bmi1()) { 6281 andnq(rax, rdx, rax); 6282 } else { 6283 notq(rdx); 6284 andq(rax, rdx); 6285 } 6286 shrq(rax, 63); 6287 bind(done); 6288 } 6289 6290 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6291 Label done; 6292 Label neg_divisor_fastpath; 6293 cmpq(divisor, 0); 6294 jccb(Assembler::less, neg_divisor_fastpath); 6295 xorq(rdx, rdx); 6296 divq(divisor); 6297 jmp(done); 6298 bind(neg_divisor_fastpath); 6299 // Fastpath when divisor < 0: 6300 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6301 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6302 movq(rdx, rax); 6303 subq(rax, divisor); 6304 if (VM_Version::supports_bmi1()) { 6305 andnq(rax, rax, rdx); 6306 } else { 6307 notq(rax); 6308 andq(rax, rdx); 6309 } 6310 sarq(rax, 63); 6311 andq(rax, divisor); 6312 subq(rdx, rax); 6313 bind(done); 6314 } 6315 6316 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6317 Label done; 6318 Label neg_divisor_fastpath; 6319 cmpq(divisor, 0); 6320 jccb(Assembler::less, neg_divisor_fastpath); 6321 xorq(rdx, rdx); 6322 divq(divisor); 6323 jmp(done); 6324 bind(neg_divisor_fastpath); 6325 // Fastpath for divisor < 0: 6326 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6327 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6328 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6329 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6330 movq(rdx, rax); 6331 subq(rax, divisor); 6332 if (VM_Version::supports_bmi1()) { 6333 andnq(rax, rax, rdx); 6334 } else { 6335 notq(rax); 6336 andq(rax, rdx); 6337 } 6338 movq(tmp, rax); 6339 shrq(rax, 63); // quotient 6340 sarq(tmp, 63); 6341 andq(tmp, divisor); 6342 subq(rdx, tmp); // remainder 6343 bind(done); 6344 } 6345 #endif 6346 6347 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6348 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6349 int vlen_enc) { 6350 assert(VM_Version::supports_avx512bw(), ""); 6351 // Byte shuffles are inlane operations and indices are determined using 6352 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6353 // normalized to index range 0-15. This makes sure that all the multiples 6354 // of an index value are placed at same relative position in 128 bit 6355 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6356 // will be 16th element in their respective 128 bit lanes. 6357 movl(rtmp, 16); 6358 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6359 6360 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6361 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6362 // original shuffle indices and move the shuffled lanes corresponding to true 6363 // mask to destination vector. 6364 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6365 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6366 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6367 6368 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6369 // and broadcasting second 128 bit lane. 6370 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6371 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6372 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6373 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6374 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6375 6376 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6377 // and broadcasting third 128 bit lane. 6378 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6379 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6380 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6381 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6382 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6383 6384 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6385 // and broadcasting third 128 bit lane. 6386 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6387 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6388 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6389 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6390 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6391 } 6392 6393 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6394 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6395 if (vlen_enc == AVX_128bit) { 6396 vpermilps(dst, src, shuffle, vlen_enc); 6397 } else if (bt == T_INT) { 6398 vpermd(dst, shuffle, src, vlen_enc); 6399 } else { 6400 assert(bt == T_FLOAT, ""); 6401 vpermps(dst, shuffle, src, vlen_enc); 6402 } 6403 } 6404 6405 #ifdef _LP64 6406 void C2_MacroAssembler::load_nklass_compact_c2(Register dst, Register obj, Register index, Address::ScaleFactor scale, int disp) { 6407 C2LoadNKlassStub* stub = new (Compile::current()->comp_arena()) C2LoadNKlassStub(dst); 6408 Compile::current()->output()->add_stub(stub); 6409 6410 // Note: Don't clobber obj anywhere in that method! 6411 6412 // The incoming address is pointing into obj-start + klass_offset_in_bytes. We need to extract 6413 // obj-start, so that we can load from the object's mark-word instead. Usually the address 6414 // comes as obj-start in obj and klass_offset_in_bytes in disp. However, sometimes C2 6415 // emits code that pre-computes obj-start + klass_offset_in_bytes into a register, and 6416 // then passes that register as obj and 0 in disp. The following code extracts the base 6417 // and offset to load the mark-word. 6418 int offset = oopDesc::mark_offset_in_bytes() + disp - oopDesc::klass_offset_in_bytes(); 6419 movq(dst, Address(obj, index, scale, offset)); 6420 testb(dst, markWord::monitor_value); 6421 jcc(Assembler::notZero, stub->entry()); 6422 bind(stub->continuation()); 6423 shrq(dst, markWord::klass_shift); 6424 } 6425 #endif