1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "oops/methodData.hpp" 31 #include "opto/c2_MacroAssembler.hpp" 32 #include "opto/intrinsicnode.hpp" 33 #include "opto/output.hpp" 34 #include "opto/opcodes.hpp" 35 #include "opto/subnode.hpp" 36 #include "runtime/globals.hpp" 37 #include "runtime/objectMonitor.hpp" 38 #include "runtime/stubRoutines.hpp" 39 #include "utilities/checkedCast.hpp" 40 #include "utilities/globalDefinitions.hpp" 41 #include "utilities/powerOfTwo.hpp" 42 #include "utilities/sizes.hpp" 43 44 #ifdef PRODUCT 45 #define BLOCK_COMMENT(str) /* nothing */ 46 #define STOP(error) stop(error) 47 #else 48 #define BLOCK_COMMENT(str) block_comment(str) 49 #define STOP(error) block_comment(error); stop(error) 50 #endif 51 52 // C2 compiled method's prolog code. 53 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { 54 55 // WARNING: Initial instruction MUST be 5 bytes or longer so that 56 // NativeJump::patch_verified_entry will be able to patch out the entry 57 // code safely. The push to verify stack depth is ok at 5 bytes, 58 // the frame allocation can be either 3 or 6 bytes. So if we don't do 59 // stack bang then we must use the 6 byte frame allocation even if 60 // we have no frame. :-( 61 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 62 63 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 64 // Remove word for return addr 65 framesize -= wordSize; 66 stack_bang_size -= wordSize; 67 68 // Calls to C2R adapters often do not accept exceptional returns. 69 // We require that their callers must bang for them. But be careful, because 70 // some VM calls (such as call site linkage) can use several kilobytes of 71 // stack. But the stack safety zone should account for that. 72 // See bugs 4446381, 4468289, 4497237. 73 if (stack_bang_size > 0) { 74 generate_stack_overflow_check(stack_bang_size); 75 76 // We always push rbp, so that on return to interpreter rbp, will be 77 // restored correctly and we can correct the stack. 78 push(rbp); 79 // Save caller's stack pointer into RBP if the frame pointer is preserved. 80 if (PreserveFramePointer) { 81 mov(rbp, rsp); 82 } 83 // Remove word for ebp 84 framesize -= wordSize; 85 86 // Create frame 87 if (framesize) { 88 subptr(rsp, framesize); 89 } 90 } else { 91 // Create frame (force generation of a 4 byte immediate value) 92 subptr_imm32(rsp, framesize); 93 94 // Save RBP register now. 95 framesize -= wordSize; 96 movptr(Address(rsp, framesize), rbp); 97 // Save caller's stack pointer into RBP if the frame pointer is preserved. 98 if (PreserveFramePointer) { 99 movptr(rbp, rsp); 100 if (framesize > 0) { 101 addptr(rbp, framesize); 102 } 103 } 104 } 105 106 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 107 framesize -= wordSize; 108 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 109 } 110 111 #ifndef _LP64 112 // If method sets FPU control word do it now 113 if (fp_mode_24b) { 114 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 115 } 116 if (UseSSE >= 2 && VerifyFPU) { 117 verify_FPU(0, "FPU stack must be clean on entry"); 118 } 119 #endif 120 121 #ifdef ASSERT 122 if (VerifyStackAtCalls) { 123 Label L; 124 push(rax); 125 mov(rax, rsp); 126 andptr(rax, StackAlignmentInBytes-1); 127 cmpptr(rax, StackAlignmentInBytes-wordSize); 128 pop(rax); 129 jcc(Assembler::equal, L); 130 STOP("Stack is not properly aligned!"); 131 bind(L); 132 } 133 #endif 134 135 if (!is_stub) { 136 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 137 #ifdef _LP64 138 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 139 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 140 Label dummy_slow_path; 141 Label dummy_continuation; 142 Label* slow_path = &dummy_slow_path; 143 Label* continuation = &dummy_continuation; 144 if (!Compile::current()->output()->in_scratch_emit_size()) { 145 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 146 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 147 Compile::current()->output()->add_stub(stub); 148 slow_path = &stub->entry(); 149 continuation = &stub->continuation(); 150 } 151 bs->nmethod_entry_barrier(this, slow_path, continuation); 152 } 153 #else 154 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 155 bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */); 156 #endif 157 } 158 } 159 160 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 161 switch (vlen_in_bytes) { 162 case 4: // fall-through 163 case 8: // fall-through 164 case 16: return Assembler::AVX_128bit; 165 case 32: return Assembler::AVX_256bit; 166 case 64: return Assembler::AVX_512bit; 167 168 default: { 169 ShouldNotReachHere(); 170 return Assembler::AVX_NoVec; 171 } 172 } 173 } 174 175 #if INCLUDE_RTM_OPT 176 177 // Update rtm_counters based on abort status 178 // input: abort_status 179 // rtm_counters (RTMLockingCounters*) 180 // flags are killed 181 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) { 182 183 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset())); 184 if (PrintPreciseRTMLockingStatistics) { 185 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 186 Label check_abort; 187 testl(abort_status, (1<<i)); 188 jccb(Assembler::equal, check_abort); 189 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx)))); 190 bind(check_abort); 191 } 192 } 193 } 194 195 // Branch if (random & (count-1) != 0), count is 2^n 196 // tmp, scr and flags are killed 197 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) { 198 assert(tmp == rax, ""); 199 assert(scr == rdx, ""); 200 rdtsc(); // modifies EDX:EAX 201 andptr(tmp, count-1); 202 jccb(Assembler::notZero, brLabel); 203 } 204 205 // Perform abort ratio calculation, set no_rtm bit if high ratio 206 // input: rtm_counters_Reg (RTMLockingCounters* address) 207 // tmpReg, rtm_counters_Reg and flags are killed 208 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg, 209 Register rtm_counters_Reg, 210 RTMLockingCounters* rtm_counters, 211 Metadata* method_data) { 212 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 213 214 if (RTMLockingCalculationDelay > 0) { 215 // Delay calculation 216 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr())); 217 testptr(tmpReg, tmpReg); 218 jccb(Assembler::equal, L_done); 219 } 220 // Abort ratio calculation only if abort_count > RTMAbortThreshold 221 // Aborted transactions = abort_count * 100 222 // All transactions = total_count * RTMTotalCountIncrRate 223 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 224 225 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset())); 226 cmpptr(tmpReg, RTMAbortThreshold); 227 jccb(Assembler::below, L_check_always_rtm2); 228 imulptr(tmpReg, tmpReg, 100); 229 230 Register scrReg = rtm_counters_Reg; 231 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 232 imulptr(scrReg, scrReg, RTMTotalCountIncrRate); 233 imulptr(scrReg, scrReg, RTMAbortRatio); 234 cmpptr(tmpReg, scrReg); 235 jccb(Assembler::below, L_check_always_rtm1); 236 if (method_data != nullptr) { 237 // set rtm_state to "no rtm" in MDO 238 mov_metadata(tmpReg, method_data); 239 lock(); 240 orl(Address(tmpReg, MethodData::rtm_state_offset()), NoRTM); 241 } 242 jmpb(L_done); 243 bind(L_check_always_rtm1); 244 // Reload RTMLockingCounters* address 245 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 246 bind(L_check_always_rtm2); 247 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 248 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 249 jccb(Assembler::below, L_done); 250 if (method_data != nullptr) { 251 // set rtm_state to "always rtm" in MDO 252 mov_metadata(tmpReg, method_data); 253 lock(); 254 orl(Address(tmpReg, MethodData::rtm_state_offset()), UseRTM); 255 } 256 bind(L_done); 257 } 258 259 // Update counters and perform abort ratio calculation 260 // input: abort_status_Reg 261 // rtm_counters_Reg, flags are killed 262 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg, 263 Register rtm_counters_Reg, 264 RTMLockingCounters* rtm_counters, 265 Metadata* method_data, 266 bool profile_rtm) { 267 268 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 269 // update rtm counters based on rax value at abort 270 // reads abort_status_Reg, updates flags 271 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 272 rtm_counters_update(abort_status_Reg, rtm_counters_Reg); 273 if (profile_rtm) { 274 // Save abort status because abort_status_Reg is used by following code. 275 if (RTMRetryCount > 0) { 276 push(abort_status_Reg); 277 } 278 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 279 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data); 280 // restore abort status 281 if (RTMRetryCount > 0) { 282 pop(abort_status_Reg); 283 } 284 } 285 } 286 287 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4) 288 // inputs: retry_count_Reg 289 // : abort_status_Reg 290 // output: retry_count_Reg decremented by 1 291 // flags are killed 292 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) { 293 Label doneRetry; 294 assert(abort_status_Reg == rax, ""); 295 // The abort reason bits are in eax (see all states in rtmLocking.hpp) 296 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) 297 // if reason is in 0x6 and retry count != 0 then retry 298 andptr(abort_status_Reg, 0x6); 299 jccb(Assembler::zero, doneRetry); 300 testl(retry_count_Reg, retry_count_Reg); 301 jccb(Assembler::zero, doneRetry); 302 pause(); 303 decrementl(retry_count_Reg); 304 jmp(retryLabel); 305 bind(doneRetry); 306 } 307 308 // Spin and retry if lock is busy, 309 // inputs: box_Reg (monitor address) 310 // : retry_count_Reg 311 // output: retry_count_Reg decremented by 1 312 // : clear z flag if retry count exceeded 313 // tmp_Reg, scr_Reg, flags are killed 314 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg, 315 Register tmp_Reg, Register scr_Reg, Label& retryLabel) { 316 Label SpinLoop, SpinExit, doneRetry; 317 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 318 319 testl(retry_count_Reg, retry_count_Reg); 320 jccb(Assembler::zero, doneRetry); 321 decrementl(retry_count_Reg); 322 movptr(scr_Reg, RTMSpinLoopCount); 323 324 bind(SpinLoop); 325 pause(); 326 decrementl(scr_Reg); 327 jccb(Assembler::lessEqual, SpinExit); 328 movptr(tmp_Reg, Address(box_Reg, owner_offset)); 329 testptr(tmp_Reg, tmp_Reg); 330 jccb(Assembler::notZero, SpinLoop); 331 332 bind(SpinExit); 333 jmp(retryLabel); 334 bind(doneRetry); 335 incrementl(retry_count_Reg); // clear z flag 336 } 337 338 // Use RTM for normal stack locks 339 // Input: objReg (object to lock) 340 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg, 341 Register retry_on_abort_count_Reg, 342 RTMLockingCounters* stack_rtm_counters, 343 Metadata* method_data, bool profile_rtm, 344 Label& DONE_LABEL, Label& IsInflated) { 345 assert(UseRTMForStackLocks, "why call this otherwise?"); 346 assert(tmpReg == rax, ""); 347 assert(scrReg == rdx, ""); 348 Label L_rtm_retry, L_decrement_retry, L_on_abort; 349 350 if (RTMRetryCount > 0) { 351 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 352 bind(L_rtm_retry); 353 } 354 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 355 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 356 jcc(Assembler::notZero, IsInflated); 357 358 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 359 Label L_noincrement; 360 if (RTMTotalCountIncrRate > 1) { 361 // tmpReg, scrReg and flags are killed 362 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 363 } 364 assert(stack_rtm_counters != nullptr, "should not be null when profiling RTM"); 365 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg); 366 bind(L_noincrement); 367 } 368 xbegin(L_on_abort); 369 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 370 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 371 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 372 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked 373 374 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 375 if (UseRTMXendForLockBusy) { 376 xend(); 377 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry) 378 jmp(L_decrement_retry); 379 } 380 else { 381 xabort(0); 382 } 383 bind(L_on_abort); 384 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 385 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm); 386 } 387 bind(L_decrement_retry); 388 if (RTMRetryCount > 0) { 389 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 390 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 391 } 392 } 393 394 // Use RTM for inflating locks 395 // inputs: objReg (object to lock) 396 // boxReg (on-stack box address (displaced header location) - KILLED) 397 // tmpReg (ObjectMonitor address + markWord::monitor_value) 398 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg, 399 Register scrReg, Register retry_on_busy_count_Reg, 400 Register retry_on_abort_count_Reg, 401 RTMLockingCounters* rtm_counters, 402 Metadata* method_data, bool profile_rtm, 403 Label& DONE_LABEL) { 404 assert(UseRTMLocking, "why call this otherwise?"); 405 assert(tmpReg == rax, ""); 406 assert(scrReg == rdx, ""); 407 Label L_rtm_retry, L_decrement_retry, L_on_abort; 408 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 409 410 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 411 movptr(boxReg, tmpReg); // Save ObjectMonitor address 412 413 if (RTMRetryCount > 0) { 414 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy 415 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 416 bind(L_rtm_retry); 417 } 418 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 419 Label L_noincrement; 420 if (RTMTotalCountIncrRate > 1) { 421 // tmpReg, scrReg and flags are killed 422 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 423 } 424 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 425 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg); 426 bind(L_noincrement); 427 } 428 xbegin(L_on_abort); 429 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 430 movptr(tmpReg, Address(tmpReg, owner_offset)); 431 testptr(tmpReg, tmpReg); 432 jcc(Assembler::zero, DONE_LABEL); 433 if (UseRTMXendForLockBusy) { 434 xend(); 435 jmp(L_decrement_retry); 436 } 437 else { 438 xabort(0); 439 } 440 bind(L_on_abort); 441 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 442 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 443 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm); 444 } 445 if (RTMRetryCount > 0) { 446 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 447 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 448 } 449 450 movptr(tmpReg, Address(boxReg, owner_offset)) ; 451 testptr(tmpReg, tmpReg) ; 452 jccb(Assembler::notZero, L_decrement_retry) ; 453 454 // Appears unlocked - try to swing _owner from null to non-null. 455 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 456 #ifdef _LP64 457 Register threadReg = r15_thread; 458 #else 459 get_thread(scrReg); 460 Register threadReg = scrReg; 461 #endif 462 lock(); 463 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg 464 465 if (RTMRetryCount > 0) { 466 // success done else retry 467 jccb(Assembler::equal, DONE_LABEL) ; 468 bind(L_decrement_retry); 469 // Spin and retry if lock is busy. 470 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry); 471 } 472 else { 473 bind(L_decrement_retry); 474 } 475 } 476 477 #endif // INCLUDE_RTM_OPT 478 479 // fast_lock and fast_unlock used by C2 480 481 // Because the transitions from emitted code to the runtime 482 // monitorenter/exit helper stubs are so slow it's critical that 483 // we inline both the stack-locking fast path and the inflated fast path. 484 // 485 // See also: cmpFastLock and cmpFastUnlock. 486 // 487 // What follows is a specialized inline transliteration of the code 488 // in enter() and exit(). If we're concerned about I$ bloat another 489 // option would be to emit TrySlowEnter and TrySlowExit methods 490 // at startup-time. These methods would accept arguments as 491 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 492 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 493 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 494 // In practice, however, the # of lock sites is bounded and is usually small. 495 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 496 // if the processor uses simple bimodal branch predictors keyed by EIP 497 // Since the helper routines would be called from multiple synchronization 498 // sites. 499 // 500 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 501 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 502 // to those specialized methods. That'd give us a mostly platform-independent 503 // implementation that the JITs could optimize and inline at their pleasure. 504 // Done correctly, the only time we'd need to cross to native could would be 505 // to park() or unpark() threads. We'd also need a few more unsafe operators 506 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 507 // (b) explicit barriers or fence operations. 508 // 509 // TODO: 510 // 511 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 512 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 513 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 514 // the lock operators would typically be faster than reifying Self. 515 // 516 // * Ideally I'd define the primitives as: 517 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 518 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 519 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 520 // Instead, we're stuck with a rather awkward and brittle register assignments below. 521 // Furthermore the register assignments are overconstrained, possibly resulting in 522 // sub-optimal code near the synchronization site. 523 // 524 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 525 // Alternately, use a better sp-proximity test. 526 // 527 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 528 // Either one is sufficient to uniquely identify a thread. 529 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 530 // 531 // * Intrinsify notify() and notifyAll() for the common cases where the 532 // object is locked by the calling thread but the waitlist is empty. 533 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 534 // 535 // * use jccb and jmpb instead of jcc and jmp to improve code density. 536 // But beware of excessive branch density on AMD Opterons. 537 // 538 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 539 // or failure of the fast path. If the fast path fails then we pass 540 // control to the slow path, typically in C. In fast_lock and 541 // fast_unlock we often branch to DONE_LABEL, just to find that C2 542 // will emit a conditional branch immediately after the node. 543 // So we have branches to branches and lots of ICC.ZF games. 544 // Instead, it might be better to have C2 pass a "FailureLabel" 545 // into fast_lock and fast_unlock. In the case of success, control 546 // will drop through the node. ICC.ZF is undefined at exit. 547 // In the case of failure, the node will branch directly to the 548 // FailureLabel 549 550 551 // obj: object to lock 552 // box: on-stack box address (displaced header location) - KILLED 553 // rax,: tmp -- KILLED 554 // scr: tmp -- KILLED 555 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 556 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 557 RTMLockingCounters* rtm_counters, 558 RTMLockingCounters* stack_rtm_counters, 559 Metadata* method_data, 560 bool use_rtm, bool profile_rtm) { 561 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 562 // Ensure the register assignments are disjoint 563 assert(tmpReg == rax, ""); 564 565 if (use_rtm) { 566 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg); 567 } else { 568 assert(cx1Reg == noreg, ""); 569 assert(cx2Reg == noreg, ""); 570 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 571 } 572 573 // Possible cases that we'll encounter in fast_lock 574 // ------------------------------------------------ 575 // * Inflated 576 // -- unlocked 577 // -- Locked 578 // = by self 579 // = by other 580 // * neutral 581 // * stack-locked 582 // -- by self 583 // = sp-proximity test hits 584 // = sp-proximity test generates false-negative 585 // -- by other 586 // 587 588 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 589 590 if (DiagnoseSyncOnValueBasedClasses != 0) { 591 load_klass(tmpReg, objReg, scrReg); 592 movl(tmpReg, Address(tmpReg, Klass::access_flags_offset())); 593 testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS); 594 jcc(Assembler::notZero, DONE_LABEL); 595 } 596 597 #if INCLUDE_RTM_OPT 598 if (UseRTMForStackLocks && use_rtm) { 599 assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive"); 600 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg, 601 stack_rtm_counters, method_data, profile_rtm, 602 DONE_LABEL, IsInflated); 603 } 604 #endif // INCLUDE_RTM_OPT 605 606 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 607 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 608 jcc(Assembler::notZero, IsInflated); 609 610 if (LockingMode == LM_MONITOR) { 611 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 612 testptr(objReg, objReg); 613 } else { 614 assert(LockingMode == LM_LEGACY, "must be"); 615 // Attempt stack-locking ... 616 orptr (tmpReg, markWord::unlocked_value); 617 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 618 lock(); 619 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 620 jcc(Assembler::equal, COUNT); // Success 621 622 // Recursive locking. 623 // The object is stack-locked: markword contains stack pointer to BasicLock. 624 // Locked by current thread if difference with current SP is less than one page. 625 subptr(tmpReg, rsp); 626 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 627 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) ); 628 movptr(Address(boxReg, 0), tmpReg); 629 } 630 jmp(DONE_LABEL); 631 632 bind(IsInflated); 633 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 634 635 #if INCLUDE_RTM_OPT 636 // Use the same RTM locking code in 32- and 64-bit VM. 637 if (use_rtm) { 638 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg, 639 rtm_counters, method_data, profile_rtm, DONE_LABEL); 640 } else { 641 #endif // INCLUDE_RTM_OPT 642 643 #ifndef _LP64 644 // The object is inflated. 645 646 // boxReg refers to the on-stack BasicLock in the current frame. 647 // We'd like to write: 648 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 649 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 650 // additional latency as we have another ST in the store buffer that must drain. 651 652 // avoid ST-before-CAS 653 // register juggle because we need tmpReg for cmpxchgptr below 654 movptr(scrReg, boxReg); 655 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 656 657 // Optimistic form: consider XORL tmpReg,tmpReg 658 movptr(tmpReg, NULL_WORD); 659 660 // Appears unlocked - try to swing _owner from null to non-null. 661 // Ideally, I'd manifest "Self" with get_thread and then attempt 662 // to CAS the register containing Self into m->Owner. 663 // But we don't have enough registers, so instead we can either try to CAS 664 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 665 // we later store "Self" into m->Owner. Transiently storing a stack address 666 // (rsp or the address of the box) into m->owner is harmless. 667 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 668 lock(); 669 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 670 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 671 // If we weren't able to swing _owner from null to the BasicLock 672 // then take the slow path. 673 jccb (Assembler::notZero, NO_COUNT); 674 // update _owner from BasicLock to thread 675 get_thread (scrReg); // beware: clobbers ICCs 676 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 677 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 678 679 // If the CAS fails we can either retry or pass control to the slow path. 680 // We use the latter tactic. 681 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 682 // If the CAS was successful ... 683 // Self has acquired the lock 684 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 685 // Intentional fall-through into DONE_LABEL ... 686 #else // _LP64 687 // It's inflated and we use scrReg for ObjectMonitor* in this section. 688 movq(scrReg, tmpReg); 689 xorq(tmpReg, tmpReg); 690 lock(); 691 cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 692 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 693 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 694 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 695 // Propagate ICC.ZF from CAS above into DONE_LABEL. 696 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 697 698 cmpptr(thread, rax); // Check if we are already the owner (recursive lock) 699 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 700 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 701 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 702 #endif // _LP64 703 #if INCLUDE_RTM_OPT 704 } // use_rtm() 705 #endif 706 bind(DONE_LABEL); 707 708 // ZFlag == 1 count in fast path 709 // ZFlag == 0 count in slow path 710 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 711 712 bind(COUNT); 713 // Count monitors in fast path 714 increment(Address(thread, JavaThread::held_monitor_count_offset())); 715 716 xorl(tmpReg, tmpReg); // Set ZF == 1 717 718 bind(NO_COUNT); 719 720 // At NO_COUNT the icc ZFlag is set as follows ... 721 // fast_unlock uses the same protocol. 722 // ZFlag == 1 -> Success 723 // ZFlag == 0 -> Failure - force control through the slow path 724 } 725 726 // obj: object to unlock 727 // box: box address (displaced header location), killed. Must be EAX. 728 // tmp: killed, cannot be obj nor box. 729 // 730 // Some commentary on balanced locking: 731 // 732 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 733 // Methods that don't have provably balanced locking are forced to run in the 734 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 735 // The interpreter provides two properties: 736 // I1: At return-time the interpreter automatically and quietly unlocks any 737 // objects acquired the current activation (frame). Recall that the 738 // interpreter maintains an on-stack list of locks currently held by 739 // a frame. 740 // I2: If a method attempts to unlock an object that is not held by the 741 // the frame the interpreter throws IMSX. 742 // 743 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 744 // B() doesn't have provably balanced locking so it runs in the interpreter. 745 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 746 // is still locked by A(). 747 // 748 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 749 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 750 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 751 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 752 // Arguably given that the spec legislates the JNI case as undefined our implementation 753 // could reasonably *avoid* checking owner in fast_unlock(). 754 // In the interest of performance we elide m->Owner==Self check in unlock. 755 // A perfectly viable alternative is to elide the owner check except when 756 // Xcheck:jni is enabled. 757 758 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { 759 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 760 assert(boxReg == rax, ""); 761 assert_different_registers(objReg, boxReg, tmpReg); 762 763 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 764 765 #if INCLUDE_RTM_OPT 766 if (UseRTMForStackLocks && use_rtm) { 767 assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive"); 768 Label L_regular_unlock; 769 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 770 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 771 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 772 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock 773 xend(); // otherwise end... 774 jmp(DONE_LABEL); // ... and we're done 775 bind(L_regular_unlock); 776 } 777 #endif 778 779 if (LockingMode == LM_LEGACY) { 780 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 781 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 782 } 783 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 784 if (LockingMode != LM_MONITOR) { 785 testptr(tmpReg, markWord::monitor_value); // Inflated? 786 jcc(Assembler::zero, Stacked); 787 } 788 789 // It's inflated. 790 791 #if INCLUDE_RTM_OPT 792 if (use_rtm) { 793 Label L_regular_inflated_unlock; 794 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 795 movptr(boxReg, Address(tmpReg, owner_offset)); 796 testptr(boxReg, boxReg); 797 jccb(Assembler::notZero, L_regular_inflated_unlock); 798 xend(); 799 jmp(DONE_LABEL); 800 bind(L_regular_inflated_unlock); 801 } 802 #endif 803 804 // Despite our balanced locking property we still check that m->_owner == Self 805 // as java routines or native JNI code called by this thread might 806 // have released the lock. 807 // Refer to the comments in synchronizer.cpp for how we might encode extra 808 // state in _succ so we can avoid fetching EntryList|cxq. 809 // 810 // If there's no contention try a 1-0 exit. That is, exit without 811 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 812 // we detect and recover from the race that the 1-0 exit admits. 813 // 814 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 815 // before it STs null into _owner, releasing the lock. Updates 816 // to data protected by the critical section must be visible before 817 // we drop the lock (and thus before any other thread could acquire 818 // the lock and observe the fields protected by the lock). 819 // IA32's memory-model is SPO, so STs are ordered with respect to 820 // each other and there's no need for an explicit barrier (fence). 821 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 822 #ifndef _LP64 823 // Note that we could employ various encoding schemes to reduce 824 // the number of loads below (currently 4) to just 2 or 3. 825 // Refer to the comments in synchronizer.cpp. 826 // In practice the chain of fetches doesn't seem to impact performance, however. 827 xorptr(boxReg, boxReg); 828 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 829 jccb (Assembler::notZero, DONE_LABEL); 830 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 831 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 832 jccb (Assembler::notZero, DONE_LABEL); 833 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 834 jmpb (DONE_LABEL); 835 #else // _LP64 836 // It's inflated 837 Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath; 838 839 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 840 jccb(Assembler::equal, LNotRecursive); 841 842 // Recursive inflated unlock 843 decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 844 jmpb(LSuccess); 845 846 bind(LNotRecursive); 847 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 848 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 849 jccb (Assembler::notZero, CheckSucc); 850 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 851 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 852 jmpb (DONE_LABEL); 853 854 // Try to avoid passing control into the slow_path ... 855 bind (CheckSucc); 856 857 // The following optional optimization can be elided if necessary 858 // Effectively: if (succ == null) goto slow path 859 // The code reduces the window for a race, however, 860 // and thus benefits performance. 861 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 862 jccb (Assembler::zero, LGoSlowPath); 863 864 xorptr(boxReg, boxReg); 865 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 866 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 867 868 // Memory barrier/fence 869 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 870 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 871 // This is faster on Nehalem and AMD Shanghai/Barcelona. 872 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 873 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 874 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 875 lock(); addl(Address(rsp, 0), 0); 876 877 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 878 jccb (Assembler::notZero, LSuccess); 879 880 // Rare inopportune interleaving - race. 881 // The successor vanished in the small window above. 882 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 883 // We need to ensure progress and succession. 884 // Try to reacquire the lock. 885 // If that fails then the new owner is responsible for succession and this 886 // thread needs to take no further action and can exit via the fast path (success). 887 // If the re-acquire succeeds then pass control into the slow path. 888 // As implemented, this latter mode is horrible because we generated more 889 // coherence traffic on the lock *and* artificially extended the critical section 890 // length while by virtue of passing control into the slow path. 891 892 // box is really RAX -- the following CMPXCHG depends on that binding 893 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 894 lock(); 895 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 896 // There's no successor so we tried to regrab the lock. 897 // If that didn't work, then another thread grabbed the 898 // lock so we're done (and exit was a success). 899 jccb (Assembler::notEqual, LSuccess); 900 // Intentional fall-through into slow path 901 902 bind (LGoSlowPath); 903 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 904 jmpb (DONE_LABEL); 905 906 bind (LSuccess); 907 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 908 jmpb (DONE_LABEL); 909 910 #endif 911 if (LockingMode == LM_LEGACY) { 912 bind (Stacked); 913 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 914 lock(); 915 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 916 // Intentional fall-thru into DONE_LABEL 917 } 918 919 bind(DONE_LABEL); 920 921 // ZFlag == 1 count in fast path 922 // ZFlag == 0 count in slow path 923 jccb(Assembler::notZero, NO_COUNT); 924 925 bind(COUNT); 926 // Count monitors in fast path 927 #ifndef _LP64 928 get_thread(tmpReg); 929 decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); 930 #else // _LP64 931 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 932 #endif 933 934 xorl(tmpReg, tmpReg); // Set ZF == 1 935 936 bind(NO_COUNT); 937 } 938 939 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register rax_reg, 940 Register t, Register thread) { 941 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 942 assert(rax_reg == rax, "Used for CAS"); 943 assert_different_registers(obj, box, rax_reg, t, thread); 944 945 // Handle inflated monitor. 946 Label inflated; 947 // Finish fast lock successfully. ZF value is irrelevant. 948 Label locked; 949 // Finish fast lock unsuccessfully. MUST jump with ZF == 0 950 Label slow_path; 951 952 if (DiagnoseSyncOnValueBasedClasses != 0) { 953 load_klass(rax_reg, obj, t); 954 movl(rax_reg, Address(rax_reg, Klass::access_flags_offset())); 955 testl(rax_reg, JVM_ACC_IS_VALUE_BASED_CLASS); 956 jcc(Assembler::notZero, slow_path); 957 } 958 959 const Register mark = t; 960 961 { // Lightweight Lock 962 963 Label push; 964 965 const Register top = box; 966 967 // Load the mark. 968 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 969 970 // Prefetch top. 971 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 972 973 // Check for monitor (0b10). 974 testptr(mark, markWord::monitor_value); 975 jcc(Assembler::notZero, inflated); 976 977 // Check if lock-stack is full. 978 cmpl(top, LockStack::end_offset() - 1); 979 jcc(Assembler::greater, slow_path); 980 981 // Check if recursive. 982 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 983 jccb(Assembler::equal, push); 984 985 // Try to lock. Transition lock bits 0b01 => 0b00 986 movptr(rax_reg, mark); 987 orptr(rax_reg, markWord::unlocked_value); 988 andptr(mark, ~(int32_t)markWord::unlocked_value); 989 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 990 jcc(Assembler::notEqual, slow_path); 991 992 bind(push); 993 // After successful lock, push object on lock-stack. 994 movptr(Address(thread, top), obj); 995 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 996 jmpb(locked); 997 } 998 999 { // Handle inflated monitor. 1000 bind(inflated); 1001 1002 const Register tagged_monitor = mark; 1003 1004 // CAS owner (null => current thread). 1005 xorptr(rax_reg, rax_reg); 1006 lock(); cmpxchgptr(thread, Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 1007 jccb(Assembler::equal, locked); 1008 1009 // Check if recursive. 1010 cmpptr(thread, rax_reg); 1011 jccb(Assembler::notEqual, slow_path); 1012 1013 // Recursive. 1014 increment(Address(tagged_monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 1015 } 1016 1017 bind(locked); 1018 increment(Address(thread, JavaThread::held_monitor_count_offset())); 1019 // Set ZF = 1 1020 xorl(rax_reg, rax_reg); 1021 1022 #ifdef ASSERT 1023 // Check that locked label is reached with ZF set. 1024 Label zf_correct; 1025 Label zf_bad_zero; 1026 jcc(Assembler::zero, zf_correct); 1027 jmp(zf_bad_zero); 1028 #endif 1029 1030 bind(slow_path); 1031 #ifdef ASSERT 1032 // Check that slow_path label is reached with ZF not set. 1033 jcc(Assembler::notZero, zf_correct); 1034 stop("Fast Lock ZF != 0"); 1035 bind(zf_bad_zero); 1036 stop("Fast Lock ZF != 1"); 1037 bind(zf_correct); 1038 #endif 1039 // C2 uses the value of ZF to determine the continuation. 1040 } 1041 1042 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register reg_rax, Register t, Register thread) { 1043 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 1044 assert(reg_rax == rax, "Used for CAS"); 1045 assert_different_registers(obj, reg_rax, t); 1046 1047 // Handle inflated monitor. 1048 Label inflated, inflated_check_lock_stack; 1049 // Finish fast unlock successfully. MUST jump with ZF == 1 1050 Label unlocked; 1051 1052 // Assume success. 1053 decrement(Address(thread, JavaThread::held_monitor_count_offset())); 1054 1055 const Register mark = t; 1056 const Register top = reg_rax; 1057 1058 Label dummy; 1059 C2FastUnlockLightweightStub* stub = nullptr; 1060 1061 if (!Compile::current()->output()->in_scratch_emit_size()) { 1062 stub = new (Compile::current()->comp_arena()) C2FastUnlockLightweightStub(obj, mark, reg_rax, thread); 1063 Compile::current()->output()->add_stub(stub); 1064 } 1065 1066 Label& push_and_slow_path = stub == nullptr ? dummy : stub->push_and_slow_path(); 1067 Label& check_successor = stub == nullptr ? dummy : stub->check_successor(); 1068 1069 { // Lightweight Unlock 1070 1071 // Load top. 1072 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 1073 1074 // Prefetch mark. 1075 movptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 1076 1077 // Check if obj is top of lock-stack. 1078 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 1079 // Top of lock stack was not obj. Must be monitor. 1080 jcc(Assembler::notEqual, inflated_check_lock_stack); 1081 1082 // Pop lock-stack. 1083 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 1084 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 1085 1086 // Check if recursive. 1087 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 1088 jcc(Assembler::equal, unlocked); 1089 1090 // We elide the monitor check, let the CAS fail instead. 1091 1092 // Try to unlock. Transition lock bits 0b00 => 0b01 1093 movptr(reg_rax, mark); 1094 andptr(reg_rax, ~(int32_t)markWord::lock_mask); 1095 orptr(mark, markWord::unlocked_value); 1096 lock(); cmpxchgptr(mark, Address(obj, oopDesc::mark_offset_in_bytes())); 1097 jcc(Assembler::notEqual, push_and_slow_path); 1098 jmp(unlocked); 1099 } 1100 1101 1102 { // Handle inflated monitor. 1103 bind(inflated_check_lock_stack); 1104 #ifdef ASSERT 1105 Label check_done; 1106 subl(top, oopSize); 1107 cmpl(top, in_bytes(JavaThread::lock_stack_base_offset())); 1108 jcc(Assembler::below, check_done); 1109 cmpptr(obj, Address(thread, top)); 1110 jccb(Assembler::notEqual, inflated_check_lock_stack); 1111 stop("Fast Unlock lock on stack"); 1112 bind(check_done); 1113 testptr(mark, markWord::monitor_value); 1114 jccb(Assembler::notZero, inflated); 1115 stop("Fast Unlock not monitor"); 1116 #endif 1117 1118 bind(inflated); 1119 1120 // mark contains the tagged ObjectMonitor*. 1121 const Register monitor = mark; 1122 1123 #ifndef _LP64 1124 // Check if recursive. 1125 xorptr(reg_rax, reg_rax); 1126 orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 1127 jcc(Assembler::notZero, check_successor); 1128 1129 // Check if the entry lists are empty. 1130 movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 1131 orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 1132 jcc(Assembler::notZero, check_successor); 1133 1134 // Release lock. 1135 movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 1136 #else // _LP64 1137 Label recursive; 1138 1139 // Check if recursive. 1140 cmpptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 1141 jccb(Assembler::notEqual, recursive); 1142 1143 // Check if the entry lists are empty. 1144 movptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 1145 orptr(reg_rax, Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 1146 jcc(Assembler::notZero, check_successor); 1147 1148 // Release lock. 1149 movptr(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 1150 jmpb(unlocked); 1151 1152 // Recursive unlock. 1153 bind(recursive); 1154 decrement(Address(monitor, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 1155 xorl(t, t); 1156 #endif 1157 } 1158 1159 bind(unlocked); 1160 if (stub != nullptr) { 1161 bind(stub->unlocked_continuation()); 1162 } 1163 1164 #ifdef ASSERT 1165 // Check that unlocked label is reached with ZF set. 1166 Label zf_correct; 1167 jcc(Assembler::zero, zf_correct); 1168 stop("Fast Unlock ZF != 1"); 1169 #endif 1170 1171 if (stub != nullptr) { 1172 bind(stub->slow_path_continuation()); 1173 } 1174 #ifdef ASSERT 1175 // Check that stub->continuation() label is reached with ZF not set. 1176 jccb(Assembler::notZero, zf_correct); 1177 stop("Fast Unlock ZF != 0"); 1178 bind(zf_correct); 1179 #endif 1180 // C2 uses the value of ZF to determine the continuation. 1181 } 1182 1183 //------------------------------------------------------------------------------------------- 1184 // Generic instructions support for use in .ad files C2 code generation 1185 1186 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 1187 if (dst != src) { 1188 movdqu(dst, src); 1189 } 1190 if (opcode == Op_AbsVD) { 1191 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 1192 } else { 1193 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 1194 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1195 } 1196 } 1197 1198 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 1199 if (opcode == Op_AbsVD) { 1200 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 1201 } else { 1202 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 1203 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 1204 } 1205 } 1206 1207 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 1208 if (dst != src) { 1209 movdqu(dst, src); 1210 } 1211 if (opcode == Op_AbsVF) { 1212 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 1213 } else { 1214 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 1215 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1216 } 1217 } 1218 1219 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 1220 if (opcode == Op_AbsVF) { 1221 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 1222 } else { 1223 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 1224 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 1225 } 1226 } 1227 1228 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 1229 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1230 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 1231 1232 if (opcode == Op_MinV) { 1233 if (elem_bt == T_BYTE) { 1234 pminsb(dst, src); 1235 } else if (elem_bt == T_SHORT) { 1236 pminsw(dst, src); 1237 } else if (elem_bt == T_INT) { 1238 pminsd(dst, src); 1239 } else { 1240 assert(elem_bt == T_LONG, "required"); 1241 assert(tmp == xmm0, "required"); 1242 assert_different_registers(dst, src, tmp); 1243 movdqu(xmm0, dst); 1244 pcmpgtq(xmm0, src); 1245 blendvpd(dst, src); // xmm0 as mask 1246 } 1247 } else { // opcode == Op_MaxV 1248 if (elem_bt == T_BYTE) { 1249 pmaxsb(dst, src); 1250 } else if (elem_bt == T_SHORT) { 1251 pmaxsw(dst, src); 1252 } else if (elem_bt == T_INT) { 1253 pmaxsd(dst, src); 1254 } else { 1255 assert(elem_bt == T_LONG, "required"); 1256 assert(tmp == xmm0, "required"); 1257 assert_different_registers(dst, src, tmp); 1258 movdqu(xmm0, src); 1259 pcmpgtq(xmm0, dst); 1260 blendvpd(dst, src); // xmm0 as mask 1261 } 1262 } 1263 } 1264 1265 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 1266 XMMRegister dst, XMMRegister src1, XMMRegister src2, 1267 int vlen_enc) { 1268 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1269 1270 if (opcode == Op_MinV) { 1271 if (elem_bt == T_BYTE) { 1272 vpminsb(dst, src1, src2, vlen_enc); 1273 } else if (elem_bt == T_SHORT) { 1274 vpminsw(dst, src1, src2, vlen_enc); 1275 } else if (elem_bt == T_INT) { 1276 vpminsd(dst, src1, src2, vlen_enc); 1277 } else { 1278 assert(elem_bt == T_LONG, "required"); 1279 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1280 vpminsq(dst, src1, src2, vlen_enc); 1281 } else { 1282 assert_different_registers(dst, src1, src2); 1283 vpcmpgtq(dst, src1, src2, vlen_enc); 1284 vblendvpd(dst, src1, src2, dst, vlen_enc); 1285 } 1286 } 1287 } else { // opcode == Op_MaxV 1288 if (elem_bt == T_BYTE) { 1289 vpmaxsb(dst, src1, src2, vlen_enc); 1290 } else if (elem_bt == T_SHORT) { 1291 vpmaxsw(dst, src1, src2, vlen_enc); 1292 } else if (elem_bt == T_INT) { 1293 vpmaxsd(dst, src1, src2, vlen_enc); 1294 } else { 1295 assert(elem_bt == T_LONG, "required"); 1296 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1297 vpmaxsq(dst, src1, src2, vlen_enc); 1298 } else { 1299 assert_different_registers(dst, src1, src2); 1300 vpcmpgtq(dst, src1, src2, vlen_enc); 1301 vblendvpd(dst, src2, src1, dst, vlen_enc); 1302 } 1303 } 1304 } 1305 } 1306 1307 // Float/Double min max 1308 1309 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1310 XMMRegister dst, XMMRegister a, XMMRegister b, 1311 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1312 int vlen_enc) { 1313 assert(UseAVX > 0, "required"); 1314 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1315 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1316 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1317 assert_different_registers(a, tmp, atmp, btmp); 1318 assert_different_registers(b, tmp, atmp, btmp); 1319 1320 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1321 bool is_double_word = is_double_word_type(elem_bt); 1322 1323 /* Note on 'non-obvious' assembly sequence: 1324 * 1325 * While there are vminps/vmaxps instructions, there are two important differences between hardware 1326 * and Java on how they handle floats: 1327 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 1328 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 1329 * 1330 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 1331 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 1332 * (only useful when signs differ, noop otherwise) 1333 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 1334 1335 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 1336 * btmp = (b < +0.0) ? a : b 1337 * atmp = (b < +0.0) ? b : a 1338 * Tmp = Max_Float(atmp , btmp) 1339 * Res = (atmp == NaN) ? atmp : Tmp 1340 */ 1341 1342 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 1343 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 1344 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 1345 XMMRegister mask; 1346 1347 if (!is_double_word && is_min) { 1348 mask = a; 1349 vblend = &MacroAssembler::vblendvps; 1350 vmaxmin = &MacroAssembler::vminps; 1351 vcmp = &MacroAssembler::vcmpps; 1352 } else if (!is_double_word && !is_min) { 1353 mask = b; 1354 vblend = &MacroAssembler::vblendvps; 1355 vmaxmin = &MacroAssembler::vmaxps; 1356 vcmp = &MacroAssembler::vcmpps; 1357 } else if (is_double_word && is_min) { 1358 mask = a; 1359 vblend = &MacroAssembler::vblendvpd; 1360 vmaxmin = &MacroAssembler::vminpd; 1361 vcmp = &MacroAssembler::vcmppd; 1362 } else { 1363 assert(is_double_word && !is_min, "sanity"); 1364 mask = b; 1365 vblend = &MacroAssembler::vblendvpd; 1366 vmaxmin = &MacroAssembler::vmaxpd; 1367 vcmp = &MacroAssembler::vcmppd; 1368 } 1369 1370 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 1371 XMMRegister maxmin, scratch; 1372 if (dst == btmp) { 1373 maxmin = btmp; 1374 scratch = tmp; 1375 } else { 1376 maxmin = tmp; 1377 scratch = btmp; 1378 } 1379 1380 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 1381 if (precompute_mask && !is_double_word) { 1382 vpsrad(tmp, mask, 32, vlen_enc); 1383 mask = tmp; 1384 } else if (precompute_mask && is_double_word) { 1385 vpxor(tmp, tmp, tmp, vlen_enc); 1386 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1387 mask = tmp; 1388 } 1389 1390 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1391 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1392 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1393 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1394 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1395 } 1396 1397 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1398 XMMRegister dst, XMMRegister a, XMMRegister b, 1399 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1400 int vlen_enc) { 1401 assert(UseAVX > 2, "required"); 1402 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1403 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1404 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1405 assert_different_registers(dst, a, atmp, btmp); 1406 assert_different_registers(dst, b, atmp, btmp); 1407 1408 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1409 bool is_double_word = is_double_word_type(elem_bt); 1410 bool merge = true; 1411 1412 if (!is_double_word && is_min) { 1413 evpmovd2m(ktmp, a, vlen_enc); 1414 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1415 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1416 vminps(dst, atmp, btmp, vlen_enc); 1417 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1418 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1419 } else if (!is_double_word && !is_min) { 1420 evpmovd2m(ktmp, b, vlen_enc); 1421 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1422 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1423 vmaxps(dst, atmp, btmp, vlen_enc); 1424 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1425 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1426 } else if (is_double_word && is_min) { 1427 evpmovq2m(ktmp, a, vlen_enc); 1428 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1429 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1430 vminpd(dst, atmp, btmp, vlen_enc); 1431 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1432 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1433 } else { 1434 assert(is_double_word && !is_min, "sanity"); 1435 evpmovq2m(ktmp, b, vlen_enc); 1436 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1437 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1438 vmaxpd(dst, atmp, btmp, vlen_enc); 1439 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1440 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1441 } 1442 } 1443 1444 // Float/Double signum 1445 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1446 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1447 1448 Label DONE_LABEL; 1449 1450 if (opcode == Op_SignumF) { 1451 assert(UseSSE > 0, "required"); 1452 ucomiss(dst, zero); 1453 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1454 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1455 movflt(dst, one); 1456 jcc(Assembler::above, DONE_LABEL); 1457 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1458 } else if (opcode == Op_SignumD) { 1459 assert(UseSSE > 1, "required"); 1460 ucomisd(dst, zero); 1461 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1462 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1463 movdbl(dst, one); 1464 jcc(Assembler::above, DONE_LABEL); 1465 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1466 } 1467 1468 bind(DONE_LABEL); 1469 } 1470 1471 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1472 if (sign) { 1473 pmovsxbw(dst, src); 1474 } else { 1475 pmovzxbw(dst, src); 1476 } 1477 } 1478 1479 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1480 if (sign) { 1481 vpmovsxbw(dst, src, vector_len); 1482 } else { 1483 vpmovzxbw(dst, src, vector_len); 1484 } 1485 } 1486 1487 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1488 if (sign) { 1489 vpmovsxbd(dst, src, vector_len); 1490 } else { 1491 vpmovzxbd(dst, src, vector_len); 1492 } 1493 } 1494 1495 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1496 if (sign) { 1497 vpmovsxwd(dst, src, vector_len); 1498 } else { 1499 vpmovzxwd(dst, src, vector_len); 1500 } 1501 } 1502 1503 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1504 int shift, int vector_len) { 1505 if (opcode == Op_RotateLeftV) { 1506 if (etype == T_INT) { 1507 evprold(dst, src, shift, vector_len); 1508 } else { 1509 assert(etype == T_LONG, "expected type T_LONG"); 1510 evprolq(dst, src, shift, vector_len); 1511 } 1512 } else { 1513 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1514 if (etype == T_INT) { 1515 evprord(dst, src, shift, vector_len); 1516 } else { 1517 assert(etype == T_LONG, "expected type T_LONG"); 1518 evprorq(dst, src, shift, vector_len); 1519 } 1520 } 1521 } 1522 1523 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1524 XMMRegister shift, int vector_len) { 1525 if (opcode == Op_RotateLeftV) { 1526 if (etype == T_INT) { 1527 evprolvd(dst, src, shift, vector_len); 1528 } else { 1529 assert(etype == T_LONG, "expected type T_LONG"); 1530 evprolvq(dst, src, shift, vector_len); 1531 } 1532 } else { 1533 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1534 if (etype == T_INT) { 1535 evprorvd(dst, src, shift, vector_len); 1536 } else { 1537 assert(etype == T_LONG, "expected type T_LONG"); 1538 evprorvq(dst, src, shift, vector_len); 1539 } 1540 } 1541 } 1542 1543 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1544 if (opcode == Op_RShiftVI) { 1545 psrad(dst, shift); 1546 } else if (opcode == Op_LShiftVI) { 1547 pslld(dst, shift); 1548 } else { 1549 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1550 psrld(dst, shift); 1551 } 1552 } 1553 1554 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1555 switch (opcode) { 1556 case Op_RShiftVI: psrad(dst, shift); break; 1557 case Op_LShiftVI: pslld(dst, shift); break; 1558 case Op_URShiftVI: psrld(dst, shift); break; 1559 1560 default: assert(false, "%s", NodeClassNames[opcode]); 1561 } 1562 } 1563 1564 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1565 if (opcode == Op_RShiftVI) { 1566 vpsrad(dst, nds, shift, vector_len); 1567 } else if (opcode == Op_LShiftVI) { 1568 vpslld(dst, nds, shift, vector_len); 1569 } else { 1570 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1571 vpsrld(dst, nds, shift, vector_len); 1572 } 1573 } 1574 1575 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1576 switch (opcode) { 1577 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1578 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1579 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1580 1581 default: assert(false, "%s", NodeClassNames[opcode]); 1582 } 1583 } 1584 1585 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1586 switch (opcode) { 1587 case Op_RShiftVB: // fall-through 1588 case Op_RShiftVS: psraw(dst, shift); break; 1589 1590 case Op_LShiftVB: // fall-through 1591 case Op_LShiftVS: psllw(dst, shift); break; 1592 1593 case Op_URShiftVS: // fall-through 1594 case Op_URShiftVB: psrlw(dst, shift); break; 1595 1596 default: assert(false, "%s", NodeClassNames[opcode]); 1597 } 1598 } 1599 1600 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1601 switch (opcode) { 1602 case Op_RShiftVB: // fall-through 1603 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1604 1605 case Op_LShiftVB: // fall-through 1606 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1607 1608 case Op_URShiftVS: // fall-through 1609 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1610 1611 default: assert(false, "%s", NodeClassNames[opcode]); 1612 } 1613 } 1614 1615 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1616 switch (opcode) { 1617 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1618 case Op_LShiftVL: psllq(dst, shift); break; 1619 case Op_URShiftVL: psrlq(dst, shift); break; 1620 1621 default: assert(false, "%s", NodeClassNames[opcode]); 1622 } 1623 } 1624 1625 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1626 if (opcode == Op_RShiftVL) { 1627 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1628 } else if (opcode == Op_LShiftVL) { 1629 psllq(dst, shift); 1630 } else { 1631 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1632 psrlq(dst, shift); 1633 } 1634 } 1635 1636 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1637 switch (opcode) { 1638 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1639 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1640 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1641 1642 default: assert(false, "%s", NodeClassNames[opcode]); 1643 } 1644 } 1645 1646 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1647 if (opcode == Op_RShiftVL) { 1648 evpsraq(dst, nds, shift, vector_len); 1649 } else if (opcode == Op_LShiftVL) { 1650 vpsllq(dst, nds, shift, vector_len); 1651 } else { 1652 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1653 vpsrlq(dst, nds, shift, vector_len); 1654 } 1655 } 1656 1657 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1658 switch (opcode) { 1659 case Op_RShiftVB: // fall-through 1660 case Op_RShiftVS: // fall-through 1661 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1662 1663 case Op_LShiftVB: // fall-through 1664 case Op_LShiftVS: // fall-through 1665 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1666 1667 case Op_URShiftVB: // fall-through 1668 case Op_URShiftVS: // fall-through 1669 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1670 1671 default: assert(false, "%s", NodeClassNames[opcode]); 1672 } 1673 } 1674 1675 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1676 switch (opcode) { 1677 case Op_RShiftVB: // fall-through 1678 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1679 1680 case Op_LShiftVB: // fall-through 1681 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1682 1683 case Op_URShiftVB: // fall-through 1684 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1685 1686 default: assert(false, "%s", NodeClassNames[opcode]); 1687 } 1688 } 1689 1690 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1691 assert(UseAVX >= 2, "required"); 1692 switch (opcode) { 1693 case Op_RShiftVL: { 1694 if (UseAVX > 2) { 1695 assert(tmp == xnoreg, "not used"); 1696 if (!VM_Version::supports_avx512vl()) { 1697 vlen_enc = Assembler::AVX_512bit; 1698 } 1699 evpsravq(dst, src, shift, vlen_enc); 1700 } else { 1701 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1702 vpsrlvq(dst, src, shift, vlen_enc); 1703 vpsrlvq(tmp, tmp, shift, vlen_enc); 1704 vpxor(dst, dst, tmp, vlen_enc); 1705 vpsubq(dst, dst, tmp, vlen_enc); 1706 } 1707 break; 1708 } 1709 case Op_LShiftVL: { 1710 assert(tmp == xnoreg, "not used"); 1711 vpsllvq(dst, src, shift, vlen_enc); 1712 break; 1713 } 1714 case Op_URShiftVL: { 1715 assert(tmp == xnoreg, "not used"); 1716 vpsrlvq(dst, src, shift, vlen_enc); 1717 break; 1718 } 1719 default: assert(false, "%s", NodeClassNames[opcode]); 1720 } 1721 } 1722 1723 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1724 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1725 assert(opcode == Op_LShiftVB || 1726 opcode == Op_RShiftVB || 1727 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1728 bool sign = (opcode != Op_URShiftVB); 1729 assert(vector_len == 0, "required"); 1730 vextendbd(sign, dst, src, 1); 1731 vpmovzxbd(vtmp, shift, 1); 1732 varshiftd(opcode, dst, dst, vtmp, 1); 1733 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1734 vextracti128_high(vtmp, dst); 1735 vpackusdw(dst, dst, vtmp, 0); 1736 } 1737 1738 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1739 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1740 assert(opcode == Op_LShiftVB || 1741 opcode == Op_RShiftVB || 1742 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1743 bool sign = (opcode != Op_URShiftVB); 1744 int ext_vector_len = vector_len + 1; 1745 vextendbw(sign, dst, src, ext_vector_len); 1746 vpmovzxbw(vtmp, shift, ext_vector_len); 1747 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1748 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1749 if (vector_len == 0) { 1750 vextracti128_high(vtmp, dst); 1751 vpackuswb(dst, dst, vtmp, vector_len); 1752 } else { 1753 vextracti64x4_high(vtmp, dst); 1754 vpackuswb(dst, dst, vtmp, vector_len); 1755 vpermq(dst, dst, 0xD8, vector_len); 1756 } 1757 } 1758 1759 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1760 switch(typ) { 1761 case T_BYTE: 1762 pinsrb(dst, val, idx); 1763 break; 1764 case T_SHORT: 1765 pinsrw(dst, val, idx); 1766 break; 1767 case T_INT: 1768 pinsrd(dst, val, idx); 1769 break; 1770 case T_LONG: 1771 pinsrq(dst, val, idx); 1772 break; 1773 default: 1774 assert(false,"Should not reach here."); 1775 break; 1776 } 1777 } 1778 1779 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1780 switch(typ) { 1781 case T_BYTE: 1782 vpinsrb(dst, src, val, idx); 1783 break; 1784 case T_SHORT: 1785 vpinsrw(dst, src, val, idx); 1786 break; 1787 case T_INT: 1788 vpinsrd(dst, src, val, idx); 1789 break; 1790 case T_LONG: 1791 vpinsrq(dst, src, val, idx); 1792 break; 1793 default: 1794 assert(false,"Should not reach here."); 1795 break; 1796 } 1797 } 1798 1799 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1800 switch(typ) { 1801 case T_INT: 1802 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1803 break; 1804 case T_FLOAT: 1805 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1806 break; 1807 case T_LONG: 1808 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1809 break; 1810 case T_DOUBLE: 1811 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1812 break; 1813 default: 1814 assert(false,"Should not reach here."); 1815 break; 1816 } 1817 } 1818 1819 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1820 switch(typ) { 1821 case T_INT: 1822 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1823 break; 1824 case T_FLOAT: 1825 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1826 break; 1827 case T_LONG: 1828 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1829 break; 1830 case T_DOUBLE: 1831 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1832 break; 1833 default: 1834 assert(false,"Should not reach here."); 1835 break; 1836 } 1837 } 1838 1839 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1840 switch(typ) { 1841 case T_INT: 1842 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1843 break; 1844 case T_FLOAT: 1845 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1846 break; 1847 case T_LONG: 1848 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1849 break; 1850 case T_DOUBLE: 1851 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1852 break; 1853 default: 1854 assert(false,"Should not reach here."); 1855 break; 1856 } 1857 } 1858 1859 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1860 if (vlen_in_bytes <= 16) { 1861 pxor (dst, dst); 1862 psubb(dst, src); 1863 switch (elem_bt) { 1864 case T_BYTE: /* nothing to do */ break; 1865 case T_SHORT: pmovsxbw(dst, dst); break; 1866 case T_INT: pmovsxbd(dst, dst); break; 1867 case T_FLOAT: pmovsxbd(dst, dst); break; 1868 case T_LONG: pmovsxbq(dst, dst); break; 1869 case T_DOUBLE: pmovsxbq(dst, dst); break; 1870 1871 default: assert(false, "%s", type2name(elem_bt)); 1872 } 1873 } else { 1874 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1875 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1876 1877 vpxor (dst, dst, dst, vlen_enc); 1878 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1879 1880 switch (elem_bt) { 1881 case T_BYTE: /* nothing to do */ break; 1882 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1883 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1884 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1885 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1886 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1887 1888 default: assert(false, "%s", type2name(elem_bt)); 1889 } 1890 } 1891 } 1892 1893 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1894 if (novlbwdq) { 1895 vpmovsxbd(xtmp, src, vlen_enc); 1896 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1897 Assembler::eq, true, vlen_enc, noreg); 1898 } else { 1899 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1900 vpsubb(xtmp, xtmp, src, vlen_enc); 1901 evpmovb2m(dst, xtmp, vlen_enc); 1902 } 1903 } 1904 1905 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) { 1906 switch (vlen_in_bytes) { 1907 case 4: movdl(dst, src); break; 1908 case 8: movq(dst, src); break; 1909 case 16: movdqu(dst, src); break; 1910 case 32: vmovdqu(dst, src); break; 1911 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1912 default: ShouldNotReachHere(); 1913 } 1914 } 1915 1916 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1917 assert(rscratch != noreg || always_reachable(src), "missing"); 1918 1919 if (reachable(src)) { 1920 load_vector(dst, as_Address(src), vlen_in_bytes); 1921 } else { 1922 lea(rscratch, src); 1923 load_vector(dst, Address(rscratch, 0), vlen_in_bytes); 1924 } 1925 } 1926 1927 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1928 int vlen_enc = vector_length_encoding(vlen); 1929 if (VM_Version::supports_avx()) { 1930 if (bt == T_LONG) { 1931 if (VM_Version::supports_avx2()) { 1932 vpbroadcastq(dst, src, vlen_enc); 1933 } else { 1934 vmovddup(dst, src, vlen_enc); 1935 } 1936 } else if (bt == T_DOUBLE) { 1937 if (vlen_enc != Assembler::AVX_128bit) { 1938 vbroadcastsd(dst, src, vlen_enc, noreg); 1939 } else { 1940 vmovddup(dst, src, vlen_enc); 1941 } 1942 } else { 1943 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1944 vpbroadcastd(dst, src, vlen_enc); 1945 } else { 1946 vbroadcastss(dst, src, vlen_enc); 1947 } 1948 } 1949 } else if (VM_Version::supports_sse3()) { 1950 movddup(dst, src); 1951 } else { 1952 movq(dst, src); 1953 if (vlen == 16) { 1954 punpcklqdq(dst, dst); 1955 } 1956 } 1957 } 1958 1959 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1960 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1961 int offset = exact_log2(type2aelembytes(bt)) << 6; 1962 if (is_floating_point_type(bt)) { 1963 offset += 128; 1964 } 1965 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1966 load_vector(dst, addr, vlen_in_bytes); 1967 } 1968 1969 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1970 1971 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1972 int vector_len = Assembler::AVX_128bit; 1973 1974 switch (opcode) { 1975 case Op_AndReductionV: pand(dst, src); break; 1976 case Op_OrReductionV: por (dst, src); break; 1977 case Op_XorReductionV: pxor(dst, src); break; 1978 case Op_MinReductionV: 1979 switch (typ) { 1980 case T_BYTE: pminsb(dst, src); break; 1981 case T_SHORT: pminsw(dst, src); break; 1982 case T_INT: pminsd(dst, src); break; 1983 case T_LONG: assert(UseAVX > 2, "required"); 1984 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1985 default: assert(false, "wrong type"); 1986 } 1987 break; 1988 case Op_MaxReductionV: 1989 switch (typ) { 1990 case T_BYTE: pmaxsb(dst, src); break; 1991 case T_SHORT: pmaxsw(dst, src); break; 1992 case T_INT: pmaxsd(dst, src); break; 1993 case T_LONG: assert(UseAVX > 2, "required"); 1994 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1995 default: assert(false, "wrong type"); 1996 } 1997 break; 1998 case Op_AddReductionVF: addss(dst, src); break; 1999 case Op_AddReductionVD: addsd(dst, src); break; 2000 case Op_AddReductionVI: 2001 switch (typ) { 2002 case T_BYTE: paddb(dst, src); break; 2003 case T_SHORT: paddw(dst, src); break; 2004 case T_INT: paddd(dst, src); break; 2005 default: assert(false, "wrong type"); 2006 } 2007 break; 2008 case Op_AddReductionVL: paddq(dst, src); break; 2009 case Op_MulReductionVF: mulss(dst, src); break; 2010 case Op_MulReductionVD: mulsd(dst, src); break; 2011 case Op_MulReductionVI: 2012 switch (typ) { 2013 case T_SHORT: pmullw(dst, src); break; 2014 case T_INT: pmulld(dst, src); break; 2015 default: assert(false, "wrong type"); 2016 } 2017 break; 2018 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 2019 evpmullq(dst, dst, src, vector_len); break; 2020 default: assert(false, "wrong opcode"); 2021 } 2022 } 2023 2024 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 2025 int vector_len = Assembler::AVX_256bit; 2026 2027 switch (opcode) { 2028 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 2029 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 2030 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 2031 case Op_MinReductionV: 2032 switch (typ) { 2033 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 2034 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 2035 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 2036 case T_LONG: assert(UseAVX > 2, "required"); 2037 vpminsq(dst, src1, src2, vector_len); break; 2038 default: assert(false, "wrong type"); 2039 } 2040 break; 2041 case Op_MaxReductionV: 2042 switch (typ) { 2043 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 2044 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 2045 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 2046 case T_LONG: assert(UseAVX > 2, "required"); 2047 vpmaxsq(dst, src1, src2, vector_len); break; 2048 default: assert(false, "wrong type"); 2049 } 2050 break; 2051 case Op_AddReductionVI: 2052 switch (typ) { 2053 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 2054 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 2055 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 2056 default: assert(false, "wrong type"); 2057 } 2058 break; 2059 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 2060 case Op_MulReductionVI: 2061 switch (typ) { 2062 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 2063 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 2064 default: assert(false, "wrong type"); 2065 } 2066 break; 2067 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 2068 default: assert(false, "wrong opcode"); 2069 } 2070 } 2071 2072 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 2073 XMMRegister dst, XMMRegister src, 2074 XMMRegister vtmp1, XMMRegister vtmp2) { 2075 switch (opcode) { 2076 case Op_AddReductionVF: 2077 case Op_MulReductionVF: 2078 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 2079 break; 2080 2081 case Op_AddReductionVD: 2082 case Op_MulReductionVD: 2083 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 2084 break; 2085 2086 default: assert(false, "wrong opcode"); 2087 } 2088 } 2089 2090 void C2_MacroAssembler::reduceB(int opcode, int vlen, 2091 Register dst, Register src1, XMMRegister src2, 2092 XMMRegister vtmp1, XMMRegister vtmp2) { 2093 switch (vlen) { 2094 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2095 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2096 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2097 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2098 2099 default: assert(false, "wrong vector length"); 2100 } 2101 } 2102 2103 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 2104 Register dst, Register src1, XMMRegister src2, 2105 XMMRegister vtmp1, XMMRegister vtmp2) { 2106 switch (vlen) { 2107 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2108 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2109 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2110 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2111 2112 default: assert(false, "wrong vector length"); 2113 } 2114 } 2115 2116 void C2_MacroAssembler::reduceS(int opcode, int vlen, 2117 Register dst, Register src1, XMMRegister src2, 2118 XMMRegister vtmp1, XMMRegister vtmp2) { 2119 switch (vlen) { 2120 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2121 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2122 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2123 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2124 2125 default: assert(false, "wrong vector length"); 2126 } 2127 } 2128 2129 void C2_MacroAssembler::reduceI(int opcode, int vlen, 2130 Register dst, Register src1, XMMRegister src2, 2131 XMMRegister vtmp1, XMMRegister vtmp2) { 2132 switch (vlen) { 2133 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2134 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2135 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 2136 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2137 2138 default: assert(false, "wrong vector length"); 2139 } 2140 } 2141 2142 #ifdef _LP64 2143 void C2_MacroAssembler::reduceL(int opcode, int vlen, 2144 Register dst, Register src1, XMMRegister src2, 2145 XMMRegister vtmp1, XMMRegister vtmp2) { 2146 switch (vlen) { 2147 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2148 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2149 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 2150 2151 default: assert(false, "wrong vector length"); 2152 } 2153 } 2154 #endif // _LP64 2155 2156 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2157 switch (vlen) { 2158 case 2: 2159 assert(vtmp2 == xnoreg, ""); 2160 reduce2F(opcode, dst, src, vtmp1); 2161 break; 2162 case 4: 2163 assert(vtmp2 == xnoreg, ""); 2164 reduce4F(opcode, dst, src, vtmp1); 2165 break; 2166 case 8: 2167 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2168 break; 2169 case 16: 2170 reduce16F(opcode, dst, src, vtmp1, vtmp2); 2171 break; 2172 default: assert(false, "wrong vector length"); 2173 } 2174 } 2175 2176 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2177 switch (vlen) { 2178 case 2: 2179 assert(vtmp2 == xnoreg, ""); 2180 reduce2D(opcode, dst, src, vtmp1); 2181 break; 2182 case 4: 2183 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2184 break; 2185 case 8: 2186 reduce8D(opcode, dst, src, vtmp1, vtmp2); 2187 break; 2188 default: assert(false, "wrong vector length"); 2189 } 2190 } 2191 2192 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2193 if (opcode == Op_AddReductionVI) { 2194 if (vtmp1 != src2) { 2195 movdqu(vtmp1, src2); 2196 } 2197 phaddd(vtmp1, vtmp1); 2198 } else { 2199 pshufd(vtmp1, src2, 0x1); 2200 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2201 } 2202 movdl(vtmp2, src1); 2203 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2204 movdl(dst, vtmp1); 2205 } 2206 2207 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2208 if (opcode == Op_AddReductionVI) { 2209 if (vtmp1 != src2) { 2210 movdqu(vtmp1, src2); 2211 } 2212 phaddd(vtmp1, src2); 2213 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2214 } else { 2215 pshufd(vtmp2, src2, 0xE); 2216 reduce_operation_128(T_INT, opcode, vtmp2, src2); 2217 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2218 } 2219 } 2220 2221 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2222 if (opcode == Op_AddReductionVI) { 2223 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 2224 vextracti128_high(vtmp2, vtmp1); 2225 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 2226 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2227 } else { 2228 vextracti128_high(vtmp1, src2); 2229 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2230 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2231 } 2232 } 2233 2234 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2235 vextracti64x4_high(vtmp2, src2); 2236 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2237 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2238 } 2239 2240 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2241 pshufd(vtmp2, src2, 0x1); 2242 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2243 movdqu(vtmp1, vtmp2); 2244 psrldq(vtmp1, 2); 2245 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2246 movdqu(vtmp2, vtmp1); 2247 psrldq(vtmp2, 1); 2248 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2249 movdl(vtmp2, src1); 2250 pmovsxbd(vtmp1, vtmp1); 2251 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2252 pextrb(dst, vtmp1, 0x0); 2253 movsbl(dst, dst); 2254 } 2255 2256 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2257 pshufd(vtmp1, src2, 0xE); 2258 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2259 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2260 } 2261 2262 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2263 vextracti128_high(vtmp2, src2); 2264 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2265 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2266 } 2267 2268 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2269 vextracti64x4_high(vtmp1, src2); 2270 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2271 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2272 } 2273 2274 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2275 pmovsxbw(vtmp2, src2); 2276 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2277 } 2278 2279 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2280 if (UseAVX > 1) { 2281 int vector_len = Assembler::AVX_256bit; 2282 vpmovsxbw(vtmp1, src2, vector_len); 2283 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2284 } else { 2285 pmovsxbw(vtmp2, src2); 2286 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2287 pshufd(vtmp2, src2, 0x1); 2288 pmovsxbw(vtmp2, src2); 2289 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2290 } 2291 } 2292 2293 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2294 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2295 int vector_len = Assembler::AVX_512bit; 2296 vpmovsxbw(vtmp1, src2, vector_len); 2297 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2298 } else { 2299 assert(UseAVX >= 2,"Should not reach here."); 2300 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2301 vextracti128_high(vtmp2, src2); 2302 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2303 } 2304 } 2305 2306 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2307 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2308 vextracti64x4_high(vtmp2, src2); 2309 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2310 } 2311 2312 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2313 if (opcode == Op_AddReductionVI) { 2314 if (vtmp1 != src2) { 2315 movdqu(vtmp1, src2); 2316 } 2317 phaddw(vtmp1, vtmp1); 2318 phaddw(vtmp1, vtmp1); 2319 } else { 2320 pshufd(vtmp2, src2, 0x1); 2321 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2322 movdqu(vtmp1, vtmp2); 2323 psrldq(vtmp1, 2); 2324 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2325 } 2326 movdl(vtmp2, src1); 2327 pmovsxwd(vtmp1, vtmp1); 2328 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2329 pextrw(dst, vtmp1, 0x0); 2330 movswl(dst, dst); 2331 } 2332 2333 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2334 if (opcode == Op_AddReductionVI) { 2335 if (vtmp1 != src2) { 2336 movdqu(vtmp1, src2); 2337 } 2338 phaddw(vtmp1, src2); 2339 } else { 2340 pshufd(vtmp1, src2, 0xE); 2341 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2342 } 2343 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2344 } 2345 2346 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2347 if (opcode == Op_AddReductionVI) { 2348 int vector_len = Assembler::AVX_256bit; 2349 vphaddw(vtmp2, src2, src2, vector_len); 2350 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2351 } else { 2352 vextracti128_high(vtmp2, src2); 2353 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2354 } 2355 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2356 } 2357 2358 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2359 int vector_len = Assembler::AVX_256bit; 2360 vextracti64x4_high(vtmp1, src2); 2361 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2362 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2363 } 2364 2365 #ifdef _LP64 2366 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2367 pshufd(vtmp2, src2, 0xE); 2368 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2369 movdq(vtmp1, src1); 2370 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2371 movdq(dst, vtmp1); 2372 } 2373 2374 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2375 vextracti128_high(vtmp1, src2); 2376 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2377 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2378 } 2379 2380 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2381 vextracti64x4_high(vtmp2, src2); 2382 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2383 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2384 } 2385 2386 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2387 mov64(temp, -1L); 2388 bzhiq(temp, temp, len); 2389 kmovql(dst, temp); 2390 } 2391 #endif // _LP64 2392 2393 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2394 reduce_operation_128(T_FLOAT, opcode, dst, src); 2395 pshufd(vtmp, src, 0x1); 2396 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2397 } 2398 2399 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2400 reduce2F(opcode, dst, src, vtmp); 2401 pshufd(vtmp, src, 0x2); 2402 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2403 pshufd(vtmp, src, 0x3); 2404 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2405 } 2406 2407 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2408 reduce4F(opcode, dst, src, vtmp2); 2409 vextractf128_high(vtmp2, src); 2410 reduce4F(opcode, dst, vtmp2, vtmp1); 2411 } 2412 2413 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2414 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2415 vextracti64x4_high(vtmp1, src); 2416 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2417 } 2418 2419 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2420 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2421 pshufd(vtmp, src, 0xE); 2422 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2423 } 2424 2425 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2426 reduce2D(opcode, dst, src, vtmp2); 2427 vextractf128_high(vtmp2, src); 2428 reduce2D(opcode, dst, vtmp2, vtmp1); 2429 } 2430 2431 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2432 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2433 vextracti64x4_high(vtmp1, src); 2434 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2435 } 2436 2437 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2438 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2439 } 2440 2441 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2442 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2443 } 2444 2445 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2446 int vec_enc) { 2447 switch(elem_bt) { 2448 case T_INT: 2449 case T_FLOAT: 2450 vmaskmovps(dst, src, mask, vec_enc); 2451 break; 2452 case T_LONG: 2453 case T_DOUBLE: 2454 vmaskmovpd(dst, src, mask, vec_enc); 2455 break; 2456 default: 2457 fatal("Unsupported type %s", type2name(elem_bt)); 2458 break; 2459 } 2460 } 2461 2462 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2463 int vec_enc) { 2464 switch(elem_bt) { 2465 case T_INT: 2466 case T_FLOAT: 2467 vmaskmovps(dst, src, mask, vec_enc); 2468 break; 2469 case T_LONG: 2470 case T_DOUBLE: 2471 vmaskmovpd(dst, src, mask, vec_enc); 2472 break; 2473 default: 2474 fatal("Unsupported type %s", type2name(elem_bt)); 2475 break; 2476 } 2477 } 2478 2479 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2480 XMMRegister dst, XMMRegister src, 2481 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2482 XMMRegister xmm_0, XMMRegister xmm_1) { 2483 const int permconst[] = {1, 14}; 2484 XMMRegister wsrc = src; 2485 XMMRegister wdst = xmm_0; 2486 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2487 2488 int vlen_enc = Assembler::AVX_128bit; 2489 if (vlen == 16) { 2490 vlen_enc = Assembler::AVX_256bit; 2491 } 2492 2493 for (int i = log2(vlen) - 1; i >=0; i--) { 2494 if (i == 0 && !is_dst_valid) { 2495 wdst = dst; 2496 } 2497 if (i == 3) { 2498 vextracti64x4_high(wtmp, wsrc); 2499 } else if (i == 2) { 2500 vextracti128_high(wtmp, wsrc); 2501 } else { // i = [0,1] 2502 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2503 } 2504 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2505 wsrc = wdst; 2506 vlen_enc = Assembler::AVX_128bit; 2507 } 2508 if (is_dst_valid) { 2509 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2510 } 2511 } 2512 2513 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2514 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2515 XMMRegister xmm_0, XMMRegister xmm_1) { 2516 XMMRegister wsrc = src; 2517 XMMRegister wdst = xmm_0; 2518 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2519 int vlen_enc = Assembler::AVX_128bit; 2520 if (vlen == 8) { 2521 vlen_enc = Assembler::AVX_256bit; 2522 } 2523 for (int i = log2(vlen) - 1; i >=0; i--) { 2524 if (i == 0 && !is_dst_valid) { 2525 wdst = dst; 2526 } 2527 if (i == 1) { 2528 vextracti128_high(wtmp, wsrc); 2529 } else if (i == 2) { 2530 vextracti64x4_high(wtmp, wsrc); 2531 } else { 2532 assert(i == 0, "%d", i); 2533 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2534 } 2535 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2536 wsrc = wdst; 2537 vlen_enc = Assembler::AVX_128bit; 2538 } 2539 if (is_dst_valid) { 2540 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2541 } 2542 } 2543 2544 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2545 switch (bt) { 2546 case T_BYTE: pextrb(dst, src, idx); break; 2547 case T_SHORT: pextrw(dst, src, idx); break; 2548 case T_INT: pextrd(dst, src, idx); break; 2549 case T_LONG: pextrq(dst, src, idx); break; 2550 2551 default: 2552 assert(false,"Should not reach here."); 2553 break; 2554 } 2555 } 2556 2557 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2558 int esize = type2aelembytes(typ); 2559 int elem_per_lane = 16/esize; 2560 int lane = elemindex / elem_per_lane; 2561 int eindex = elemindex % elem_per_lane; 2562 2563 if (lane >= 2) { 2564 assert(UseAVX > 2, "required"); 2565 vextractf32x4(dst, src, lane & 3); 2566 return dst; 2567 } else if (lane > 0) { 2568 assert(UseAVX > 0, "required"); 2569 vextractf128(dst, src, lane); 2570 return dst; 2571 } else { 2572 return src; 2573 } 2574 } 2575 2576 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2577 if (typ == T_BYTE) { 2578 movsbl(dst, dst); 2579 } else if (typ == T_SHORT) { 2580 movswl(dst, dst); 2581 } 2582 } 2583 2584 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2585 int esize = type2aelembytes(typ); 2586 int elem_per_lane = 16/esize; 2587 int eindex = elemindex % elem_per_lane; 2588 assert(is_integral_type(typ),"required"); 2589 2590 if (eindex == 0) { 2591 if (typ == T_LONG) { 2592 movq(dst, src); 2593 } else { 2594 movdl(dst, src); 2595 movsxl(typ, dst); 2596 } 2597 } else { 2598 extract(typ, dst, src, eindex); 2599 movsxl(typ, dst); 2600 } 2601 } 2602 2603 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2604 int esize = type2aelembytes(typ); 2605 int elem_per_lane = 16/esize; 2606 int eindex = elemindex % elem_per_lane; 2607 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2608 2609 if (eindex == 0) { 2610 movq(dst, src); 2611 } else { 2612 if (typ == T_FLOAT) { 2613 if (UseAVX == 0) { 2614 movdqu(dst, src); 2615 shufps(dst, dst, eindex); 2616 } else { 2617 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2618 } 2619 } else { 2620 if (UseAVX == 0) { 2621 movdqu(dst, src); 2622 psrldq(dst, eindex*esize); 2623 } else { 2624 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2625 } 2626 movq(dst, dst); 2627 } 2628 } 2629 // Zero upper bits 2630 if (typ == T_FLOAT) { 2631 if (UseAVX == 0) { 2632 assert(vtmp != xnoreg, "required."); 2633 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2634 pand(dst, vtmp); 2635 } else { 2636 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2637 } 2638 } 2639 } 2640 2641 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2642 switch(typ) { 2643 case T_BYTE: 2644 case T_BOOLEAN: 2645 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2646 break; 2647 case T_SHORT: 2648 case T_CHAR: 2649 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2650 break; 2651 case T_INT: 2652 case T_FLOAT: 2653 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2654 break; 2655 case T_LONG: 2656 case T_DOUBLE: 2657 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2658 break; 2659 default: 2660 assert(false,"Should not reach here."); 2661 break; 2662 } 2663 } 2664 2665 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2666 assert(rscratch != noreg || always_reachable(src2), "missing"); 2667 2668 switch(typ) { 2669 case T_BOOLEAN: 2670 case T_BYTE: 2671 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2672 break; 2673 case T_CHAR: 2674 case T_SHORT: 2675 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2676 break; 2677 case T_INT: 2678 case T_FLOAT: 2679 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2680 break; 2681 case T_LONG: 2682 case T_DOUBLE: 2683 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2684 break; 2685 default: 2686 assert(false,"Should not reach here."); 2687 break; 2688 } 2689 } 2690 2691 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2692 switch(typ) { 2693 case T_BYTE: 2694 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2695 break; 2696 case T_SHORT: 2697 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2698 break; 2699 case T_INT: 2700 case T_FLOAT: 2701 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2702 break; 2703 case T_LONG: 2704 case T_DOUBLE: 2705 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2706 break; 2707 default: 2708 assert(false,"Should not reach here."); 2709 break; 2710 } 2711 } 2712 2713 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2714 assert(vlen_in_bytes <= 32, ""); 2715 int esize = type2aelembytes(bt); 2716 if (vlen_in_bytes == 32) { 2717 assert(vtmp == xnoreg, "required."); 2718 if (esize >= 4) { 2719 vtestps(src1, src2, AVX_256bit); 2720 } else { 2721 vptest(src1, src2, AVX_256bit); 2722 } 2723 return; 2724 } 2725 if (vlen_in_bytes < 16) { 2726 // Duplicate the lower part to fill the whole register, 2727 // Don't need to do so for src2 2728 assert(vtmp != xnoreg, "required"); 2729 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2730 pshufd(vtmp, src1, shuffle_imm); 2731 } else { 2732 assert(vtmp == xnoreg, "required"); 2733 vtmp = src1; 2734 } 2735 if (esize >= 4 && VM_Version::supports_avx()) { 2736 vtestps(vtmp, src2, AVX_128bit); 2737 } else { 2738 ptest(vtmp, src2); 2739 } 2740 } 2741 2742 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2743 assert(UseAVX >= 2, "required"); 2744 #ifdef ASSERT 2745 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2746 bool is_bw_supported = VM_Version::supports_avx512bw(); 2747 if (is_bw && !is_bw_supported) { 2748 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2749 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2750 "XMM register should be 0-15"); 2751 } 2752 #endif // ASSERT 2753 switch (elem_bt) { 2754 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2755 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2756 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2757 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2758 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2759 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2760 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2761 } 2762 } 2763 2764 #ifdef _LP64 2765 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2766 assert(UseAVX >= 2, "required"); 2767 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2768 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2769 if ((UseAVX > 2) && 2770 (!is_bw || VM_Version::supports_avx512bw()) && 2771 (!is_vl || VM_Version::supports_avx512vl())) { 2772 switch (elem_bt) { 2773 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2774 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2775 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2776 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2777 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2778 } 2779 } else { 2780 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2781 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2782 switch (elem_bt) { 2783 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2784 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2785 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2786 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2787 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2788 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2789 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2790 } 2791 } 2792 } 2793 #endif 2794 2795 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2796 switch (to_elem_bt) { 2797 case T_SHORT: 2798 vpmovsxbw(dst, src, vlen_enc); 2799 break; 2800 case T_INT: 2801 vpmovsxbd(dst, src, vlen_enc); 2802 break; 2803 case T_FLOAT: 2804 vpmovsxbd(dst, src, vlen_enc); 2805 vcvtdq2ps(dst, dst, vlen_enc); 2806 break; 2807 case T_LONG: 2808 vpmovsxbq(dst, src, vlen_enc); 2809 break; 2810 case T_DOUBLE: { 2811 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2812 vpmovsxbd(dst, src, mid_vlen_enc); 2813 vcvtdq2pd(dst, dst, vlen_enc); 2814 break; 2815 } 2816 default: 2817 fatal("Unsupported type %s", type2name(to_elem_bt)); 2818 break; 2819 } 2820 } 2821 2822 //------------------------------------------------------------------------------------------- 2823 2824 // IndexOf for constant substrings with size >= 8 chars 2825 // which don't need to be loaded through stack. 2826 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2827 Register cnt1, Register cnt2, 2828 int int_cnt2, Register result, 2829 XMMRegister vec, Register tmp, 2830 int ae) { 2831 ShortBranchVerifier sbv(this); 2832 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2833 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2834 2835 // This method uses the pcmpestri instruction with bound registers 2836 // inputs: 2837 // xmm - substring 2838 // rax - substring length (elements count) 2839 // mem - scanned string 2840 // rdx - string length (elements count) 2841 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2842 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2843 // outputs: 2844 // rcx - matched index in string 2845 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2846 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2847 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2848 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2849 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2850 2851 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2852 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2853 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2854 2855 // Note, inline_string_indexOf() generates checks: 2856 // if (substr.count > string.count) return -1; 2857 // if (substr.count == 0) return 0; 2858 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2859 2860 // Load substring. 2861 if (ae == StrIntrinsicNode::UL) { 2862 pmovzxbw(vec, Address(str2, 0)); 2863 } else { 2864 movdqu(vec, Address(str2, 0)); 2865 } 2866 movl(cnt2, int_cnt2); 2867 movptr(result, str1); // string addr 2868 2869 if (int_cnt2 > stride) { 2870 jmpb(SCAN_TO_SUBSTR); 2871 2872 // Reload substr for rescan, this code 2873 // is executed only for large substrings (> 8 chars) 2874 bind(RELOAD_SUBSTR); 2875 if (ae == StrIntrinsicNode::UL) { 2876 pmovzxbw(vec, Address(str2, 0)); 2877 } else { 2878 movdqu(vec, Address(str2, 0)); 2879 } 2880 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2881 2882 bind(RELOAD_STR); 2883 // We came here after the beginning of the substring was 2884 // matched but the rest of it was not so we need to search 2885 // again. Start from the next element after the previous match. 2886 2887 // cnt2 is number of substring reminding elements and 2888 // cnt1 is number of string reminding elements when cmp failed. 2889 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2890 subl(cnt1, cnt2); 2891 addl(cnt1, int_cnt2); 2892 movl(cnt2, int_cnt2); // Now restore cnt2 2893 2894 decrementl(cnt1); // Shift to next element 2895 cmpl(cnt1, cnt2); 2896 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2897 2898 addptr(result, (1<<scale1)); 2899 2900 } // (int_cnt2 > 8) 2901 2902 // Scan string for start of substr in 16-byte vectors 2903 bind(SCAN_TO_SUBSTR); 2904 pcmpestri(vec, Address(result, 0), mode); 2905 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2906 subl(cnt1, stride); 2907 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2908 cmpl(cnt1, cnt2); 2909 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2910 addptr(result, 16); 2911 jmpb(SCAN_TO_SUBSTR); 2912 2913 // Found a potential substr 2914 bind(FOUND_CANDIDATE); 2915 // Matched whole vector if first element matched (tmp(rcx) == 0). 2916 if (int_cnt2 == stride) { 2917 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2918 } else { // int_cnt2 > 8 2919 jccb(Assembler::overflow, FOUND_SUBSTR); 2920 } 2921 // After pcmpestri tmp(rcx) contains matched element index 2922 // Compute start addr of substr 2923 lea(result, Address(result, tmp, scale1)); 2924 2925 // Make sure string is still long enough 2926 subl(cnt1, tmp); 2927 cmpl(cnt1, cnt2); 2928 if (int_cnt2 == stride) { 2929 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2930 } else { // int_cnt2 > 8 2931 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2932 } 2933 // Left less then substring. 2934 2935 bind(RET_NOT_FOUND); 2936 movl(result, -1); 2937 jmp(EXIT); 2938 2939 if (int_cnt2 > stride) { 2940 // This code is optimized for the case when whole substring 2941 // is matched if its head is matched. 2942 bind(MATCH_SUBSTR_HEAD); 2943 pcmpestri(vec, Address(result, 0), mode); 2944 // Reload only string if does not match 2945 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2946 2947 Label CONT_SCAN_SUBSTR; 2948 // Compare the rest of substring (> 8 chars). 2949 bind(FOUND_SUBSTR); 2950 // First 8 chars are already matched. 2951 negptr(cnt2); 2952 addptr(cnt2, stride); 2953 2954 bind(SCAN_SUBSTR); 2955 subl(cnt1, stride); 2956 cmpl(cnt2, -stride); // Do not read beyond substring 2957 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2958 // Back-up strings to avoid reading beyond substring: 2959 // cnt1 = cnt1 - cnt2 + 8 2960 addl(cnt1, cnt2); // cnt2 is negative 2961 addl(cnt1, stride); 2962 movl(cnt2, stride); negptr(cnt2); 2963 bind(CONT_SCAN_SUBSTR); 2964 if (int_cnt2 < (int)G) { 2965 int tail_off1 = int_cnt2<<scale1; 2966 int tail_off2 = int_cnt2<<scale2; 2967 if (ae == StrIntrinsicNode::UL) { 2968 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2969 } else { 2970 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2971 } 2972 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2973 } else { 2974 // calculate index in register to avoid integer overflow (int_cnt2*2) 2975 movl(tmp, int_cnt2); 2976 addptr(tmp, cnt2); 2977 if (ae == StrIntrinsicNode::UL) { 2978 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2979 } else { 2980 movdqu(vec, Address(str2, tmp, scale2, 0)); 2981 } 2982 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2983 } 2984 // Need to reload strings pointers if not matched whole vector 2985 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2986 addptr(cnt2, stride); 2987 jcc(Assembler::negative, SCAN_SUBSTR); 2988 // Fall through if found full substring 2989 2990 } // (int_cnt2 > 8) 2991 2992 bind(RET_FOUND); 2993 // Found result if we matched full small substring. 2994 // Compute substr offset 2995 subptr(result, str1); 2996 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2997 shrl(result, 1); // index 2998 } 2999 bind(EXIT); 3000 3001 } // string_indexofC8 3002 3003 // Small strings are loaded through stack if they cross page boundary. 3004 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 3005 Register cnt1, Register cnt2, 3006 int int_cnt2, Register result, 3007 XMMRegister vec, Register tmp, 3008 int ae) { 3009 ShortBranchVerifier sbv(this); 3010 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3011 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 3012 3013 // 3014 // int_cnt2 is length of small (< 8 chars) constant substring 3015 // or (-1) for non constant substring in which case its length 3016 // is in cnt2 register. 3017 // 3018 // Note, inline_string_indexOf() generates checks: 3019 // if (substr.count > string.count) return -1; 3020 // if (substr.count == 0) return 0; 3021 // 3022 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 3023 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 3024 // This method uses the pcmpestri instruction with bound registers 3025 // inputs: 3026 // xmm - substring 3027 // rax - substring length (elements count) 3028 // mem - scanned string 3029 // rdx - string length (elements count) 3030 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 3031 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 3032 // outputs: 3033 // rcx - matched index in string 3034 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3035 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 3036 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 3037 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 3038 3039 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 3040 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 3041 FOUND_CANDIDATE; 3042 3043 { //======================================================== 3044 // We don't know where these strings are located 3045 // and we can't read beyond them. Load them through stack. 3046 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 3047 3048 movptr(tmp, rsp); // save old SP 3049 3050 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 3051 if (int_cnt2 == (1>>scale2)) { // One byte 3052 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 3053 load_unsigned_byte(result, Address(str2, 0)); 3054 movdl(vec, result); // move 32 bits 3055 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 3056 // Not enough header space in 32-bit VM: 12+3 = 15. 3057 movl(result, Address(str2, -1)); 3058 shrl(result, 8); 3059 movdl(vec, result); // move 32 bits 3060 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 3061 load_unsigned_short(result, Address(str2, 0)); 3062 movdl(vec, result); // move 32 bits 3063 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 3064 movdl(vec, Address(str2, 0)); // move 32 bits 3065 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 3066 movq(vec, Address(str2, 0)); // move 64 bits 3067 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 3068 // Array header size is 12 bytes in 32-bit VM 3069 // + 6 bytes for 3 chars == 18 bytes, 3070 // enough space to load vec and shift. 3071 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 3072 if (ae == StrIntrinsicNode::UL) { 3073 int tail_off = int_cnt2-8; 3074 pmovzxbw(vec, Address(str2, tail_off)); 3075 psrldq(vec, -2*tail_off); 3076 } 3077 else { 3078 int tail_off = int_cnt2*(1<<scale2); 3079 movdqu(vec, Address(str2, tail_off-16)); 3080 psrldq(vec, 16-tail_off); 3081 } 3082 } 3083 } else { // not constant substring 3084 cmpl(cnt2, stride); 3085 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 3086 3087 // We can read beyond string if srt+16 does not cross page boundary 3088 // since heaps are aligned and mapped by pages. 3089 assert(os::vm_page_size() < (int)G, "default page should be small"); 3090 movl(result, str2); // We need only low 32 bits 3091 andl(result, ((int)os::vm_page_size()-1)); 3092 cmpl(result, ((int)os::vm_page_size()-16)); 3093 jccb(Assembler::belowEqual, CHECK_STR); 3094 3095 // Move small strings to stack to allow load 16 bytes into vec. 3096 subptr(rsp, 16); 3097 int stk_offset = wordSize-(1<<scale2); 3098 push(cnt2); 3099 3100 bind(COPY_SUBSTR); 3101 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 3102 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 3103 movb(Address(rsp, cnt2, scale2, stk_offset), result); 3104 } else if (ae == StrIntrinsicNode::UU) { 3105 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 3106 movw(Address(rsp, cnt2, scale2, stk_offset), result); 3107 } 3108 decrement(cnt2); 3109 jccb(Assembler::notZero, COPY_SUBSTR); 3110 3111 pop(cnt2); 3112 movptr(str2, rsp); // New substring address 3113 } // non constant 3114 3115 bind(CHECK_STR); 3116 cmpl(cnt1, stride); 3117 jccb(Assembler::aboveEqual, BIG_STRINGS); 3118 3119 // Check cross page boundary. 3120 movl(result, str1); // We need only low 32 bits 3121 andl(result, ((int)os::vm_page_size()-1)); 3122 cmpl(result, ((int)os::vm_page_size()-16)); 3123 jccb(Assembler::belowEqual, BIG_STRINGS); 3124 3125 subptr(rsp, 16); 3126 int stk_offset = -(1<<scale1); 3127 if (int_cnt2 < 0) { // not constant 3128 push(cnt2); 3129 stk_offset += wordSize; 3130 } 3131 movl(cnt2, cnt1); 3132 3133 bind(COPY_STR); 3134 if (ae == StrIntrinsicNode::LL) { 3135 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 3136 movb(Address(rsp, cnt2, scale1, stk_offset), result); 3137 } else { 3138 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 3139 movw(Address(rsp, cnt2, scale1, stk_offset), result); 3140 } 3141 decrement(cnt2); 3142 jccb(Assembler::notZero, COPY_STR); 3143 3144 if (int_cnt2 < 0) { // not constant 3145 pop(cnt2); 3146 } 3147 movptr(str1, rsp); // New string address 3148 3149 bind(BIG_STRINGS); 3150 // Load substring. 3151 if (int_cnt2 < 0) { // -1 3152 if (ae == StrIntrinsicNode::UL) { 3153 pmovzxbw(vec, Address(str2, 0)); 3154 } else { 3155 movdqu(vec, Address(str2, 0)); 3156 } 3157 push(cnt2); // substr count 3158 push(str2); // substr addr 3159 push(str1); // string addr 3160 } else { 3161 // Small (< 8 chars) constant substrings are loaded already. 3162 movl(cnt2, int_cnt2); 3163 } 3164 push(tmp); // original SP 3165 3166 } // Finished loading 3167 3168 //======================================================== 3169 // Start search 3170 // 3171 3172 movptr(result, str1); // string addr 3173 3174 if (int_cnt2 < 0) { // Only for non constant substring 3175 jmpb(SCAN_TO_SUBSTR); 3176 3177 // SP saved at sp+0 3178 // String saved at sp+1*wordSize 3179 // Substr saved at sp+2*wordSize 3180 // Substr count saved at sp+3*wordSize 3181 3182 // Reload substr for rescan, this code 3183 // is executed only for large substrings (> 8 chars) 3184 bind(RELOAD_SUBSTR); 3185 movptr(str2, Address(rsp, 2*wordSize)); 3186 movl(cnt2, Address(rsp, 3*wordSize)); 3187 if (ae == StrIntrinsicNode::UL) { 3188 pmovzxbw(vec, Address(str2, 0)); 3189 } else { 3190 movdqu(vec, Address(str2, 0)); 3191 } 3192 // We came here after the beginning of the substring was 3193 // matched but the rest of it was not so we need to search 3194 // again. Start from the next element after the previous match. 3195 subptr(str1, result); // Restore counter 3196 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3197 shrl(str1, 1); 3198 } 3199 addl(cnt1, str1); 3200 decrementl(cnt1); // Shift to next element 3201 cmpl(cnt1, cnt2); 3202 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3203 3204 addptr(result, (1<<scale1)); 3205 } // non constant 3206 3207 // Scan string for start of substr in 16-byte vectors 3208 bind(SCAN_TO_SUBSTR); 3209 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 3210 pcmpestri(vec, Address(result, 0), mode); 3211 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 3212 subl(cnt1, stride); 3213 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 3214 cmpl(cnt1, cnt2); 3215 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 3216 addptr(result, 16); 3217 3218 bind(ADJUST_STR); 3219 cmpl(cnt1, stride); // Do not read beyond string 3220 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 3221 // Back-up string to avoid reading beyond string. 3222 lea(result, Address(result, cnt1, scale1, -16)); 3223 movl(cnt1, stride); 3224 jmpb(SCAN_TO_SUBSTR); 3225 3226 // Found a potential substr 3227 bind(FOUND_CANDIDATE); 3228 // After pcmpestri tmp(rcx) contains matched element index 3229 3230 // Make sure string is still long enough 3231 subl(cnt1, tmp); 3232 cmpl(cnt1, cnt2); 3233 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3234 // Left less then substring. 3235 3236 bind(RET_NOT_FOUND); 3237 movl(result, -1); 3238 jmp(CLEANUP); 3239 3240 bind(FOUND_SUBSTR); 3241 // Compute start addr of substr 3242 lea(result, Address(result, tmp, scale1)); 3243 if (int_cnt2 > 0) { // Constant substring 3244 // Repeat search for small substring (< 8 chars) 3245 // from new point without reloading substring. 3246 // Have to check that we don't read beyond string. 3247 cmpl(tmp, stride-int_cnt2); 3248 jccb(Assembler::greater, ADJUST_STR); 3249 // Fall through if matched whole substring. 3250 } else { // non constant 3251 assert(int_cnt2 == -1, "should be != 0"); 3252 3253 addl(tmp, cnt2); 3254 // Found result if we matched whole substring. 3255 cmpl(tmp, stride); 3256 jcc(Assembler::lessEqual, RET_FOUND); 3257 3258 // Repeat search for small substring (<= 8 chars) 3259 // from new point 'str1' without reloading substring. 3260 cmpl(cnt2, stride); 3261 // Have to check that we don't read beyond string. 3262 jccb(Assembler::lessEqual, ADJUST_STR); 3263 3264 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3265 // Compare the rest of substring (> 8 chars). 3266 movptr(str1, result); 3267 3268 cmpl(tmp, cnt2); 3269 // First 8 chars are already matched. 3270 jccb(Assembler::equal, CHECK_NEXT); 3271 3272 bind(SCAN_SUBSTR); 3273 pcmpestri(vec, Address(str1, 0), mode); 3274 // Need to reload strings pointers if not matched whole vector 3275 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3276 3277 bind(CHECK_NEXT); 3278 subl(cnt2, stride); 3279 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3280 addptr(str1, 16); 3281 if (ae == StrIntrinsicNode::UL) { 3282 addptr(str2, 8); 3283 } else { 3284 addptr(str2, 16); 3285 } 3286 subl(cnt1, stride); 3287 cmpl(cnt2, stride); // Do not read beyond substring 3288 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3289 // Back-up strings to avoid reading beyond substring. 3290 3291 if (ae == StrIntrinsicNode::UL) { 3292 lea(str2, Address(str2, cnt2, scale2, -8)); 3293 lea(str1, Address(str1, cnt2, scale1, -16)); 3294 } else { 3295 lea(str2, Address(str2, cnt2, scale2, -16)); 3296 lea(str1, Address(str1, cnt2, scale1, -16)); 3297 } 3298 subl(cnt1, cnt2); 3299 movl(cnt2, stride); 3300 addl(cnt1, stride); 3301 bind(CONT_SCAN_SUBSTR); 3302 if (ae == StrIntrinsicNode::UL) { 3303 pmovzxbw(vec, Address(str2, 0)); 3304 } else { 3305 movdqu(vec, Address(str2, 0)); 3306 } 3307 jmp(SCAN_SUBSTR); 3308 3309 bind(RET_FOUND_LONG); 3310 movptr(str1, Address(rsp, wordSize)); 3311 } // non constant 3312 3313 bind(RET_FOUND); 3314 // Compute substr offset 3315 subptr(result, str1); 3316 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3317 shrl(result, 1); // index 3318 } 3319 bind(CLEANUP); 3320 pop(rsp); // restore SP 3321 3322 } // string_indexof 3323 3324 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3325 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3326 ShortBranchVerifier sbv(this); 3327 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3328 3329 int stride = 8; 3330 3331 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3332 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3333 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3334 FOUND_SEQ_CHAR, DONE_LABEL; 3335 3336 movptr(result, str1); 3337 if (UseAVX >= 2) { 3338 cmpl(cnt1, stride); 3339 jcc(Assembler::less, SCAN_TO_CHAR); 3340 cmpl(cnt1, 2*stride); 3341 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3342 movdl(vec1, ch); 3343 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3344 vpxor(vec2, vec2); 3345 movl(tmp, cnt1); 3346 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3347 andl(cnt1,0x0000000F); //tail count (in chars) 3348 3349 bind(SCAN_TO_16_CHAR_LOOP); 3350 vmovdqu(vec3, Address(result, 0)); 3351 vpcmpeqw(vec3, vec3, vec1, 1); 3352 vptest(vec2, vec3); 3353 jcc(Assembler::carryClear, FOUND_CHAR); 3354 addptr(result, 32); 3355 subl(tmp, 2*stride); 3356 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3357 jmp(SCAN_TO_8_CHAR); 3358 bind(SCAN_TO_8_CHAR_INIT); 3359 movdl(vec1, ch); 3360 pshuflw(vec1, vec1, 0x00); 3361 pshufd(vec1, vec1, 0); 3362 pxor(vec2, vec2); 3363 } 3364 bind(SCAN_TO_8_CHAR); 3365 cmpl(cnt1, stride); 3366 jcc(Assembler::less, SCAN_TO_CHAR); 3367 if (UseAVX < 2) { 3368 movdl(vec1, ch); 3369 pshuflw(vec1, vec1, 0x00); 3370 pshufd(vec1, vec1, 0); 3371 pxor(vec2, vec2); 3372 } 3373 movl(tmp, cnt1); 3374 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3375 andl(cnt1,0x00000007); //tail count (in chars) 3376 3377 bind(SCAN_TO_8_CHAR_LOOP); 3378 movdqu(vec3, Address(result, 0)); 3379 pcmpeqw(vec3, vec1); 3380 ptest(vec2, vec3); 3381 jcc(Assembler::carryClear, FOUND_CHAR); 3382 addptr(result, 16); 3383 subl(tmp, stride); 3384 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3385 bind(SCAN_TO_CHAR); 3386 testl(cnt1, cnt1); 3387 jcc(Assembler::zero, RET_NOT_FOUND); 3388 bind(SCAN_TO_CHAR_LOOP); 3389 load_unsigned_short(tmp, Address(result, 0)); 3390 cmpl(ch, tmp); 3391 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3392 addptr(result, 2); 3393 subl(cnt1, 1); 3394 jccb(Assembler::zero, RET_NOT_FOUND); 3395 jmp(SCAN_TO_CHAR_LOOP); 3396 3397 bind(RET_NOT_FOUND); 3398 movl(result, -1); 3399 jmpb(DONE_LABEL); 3400 3401 bind(FOUND_CHAR); 3402 if (UseAVX >= 2) { 3403 vpmovmskb(tmp, vec3); 3404 } else { 3405 pmovmskb(tmp, vec3); 3406 } 3407 bsfl(ch, tmp); 3408 addptr(result, ch); 3409 3410 bind(FOUND_SEQ_CHAR); 3411 subptr(result, str1); 3412 shrl(result, 1); 3413 3414 bind(DONE_LABEL); 3415 } // string_indexof_char 3416 3417 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3418 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3419 ShortBranchVerifier sbv(this); 3420 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3421 3422 int stride = 16; 3423 3424 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3425 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3426 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3427 FOUND_SEQ_CHAR, DONE_LABEL; 3428 3429 movptr(result, str1); 3430 if (UseAVX >= 2) { 3431 cmpl(cnt1, stride); 3432 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3433 cmpl(cnt1, stride*2); 3434 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3435 movdl(vec1, ch); 3436 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3437 vpxor(vec2, vec2); 3438 movl(tmp, cnt1); 3439 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3440 andl(cnt1,0x0000001F); //tail count (in chars) 3441 3442 bind(SCAN_TO_32_CHAR_LOOP); 3443 vmovdqu(vec3, Address(result, 0)); 3444 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3445 vptest(vec2, vec3); 3446 jcc(Assembler::carryClear, FOUND_CHAR); 3447 addptr(result, 32); 3448 subl(tmp, stride*2); 3449 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3450 jmp(SCAN_TO_16_CHAR); 3451 3452 bind(SCAN_TO_16_CHAR_INIT); 3453 movdl(vec1, ch); 3454 pxor(vec2, vec2); 3455 pshufb(vec1, vec2); 3456 } 3457 3458 bind(SCAN_TO_16_CHAR); 3459 cmpl(cnt1, stride); 3460 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3461 if (UseAVX < 2) { 3462 movdl(vec1, ch); 3463 pxor(vec2, vec2); 3464 pshufb(vec1, vec2); 3465 } 3466 movl(tmp, cnt1); 3467 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3468 andl(cnt1,0x0000000F); //tail count (in bytes) 3469 3470 bind(SCAN_TO_16_CHAR_LOOP); 3471 movdqu(vec3, Address(result, 0)); 3472 pcmpeqb(vec3, vec1); 3473 ptest(vec2, vec3); 3474 jcc(Assembler::carryClear, FOUND_CHAR); 3475 addptr(result, 16); 3476 subl(tmp, stride); 3477 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3478 3479 bind(SCAN_TO_CHAR_INIT); 3480 testl(cnt1, cnt1); 3481 jcc(Assembler::zero, RET_NOT_FOUND); 3482 bind(SCAN_TO_CHAR_LOOP); 3483 load_unsigned_byte(tmp, Address(result, 0)); 3484 cmpl(ch, tmp); 3485 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3486 addptr(result, 1); 3487 subl(cnt1, 1); 3488 jccb(Assembler::zero, RET_NOT_FOUND); 3489 jmp(SCAN_TO_CHAR_LOOP); 3490 3491 bind(RET_NOT_FOUND); 3492 movl(result, -1); 3493 jmpb(DONE_LABEL); 3494 3495 bind(FOUND_CHAR); 3496 if (UseAVX >= 2) { 3497 vpmovmskb(tmp, vec3); 3498 } else { 3499 pmovmskb(tmp, vec3); 3500 } 3501 bsfl(ch, tmp); 3502 addptr(result, ch); 3503 3504 bind(FOUND_SEQ_CHAR); 3505 subptr(result, str1); 3506 3507 bind(DONE_LABEL); 3508 } // stringL_indexof_char 3509 3510 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3511 switch (eltype) { 3512 case T_BOOLEAN: return sizeof(jboolean); 3513 case T_BYTE: return sizeof(jbyte); 3514 case T_SHORT: return sizeof(jshort); 3515 case T_CHAR: return sizeof(jchar); 3516 case T_INT: return sizeof(jint); 3517 default: 3518 ShouldNotReachHere(); 3519 return -1; 3520 } 3521 } 3522 3523 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3524 switch (eltype) { 3525 // T_BOOLEAN used as surrogate for unsigned byte 3526 case T_BOOLEAN: movzbl(dst, src); break; 3527 case T_BYTE: movsbl(dst, src); break; 3528 case T_SHORT: movswl(dst, src); break; 3529 case T_CHAR: movzwl(dst, src); break; 3530 case T_INT: movl(dst, src); break; 3531 default: 3532 ShouldNotReachHere(); 3533 } 3534 } 3535 3536 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3537 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3538 } 3539 3540 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3541 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3542 } 3543 3544 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3545 const int vlen = Assembler::AVX_256bit; 3546 switch (eltype) { 3547 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3548 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3549 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3550 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3551 case T_INT: 3552 // do nothing 3553 break; 3554 default: 3555 ShouldNotReachHere(); 3556 } 3557 } 3558 3559 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3560 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3561 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3562 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3563 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3564 BasicType eltype) { 3565 ShortBranchVerifier sbv(this); 3566 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3567 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3568 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3569 3570 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3571 SHORT_UNROLLED_LOOP_EXIT, 3572 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3573 UNROLLED_VECTOR_LOOP_BEGIN, 3574 END; 3575 switch (eltype) { 3576 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3577 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3578 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3579 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3580 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3581 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3582 } 3583 3584 // For "renaming" for readibility of the code 3585 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3586 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3587 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3588 3589 const int elsize = arrays_hashcode_elsize(eltype); 3590 3591 /* 3592 if (cnt1 >= 2) { 3593 if (cnt1 >= 32) { 3594 UNROLLED VECTOR LOOP 3595 } 3596 UNROLLED SCALAR LOOP 3597 } 3598 SINGLE SCALAR 3599 */ 3600 3601 cmpl(cnt1, 32); 3602 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3603 3604 // cnt1 >= 32 && generate_vectorized_loop 3605 xorl(index, index); 3606 3607 // vresult = IntVector.zero(I256); 3608 for (int idx = 0; idx < 4; idx++) { 3609 vpxor(vresult[idx], vresult[idx]); 3610 } 3611 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3612 Register bound = tmp2; 3613 Register next = tmp3; 3614 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3615 movl(next, Address(tmp2, 0)); 3616 movdl(vnext, next); 3617 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3618 3619 // index = 0; 3620 // bound = cnt1 & ~(32 - 1); 3621 movl(bound, cnt1); 3622 andl(bound, ~(32 - 1)); 3623 // for (; index < bound; index += 32) { 3624 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3625 // result *= next; 3626 imull(result, next); 3627 // loop fission to upfront the cost of fetching from memory, OOO execution 3628 // can then hopefully do a better job of prefetching 3629 for (int idx = 0; idx < 4; idx++) { 3630 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3631 } 3632 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3633 for (int idx = 0; idx < 4; idx++) { 3634 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3635 arrays_hashcode_elvcast(vtmp[idx], eltype); 3636 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3637 } 3638 // index += 32; 3639 addl(index, 32); 3640 // index < bound; 3641 cmpl(index, bound); 3642 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3643 // } 3644 3645 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3646 subl(cnt1, bound); 3647 // release bound 3648 3649 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3650 for (int idx = 0; idx < 4; idx++) { 3651 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3652 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3653 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3654 } 3655 // result += vresult.reduceLanes(ADD); 3656 for (int idx = 0; idx < 4; idx++) { 3657 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3658 } 3659 3660 // } else if (cnt1 < 32) { 3661 3662 bind(SHORT_UNROLLED_BEGIN); 3663 // int i = 1; 3664 movl(index, 1); 3665 cmpl(index, cnt1); 3666 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3667 3668 // for (; i < cnt1 ; i += 2) { 3669 bind(SHORT_UNROLLED_LOOP_BEGIN); 3670 movl(tmp3, 961); 3671 imull(result, tmp3); 3672 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3673 movl(tmp3, tmp2); 3674 shll(tmp3, 5); 3675 subl(tmp3, tmp2); 3676 addl(result, tmp3); 3677 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3678 addl(result, tmp3); 3679 addl(index, 2); 3680 cmpl(index, cnt1); 3681 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3682 3683 // } 3684 // if (i >= cnt1) { 3685 bind(SHORT_UNROLLED_LOOP_EXIT); 3686 jccb(Assembler::greater, END); 3687 movl(tmp2, result); 3688 shll(result, 5); 3689 subl(result, tmp2); 3690 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3691 addl(result, tmp3); 3692 // } 3693 bind(END); 3694 3695 BLOCK_COMMENT("} // arrays_hashcode"); 3696 3697 } // arrays_hashcode 3698 3699 // helper function for string_compare 3700 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3701 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3702 Address::ScaleFactor scale2, Register index, int ae) { 3703 if (ae == StrIntrinsicNode::LL) { 3704 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3705 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3706 } else if (ae == StrIntrinsicNode::UU) { 3707 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3708 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3709 } else { 3710 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3711 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3712 } 3713 } 3714 3715 // Compare strings, used for char[] and byte[]. 3716 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3717 Register cnt1, Register cnt2, Register result, 3718 XMMRegister vec1, int ae, KRegister mask) { 3719 ShortBranchVerifier sbv(this); 3720 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3721 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3722 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3723 int stride2x2 = 0x40; 3724 Address::ScaleFactor scale = Address::no_scale; 3725 Address::ScaleFactor scale1 = Address::no_scale; 3726 Address::ScaleFactor scale2 = Address::no_scale; 3727 3728 if (ae != StrIntrinsicNode::LL) { 3729 stride2x2 = 0x20; 3730 } 3731 3732 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3733 shrl(cnt2, 1); 3734 } 3735 // Compute the minimum of the string lengths and the 3736 // difference of the string lengths (stack). 3737 // Do the conditional move stuff 3738 movl(result, cnt1); 3739 subl(cnt1, cnt2); 3740 push(cnt1); 3741 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3742 3743 // Is the minimum length zero? 3744 testl(cnt2, cnt2); 3745 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3746 if (ae == StrIntrinsicNode::LL) { 3747 // Load first bytes 3748 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3749 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3750 } else if (ae == StrIntrinsicNode::UU) { 3751 // Load first characters 3752 load_unsigned_short(result, Address(str1, 0)); 3753 load_unsigned_short(cnt1, Address(str2, 0)); 3754 } else { 3755 load_unsigned_byte(result, Address(str1, 0)); 3756 load_unsigned_short(cnt1, Address(str2, 0)); 3757 } 3758 subl(result, cnt1); 3759 jcc(Assembler::notZero, POP_LABEL); 3760 3761 if (ae == StrIntrinsicNode::UU) { 3762 // Divide length by 2 to get number of chars 3763 shrl(cnt2, 1); 3764 } 3765 cmpl(cnt2, 1); 3766 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3767 3768 // Check if the strings start at the same location and setup scale and stride 3769 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3770 cmpptr(str1, str2); 3771 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3772 if (ae == StrIntrinsicNode::LL) { 3773 scale = Address::times_1; 3774 stride = 16; 3775 } else { 3776 scale = Address::times_2; 3777 stride = 8; 3778 } 3779 } else { 3780 scale1 = Address::times_1; 3781 scale2 = Address::times_2; 3782 // scale not used 3783 stride = 8; 3784 } 3785 3786 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3787 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3788 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3789 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3790 Label COMPARE_TAIL_LONG; 3791 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3792 3793 int pcmpmask = 0x19; 3794 if (ae == StrIntrinsicNode::LL) { 3795 pcmpmask &= ~0x01; 3796 } 3797 3798 // Setup to compare 16-chars (32-bytes) vectors, 3799 // start from first character again because it has aligned address. 3800 if (ae == StrIntrinsicNode::LL) { 3801 stride2 = 32; 3802 } else { 3803 stride2 = 16; 3804 } 3805 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3806 adr_stride = stride << scale; 3807 } else { 3808 adr_stride1 = 8; //stride << scale1; 3809 adr_stride2 = 16; //stride << scale2; 3810 } 3811 3812 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3813 // rax and rdx are used by pcmpestri as elements counters 3814 movl(result, cnt2); 3815 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3816 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3817 3818 // fast path : compare first 2 8-char vectors. 3819 bind(COMPARE_16_CHARS); 3820 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3821 movdqu(vec1, Address(str1, 0)); 3822 } else { 3823 pmovzxbw(vec1, Address(str1, 0)); 3824 } 3825 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3826 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3827 3828 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3829 movdqu(vec1, Address(str1, adr_stride)); 3830 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3831 } else { 3832 pmovzxbw(vec1, Address(str1, adr_stride1)); 3833 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3834 } 3835 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3836 addl(cnt1, stride); 3837 3838 // Compare the characters at index in cnt1 3839 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3840 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3841 subl(result, cnt2); 3842 jmp(POP_LABEL); 3843 3844 // Setup the registers to start vector comparison loop 3845 bind(COMPARE_WIDE_VECTORS); 3846 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3847 lea(str1, Address(str1, result, scale)); 3848 lea(str2, Address(str2, result, scale)); 3849 } else { 3850 lea(str1, Address(str1, result, scale1)); 3851 lea(str2, Address(str2, result, scale2)); 3852 } 3853 subl(result, stride2); 3854 subl(cnt2, stride2); 3855 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3856 negptr(result); 3857 3858 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3859 bind(COMPARE_WIDE_VECTORS_LOOP); 3860 3861 #ifdef _LP64 3862 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3863 cmpl(cnt2, stride2x2); 3864 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3865 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3866 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3867 3868 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3869 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3870 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3871 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3872 } else { 3873 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3874 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3875 } 3876 kortestql(mask, mask); 3877 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3878 addptr(result, stride2x2); // update since we already compared at this addr 3879 subl(cnt2, stride2x2); // and sub the size too 3880 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3881 3882 vpxor(vec1, vec1); 3883 jmpb(COMPARE_WIDE_TAIL); 3884 }//if (VM_Version::supports_avx512vlbw()) 3885 #endif // _LP64 3886 3887 3888 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3889 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3890 vmovdqu(vec1, Address(str1, result, scale)); 3891 vpxor(vec1, Address(str2, result, scale)); 3892 } else { 3893 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3894 vpxor(vec1, Address(str2, result, scale2)); 3895 } 3896 vptest(vec1, vec1); 3897 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3898 addptr(result, stride2); 3899 subl(cnt2, stride2); 3900 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3901 // clean upper bits of YMM registers 3902 vpxor(vec1, vec1); 3903 3904 // compare wide vectors tail 3905 bind(COMPARE_WIDE_TAIL); 3906 testptr(result, result); 3907 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3908 3909 movl(result, stride2); 3910 movl(cnt2, result); 3911 negptr(result); 3912 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3913 3914 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3915 bind(VECTOR_NOT_EQUAL); 3916 // clean upper bits of YMM registers 3917 vpxor(vec1, vec1); 3918 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3919 lea(str1, Address(str1, result, scale)); 3920 lea(str2, Address(str2, result, scale)); 3921 } else { 3922 lea(str1, Address(str1, result, scale1)); 3923 lea(str2, Address(str2, result, scale2)); 3924 } 3925 jmp(COMPARE_16_CHARS); 3926 3927 // Compare tail chars, length between 1 to 15 chars 3928 bind(COMPARE_TAIL_LONG); 3929 movl(cnt2, result); 3930 cmpl(cnt2, stride); 3931 jcc(Assembler::less, COMPARE_SMALL_STR); 3932 3933 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3934 movdqu(vec1, Address(str1, 0)); 3935 } else { 3936 pmovzxbw(vec1, Address(str1, 0)); 3937 } 3938 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3939 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3940 subptr(cnt2, stride); 3941 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3942 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3943 lea(str1, Address(str1, result, scale)); 3944 lea(str2, Address(str2, result, scale)); 3945 } else { 3946 lea(str1, Address(str1, result, scale1)); 3947 lea(str2, Address(str2, result, scale2)); 3948 } 3949 negptr(cnt2); 3950 jmpb(WHILE_HEAD_LABEL); 3951 3952 bind(COMPARE_SMALL_STR); 3953 } else if (UseSSE42Intrinsics) { 3954 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3955 int pcmpmask = 0x19; 3956 // Setup to compare 8-char (16-byte) vectors, 3957 // start from first character again because it has aligned address. 3958 movl(result, cnt2); 3959 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3960 if (ae == StrIntrinsicNode::LL) { 3961 pcmpmask &= ~0x01; 3962 } 3963 jcc(Assembler::zero, COMPARE_TAIL); 3964 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3965 lea(str1, Address(str1, result, scale)); 3966 lea(str2, Address(str2, result, scale)); 3967 } else { 3968 lea(str1, Address(str1, result, scale1)); 3969 lea(str2, Address(str2, result, scale2)); 3970 } 3971 negptr(result); 3972 3973 // pcmpestri 3974 // inputs: 3975 // vec1- substring 3976 // rax - negative string length (elements count) 3977 // mem - scanned string 3978 // rdx - string length (elements count) 3979 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3980 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3981 // outputs: 3982 // rcx - first mismatched element index 3983 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3984 3985 bind(COMPARE_WIDE_VECTORS); 3986 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3987 movdqu(vec1, Address(str1, result, scale)); 3988 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3989 } else { 3990 pmovzxbw(vec1, Address(str1, result, scale1)); 3991 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3992 } 3993 // After pcmpestri cnt1(rcx) contains mismatched element index 3994 3995 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3996 addptr(result, stride); 3997 subptr(cnt2, stride); 3998 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3999 4000 // compare wide vectors tail 4001 testptr(result, result); 4002 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 4003 4004 movl(cnt2, stride); 4005 movl(result, stride); 4006 negptr(result); 4007 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4008 movdqu(vec1, Address(str1, result, scale)); 4009 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 4010 } else { 4011 pmovzxbw(vec1, Address(str1, result, scale1)); 4012 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 4013 } 4014 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 4015 4016 // Mismatched characters in the vectors 4017 bind(VECTOR_NOT_EQUAL); 4018 addptr(cnt1, result); 4019 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 4020 subl(result, cnt2); 4021 jmpb(POP_LABEL); 4022 4023 bind(COMPARE_TAIL); // limit is zero 4024 movl(cnt2, result); 4025 // Fallthru to tail compare 4026 } 4027 // Shift str2 and str1 to the end of the arrays, negate min 4028 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 4029 lea(str1, Address(str1, cnt2, scale)); 4030 lea(str2, Address(str2, cnt2, scale)); 4031 } else { 4032 lea(str1, Address(str1, cnt2, scale1)); 4033 lea(str2, Address(str2, cnt2, scale2)); 4034 } 4035 decrementl(cnt2); // first character was compared already 4036 negptr(cnt2); 4037 4038 // Compare the rest of the elements 4039 bind(WHILE_HEAD_LABEL); 4040 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 4041 subl(result, cnt1); 4042 jccb(Assembler::notZero, POP_LABEL); 4043 increment(cnt2); 4044 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 4045 4046 // Strings are equal up to min length. Return the length difference. 4047 bind(LENGTH_DIFF_LABEL); 4048 pop(result); 4049 if (ae == StrIntrinsicNode::UU) { 4050 // Divide diff by 2 to get number of chars 4051 sarl(result, 1); 4052 } 4053 jmpb(DONE_LABEL); 4054 4055 #ifdef _LP64 4056 if (VM_Version::supports_avx512vlbw()) { 4057 4058 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 4059 4060 kmovql(cnt1, mask); 4061 notq(cnt1); 4062 bsfq(cnt2, cnt1); 4063 if (ae != StrIntrinsicNode::LL) { 4064 // Divide diff by 2 to get number of chars 4065 sarl(cnt2, 1); 4066 } 4067 addq(result, cnt2); 4068 if (ae == StrIntrinsicNode::LL) { 4069 load_unsigned_byte(cnt1, Address(str2, result)); 4070 load_unsigned_byte(result, Address(str1, result)); 4071 } else if (ae == StrIntrinsicNode::UU) { 4072 load_unsigned_short(cnt1, Address(str2, result, scale)); 4073 load_unsigned_short(result, Address(str1, result, scale)); 4074 } else { 4075 load_unsigned_short(cnt1, Address(str2, result, scale2)); 4076 load_unsigned_byte(result, Address(str1, result, scale1)); 4077 } 4078 subl(result, cnt1); 4079 jmpb(POP_LABEL); 4080 }//if (VM_Version::supports_avx512vlbw()) 4081 #endif // _LP64 4082 4083 // Discard the stored length difference 4084 bind(POP_LABEL); 4085 pop(cnt1); 4086 4087 // That's it 4088 bind(DONE_LABEL); 4089 if(ae == StrIntrinsicNode::UL) { 4090 negl(result); 4091 } 4092 4093 } 4094 4095 // Search for Non-ASCII character (Negative byte value) in a byte array, 4096 // return the index of the first such character, otherwise the length 4097 // of the array segment searched. 4098 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 4099 // @IntrinsicCandidate 4100 // public static int countPositives(byte[] ba, int off, int len) { 4101 // for (int i = off; i < off + len; i++) { 4102 // if (ba[i] < 0) { 4103 // return i - off; 4104 // } 4105 // } 4106 // return len; 4107 // } 4108 void C2_MacroAssembler::count_positives(Register ary1, Register len, 4109 Register result, Register tmp1, 4110 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 4111 // rsi: byte array 4112 // rcx: len 4113 // rax: result 4114 ShortBranchVerifier sbv(this); 4115 assert_different_registers(ary1, len, result, tmp1); 4116 assert_different_registers(vec1, vec2); 4117 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 4118 4119 movl(result, len); // copy 4120 // len == 0 4121 testl(len, len); 4122 jcc(Assembler::zero, DONE); 4123 4124 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 4125 VM_Version::supports_avx512vlbw() && 4126 VM_Version::supports_bmi2()) { 4127 4128 Label test_64_loop, test_tail, BREAK_LOOP; 4129 movl(tmp1, len); 4130 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 4131 4132 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 4133 andl(len, 0xffffffc0); // vector count (in chars) 4134 jccb(Assembler::zero, test_tail); 4135 4136 lea(ary1, Address(ary1, len, Address::times_1)); 4137 negptr(len); 4138 4139 bind(test_64_loop); 4140 // Check whether our 64 elements of size byte contain negatives 4141 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 4142 kortestql(mask1, mask1); 4143 jcc(Assembler::notZero, BREAK_LOOP); 4144 4145 addptr(len, 64); 4146 jccb(Assembler::notZero, test_64_loop); 4147 4148 bind(test_tail); 4149 // bail out when there is nothing to be done 4150 testl(tmp1, -1); 4151 jcc(Assembler::zero, DONE); 4152 4153 4154 // check the tail for absense of negatives 4155 // ~(~0 << len) applied up to two times (for 32-bit scenario) 4156 #ifdef _LP64 4157 { 4158 Register tmp3_aliased = len; 4159 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 4160 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 4161 notq(tmp3_aliased); 4162 kmovql(mask2, tmp3_aliased); 4163 } 4164 #else 4165 Label k_init; 4166 jmp(k_init); 4167 4168 // We could not read 64-bits from a general purpose register thus we move 4169 // data required to compose 64 1's to the instruction stream 4170 // We emit 64 byte wide series of elements from 0..63 which later on would 4171 // be used as a compare targets with tail count contained in tmp1 register. 4172 // Result would be a k register having tmp1 consecutive number or 1 4173 // counting from least significant bit. 4174 address tmp = pc(); 4175 emit_int64(0x0706050403020100); 4176 emit_int64(0x0F0E0D0C0B0A0908); 4177 emit_int64(0x1716151413121110); 4178 emit_int64(0x1F1E1D1C1B1A1918); 4179 emit_int64(0x2726252423222120); 4180 emit_int64(0x2F2E2D2C2B2A2928); 4181 emit_int64(0x3736353433323130); 4182 emit_int64(0x3F3E3D3C3B3A3938); 4183 4184 bind(k_init); 4185 lea(len, InternalAddress(tmp)); 4186 // create mask to test for negative byte inside a vector 4187 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 4188 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 4189 4190 #endif 4191 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 4192 ktestq(mask1, mask2); 4193 jcc(Assembler::zero, DONE); 4194 4195 // do a full check for negative registers in the tail 4196 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 4197 // ary1 already pointing to the right place 4198 jmpb(TAIL_START); 4199 4200 bind(BREAK_LOOP); 4201 // At least one byte in the last 64 byte block was negative. 4202 // Set up to look at the last 64 bytes as if they were a tail 4203 lea(ary1, Address(ary1, len, Address::times_1)); 4204 addptr(result, len); 4205 // Ignore the very last byte: if all others are positive, 4206 // it must be negative, so we can skip right to the 2+1 byte 4207 // end comparison at this point 4208 orl(result, 63); 4209 movl(len, 63); 4210 // Fallthru to tail compare 4211 } else { 4212 4213 if (UseAVX >= 2 && UseSSE >= 2) { 4214 // With AVX2, use 32-byte vector compare 4215 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4216 4217 // Compare 32-byte vectors 4218 testl(len, 0xffffffe0); // vector count (in bytes) 4219 jccb(Assembler::zero, TAIL_START); 4220 4221 andl(len, 0xffffffe0); 4222 lea(ary1, Address(ary1, len, Address::times_1)); 4223 negptr(len); 4224 4225 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 4226 movdl(vec2, tmp1); 4227 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4228 4229 bind(COMPARE_WIDE_VECTORS); 4230 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4231 vptest(vec1, vec2); 4232 jccb(Assembler::notZero, BREAK_LOOP); 4233 addptr(len, 32); 4234 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4235 4236 testl(result, 0x0000001f); // any bytes remaining? 4237 jcc(Assembler::zero, DONE); 4238 4239 // Quick test using the already prepared vector mask 4240 movl(len, result); 4241 andl(len, 0x0000001f); 4242 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4243 vptest(vec1, vec2); 4244 jcc(Assembler::zero, DONE); 4245 // There are zeros, jump to the tail to determine exactly where 4246 jmpb(TAIL_START); 4247 4248 bind(BREAK_LOOP); 4249 // At least one byte in the last 32-byte vector is negative. 4250 // Set up to look at the last 32 bytes as if they were a tail 4251 lea(ary1, Address(ary1, len, Address::times_1)); 4252 addptr(result, len); 4253 // Ignore the very last byte: if all others are positive, 4254 // it must be negative, so we can skip right to the 2+1 byte 4255 // end comparison at this point 4256 orl(result, 31); 4257 movl(len, 31); 4258 // Fallthru to tail compare 4259 } else if (UseSSE42Intrinsics) { 4260 // With SSE4.2, use double quad vector compare 4261 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4262 4263 // Compare 16-byte vectors 4264 testl(len, 0xfffffff0); // vector count (in bytes) 4265 jcc(Assembler::zero, TAIL_START); 4266 4267 andl(len, 0xfffffff0); 4268 lea(ary1, Address(ary1, len, Address::times_1)); 4269 negptr(len); 4270 4271 movl(tmp1, 0x80808080); 4272 movdl(vec2, tmp1); 4273 pshufd(vec2, vec2, 0); 4274 4275 bind(COMPARE_WIDE_VECTORS); 4276 movdqu(vec1, Address(ary1, len, Address::times_1)); 4277 ptest(vec1, vec2); 4278 jccb(Assembler::notZero, BREAK_LOOP); 4279 addptr(len, 16); 4280 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4281 4282 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4283 jcc(Assembler::zero, DONE); 4284 4285 // Quick test using the already prepared vector mask 4286 movl(len, result); 4287 andl(len, 0x0000000f); // tail count (in bytes) 4288 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4289 ptest(vec1, vec2); 4290 jcc(Assembler::zero, DONE); 4291 jmpb(TAIL_START); 4292 4293 bind(BREAK_LOOP); 4294 // At least one byte in the last 16-byte vector is negative. 4295 // Set up and look at the last 16 bytes as if they were a tail 4296 lea(ary1, Address(ary1, len, Address::times_1)); 4297 addptr(result, len); 4298 // Ignore the very last byte: if all others are positive, 4299 // it must be negative, so we can skip right to the 2+1 byte 4300 // end comparison at this point 4301 orl(result, 15); 4302 movl(len, 15); 4303 // Fallthru to tail compare 4304 } 4305 } 4306 4307 bind(TAIL_START); 4308 // Compare 4-byte vectors 4309 andl(len, 0xfffffffc); // vector count (in bytes) 4310 jccb(Assembler::zero, COMPARE_CHAR); 4311 4312 lea(ary1, Address(ary1, len, Address::times_1)); 4313 negptr(len); 4314 4315 bind(COMPARE_VECTORS); 4316 movl(tmp1, Address(ary1, len, Address::times_1)); 4317 andl(tmp1, 0x80808080); 4318 jccb(Assembler::notZero, TAIL_ADJUST); 4319 addptr(len, 4); 4320 jccb(Assembler::notZero, COMPARE_VECTORS); 4321 4322 // Compare trailing char (final 2-3 bytes), if any 4323 bind(COMPARE_CHAR); 4324 4325 testl(result, 0x2); // tail char 4326 jccb(Assembler::zero, COMPARE_BYTE); 4327 load_unsigned_short(tmp1, Address(ary1, 0)); 4328 andl(tmp1, 0x00008080); 4329 jccb(Assembler::notZero, CHAR_ADJUST); 4330 lea(ary1, Address(ary1, 2)); 4331 4332 bind(COMPARE_BYTE); 4333 testl(result, 0x1); // tail byte 4334 jccb(Assembler::zero, DONE); 4335 load_unsigned_byte(tmp1, Address(ary1, 0)); 4336 testl(tmp1, 0x00000080); 4337 jccb(Assembler::zero, DONE); 4338 subptr(result, 1); 4339 jmpb(DONE); 4340 4341 bind(TAIL_ADJUST); 4342 // there are negative bits in the last 4 byte block. 4343 // Adjust result and check the next three bytes 4344 addptr(result, len); 4345 orl(result, 3); 4346 lea(ary1, Address(ary1, len, Address::times_1)); 4347 jmpb(COMPARE_CHAR); 4348 4349 bind(CHAR_ADJUST); 4350 // We are looking at a char + optional byte tail, and found that one 4351 // of the bytes in the char is negative. Adjust the result, check the 4352 // first byte and readjust if needed. 4353 andl(result, 0xfffffffc); 4354 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4355 jccb(Assembler::notZero, DONE); 4356 addptr(result, 1); 4357 4358 // That's it 4359 bind(DONE); 4360 if (UseAVX >= 2 && UseSSE >= 2) { 4361 // clean upper bits of YMM registers 4362 vpxor(vec1, vec1); 4363 vpxor(vec2, vec2); 4364 } 4365 } 4366 4367 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4368 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4369 Register limit, Register result, Register chr, 4370 XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) { 4371 ShortBranchVerifier sbv(this); 4372 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4373 4374 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4375 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4376 4377 if (is_array_equ) { 4378 // Check the input args 4379 cmpoop(ary1, ary2); 4380 jcc(Assembler::equal, TRUE_LABEL); 4381 4382 // Need additional checks for arrays_equals. 4383 testptr(ary1, ary1); 4384 jcc(Assembler::zero, FALSE_LABEL); 4385 testptr(ary2, ary2); 4386 jcc(Assembler::zero, FALSE_LABEL); 4387 4388 // Check the lengths 4389 movl(limit, Address(ary1, length_offset)); 4390 cmpl(limit, Address(ary2, length_offset)); 4391 jcc(Assembler::notEqual, FALSE_LABEL); 4392 } 4393 4394 // count == 0 4395 testl(limit, limit); 4396 jcc(Assembler::zero, TRUE_LABEL); 4397 4398 if (is_array_equ) { 4399 // Load array address 4400 lea(ary1, Address(ary1, base_offset)); 4401 lea(ary2, Address(ary2, base_offset)); 4402 } 4403 4404 if (is_array_equ && is_char) { 4405 // arrays_equals when used for char[]. 4406 shll(limit, 1); // byte count != 0 4407 } 4408 movl(result, limit); // copy 4409 4410 if (UseAVX >= 2) { 4411 // With AVX2, use 32-byte vector compare 4412 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4413 4414 // Compare 32-byte vectors 4415 andl(result, 0x0000001f); // tail count (in bytes) 4416 andl(limit, 0xffffffe0); // vector count (in bytes) 4417 jcc(Assembler::zero, COMPARE_TAIL); 4418 4419 lea(ary1, Address(ary1, limit, Address::times_1)); 4420 lea(ary2, Address(ary2, limit, Address::times_1)); 4421 negptr(limit); 4422 4423 #ifdef _LP64 4424 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4425 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4426 4427 cmpl(limit, -64); 4428 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4429 4430 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4431 4432 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4433 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4434 kortestql(mask, mask); 4435 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4436 addptr(limit, 64); // update since we already compared at this addr 4437 cmpl(limit, -64); 4438 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4439 4440 // At this point we may still need to compare -limit+result bytes. 4441 // We could execute the next two instruction and just continue via non-wide path: 4442 // cmpl(limit, 0); 4443 // jcc(Assembler::equal, COMPARE_TAIL); // true 4444 // But since we stopped at the points ary{1,2}+limit which are 4445 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4446 // (|limit| <= 32 and result < 32), 4447 // we may just compare the last 64 bytes. 4448 // 4449 addptr(result, -64); // it is safe, bc we just came from this area 4450 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4451 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4452 kortestql(mask, mask); 4453 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4454 4455 jmp(TRUE_LABEL); 4456 4457 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4458 4459 }//if (VM_Version::supports_avx512vlbw()) 4460 #endif //_LP64 4461 bind(COMPARE_WIDE_VECTORS); 4462 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); 4463 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4464 vpxor(vec1, vec2); 4465 4466 vptest(vec1, vec1); 4467 jcc(Assembler::notZero, FALSE_LABEL); 4468 addptr(limit, 32); 4469 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4470 4471 testl(result, result); 4472 jcc(Assembler::zero, TRUE_LABEL); 4473 4474 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 4475 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4476 vpxor(vec1, vec2); 4477 4478 vptest(vec1, vec1); 4479 jccb(Assembler::notZero, FALSE_LABEL); 4480 jmpb(TRUE_LABEL); 4481 4482 bind(COMPARE_TAIL); // limit is zero 4483 movl(limit, result); 4484 // Fallthru to tail compare 4485 } else if (UseSSE42Intrinsics) { 4486 // With SSE4.2, use double quad vector compare 4487 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4488 4489 // Compare 16-byte vectors 4490 andl(result, 0x0000000f); // tail count (in bytes) 4491 andl(limit, 0xfffffff0); // vector count (in bytes) 4492 jcc(Assembler::zero, COMPARE_TAIL); 4493 4494 lea(ary1, Address(ary1, limit, Address::times_1)); 4495 lea(ary2, Address(ary2, limit, Address::times_1)); 4496 negptr(limit); 4497 4498 bind(COMPARE_WIDE_VECTORS); 4499 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4500 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4501 pxor(vec1, vec2); 4502 4503 ptest(vec1, vec1); 4504 jcc(Assembler::notZero, FALSE_LABEL); 4505 addptr(limit, 16); 4506 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4507 4508 testl(result, result); 4509 jcc(Assembler::zero, TRUE_LABEL); 4510 4511 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4512 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4513 pxor(vec1, vec2); 4514 4515 ptest(vec1, vec1); 4516 jccb(Assembler::notZero, FALSE_LABEL); 4517 jmpb(TRUE_LABEL); 4518 4519 bind(COMPARE_TAIL); // limit is zero 4520 movl(limit, result); 4521 // Fallthru to tail compare 4522 } 4523 4524 // Compare 4-byte vectors 4525 andl(limit, 0xfffffffc); // vector count (in bytes) 4526 jccb(Assembler::zero, COMPARE_CHAR); 4527 4528 lea(ary1, Address(ary1, limit, Address::times_1)); 4529 lea(ary2, Address(ary2, limit, Address::times_1)); 4530 negptr(limit); 4531 4532 bind(COMPARE_VECTORS); 4533 movl(chr, Address(ary1, limit, Address::times_1)); 4534 cmpl(chr, Address(ary2, limit, Address::times_1)); 4535 jccb(Assembler::notEqual, FALSE_LABEL); 4536 addptr(limit, 4); 4537 jcc(Assembler::notZero, COMPARE_VECTORS); 4538 4539 // Compare trailing char (final 2 bytes), if any 4540 bind(COMPARE_CHAR); 4541 testl(result, 0x2); // tail char 4542 jccb(Assembler::zero, COMPARE_BYTE); 4543 load_unsigned_short(chr, Address(ary1, 0)); 4544 load_unsigned_short(limit, Address(ary2, 0)); 4545 cmpl(chr, limit); 4546 jccb(Assembler::notEqual, FALSE_LABEL); 4547 4548 if (is_array_equ && is_char) { 4549 bind(COMPARE_BYTE); 4550 } else { 4551 lea(ary1, Address(ary1, 2)); 4552 lea(ary2, Address(ary2, 2)); 4553 4554 bind(COMPARE_BYTE); 4555 testl(result, 0x1); // tail byte 4556 jccb(Assembler::zero, TRUE_LABEL); 4557 load_unsigned_byte(chr, Address(ary1, 0)); 4558 load_unsigned_byte(limit, Address(ary2, 0)); 4559 cmpl(chr, limit); 4560 jccb(Assembler::notEqual, FALSE_LABEL); 4561 } 4562 bind(TRUE_LABEL); 4563 movl(result, 1); // return true 4564 jmpb(DONE); 4565 4566 bind(FALSE_LABEL); 4567 xorl(result, result); // return false 4568 4569 // That's it 4570 bind(DONE); 4571 if (UseAVX >= 2) { 4572 // clean upper bits of YMM registers 4573 vpxor(vec1, vec1); 4574 vpxor(vec2, vec2); 4575 } 4576 } 4577 4578 #ifdef _LP64 4579 4580 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4581 #define __ masm. 4582 Register dst = stub.data<0>(); 4583 XMMRegister src = stub.data<1>(); 4584 address target = stub.data<2>(); 4585 __ bind(stub.entry()); 4586 __ subptr(rsp, 8); 4587 __ movdbl(Address(rsp), src); 4588 __ call(RuntimeAddress(target)); 4589 __ pop(dst); 4590 __ jmp(stub.continuation()); 4591 #undef __ 4592 } 4593 4594 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4595 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4596 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4597 4598 address slowpath_target; 4599 if (dst_bt == T_INT) { 4600 if (src_bt == T_FLOAT) { 4601 cvttss2sil(dst, src); 4602 cmpl(dst, 0x80000000); 4603 slowpath_target = StubRoutines::x86::f2i_fixup(); 4604 } else { 4605 cvttsd2sil(dst, src); 4606 cmpl(dst, 0x80000000); 4607 slowpath_target = StubRoutines::x86::d2i_fixup(); 4608 } 4609 } else { 4610 if (src_bt == T_FLOAT) { 4611 cvttss2siq(dst, src); 4612 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4613 slowpath_target = StubRoutines::x86::f2l_fixup(); 4614 } else { 4615 cvttsd2siq(dst, src); 4616 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4617 slowpath_target = StubRoutines::x86::d2l_fixup(); 4618 } 4619 } 4620 4621 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4622 jcc(Assembler::equal, stub->entry()); 4623 bind(stub->continuation()); 4624 } 4625 4626 #endif // _LP64 4627 4628 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4629 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4630 switch(ideal_opc) { 4631 case Op_LShiftVS: 4632 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4633 case Op_LShiftVI: 4634 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4635 case Op_LShiftVL: 4636 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4637 case Op_RShiftVS: 4638 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4639 case Op_RShiftVI: 4640 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4641 case Op_RShiftVL: 4642 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4643 case Op_URShiftVS: 4644 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4645 case Op_URShiftVI: 4646 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4647 case Op_URShiftVL: 4648 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4649 case Op_RotateRightV: 4650 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4651 case Op_RotateLeftV: 4652 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4653 default: 4654 fatal("Unsupported masked operation"); break; 4655 } 4656 } 4657 4658 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4659 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4660 bool is_varshift) { 4661 switch (ideal_opc) { 4662 case Op_AddVB: 4663 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4664 case Op_AddVS: 4665 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4666 case Op_AddVI: 4667 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4668 case Op_AddVL: 4669 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4670 case Op_AddVF: 4671 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4672 case Op_AddVD: 4673 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4674 case Op_SubVB: 4675 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4676 case Op_SubVS: 4677 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4678 case Op_SubVI: 4679 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4680 case Op_SubVL: 4681 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4682 case Op_SubVF: 4683 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4684 case Op_SubVD: 4685 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4686 case Op_MulVS: 4687 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4688 case Op_MulVI: 4689 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4690 case Op_MulVL: 4691 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4692 case Op_MulVF: 4693 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4694 case Op_MulVD: 4695 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4696 case Op_DivVF: 4697 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4698 case Op_DivVD: 4699 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4700 case Op_SqrtVF: 4701 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4702 case Op_SqrtVD: 4703 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4704 case Op_AbsVB: 4705 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4706 case Op_AbsVS: 4707 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4708 case Op_AbsVI: 4709 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4710 case Op_AbsVL: 4711 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4712 case Op_FmaVF: 4713 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4714 case Op_FmaVD: 4715 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4716 case Op_VectorRearrange: 4717 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4718 case Op_LShiftVS: 4719 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4720 case Op_LShiftVI: 4721 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4722 case Op_LShiftVL: 4723 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4724 case Op_RShiftVS: 4725 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4726 case Op_RShiftVI: 4727 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4728 case Op_RShiftVL: 4729 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4730 case Op_URShiftVS: 4731 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4732 case Op_URShiftVI: 4733 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4734 case Op_URShiftVL: 4735 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4736 case Op_RotateLeftV: 4737 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4738 case Op_RotateRightV: 4739 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4740 case Op_MaxV: 4741 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4742 case Op_MinV: 4743 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4744 case Op_XorV: 4745 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4746 case Op_OrV: 4747 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4748 case Op_AndV: 4749 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4750 default: 4751 fatal("Unsupported masked operation"); break; 4752 } 4753 } 4754 4755 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4756 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4757 switch (ideal_opc) { 4758 case Op_AddVB: 4759 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4760 case Op_AddVS: 4761 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4762 case Op_AddVI: 4763 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4764 case Op_AddVL: 4765 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4766 case Op_AddVF: 4767 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4768 case Op_AddVD: 4769 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4770 case Op_SubVB: 4771 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4772 case Op_SubVS: 4773 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4774 case Op_SubVI: 4775 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4776 case Op_SubVL: 4777 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4778 case Op_SubVF: 4779 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4780 case Op_SubVD: 4781 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4782 case Op_MulVS: 4783 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4784 case Op_MulVI: 4785 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4786 case Op_MulVL: 4787 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4788 case Op_MulVF: 4789 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4790 case Op_MulVD: 4791 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4792 case Op_DivVF: 4793 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4794 case Op_DivVD: 4795 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4796 case Op_FmaVF: 4797 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4798 case Op_FmaVD: 4799 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4800 case Op_MaxV: 4801 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4802 case Op_MinV: 4803 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4804 case Op_XorV: 4805 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4806 case Op_OrV: 4807 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4808 case Op_AndV: 4809 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4810 default: 4811 fatal("Unsupported masked operation"); break; 4812 } 4813 } 4814 4815 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4816 KRegister src1, KRegister src2) { 4817 BasicType etype = T_ILLEGAL; 4818 switch(mask_len) { 4819 case 2: 4820 case 4: 4821 case 8: etype = T_BYTE; break; 4822 case 16: etype = T_SHORT; break; 4823 case 32: etype = T_INT; break; 4824 case 64: etype = T_LONG; break; 4825 default: fatal("Unsupported type"); break; 4826 } 4827 assert(etype != T_ILLEGAL, ""); 4828 switch(ideal_opc) { 4829 case Op_AndVMask: 4830 kand(etype, dst, src1, src2); break; 4831 case Op_OrVMask: 4832 kor(etype, dst, src1, src2); break; 4833 case Op_XorVMask: 4834 kxor(etype, dst, src1, src2); break; 4835 default: 4836 fatal("Unsupported masked operation"); break; 4837 } 4838 } 4839 4840 /* 4841 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4842 * If src is NaN, the result is 0. 4843 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4844 * the result is equal to the value of Integer.MIN_VALUE. 4845 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4846 * the result is equal to the value of Integer.MAX_VALUE. 4847 */ 4848 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4849 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4850 Register rscratch, AddressLiteral float_sign_flip, 4851 int vec_enc) { 4852 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4853 Label done; 4854 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4855 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4856 vptest(xtmp2, xtmp2, vec_enc); 4857 jccb(Assembler::equal, done); 4858 4859 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4860 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4861 4862 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4863 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4864 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4865 4866 // Recompute the mask for remaining special value. 4867 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4868 // Extract SRC values corresponding to TRUE mask lanes. 4869 vpand(xtmp4, xtmp2, src, vec_enc); 4870 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4871 // values are set. 4872 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4873 4874 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4875 bind(done); 4876 } 4877 4878 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4879 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4880 Register rscratch, AddressLiteral float_sign_flip, 4881 int vec_enc) { 4882 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4883 Label done; 4884 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4885 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4886 kortestwl(ktmp1, ktmp1); 4887 jccb(Assembler::equal, done); 4888 4889 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4890 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4891 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4892 4893 kxorwl(ktmp1, ktmp1, ktmp2); 4894 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4895 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4896 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4897 bind(done); 4898 } 4899 4900 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4901 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4902 Register rscratch, AddressLiteral double_sign_flip, 4903 int vec_enc) { 4904 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4905 4906 Label done; 4907 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4908 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4909 kortestwl(ktmp1, ktmp1); 4910 jccb(Assembler::equal, done); 4911 4912 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4913 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4914 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4915 4916 kxorwl(ktmp1, ktmp1, ktmp2); 4917 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4918 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4919 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4920 bind(done); 4921 } 4922 4923 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4924 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4925 Register rscratch, AddressLiteral float_sign_flip, 4926 int vec_enc) { 4927 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4928 Label done; 4929 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4930 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4931 kortestwl(ktmp1, ktmp1); 4932 jccb(Assembler::equal, done); 4933 4934 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4935 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4936 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4937 4938 kxorwl(ktmp1, ktmp1, ktmp2); 4939 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4940 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4941 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4942 bind(done); 4943 } 4944 4945 /* 4946 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4947 * If src is NaN, the result is 0. 4948 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 4949 * the result is equal to the value of Long.MIN_VALUE. 4950 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 4951 * the result is equal to the value of Long.MAX_VALUE. 4952 */ 4953 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4954 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4955 Register rscratch, AddressLiteral double_sign_flip, 4956 int vec_enc) { 4957 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4958 4959 Label done; 4960 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4961 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 4962 kortestwl(ktmp1, ktmp1); 4963 jccb(Assembler::equal, done); 4964 4965 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4966 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4967 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4968 4969 kxorwl(ktmp1, ktmp1, ktmp2); 4970 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4971 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4972 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4973 bind(done); 4974 } 4975 4976 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 4977 XMMRegister xtmp, int index, int vec_enc) { 4978 assert(vec_enc < Assembler::AVX_512bit, ""); 4979 if (vec_enc == Assembler::AVX_256bit) { 4980 vextractf128_high(xtmp, src); 4981 vshufps(dst, src, xtmp, index, vec_enc); 4982 } else { 4983 vshufps(dst, src, zero, index, vec_enc); 4984 } 4985 } 4986 4987 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4988 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 4989 AddressLiteral float_sign_flip, int src_vec_enc) { 4990 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4991 4992 Label done; 4993 // Compare the destination lanes with float_sign_flip 4994 // value to get mask for all special values. 4995 movdqu(xtmp1, float_sign_flip, rscratch); 4996 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 4997 ptest(xtmp2, xtmp2); 4998 jccb(Assembler::equal, done); 4999 5000 // Flip float_sign_flip to get max integer value. 5001 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 5002 pxor(xtmp1, xtmp4); 5003 5004 // Set detination lanes corresponding to unordered source lanes as zero. 5005 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 5006 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 5007 5008 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5009 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5010 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 5011 5012 // Recompute the mask for remaining special value. 5013 pxor(xtmp2, xtmp3); 5014 // Extract mask corresponding to non-negative source lanes. 5015 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 5016 5017 // Shuffle mask vector and pack lower doubles word from each quadword lane. 5018 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 5019 pand(xtmp3, xtmp2); 5020 5021 // Replace destination lanes holding special value(0x80000000) with max int 5022 // if corresponding source lane holds a +ve value. 5023 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 5024 bind(done); 5025 } 5026 5027 5028 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 5029 XMMRegister xtmp, Register rscratch, int vec_enc) { 5030 switch(to_elem_bt) { 5031 case T_SHORT: 5032 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 5033 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 5034 vpackusdw(dst, dst, zero, vec_enc); 5035 if (vec_enc == Assembler::AVX_256bit) { 5036 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5037 } 5038 break; 5039 case T_BYTE: 5040 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 5041 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 5042 vpackusdw(dst, dst, zero, vec_enc); 5043 if (vec_enc == Assembler::AVX_256bit) { 5044 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 5045 } 5046 vpackuswb(dst, dst, zero, vec_enc); 5047 break; 5048 default: assert(false, "%s", type2name(to_elem_bt)); 5049 } 5050 } 5051 5052 /* 5053 * Algorithm for vector D2L and F2I conversions:- 5054 * a) Perform vector D2L/F2I cast. 5055 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 5056 * It signifies that source value could be any of the special floating point 5057 * values(NaN,-Inf,Inf,Max,-Min). 5058 * c) Set destination to zero if source is NaN value. 5059 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 5060 */ 5061 5062 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5063 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 5064 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5065 int to_elem_sz = type2aelembytes(to_elem_bt); 5066 assert(to_elem_sz <= 4, ""); 5067 vcvttps2dq(dst, src, vec_enc); 5068 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 5069 if (to_elem_sz < 4) { 5070 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 5071 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 5072 } 5073 } 5074 5075 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5076 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 5077 Register rscratch, int vec_enc) { 5078 int to_elem_sz = type2aelembytes(to_elem_bt); 5079 assert(to_elem_sz <= 4, ""); 5080 vcvttps2dq(dst, src, vec_enc); 5081 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 5082 switch(to_elem_bt) { 5083 case T_INT: 5084 break; 5085 case T_SHORT: 5086 evpmovdw(dst, dst, vec_enc); 5087 break; 5088 case T_BYTE: 5089 evpmovdb(dst, dst, vec_enc); 5090 break; 5091 default: assert(false, "%s", type2name(to_elem_bt)); 5092 } 5093 } 5094 5095 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 5096 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 5097 Register rscratch, int vec_enc) { 5098 evcvttps2qq(dst, src, vec_enc); 5099 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 5100 } 5101 5102 // Handling for downcasting from double to integer or sub-word types on AVX2. 5103 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5104 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 5105 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 5106 int to_elem_sz = type2aelembytes(to_elem_bt); 5107 assert(to_elem_sz < 8, ""); 5108 vcvttpd2dq(dst, src, vec_enc); 5109 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 5110 float_sign_flip, vec_enc); 5111 if (to_elem_sz < 4) { 5112 // xtmp4 holds all zero lanes. 5113 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 5114 } 5115 } 5116 5117 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 5118 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 5119 KRegister ktmp2, AddressLiteral sign_flip, 5120 Register rscratch, int vec_enc) { 5121 if (VM_Version::supports_avx512dq()) { 5122 evcvttpd2qq(dst, src, vec_enc); 5123 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5124 switch(to_elem_bt) { 5125 case T_LONG: 5126 break; 5127 case T_INT: 5128 evpmovsqd(dst, dst, vec_enc); 5129 break; 5130 case T_SHORT: 5131 evpmovsqd(dst, dst, vec_enc); 5132 evpmovdw(dst, dst, vec_enc); 5133 break; 5134 case T_BYTE: 5135 evpmovsqd(dst, dst, vec_enc); 5136 evpmovdb(dst, dst, vec_enc); 5137 break; 5138 default: assert(false, "%s", type2name(to_elem_bt)); 5139 } 5140 } else { 5141 assert(type2aelembytes(to_elem_bt) <= 4, ""); 5142 vcvttpd2dq(dst, src, vec_enc); 5143 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 5144 switch(to_elem_bt) { 5145 case T_INT: 5146 break; 5147 case T_SHORT: 5148 evpmovdw(dst, dst, vec_enc); 5149 break; 5150 case T_BYTE: 5151 evpmovdb(dst, dst, vec_enc); 5152 break; 5153 default: assert(false, "%s", type2name(to_elem_bt)); 5154 } 5155 } 5156 } 5157 5158 #ifdef _LP64 5159 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 5160 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5161 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5162 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5163 // and re-instantiate original MXCSR.RC mode after that. 5164 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5165 5166 mov64(tmp, julong_cast(0.5L)); 5167 evpbroadcastq(xtmp1, tmp, vec_enc); 5168 vaddpd(xtmp1, src , xtmp1, vec_enc); 5169 evcvtpd2qq(dst, xtmp1, vec_enc); 5170 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5171 double_sign_flip, vec_enc);; 5172 5173 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5174 } 5175 5176 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 5177 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5178 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 5179 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5180 // and re-instantiate original MXCSR.RC mode after that. 5181 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5182 5183 movl(tmp, jint_cast(0.5)); 5184 movq(xtmp1, tmp); 5185 vbroadcastss(xtmp1, xtmp1, vec_enc); 5186 vaddps(xtmp1, src , xtmp1, vec_enc); 5187 vcvtps2dq(dst, xtmp1, vec_enc); 5188 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 5189 float_sign_flip, vec_enc); 5190 5191 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5192 } 5193 5194 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 5195 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 5196 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 5197 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 5198 // and re-instantiate original MXCSR.RC mode after that. 5199 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 5200 5201 movl(tmp, jint_cast(0.5)); 5202 movq(xtmp1, tmp); 5203 vbroadcastss(xtmp1, xtmp1, vec_enc); 5204 vaddps(xtmp1, src , xtmp1, vec_enc); 5205 vcvtps2dq(dst, xtmp1, vec_enc); 5206 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 5207 5208 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 5209 } 5210 #endif // _LP64 5211 5212 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5213 BasicType from_elem_bt, BasicType to_elem_bt) { 5214 switch (from_elem_bt) { 5215 case T_BYTE: 5216 switch (to_elem_bt) { 5217 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 5218 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 5219 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 5220 default: ShouldNotReachHere(); 5221 } 5222 break; 5223 case T_SHORT: 5224 switch (to_elem_bt) { 5225 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 5226 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5227 default: ShouldNotReachHere(); 5228 } 5229 break; 5230 case T_INT: 5231 assert(to_elem_bt == T_LONG, ""); 5232 vpmovzxdq(dst, src, vlen_enc); 5233 break; 5234 default: 5235 ShouldNotReachHere(); 5236 } 5237 } 5238 5239 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5240 BasicType from_elem_bt, BasicType to_elem_bt) { 5241 switch (from_elem_bt) { 5242 case T_BYTE: 5243 switch (to_elem_bt) { 5244 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5245 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5246 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5247 default: ShouldNotReachHere(); 5248 } 5249 break; 5250 case T_SHORT: 5251 switch (to_elem_bt) { 5252 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5253 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5254 default: ShouldNotReachHere(); 5255 } 5256 break; 5257 case T_INT: 5258 assert(to_elem_bt == T_LONG, ""); 5259 vpmovsxdq(dst, src, vlen_enc); 5260 break; 5261 default: 5262 ShouldNotReachHere(); 5263 } 5264 } 5265 5266 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5267 BasicType dst_bt, BasicType src_bt, int vlen) { 5268 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5269 assert(vlen_enc != AVX_512bit, ""); 5270 5271 int dst_bt_size = type2aelembytes(dst_bt); 5272 int src_bt_size = type2aelembytes(src_bt); 5273 if (dst_bt_size > src_bt_size) { 5274 switch (dst_bt_size / src_bt_size) { 5275 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5276 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5277 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5278 default: ShouldNotReachHere(); 5279 } 5280 } else { 5281 assert(dst_bt_size < src_bt_size, ""); 5282 switch (src_bt_size / dst_bt_size) { 5283 case 2: { 5284 if (vlen_enc == AVX_128bit) { 5285 vpacksswb(dst, src, src, vlen_enc); 5286 } else { 5287 vpacksswb(dst, src, src, vlen_enc); 5288 vpermq(dst, dst, 0x08, vlen_enc); 5289 } 5290 break; 5291 } 5292 case 4: { 5293 if (vlen_enc == AVX_128bit) { 5294 vpackssdw(dst, src, src, vlen_enc); 5295 vpacksswb(dst, dst, dst, vlen_enc); 5296 } else { 5297 vpackssdw(dst, src, src, vlen_enc); 5298 vpermq(dst, dst, 0x08, vlen_enc); 5299 vpacksswb(dst, dst, dst, AVX_128bit); 5300 } 5301 break; 5302 } 5303 case 8: { 5304 if (vlen_enc == AVX_128bit) { 5305 vpshufd(dst, src, 0x08, vlen_enc); 5306 vpackssdw(dst, dst, dst, vlen_enc); 5307 vpacksswb(dst, dst, dst, vlen_enc); 5308 } else { 5309 vpshufd(dst, src, 0x08, vlen_enc); 5310 vpermq(dst, dst, 0x08, vlen_enc); 5311 vpackssdw(dst, dst, dst, AVX_128bit); 5312 vpacksswb(dst, dst, dst, AVX_128bit); 5313 } 5314 break; 5315 } 5316 default: ShouldNotReachHere(); 5317 } 5318 } 5319 } 5320 5321 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5322 bool merge, BasicType bt, int vlen_enc) { 5323 if (bt == T_INT) { 5324 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5325 } else { 5326 assert(bt == T_LONG, ""); 5327 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5328 } 5329 } 5330 5331 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5332 bool merge, BasicType bt, int vlen_enc) { 5333 if (bt == T_INT) { 5334 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5335 } else { 5336 assert(bt == T_LONG, ""); 5337 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5338 } 5339 } 5340 5341 #ifdef _LP64 5342 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5343 Register rtmp2, XMMRegister xtmp, int mask_len, 5344 int vec_enc) { 5345 int index = 0; 5346 int vindex = 0; 5347 mov64(rtmp1, 0x0101010101010101L); 5348 pdepq(rtmp1, src, rtmp1); 5349 if (mask_len > 8) { 5350 movq(rtmp2, src); 5351 vpxor(xtmp, xtmp, xtmp, vec_enc); 5352 movq(xtmp, rtmp1); 5353 } 5354 movq(dst, rtmp1); 5355 5356 mask_len -= 8; 5357 while (mask_len > 0) { 5358 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5359 index++; 5360 if ((index % 2) == 0) { 5361 pxor(xtmp, xtmp); 5362 } 5363 mov64(rtmp1, 0x0101010101010101L); 5364 shrq(rtmp2, 8); 5365 pdepq(rtmp1, rtmp2, rtmp1); 5366 pinsrq(xtmp, rtmp1, index % 2); 5367 vindex = index / 2; 5368 if (vindex) { 5369 // Write entire 16 byte vector when both 64 bit 5370 // lanes are update to save redundant instructions. 5371 if (index % 2) { 5372 vinsertf128(dst, dst, xtmp, vindex); 5373 } 5374 } else { 5375 vmovdqu(dst, xtmp); 5376 } 5377 mask_len -= 8; 5378 } 5379 } 5380 5381 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5382 switch(opc) { 5383 case Op_VectorMaskTrueCount: 5384 popcntq(dst, tmp); 5385 break; 5386 case Op_VectorMaskLastTrue: 5387 if (VM_Version::supports_lzcnt()) { 5388 lzcntq(tmp, tmp); 5389 movl(dst, 63); 5390 subl(dst, tmp); 5391 } else { 5392 movl(dst, -1); 5393 bsrq(tmp, tmp); 5394 cmov32(Assembler::notZero, dst, tmp); 5395 } 5396 break; 5397 case Op_VectorMaskFirstTrue: 5398 if (VM_Version::supports_bmi1()) { 5399 if (masklen < 32) { 5400 orl(tmp, 1 << masklen); 5401 tzcntl(dst, tmp); 5402 } else if (masklen == 32) { 5403 tzcntl(dst, tmp); 5404 } else { 5405 assert(masklen == 64, ""); 5406 tzcntq(dst, tmp); 5407 } 5408 } else { 5409 if (masklen < 32) { 5410 orl(tmp, 1 << masklen); 5411 bsfl(dst, tmp); 5412 } else { 5413 assert(masklen == 32 || masklen == 64, ""); 5414 movl(dst, masklen); 5415 if (masklen == 32) { 5416 bsfl(tmp, tmp); 5417 } else { 5418 bsfq(tmp, tmp); 5419 } 5420 cmov32(Assembler::notZero, dst, tmp); 5421 } 5422 } 5423 break; 5424 case Op_VectorMaskToLong: 5425 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5426 break; 5427 default: assert(false, "Unhandled mask operation"); 5428 } 5429 } 5430 5431 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5432 int masklen, int masksize, int vec_enc) { 5433 assert(VM_Version::supports_popcnt(), ""); 5434 5435 if(VM_Version::supports_avx512bw()) { 5436 kmovql(tmp, mask); 5437 } else { 5438 assert(masklen <= 16, ""); 5439 kmovwl(tmp, mask); 5440 } 5441 5442 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5443 // operations needs to be clipped. 5444 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5445 andq(tmp, (1 << masklen) - 1); 5446 } 5447 5448 vector_mask_operation_helper(opc, dst, tmp, masklen); 5449 } 5450 5451 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5452 Register tmp, int masklen, BasicType bt, int vec_enc) { 5453 assert((vec_enc == AVX_128bit && VM_Version::supports_avx()) || 5454 (vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4)), ""); 5455 assert(VM_Version::supports_popcnt(), ""); 5456 5457 bool need_clip = false; 5458 switch(bt) { 5459 case T_BOOLEAN: 5460 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5461 vpxor(xtmp, xtmp, xtmp, vec_enc); 5462 vpsubb(xtmp, xtmp, mask, vec_enc); 5463 vpmovmskb(tmp, xtmp, vec_enc); 5464 need_clip = masklen < 16; 5465 break; 5466 case T_BYTE: 5467 vpmovmskb(tmp, mask, vec_enc); 5468 need_clip = masklen < 16; 5469 break; 5470 case T_SHORT: 5471 vpacksswb(xtmp, mask, mask, vec_enc); 5472 if (masklen >= 16) { 5473 vpermpd(xtmp, xtmp, 8, vec_enc); 5474 } 5475 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5476 need_clip = masklen < 16; 5477 break; 5478 case T_INT: 5479 case T_FLOAT: 5480 vmovmskps(tmp, mask, vec_enc); 5481 need_clip = masklen < 4; 5482 break; 5483 case T_LONG: 5484 case T_DOUBLE: 5485 vmovmskpd(tmp, mask, vec_enc); 5486 need_clip = masklen < 2; 5487 break; 5488 default: assert(false, "Unhandled type, %s", type2name(bt)); 5489 } 5490 5491 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5492 // operations needs to be clipped. 5493 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5494 // need_clip implies masklen < 32 5495 andq(tmp, (1 << masklen) - 1); 5496 } 5497 5498 vector_mask_operation_helper(opc, dst, tmp, masklen); 5499 } 5500 5501 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5502 Register rtmp2, int mask_len) { 5503 kmov(rtmp1, src); 5504 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5505 mov64(rtmp2, -1L); 5506 pextq(rtmp2, rtmp2, rtmp1); 5507 kmov(dst, rtmp2); 5508 } 5509 5510 #ifdef _LP64 5511 void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, 5512 XMMRegister mask, Register rtmp, Register rscratch, 5513 XMMRegister permv, XMMRegister xtmp, BasicType bt, 5514 int vec_enc) { 5515 assert(type2aelembytes(bt) >= 4, ""); 5516 assert(opcode == Op_CompressV || opcode == Op_ExpandV, ""); 5517 address compress_perm_table = nullptr; 5518 address expand_perm_table = nullptr; 5519 if (type2aelembytes(bt) == 8) { 5520 compress_perm_table = StubRoutines::x86::compress_perm_table64(); 5521 expand_perm_table = StubRoutines::x86::expand_perm_table64(); 5522 vmovmskpd(rtmp, mask, vec_enc); 5523 } else { 5524 compress_perm_table = StubRoutines::x86::compress_perm_table32(); 5525 expand_perm_table = StubRoutines::x86::expand_perm_table32(); 5526 vmovmskps(rtmp, mask, vec_enc); 5527 } 5528 shlq(rtmp, 5); // for 32 byte permute row. 5529 if (opcode == Op_CompressV) { 5530 lea(rscratch, ExternalAddress(compress_perm_table)); 5531 } else { 5532 lea(rscratch, ExternalAddress(expand_perm_table)); 5533 } 5534 addptr(rtmp, rscratch); 5535 vmovdqu(permv, Address(rtmp)); 5536 vpermps(dst, permv, src, Assembler::AVX_256bit); 5537 vpxor(xtmp, xtmp, xtmp, vec_enc); 5538 // Blend the result with zero vector using permute mask, each column entry 5539 // in a permute table row contains either a valid permute index or a -1 (default) 5540 // value, this can potentially be used as a blending mask after 5541 // compressing/expanding the source vector lanes. 5542 vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv); 5543 } 5544 #endif 5545 5546 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5547 bool merge, BasicType bt, int vec_enc) { 5548 if (opcode == Op_CompressV) { 5549 switch(bt) { 5550 case T_BYTE: 5551 evpcompressb(dst, mask, src, merge, vec_enc); 5552 break; 5553 case T_CHAR: 5554 case T_SHORT: 5555 evpcompressw(dst, mask, src, merge, vec_enc); 5556 break; 5557 case T_INT: 5558 evpcompressd(dst, mask, src, merge, vec_enc); 5559 break; 5560 case T_FLOAT: 5561 evcompressps(dst, mask, src, merge, vec_enc); 5562 break; 5563 case T_LONG: 5564 evpcompressq(dst, mask, src, merge, vec_enc); 5565 break; 5566 case T_DOUBLE: 5567 evcompresspd(dst, mask, src, merge, vec_enc); 5568 break; 5569 default: 5570 fatal("Unsupported type %s", type2name(bt)); 5571 break; 5572 } 5573 } else { 5574 assert(opcode == Op_ExpandV, ""); 5575 switch(bt) { 5576 case T_BYTE: 5577 evpexpandb(dst, mask, src, merge, vec_enc); 5578 break; 5579 case T_CHAR: 5580 case T_SHORT: 5581 evpexpandw(dst, mask, src, merge, vec_enc); 5582 break; 5583 case T_INT: 5584 evpexpandd(dst, mask, src, merge, vec_enc); 5585 break; 5586 case T_FLOAT: 5587 evexpandps(dst, mask, src, merge, vec_enc); 5588 break; 5589 case T_LONG: 5590 evpexpandq(dst, mask, src, merge, vec_enc); 5591 break; 5592 case T_DOUBLE: 5593 evexpandpd(dst, mask, src, merge, vec_enc); 5594 break; 5595 default: 5596 fatal("Unsupported type %s", type2name(bt)); 5597 break; 5598 } 5599 } 5600 } 5601 #endif 5602 5603 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5604 KRegister ktmp1, int vec_enc) { 5605 if (opcode == Op_SignumVD) { 5606 vsubpd(dst, zero, one, vec_enc); 5607 // if src < 0 ? -1 : 1 5608 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5609 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5610 // if src == NaN, -0.0 or 0.0 return src. 5611 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5612 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5613 } else { 5614 assert(opcode == Op_SignumVF, ""); 5615 vsubps(dst, zero, one, vec_enc); 5616 // if src < 0 ? -1 : 1 5617 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5618 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5619 // if src == NaN, -0.0 or 0.0 return src. 5620 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5621 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5622 } 5623 } 5624 5625 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5626 XMMRegister xtmp1, int vec_enc) { 5627 if (opcode == Op_SignumVD) { 5628 vsubpd(dst, zero, one, vec_enc); 5629 // if src < 0 ? -1 : 1 5630 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5631 // if src == NaN, -0.0 or 0.0 return src. 5632 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5633 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5634 } else { 5635 assert(opcode == Op_SignumVF, ""); 5636 vsubps(dst, zero, one, vec_enc); 5637 // if src < 0 ? -1 : 1 5638 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5639 // if src == NaN, -0.0 or 0.0 return src. 5640 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5641 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5642 } 5643 } 5644 5645 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5646 if (VM_Version::supports_avx512bw()) { 5647 if (mask_len > 32) { 5648 kmovql(dst, src); 5649 } else { 5650 kmovdl(dst, src); 5651 if (mask_len != 32) { 5652 kshiftrdl(dst, dst, 32 - mask_len); 5653 } 5654 } 5655 } else { 5656 assert(mask_len <= 16, ""); 5657 kmovwl(dst, src); 5658 if (mask_len != 16) { 5659 kshiftrwl(dst, dst, 16 - mask_len); 5660 } 5661 } 5662 } 5663 5664 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5665 int lane_size = type2aelembytes(bt); 5666 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5667 if ((is_LP64 || lane_size < 8) && 5668 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5669 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5670 movptr(rtmp, imm32); 5671 switch(lane_size) { 5672 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5673 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5674 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5675 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5676 fatal("Unsupported lane size %d", lane_size); 5677 break; 5678 } 5679 } else { 5680 movptr(rtmp, imm32); 5681 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5682 switch(lane_size) { 5683 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5684 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5685 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5686 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5687 fatal("Unsupported lane size %d", lane_size); 5688 break; 5689 } 5690 } 5691 } 5692 5693 // 5694 // Following is lookup table based popcount computation algorithm:- 5695 // Index Bit set count 5696 // [ 0000 -> 0, 5697 // 0001 -> 1, 5698 // 0010 -> 1, 5699 // 0011 -> 2, 5700 // 0100 -> 1, 5701 // 0101 -> 2, 5702 // 0110 -> 2, 5703 // 0111 -> 3, 5704 // 1000 -> 1, 5705 // 1001 -> 2, 5706 // 1010 -> 3, 5707 // 1011 -> 3, 5708 // 1100 -> 2, 5709 // 1101 -> 3, 5710 // 1111 -> 4 ] 5711 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5712 // shuffle indices for lookup table access. 5713 // b. Right shift each byte of vector lane by 4 positions. 5714 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5715 // shuffle indices for lookup table access. 5716 // d. Add the bitset count of upper and lower 4 bits of each byte. 5717 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5718 // count of all the bytes of a quadword. 5719 // f. Perform step e. for upper 128bit vector lane. 5720 // g. Pack the bitset count of quadwords back to double word. 5721 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5722 5723 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5724 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5725 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5726 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5727 vpsrlw(dst, src, 4, vec_enc); 5728 vpand(dst, dst, xtmp1, vec_enc); 5729 vpand(xtmp1, src, xtmp1, vec_enc); 5730 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5731 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5732 vpshufb(dst, xtmp2, dst, vec_enc); 5733 vpaddb(dst, dst, xtmp1, vec_enc); 5734 } 5735 5736 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5737 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5738 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5739 // Following code is as per steps e,f,g and h of above algorithm. 5740 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5741 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5742 vpsadbw(dst, dst, xtmp2, vec_enc); 5743 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5744 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5745 vpackuswb(dst, xtmp1, dst, vec_enc); 5746 } 5747 5748 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5749 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5750 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5751 // Add the popcount of upper and lower bytes of word. 5752 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5753 vpsrlw(dst, xtmp1, 8, vec_enc); 5754 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5755 vpaddw(dst, dst, xtmp1, vec_enc); 5756 } 5757 5758 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5759 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5760 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5761 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5762 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5763 } 5764 5765 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5766 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5767 switch(bt) { 5768 case T_LONG: 5769 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5770 break; 5771 case T_INT: 5772 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5773 break; 5774 case T_CHAR: 5775 case T_SHORT: 5776 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5777 break; 5778 case T_BYTE: 5779 case T_BOOLEAN: 5780 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5781 break; 5782 default: 5783 fatal("Unsupported type %s", type2name(bt)); 5784 break; 5785 } 5786 } 5787 5788 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5789 KRegister mask, bool merge, int vec_enc) { 5790 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5791 switch(bt) { 5792 case T_LONG: 5793 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5794 evpopcntq(dst, mask, src, merge, vec_enc); 5795 break; 5796 case T_INT: 5797 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5798 evpopcntd(dst, mask, src, merge, vec_enc); 5799 break; 5800 case T_CHAR: 5801 case T_SHORT: 5802 assert(VM_Version::supports_avx512_bitalg(), ""); 5803 evpopcntw(dst, mask, src, merge, vec_enc); 5804 break; 5805 case T_BYTE: 5806 case T_BOOLEAN: 5807 assert(VM_Version::supports_avx512_bitalg(), ""); 5808 evpopcntb(dst, mask, src, merge, vec_enc); 5809 break; 5810 default: 5811 fatal("Unsupported type %s", type2name(bt)); 5812 break; 5813 } 5814 } 5815 5816 #ifndef _LP64 5817 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 5818 assert(VM_Version::supports_avx512bw(), ""); 5819 kmovdl(tmp, src); 5820 kunpckdql(dst, tmp, tmp); 5821 } 5822 #endif 5823 5824 // Bit reversal algorithm first reverses the bits of each byte followed by 5825 // a byte level reversal for multi-byte primitive types (short/int/long). 5826 // Algorithm performs a lookup table access to get reverse bit sequence 5827 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5828 // is obtained by swapping the reverse bit sequences of upper and lower 5829 // nibble of a byte. 5830 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5831 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5832 if (VM_Version::supports_avx512vlbw()) { 5833 5834 // Get the reverse bit sequence of lower nibble of each byte. 5835 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5836 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5837 evpandq(dst, xtmp2, src, vec_enc); 5838 vpshufb(dst, xtmp1, dst, vec_enc); 5839 vpsllq(dst, dst, 4, vec_enc); 5840 5841 // Get the reverse bit sequence of upper nibble of each byte. 5842 vpandn(xtmp2, xtmp2, src, vec_enc); 5843 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5844 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5845 5846 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5847 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5848 evporq(xtmp2, dst, xtmp2, vec_enc); 5849 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5850 5851 } else if(vec_enc == Assembler::AVX_512bit) { 5852 // Shift based bit reversal. 5853 assert(bt == T_LONG || bt == T_INT, ""); 5854 5855 // Swap lower and upper nibble of each byte. 5856 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5857 5858 // Swap two least and most significant bits of each nibble. 5859 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5860 5861 // Swap adjacent pair of bits. 5862 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5863 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5864 5865 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5866 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5867 } else { 5868 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5869 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5870 5871 // Get the reverse bit sequence of lower nibble of each byte. 5872 vpand(dst, xtmp2, src, vec_enc); 5873 vpshufb(dst, xtmp1, dst, vec_enc); 5874 vpsllq(dst, dst, 4, vec_enc); 5875 5876 // Get the reverse bit sequence of upper nibble of each byte. 5877 vpandn(xtmp2, xtmp2, src, vec_enc); 5878 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5879 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5880 5881 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5882 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5883 vpor(xtmp2, dst, xtmp2, vec_enc); 5884 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5885 } 5886 } 5887 5888 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 5889 XMMRegister xtmp, Register rscratch) { 5890 assert(VM_Version::supports_gfni(), ""); 5891 assert(rscratch != noreg || always_reachable(mask), "missing"); 5892 5893 // Galois field instruction based bit reversal based on following algorithm. 5894 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5895 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 5896 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 5897 vector_reverse_byte(bt, dst, xtmp, vec_enc); 5898 } 5899 5900 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 5901 XMMRegister xtmp1, Register rtmp, int vec_enc) { 5902 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 5903 evpandq(dst, xtmp1, src, vec_enc); 5904 vpsllq(dst, dst, nbits, vec_enc); 5905 vpandn(xtmp1, xtmp1, src, vec_enc); 5906 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 5907 evporq(dst, dst, xtmp1, vec_enc); 5908 } 5909 5910 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5911 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5912 // Shift based bit reversal. 5913 assert(VM_Version::supports_evex(), ""); 5914 switch(bt) { 5915 case T_LONG: 5916 // Swap upper and lower double word of each quad word. 5917 evprorq(xtmp1, k0, src, 32, true, vec_enc); 5918 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 5919 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5920 break; 5921 case T_INT: 5922 // Swap upper and lower word of each double word. 5923 evprord(xtmp1, k0, src, 16, true, vec_enc); 5924 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5925 break; 5926 case T_CHAR: 5927 case T_SHORT: 5928 // Swap upper and lower byte of each word. 5929 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 5930 break; 5931 case T_BYTE: 5932 evmovdquq(dst, k0, src, true, vec_enc); 5933 break; 5934 default: 5935 fatal("Unsupported type %s", type2name(bt)); 5936 break; 5937 } 5938 } 5939 5940 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5941 if (bt == T_BYTE) { 5942 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 5943 evmovdquq(dst, k0, src, true, vec_enc); 5944 } else { 5945 vmovdqu(dst, src); 5946 } 5947 return; 5948 } 5949 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 5950 // pre-computed shuffle indices. 5951 switch(bt) { 5952 case T_LONG: 5953 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 5954 break; 5955 case T_INT: 5956 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 5957 break; 5958 case T_CHAR: 5959 case T_SHORT: 5960 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 5961 break; 5962 default: 5963 fatal("Unsupported type %s", type2name(bt)); 5964 break; 5965 } 5966 vpshufb(dst, src, dst, vec_enc); 5967 } 5968 5969 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5970 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5971 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 5972 assert(is_integral_type(bt), ""); 5973 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5974 assert(VM_Version::supports_avx512cd(), ""); 5975 switch(bt) { 5976 case T_LONG: 5977 evplzcntq(dst, ktmp, src, merge, vec_enc); 5978 break; 5979 case T_INT: 5980 evplzcntd(dst, ktmp, src, merge, vec_enc); 5981 break; 5982 case T_SHORT: 5983 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 5984 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 5985 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 5986 vpunpckhwd(dst, xtmp1, src, vec_enc); 5987 evplzcntd(dst, ktmp, dst, merge, vec_enc); 5988 vpackusdw(dst, xtmp2, dst, vec_enc); 5989 break; 5990 case T_BYTE: 5991 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5992 // accessing the lookup table. 5993 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 5994 // accessing the lookup table. 5995 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 5996 assert(VM_Version::supports_avx512bw(), ""); 5997 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 5998 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 5999 vpand(xtmp2, dst, src, vec_enc); 6000 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 6001 vpsrlw(xtmp3, src, 4, vec_enc); 6002 vpand(xtmp3, dst, xtmp3, vec_enc); 6003 vpshufb(dst, xtmp1, xtmp3, vec_enc); 6004 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6005 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 6006 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 6007 break; 6008 default: 6009 fatal("Unsupported type %s", type2name(bt)); 6010 break; 6011 } 6012 } 6013 6014 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6015 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6016 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 6017 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 6018 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 6019 // accessing the lookup table. 6020 vpand(dst, xtmp2, src, vec_enc); 6021 vpshufb(dst, xtmp1, dst, vec_enc); 6022 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 6023 // accessing the lookup table. 6024 vpsrlw(xtmp3, src, 4, vec_enc); 6025 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 6026 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 6027 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 6028 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 6029 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 6030 vpaddb(dst, dst, xtmp2, vec_enc); 6031 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 6032 } 6033 6034 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6035 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6036 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6037 // Add zero counts of lower byte and upper byte of a word if 6038 // upper byte holds a zero value. 6039 vpsrlw(xtmp3, src, 8, vec_enc); 6040 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6041 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 6042 vpsllw(xtmp2, dst, 8, vec_enc); 6043 vpaddw(xtmp2, xtmp2, dst, vec_enc); 6044 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6045 vpsrlw(dst, dst, 8, vec_enc); 6046 } 6047 6048 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6049 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 6050 // Since IEEE 754 floating point format represents mantissa in 1.0 format 6051 // hence biased exponent can be used to compute leading zero count as per 6052 // following formula:- 6053 // LZCNT = 32 - (biased_exp - 127) 6054 // Special handling has been introduced for Zero, Max_Int and -ve source values. 6055 6056 // Broadcast 0xFF 6057 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 6058 vpsrld(xtmp1, xtmp1, 24, vec_enc); 6059 6060 // Extract biased exponent. 6061 vcvtdq2ps(dst, src, vec_enc); 6062 vpsrld(dst, dst, 23, vec_enc); 6063 vpand(dst, dst, xtmp1, vec_enc); 6064 6065 // Broadcast 127. 6066 vpsrld(xtmp1, xtmp1, 1, vec_enc); 6067 // Exponent = biased_exp - 127 6068 vpsubd(dst, dst, xtmp1, vec_enc); 6069 6070 // Exponent = Exponent + 1 6071 vpsrld(xtmp3, xtmp1, 6, vec_enc); 6072 vpaddd(dst, dst, xtmp3, vec_enc); 6073 6074 // Replace -ve exponent with zero, exponent is -ve when src 6075 // lane contains a zero value. 6076 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6077 vblendvps(dst, dst, xtmp2, dst, vec_enc); 6078 6079 // Rematerialize broadcast 32. 6080 vpslld(xtmp1, xtmp3, 5, vec_enc); 6081 // Exponent is 32 if corresponding source lane contains max_int value. 6082 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 6083 // LZCNT = 32 - exponent 6084 vpsubd(dst, xtmp1, dst, vec_enc); 6085 6086 // Replace LZCNT with a value 1 if corresponding source lane 6087 // contains max_int value. 6088 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 6089 6090 // Replace biased_exp with 0 if source lane value is less than zero. 6091 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 6092 vblendvps(dst, dst, xtmp2, src, vec_enc); 6093 } 6094 6095 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6096 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6097 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6098 // Add zero counts of lower word and upper word of a double word if 6099 // upper word holds a zero value. 6100 vpsrld(xtmp3, src, 16, vec_enc); 6101 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 6102 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 6103 vpslld(xtmp2, dst, 16, vec_enc); 6104 vpaddd(xtmp2, xtmp2, dst, vec_enc); 6105 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6106 vpsrld(dst, dst, 16, vec_enc); 6107 // Add zero counts of lower doubleword and upper doubleword of a 6108 // quadword if upper doubleword holds a zero value. 6109 vpsrlq(xtmp3, src, 32, vec_enc); 6110 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 6111 vpsllq(xtmp2, dst, 32, vec_enc); 6112 vpaddq(xtmp2, xtmp2, dst, vec_enc); 6113 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 6114 vpsrlq(dst, dst, 32, vec_enc); 6115 } 6116 6117 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 6118 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6119 Register rtmp, int vec_enc) { 6120 assert(is_integral_type(bt), "unexpected type"); 6121 assert(vec_enc < Assembler::AVX_512bit, ""); 6122 switch(bt) { 6123 case T_LONG: 6124 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6125 break; 6126 case T_INT: 6127 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 6128 break; 6129 case T_SHORT: 6130 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6131 break; 6132 case T_BYTE: 6133 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 6134 break; 6135 default: 6136 fatal("Unsupported type %s", type2name(bt)); 6137 break; 6138 } 6139 } 6140 6141 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 6142 switch(bt) { 6143 case T_BYTE: 6144 vpsubb(dst, src1, src2, vec_enc); 6145 break; 6146 case T_SHORT: 6147 vpsubw(dst, src1, src2, vec_enc); 6148 break; 6149 case T_INT: 6150 vpsubd(dst, src1, src2, vec_enc); 6151 break; 6152 case T_LONG: 6153 vpsubq(dst, src1, src2, vec_enc); 6154 break; 6155 default: 6156 fatal("Unsupported type %s", type2name(bt)); 6157 break; 6158 } 6159 } 6160 6161 // Trailing zero count computation is based on leading zero count operation as per 6162 // following equation. All AVX3 targets support AVX512CD feature which offers 6163 // direct vector instruction to compute leading zero count. 6164 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 6165 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 6166 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 6167 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 6168 assert(is_integral_type(bt), ""); 6169 // xtmp = -1 6170 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 6171 // xtmp = xtmp + src 6172 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 6173 // xtmp = xtmp & ~src 6174 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 6175 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 6176 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 6177 vpsub(bt, dst, xtmp4, dst, vec_enc); 6178 } 6179 6180 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 6181 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 6182 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 6183 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 6184 assert(is_integral_type(bt), ""); 6185 // xtmp = 0 6186 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 6187 // xtmp = 0 - src 6188 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 6189 // xtmp = xtmp | src 6190 vpor(xtmp3, xtmp3, src, vec_enc); 6191 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 6192 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 6193 vpsub(bt, dst, xtmp1, dst, vec_enc); 6194 } 6195 6196 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 6197 Label done; 6198 Label neg_divisor_fastpath; 6199 cmpl(divisor, 0); 6200 jccb(Assembler::less, neg_divisor_fastpath); 6201 xorl(rdx, rdx); 6202 divl(divisor); 6203 jmpb(done); 6204 bind(neg_divisor_fastpath); 6205 // Fastpath for divisor < 0: 6206 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6207 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6208 movl(rdx, rax); 6209 subl(rdx, divisor); 6210 if (VM_Version::supports_bmi1()) { 6211 andnl(rax, rdx, rax); 6212 } else { 6213 notl(rdx); 6214 andl(rax, rdx); 6215 } 6216 shrl(rax, 31); 6217 bind(done); 6218 } 6219 6220 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 6221 Label done; 6222 Label neg_divisor_fastpath; 6223 cmpl(divisor, 0); 6224 jccb(Assembler::less, neg_divisor_fastpath); 6225 xorl(rdx, rdx); 6226 divl(divisor); 6227 jmpb(done); 6228 bind(neg_divisor_fastpath); 6229 // Fastpath when divisor < 0: 6230 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6231 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6232 movl(rdx, rax); 6233 subl(rax, divisor); 6234 if (VM_Version::supports_bmi1()) { 6235 andnl(rax, rax, rdx); 6236 } else { 6237 notl(rax); 6238 andl(rax, rdx); 6239 } 6240 sarl(rax, 31); 6241 andl(rax, divisor); 6242 subl(rdx, rax); 6243 bind(done); 6244 } 6245 6246 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 6247 Label done; 6248 Label neg_divisor_fastpath; 6249 6250 cmpl(divisor, 0); 6251 jccb(Assembler::less, neg_divisor_fastpath); 6252 xorl(rdx, rdx); 6253 divl(divisor); 6254 jmpb(done); 6255 bind(neg_divisor_fastpath); 6256 // Fastpath for divisor < 0: 6257 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 6258 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 6259 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6260 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6261 movl(rdx, rax); 6262 subl(rax, divisor); 6263 if (VM_Version::supports_bmi1()) { 6264 andnl(rax, rax, rdx); 6265 } else { 6266 notl(rax); 6267 andl(rax, rdx); 6268 } 6269 movl(tmp, rax); 6270 shrl(rax, 31); // quotient 6271 sarl(tmp, 31); 6272 andl(tmp, divisor); 6273 subl(rdx, tmp); // remainder 6274 bind(done); 6275 } 6276 6277 #ifdef _LP64 6278 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6279 XMMRegister xtmp2, Register rtmp) { 6280 if(VM_Version::supports_gfni()) { 6281 // Galois field instruction based bit reversal based on following algorithm. 6282 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6283 mov64(rtmp, 0x8040201008040201L); 6284 movq(xtmp1, src); 6285 movq(xtmp2, rtmp); 6286 gf2p8affineqb(xtmp1, xtmp2, 0); 6287 movq(dst, xtmp1); 6288 } else { 6289 // Swap even and odd numbered bits. 6290 movl(rtmp, src); 6291 andl(rtmp, 0x55555555); 6292 shll(rtmp, 1); 6293 movl(dst, src); 6294 andl(dst, 0xAAAAAAAA); 6295 shrl(dst, 1); 6296 orl(dst, rtmp); 6297 6298 // Swap LSB and MSB 2 bits of each nibble. 6299 movl(rtmp, dst); 6300 andl(rtmp, 0x33333333); 6301 shll(rtmp, 2); 6302 andl(dst, 0xCCCCCCCC); 6303 shrl(dst, 2); 6304 orl(dst, rtmp); 6305 6306 // Swap LSB and MSB 4 bits of each byte. 6307 movl(rtmp, dst); 6308 andl(rtmp, 0x0F0F0F0F); 6309 shll(rtmp, 4); 6310 andl(dst, 0xF0F0F0F0); 6311 shrl(dst, 4); 6312 orl(dst, rtmp); 6313 } 6314 bswapl(dst); 6315 } 6316 6317 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6318 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6319 if(VM_Version::supports_gfni()) { 6320 // Galois field instruction based bit reversal based on following algorithm. 6321 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6322 mov64(rtmp1, 0x8040201008040201L); 6323 movq(xtmp1, src); 6324 movq(xtmp2, rtmp1); 6325 gf2p8affineqb(xtmp1, xtmp2, 0); 6326 movq(dst, xtmp1); 6327 } else { 6328 // Swap even and odd numbered bits. 6329 movq(rtmp1, src); 6330 mov64(rtmp2, 0x5555555555555555L); 6331 andq(rtmp1, rtmp2); 6332 shlq(rtmp1, 1); 6333 movq(dst, src); 6334 notq(rtmp2); 6335 andq(dst, rtmp2); 6336 shrq(dst, 1); 6337 orq(dst, rtmp1); 6338 6339 // Swap LSB and MSB 2 bits of each nibble. 6340 movq(rtmp1, dst); 6341 mov64(rtmp2, 0x3333333333333333L); 6342 andq(rtmp1, rtmp2); 6343 shlq(rtmp1, 2); 6344 notq(rtmp2); 6345 andq(dst, rtmp2); 6346 shrq(dst, 2); 6347 orq(dst, rtmp1); 6348 6349 // Swap LSB and MSB 4 bits of each byte. 6350 movq(rtmp1, dst); 6351 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6352 andq(rtmp1, rtmp2); 6353 shlq(rtmp1, 4); 6354 notq(rtmp2); 6355 andq(dst, rtmp2); 6356 shrq(dst, 4); 6357 orq(dst, rtmp1); 6358 } 6359 bswapq(dst); 6360 } 6361 6362 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6363 Label done; 6364 Label neg_divisor_fastpath; 6365 cmpq(divisor, 0); 6366 jccb(Assembler::less, neg_divisor_fastpath); 6367 xorl(rdx, rdx); 6368 divq(divisor); 6369 jmpb(done); 6370 bind(neg_divisor_fastpath); 6371 // Fastpath for divisor < 0: 6372 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6373 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6374 movq(rdx, rax); 6375 subq(rdx, divisor); 6376 if (VM_Version::supports_bmi1()) { 6377 andnq(rax, rdx, rax); 6378 } else { 6379 notq(rdx); 6380 andq(rax, rdx); 6381 } 6382 shrq(rax, 63); 6383 bind(done); 6384 } 6385 6386 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6387 Label done; 6388 Label neg_divisor_fastpath; 6389 cmpq(divisor, 0); 6390 jccb(Assembler::less, neg_divisor_fastpath); 6391 xorq(rdx, rdx); 6392 divq(divisor); 6393 jmp(done); 6394 bind(neg_divisor_fastpath); 6395 // Fastpath when divisor < 0: 6396 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6397 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6398 movq(rdx, rax); 6399 subq(rax, divisor); 6400 if (VM_Version::supports_bmi1()) { 6401 andnq(rax, rax, rdx); 6402 } else { 6403 notq(rax); 6404 andq(rax, rdx); 6405 } 6406 sarq(rax, 63); 6407 andq(rax, divisor); 6408 subq(rdx, rax); 6409 bind(done); 6410 } 6411 6412 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6413 Label done; 6414 Label neg_divisor_fastpath; 6415 cmpq(divisor, 0); 6416 jccb(Assembler::less, neg_divisor_fastpath); 6417 xorq(rdx, rdx); 6418 divq(divisor); 6419 jmp(done); 6420 bind(neg_divisor_fastpath); 6421 // Fastpath for divisor < 0: 6422 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6423 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6424 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6425 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6426 movq(rdx, rax); 6427 subq(rax, divisor); 6428 if (VM_Version::supports_bmi1()) { 6429 andnq(rax, rax, rdx); 6430 } else { 6431 notq(rax); 6432 andq(rax, rdx); 6433 } 6434 movq(tmp, rax); 6435 shrq(rax, 63); // quotient 6436 sarq(tmp, 63); 6437 andq(tmp, divisor); 6438 subq(rdx, tmp); // remainder 6439 bind(done); 6440 } 6441 #endif 6442 6443 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6444 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6445 int vlen_enc) { 6446 assert(VM_Version::supports_avx512bw(), ""); 6447 // Byte shuffles are inlane operations and indices are determined using 6448 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6449 // normalized to index range 0-15. This makes sure that all the multiples 6450 // of an index value are placed at same relative position in 128 bit 6451 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6452 // will be 16th element in their respective 128 bit lanes. 6453 movl(rtmp, 16); 6454 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6455 6456 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6457 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6458 // original shuffle indices and move the shuffled lanes corresponding to true 6459 // mask to destination vector. 6460 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6461 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6462 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6463 6464 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6465 // and broadcasting second 128 bit lane. 6466 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6467 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6468 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6469 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6470 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6471 6472 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6473 // and broadcasting third 128 bit lane. 6474 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6475 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6476 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6477 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6478 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6479 6480 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6481 // and broadcasting third 128 bit lane. 6482 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6483 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6484 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6485 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6486 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6487 } 6488 6489 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6490 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6491 if (vlen_enc == AVX_128bit) { 6492 vpermilps(dst, src, shuffle, vlen_enc); 6493 } else if (bt == T_INT) { 6494 vpermd(dst, shuffle, src, vlen_enc); 6495 } else { 6496 assert(bt == T_FLOAT, ""); 6497 vpermps(dst, shuffle, src, vlen_enc); 6498 } 6499 }