1 /* 2 * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "oops/methodData.hpp" 31 #include "opto/c2_MacroAssembler.hpp" 32 #include "opto/intrinsicnode.hpp" 33 #include "opto/output.hpp" 34 #include "opto/opcodes.hpp" 35 #include "opto/subnode.hpp" 36 #include "runtime/objectMonitor.hpp" 37 #include "runtime/stubRoutines.hpp" 38 #include "utilities/checkedCast.hpp" 39 40 #ifdef PRODUCT 41 #define BLOCK_COMMENT(str) /* nothing */ 42 #define STOP(error) stop(error) 43 #else 44 #define BLOCK_COMMENT(str) block_comment(str) 45 #define STOP(error) block_comment(error); stop(error) 46 #endif 47 48 // C2 compiled method's prolog code. 49 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { 50 51 // WARNING: Initial instruction MUST be 5 bytes or longer so that 52 // NativeJump::patch_verified_entry will be able to patch out the entry 53 // code safely. The push to verify stack depth is ok at 5 bytes, 54 // the frame allocation can be either 3 or 6 bytes. So if we don't do 55 // stack bang then we must use the 6 byte frame allocation even if 56 // we have no frame. :-( 57 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 58 59 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 60 // Remove word for return addr 61 framesize -= wordSize; 62 stack_bang_size -= wordSize; 63 64 // Calls to C2R adapters often do not accept exceptional returns. 65 // We require that their callers must bang for them. But be careful, because 66 // some VM calls (such as call site linkage) can use several kilobytes of 67 // stack. But the stack safety zone should account for that. 68 // See bugs 4446381, 4468289, 4497237. 69 if (stack_bang_size > 0) { 70 generate_stack_overflow_check(stack_bang_size); 71 72 // We always push rbp, so that on return to interpreter rbp, will be 73 // restored correctly and we can correct the stack. 74 push(rbp); 75 // Save caller's stack pointer into RBP if the frame pointer is preserved. 76 if (PreserveFramePointer) { 77 mov(rbp, rsp); 78 } 79 // Remove word for ebp 80 framesize -= wordSize; 81 82 // Create frame 83 if (framesize) { 84 subptr(rsp, framesize); 85 } 86 } else { 87 // Create frame (force generation of a 4 byte immediate value) 88 subptr_imm32(rsp, framesize); 89 90 // Save RBP register now. 91 framesize -= wordSize; 92 movptr(Address(rsp, framesize), rbp); 93 // Save caller's stack pointer into RBP if the frame pointer is preserved. 94 if (PreserveFramePointer) { 95 movptr(rbp, rsp); 96 if (framesize > 0) { 97 addptr(rbp, framesize); 98 } 99 } 100 } 101 102 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 103 framesize -= wordSize; 104 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 105 } 106 107 #ifndef _LP64 108 // If method sets FPU control word do it now 109 if (fp_mode_24b) { 110 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 111 } 112 if (UseSSE >= 2 && VerifyFPU) { 113 verify_FPU(0, "FPU stack must be clean on entry"); 114 } 115 #endif 116 117 #ifdef ASSERT 118 if (VerifyStackAtCalls) { 119 Label L; 120 push(rax); 121 mov(rax, rsp); 122 andptr(rax, StackAlignmentInBytes-1); 123 cmpptr(rax, StackAlignmentInBytes-wordSize); 124 pop(rax); 125 jcc(Assembler::equal, L); 126 STOP("Stack is not properly aligned!"); 127 bind(L); 128 } 129 #endif 130 131 if (!is_stub) { 132 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 133 #ifdef _LP64 134 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 135 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 136 Label dummy_slow_path; 137 Label dummy_continuation; 138 Label* slow_path = &dummy_slow_path; 139 Label* continuation = &dummy_continuation; 140 if (!Compile::current()->output()->in_scratch_emit_size()) { 141 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 142 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 143 Compile::current()->output()->add_stub(stub); 144 slow_path = &stub->entry(); 145 continuation = &stub->continuation(); 146 } 147 bs->nmethod_entry_barrier(this, slow_path, continuation); 148 } 149 #else 150 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 151 bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */); 152 #endif 153 } 154 } 155 156 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 157 switch (vlen_in_bytes) { 158 case 4: // fall-through 159 case 8: // fall-through 160 case 16: return Assembler::AVX_128bit; 161 case 32: return Assembler::AVX_256bit; 162 case 64: return Assembler::AVX_512bit; 163 164 default: { 165 ShouldNotReachHere(); 166 return Assembler::AVX_NoVec; 167 } 168 } 169 } 170 171 #if INCLUDE_RTM_OPT 172 173 // Update rtm_counters based on abort status 174 // input: abort_status 175 // rtm_counters (RTMLockingCounters*) 176 // flags are killed 177 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) { 178 179 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset())); 180 if (PrintPreciseRTMLockingStatistics) { 181 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 182 Label check_abort; 183 testl(abort_status, (1<<i)); 184 jccb(Assembler::equal, check_abort); 185 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx)))); 186 bind(check_abort); 187 } 188 } 189 } 190 191 // Branch if (random & (count-1) != 0), count is 2^n 192 // tmp, scr and flags are killed 193 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) { 194 assert(tmp == rax, ""); 195 assert(scr == rdx, ""); 196 rdtsc(); // modifies EDX:EAX 197 andptr(tmp, count-1); 198 jccb(Assembler::notZero, brLabel); 199 } 200 201 // Perform abort ratio calculation, set no_rtm bit if high ratio 202 // input: rtm_counters_Reg (RTMLockingCounters* address) 203 // tmpReg, rtm_counters_Reg and flags are killed 204 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg, 205 Register rtm_counters_Reg, 206 RTMLockingCounters* rtm_counters, 207 Metadata* method_data) { 208 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 209 210 if (RTMLockingCalculationDelay > 0) { 211 // Delay calculation 212 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr())); 213 testptr(tmpReg, tmpReg); 214 jccb(Assembler::equal, L_done); 215 } 216 // Abort ratio calculation only if abort_count > RTMAbortThreshold 217 // Aborted transactions = abort_count * 100 218 // All transactions = total_count * RTMTotalCountIncrRate 219 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 220 221 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset())); 222 cmpptr(tmpReg, RTMAbortThreshold); 223 jccb(Assembler::below, L_check_always_rtm2); 224 imulptr(tmpReg, tmpReg, 100); 225 226 Register scrReg = rtm_counters_Reg; 227 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 228 imulptr(scrReg, scrReg, RTMTotalCountIncrRate); 229 imulptr(scrReg, scrReg, RTMAbortRatio); 230 cmpptr(tmpReg, scrReg); 231 jccb(Assembler::below, L_check_always_rtm1); 232 if (method_data != nullptr) { 233 // set rtm_state to "no rtm" in MDO 234 mov_metadata(tmpReg, method_data); 235 lock(); 236 orl(Address(tmpReg, MethodData::rtm_state_offset()), NoRTM); 237 } 238 jmpb(L_done); 239 bind(L_check_always_rtm1); 240 // Reload RTMLockingCounters* address 241 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 242 bind(L_check_always_rtm2); 243 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 244 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 245 jccb(Assembler::below, L_done); 246 if (method_data != nullptr) { 247 // set rtm_state to "always rtm" in MDO 248 mov_metadata(tmpReg, method_data); 249 lock(); 250 orl(Address(tmpReg, MethodData::rtm_state_offset()), UseRTM); 251 } 252 bind(L_done); 253 } 254 255 // Update counters and perform abort ratio calculation 256 // input: abort_status_Reg 257 // rtm_counters_Reg, flags are killed 258 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg, 259 Register rtm_counters_Reg, 260 RTMLockingCounters* rtm_counters, 261 Metadata* method_data, 262 bool profile_rtm) { 263 264 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 265 // update rtm counters based on rax value at abort 266 // reads abort_status_Reg, updates flags 267 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 268 rtm_counters_update(abort_status_Reg, rtm_counters_Reg); 269 if (profile_rtm) { 270 // Save abort status because abort_status_Reg is used by following code. 271 if (RTMRetryCount > 0) { 272 push(abort_status_Reg); 273 } 274 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 275 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data); 276 // restore abort status 277 if (RTMRetryCount > 0) { 278 pop(abort_status_Reg); 279 } 280 } 281 } 282 283 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4) 284 // inputs: retry_count_Reg 285 // : abort_status_Reg 286 // output: retry_count_Reg decremented by 1 287 // flags are killed 288 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) { 289 Label doneRetry; 290 assert(abort_status_Reg == rax, ""); 291 // The abort reason bits are in eax (see all states in rtmLocking.hpp) 292 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) 293 // if reason is in 0x6 and retry count != 0 then retry 294 andptr(abort_status_Reg, 0x6); 295 jccb(Assembler::zero, doneRetry); 296 testl(retry_count_Reg, retry_count_Reg); 297 jccb(Assembler::zero, doneRetry); 298 pause(); 299 decrementl(retry_count_Reg); 300 jmp(retryLabel); 301 bind(doneRetry); 302 } 303 304 // Spin and retry if lock is busy, 305 // inputs: box_Reg (monitor address) 306 // : retry_count_Reg 307 // output: retry_count_Reg decremented by 1 308 // : clear z flag if retry count exceeded 309 // tmp_Reg, scr_Reg, flags are killed 310 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg, 311 Register tmp_Reg, Register scr_Reg, Label& retryLabel) { 312 Label SpinLoop, SpinExit, doneRetry; 313 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 314 315 testl(retry_count_Reg, retry_count_Reg); 316 jccb(Assembler::zero, doneRetry); 317 decrementl(retry_count_Reg); 318 movptr(scr_Reg, RTMSpinLoopCount); 319 320 bind(SpinLoop); 321 pause(); 322 decrementl(scr_Reg); 323 jccb(Assembler::lessEqual, SpinExit); 324 movptr(tmp_Reg, Address(box_Reg, owner_offset)); 325 testptr(tmp_Reg, tmp_Reg); 326 jccb(Assembler::notZero, SpinLoop); 327 328 bind(SpinExit); 329 jmp(retryLabel); 330 bind(doneRetry); 331 incrementl(retry_count_Reg); // clear z flag 332 } 333 334 // Use RTM for normal stack locks 335 // Input: objReg (object to lock) 336 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg, 337 Register retry_on_abort_count_Reg, 338 RTMLockingCounters* stack_rtm_counters, 339 Metadata* method_data, bool profile_rtm, 340 Label& DONE_LABEL, Label& IsInflated) { 341 assert(UseRTMForStackLocks, "why call this otherwise?"); 342 assert(tmpReg == rax, ""); 343 assert(scrReg == rdx, ""); 344 Label L_rtm_retry, L_decrement_retry, L_on_abort; 345 346 if (RTMRetryCount > 0) { 347 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 348 bind(L_rtm_retry); 349 } 350 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 351 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 352 jcc(Assembler::notZero, IsInflated); 353 354 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 355 Label L_noincrement; 356 if (RTMTotalCountIncrRate > 1) { 357 // tmpReg, scrReg and flags are killed 358 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 359 } 360 assert(stack_rtm_counters != nullptr, "should not be null when profiling RTM"); 361 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg); 362 bind(L_noincrement); 363 } 364 xbegin(L_on_abort); 365 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 366 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 367 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 368 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked 369 370 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 371 if (UseRTMXendForLockBusy) { 372 xend(); 373 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry) 374 jmp(L_decrement_retry); 375 } 376 else { 377 xabort(0); 378 } 379 bind(L_on_abort); 380 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 381 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm); 382 } 383 bind(L_decrement_retry); 384 if (RTMRetryCount > 0) { 385 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 386 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 387 } 388 } 389 390 // Use RTM for inflating locks 391 // inputs: objReg (object to lock) 392 // boxReg (on-stack box address (displaced header location) - KILLED) 393 // tmpReg (ObjectMonitor address + markWord::monitor_value) 394 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg, 395 Register scrReg, Register retry_on_busy_count_Reg, 396 Register retry_on_abort_count_Reg, 397 RTMLockingCounters* rtm_counters, 398 Metadata* method_data, bool profile_rtm, 399 Label& DONE_LABEL) { 400 assert(UseRTMLocking, "why call this otherwise?"); 401 assert(tmpReg == rax, ""); 402 assert(scrReg == rdx, ""); 403 Label L_rtm_retry, L_decrement_retry, L_on_abort; 404 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 405 406 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 407 movptr(boxReg, tmpReg); // Save ObjectMonitor address 408 409 if (RTMRetryCount > 0) { 410 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy 411 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 412 bind(L_rtm_retry); 413 } 414 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 415 Label L_noincrement; 416 if (RTMTotalCountIncrRate > 1) { 417 // tmpReg, scrReg and flags are killed 418 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 419 } 420 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 421 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg); 422 bind(L_noincrement); 423 } 424 xbegin(L_on_abort); 425 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 426 movptr(tmpReg, Address(tmpReg, owner_offset)); 427 testptr(tmpReg, tmpReg); 428 jcc(Assembler::zero, DONE_LABEL); 429 if (UseRTMXendForLockBusy) { 430 xend(); 431 jmp(L_decrement_retry); 432 } 433 else { 434 xabort(0); 435 } 436 bind(L_on_abort); 437 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 438 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 439 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm); 440 } 441 if (RTMRetryCount > 0) { 442 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 443 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 444 } 445 446 movptr(tmpReg, Address(boxReg, owner_offset)) ; 447 testptr(tmpReg, tmpReg) ; 448 jccb(Assembler::notZero, L_decrement_retry) ; 449 450 // Appears unlocked - try to swing _owner from null to non-null. 451 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 452 #ifdef _LP64 453 Register threadReg = r15_thread; 454 #else 455 get_thread(scrReg); 456 Register threadReg = scrReg; 457 #endif 458 lock(); 459 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg 460 461 if (RTMRetryCount > 0) { 462 // success done else retry 463 jccb(Assembler::equal, DONE_LABEL) ; 464 bind(L_decrement_retry); 465 // Spin and retry if lock is busy. 466 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry); 467 } 468 else { 469 bind(L_decrement_retry); 470 } 471 } 472 473 #endif // INCLUDE_RTM_OPT 474 475 // fast_lock and fast_unlock used by C2 476 477 // Because the transitions from emitted code to the runtime 478 // monitorenter/exit helper stubs are so slow it's critical that 479 // we inline both the stack-locking fast path and the inflated fast path. 480 // 481 // See also: cmpFastLock and cmpFastUnlock. 482 // 483 // What follows is a specialized inline transliteration of the code 484 // in enter() and exit(). If we're concerned about I$ bloat another 485 // option would be to emit TrySlowEnter and TrySlowExit methods 486 // at startup-time. These methods would accept arguments as 487 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 488 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 489 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 490 // In practice, however, the # of lock sites is bounded and is usually small. 491 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 492 // if the processor uses simple bimodal branch predictors keyed by EIP 493 // Since the helper routines would be called from multiple synchronization 494 // sites. 495 // 496 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 497 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 498 // to those specialized methods. That'd give us a mostly platform-independent 499 // implementation that the JITs could optimize and inline at their pleasure. 500 // Done correctly, the only time we'd need to cross to native could would be 501 // to park() or unpark() threads. We'd also need a few more unsafe operators 502 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 503 // (b) explicit barriers or fence operations. 504 // 505 // TODO: 506 // 507 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 508 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 509 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 510 // the lock operators would typically be faster than reifying Self. 511 // 512 // * Ideally I'd define the primitives as: 513 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 514 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 515 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 516 // Instead, we're stuck with a rather awkward and brittle register assignments below. 517 // Furthermore the register assignments are overconstrained, possibly resulting in 518 // sub-optimal code near the synchronization site. 519 // 520 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 521 // Alternately, use a better sp-proximity test. 522 // 523 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 524 // Either one is sufficient to uniquely identify a thread. 525 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 526 // 527 // * Intrinsify notify() and notifyAll() for the common cases where the 528 // object is locked by the calling thread but the waitlist is empty. 529 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 530 // 531 // * use jccb and jmpb instead of jcc and jmp to improve code density. 532 // But beware of excessive branch density on AMD Opterons. 533 // 534 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 535 // or failure of the fast path. If the fast path fails then we pass 536 // control to the slow path, typically in C. In fast_lock and 537 // fast_unlock we often branch to DONE_LABEL, just to find that C2 538 // will emit a conditional branch immediately after the node. 539 // So we have branches to branches and lots of ICC.ZF games. 540 // Instead, it might be better to have C2 pass a "FailureLabel" 541 // into fast_lock and fast_unlock. In the case of success, control 542 // will drop through the node. ICC.ZF is undefined at exit. 543 // In the case of failure, the node will branch directly to the 544 // FailureLabel 545 546 547 // obj: object to lock 548 // box: on-stack box address (displaced header location) - KILLED 549 // rax,: tmp -- KILLED 550 // scr: tmp -- KILLED 551 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 552 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 553 RTMLockingCounters* rtm_counters, 554 RTMLockingCounters* stack_rtm_counters, 555 Metadata* method_data, 556 bool use_rtm, bool profile_rtm) { 557 // Ensure the register assignments are disjoint 558 assert(tmpReg == rax, ""); 559 560 if (use_rtm) { 561 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg); 562 } else { 563 assert(cx1Reg == noreg, ""); 564 assert(cx2Reg == noreg, ""); 565 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 566 } 567 568 // Possible cases that we'll encounter in fast_lock 569 // ------------------------------------------------ 570 // * Inflated 571 // -- unlocked 572 // -- Locked 573 // = by self 574 // = by other 575 // * neutral 576 // * stack-locked 577 // -- by self 578 // = sp-proximity test hits 579 // = sp-proximity test generates false-negative 580 // -- by other 581 // 582 583 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 584 585 if (DiagnoseSyncOnValueBasedClasses != 0) { 586 load_klass(tmpReg, objReg, scrReg); 587 movl(tmpReg, Address(tmpReg, Klass::access_flags_offset())); 588 testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS); 589 jcc(Assembler::notZero, DONE_LABEL); 590 } 591 592 #if INCLUDE_RTM_OPT 593 if (UseRTMForStackLocks && use_rtm) { 594 assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive"); 595 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg, 596 stack_rtm_counters, method_data, profile_rtm, 597 DONE_LABEL, IsInflated); 598 } 599 #endif // INCLUDE_RTM_OPT 600 601 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 602 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 603 jcc(Assembler::notZero, IsInflated); 604 605 if (LockingMode == LM_MONITOR) { 606 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 607 testptr(objReg, objReg); 608 } else if (LockingMode == LM_LEGACY) { 609 // Attempt stack-locking ... 610 orptr (tmpReg, markWord::unlocked_value); 611 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 612 lock(); 613 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 614 jcc(Assembler::equal, COUNT); // Success 615 616 // Recursive locking. 617 // The object is stack-locked: markword contains stack pointer to BasicLock. 618 // Locked by current thread if difference with current SP is less than one page. 619 subptr(tmpReg, rsp); 620 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 621 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) ); 622 movptr(Address(boxReg, 0), tmpReg); 623 } else { 624 assert(LockingMode == LM_LIGHTWEIGHT, ""); 625 lightweight_lock(objReg, tmpReg, thread, scrReg, NO_COUNT); 626 jmp(COUNT); 627 } 628 jmp(DONE_LABEL); 629 630 bind(IsInflated); 631 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 632 633 #if INCLUDE_RTM_OPT 634 // Use the same RTM locking code in 32- and 64-bit VM. 635 if (use_rtm) { 636 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg, 637 rtm_counters, method_data, profile_rtm, DONE_LABEL); 638 } else { 639 #endif // INCLUDE_RTM_OPT 640 641 #ifndef _LP64 642 // The object is inflated. 643 644 // boxReg refers to the on-stack BasicLock in the current frame. 645 // We'd like to write: 646 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 647 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 648 // additional latency as we have another ST in the store buffer that must drain. 649 650 // avoid ST-before-CAS 651 // register juggle because we need tmpReg for cmpxchgptr below 652 movptr(scrReg, boxReg); 653 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 654 655 // Optimistic form: consider XORL tmpReg,tmpReg 656 movptr(tmpReg, NULL_WORD); 657 658 // Appears unlocked - try to swing _owner from null to non-null. 659 // Ideally, I'd manifest "Self" with get_thread and then attempt 660 // to CAS the register containing Self into m->Owner. 661 // But we don't have enough registers, so instead we can either try to CAS 662 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 663 // we later store "Self" into m->Owner. Transiently storing a stack address 664 // (rsp or the address of the box) into m->owner is harmless. 665 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 666 lock(); 667 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 668 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 669 // If we weren't able to swing _owner from null to the BasicLock 670 // then take the slow path. 671 jccb (Assembler::notZero, NO_COUNT); 672 // update _owner from BasicLock to thread 673 get_thread (scrReg); // beware: clobbers ICCs 674 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 675 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 676 677 // If the CAS fails we can either retry or pass control to the slow path. 678 // We use the latter tactic. 679 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 680 // If the CAS was successful ... 681 // Self has acquired the lock 682 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 683 // Intentional fall-through into DONE_LABEL ... 684 #else // _LP64 685 // It's inflated and we use scrReg for ObjectMonitor* in this section. 686 movq(scrReg, tmpReg); 687 xorq(tmpReg, tmpReg); 688 lock(); 689 cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 690 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 691 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 692 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 693 // Propagate ICC.ZF from CAS above into DONE_LABEL. 694 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 695 696 cmpptr(thread, rax); // Check if we are already the owner (recursive lock) 697 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 698 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 699 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 700 #endif // _LP64 701 #if INCLUDE_RTM_OPT 702 } // use_rtm() 703 #endif 704 bind(DONE_LABEL); 705 706 // ZFlag == 1 count in fast path 707 // ZFlag == 0 count in slow path 708 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 709 710 bind(COUNT); 711 // Count monitors in fast path 712 increment(Address(thread, JavaThread::held_monitor_count_offset())); 713 714 xorl(tmpReg, tmpReg); // Set ZF == 1 715 716 bind(NO_COUNT); 717 718 // At NO_COUNT the icc ZFlag is set as follows ... 719 // fast_unlock uses the same protocol. 720 // ZFlag == 1 -> Success 721 // ZFlag == 0 -> Failure - force control through the slow path 722 } 723 724 // obj: object to unlock 725 // box: box address (displaced header location), killed. Must be EAX. 726 // tmp: killed, cannot be obj nor box. 727 // 728 // Some commentary on balanced locking: 729 // 730 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 731 // Methods that don't have provably balanced locking are forced to run in the 732 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 733 // The interpreter provides two properties: 734 // I1: At return-time the interpreter automatically and quietly unlocks any 735 // objects acquired the current activation (frame). Recall that the 736 // interpreter maintains an on-stack list of locks currently held by 737 // a frame. 738 // I2: If a method attempts to unlock an object that is not held by the 739 // the frame the interpreter throws IMSX. 740 // 741 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 742 // B() doesn't have provably balanced locking so it runs in the interpreter. 743 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 744 // is still locked by A(). 745 // 746 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 747 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 748 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 749 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 750 // Arguably given that the spec legislates the JNI case as undefined our implementation 751 // could reasonably *avoid* checking owner in fast_unlock(). 752 // In the interest of performance we elide m->Owner==Self check in unlock. 753 // A perfectly viable alternative is to elide the owner check except when 754 // Xcheck:jni is enabled. 755 756 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { 757 assert(boxReg == rax, ""); 758 assert_different_registers(objReg, boxReg, tmpReg); 759 760 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 761 762 #if INCLUDE_RTM_OPT 763 if (UseRTMForStackLocks && use_rtm) { 764 assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive"); 765 Label L_regular_unlock; 766 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 767 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 768 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 769 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock 770 xend(); // otherwise end... 771 jmp(DONE_LABEL); // ... and we're done 772 bind(L_regular_unlock); 773 } 774 #endif 775 776 if (LockingMode == LM_LEGACY) { 777 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 778 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 779 } 780 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 781 if (LockingMode != LM_MONITOR) { 782 testptr(tmpReg, markWord::monitor_value); // Inflated? 783 jcc(Assembler::zero, Stacked); 784 } 785 786 // It's inflated. 787 if (LockingMode == LM_LIGHTWEIGHT) { 788 // If the owner is ANONYMOUS, we need to fix it - in an outline stub. 789 testb(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t) ObjectMonitor::ANONYMOUS_OWNER); 790 #ifdef _LP64 791 if (!Compile::current()->output()->in_scratch_emit_size()) { 792 C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmpReg, boxReg); 793 Compile::current()->output()->add_stub(stub); 794 jcc(Assembler::notEqual, stub->entry()); 795 bind(stub->continuation()); 796 } else 797 #endif 798 { 799 // We can't easily implement this optimization on 32 bit because we don't have a thread register. 800 // Call the slow-path instead. 801 jcc(Assembler::notEqual, NO_COUNT); 802 } 803 } 804 805 #if INCLUDE_RTM_OPT 806 if (use_rtm) { 807 Label L_regular_inflated_unlock; 808 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 809 movptr(boxReg, Address(tmpReg, owner_offset)); 810 testptr(boxReg, boxReg); 811 jccb(Assembler::notZero, L_regular_inflated_unlock); 812 xend(); 813 jmp(DONE_LABEL); 814 bind(L_regular_inflated_unlock); 815 } 816 #endif 817 818 // Despite our balanced locking property we still check that m->_owner == Self 819 // as java routines or native JNI code called by this thread might 820 // have released the lock. 821 // Refer to the comments in synchronizer.cpp for how we might encode extra 822 // state in _succ so we can avoid fetching EntryList|cxq. 823 // 824 // If there's no contention try a 1-0 exit. That is, exit without 825 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 826 // we detect and recover from the race that the 1-0 exit admits. 827 // 828 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 829 // before it STs null into _owner, releasing the lock. Updates 830 // to data protected by the critical section must be visible before 831 // we drop the lock (and thus before any other thread could acquire 832 // the lock and observe the fields protected by the lock). 833 // IA32's memory-model is SPO, so STs are ordered with respect to 834 // each other and there's no need for an explicit barrier (fence). 835 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 836 #ifndef _LP64 837 // Note that we could employ various encoding schemes to reduce 838 // the number of loads below (currently 4) to just 2 or 3. 839 // Refer to the comments in synchronizer.cpp. 840 // In practice the chain of fetches doesn't seem to impact performance, however. 841 xorptr(boxReg, boxReg); 842 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 843 jccb (Assembler::notZero, DONE_LABEL); 844 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 845 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 846 jccb (Assembler::notZero, DONE_LABEL); 847 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 848 jmpb (DONE_LABEL); 849 #else // _LP64 850 // It's inflated 851 Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath; 852 853 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 854 jccb(Assembler::equal, LNotRecursive); 855 856 // Recursive inflated unlock 857 decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 858 jmpb(LSuccess); 859 860 bind(LNotRecursive); 861 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 862 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 863 jccb (Assembler::notZero, CheckSucc); 864 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 865 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 866 jmpb (DONE_LABEL); 867 868 // Try to avoid passing control into the slow_path ... 869 bind (CheckSucc); 870 871 // The following optional optimization can be elided if necessary 872 // Effectively: if (succ == null) goto slow path 873 // The code reduces the window for a race, however, 874 // and thus benefits performance. 875 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 876 jccb (Assembler::zero, LGoSlowPath); 877 878 xorptr(boxReg, boxReg); 879 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 880 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 881 882 // Memory barrier/fence 883 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 884 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 885 // This is faster on Nehalem and AMD Shanghai/Barcelona. 886 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 887 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 888 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 889 lock(); addl(Address(rsp, 0), 0); 890 891 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 892 jccb (Assembler::notZero, LSuccess); 893 894 // Rare inopportune interleaving - race. 895 // The successor vanished in the small window above. 896 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 897 // We need to ensure progress and succession. 898 // Try to reacquire the lock. 899 // If that fails then the new owner is responsible for succession and this 900 // thread needs to take no further action and can exit via the fast path (success). 901 // If the re-acquire succeeds then pass control into the slow path. 902 // As implemented, this latter mode is horrible because we generated more 903 // coherence traffic on the lock *and* artificially extended the critical section 904 // length while by virtue of passing control into the slow path. 905 906 // box is really RAX -- the following CMPXCHG depends on that binding 907 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 908 lock(); 909 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 910 // There's no successor so we tried to regrab the lock. 911 // If that didn't work, then another thread grabbed the 912 // lock so we're done (and exit was a success). 913 jccb (Assembler::notEqual, LSuccess); 914 // Intentional fall-through into slow path 915 916 bind (LGoSlowPath); 917 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 918 jmpb (DONE_LABEL); 919 920 bind (LSuccess); 921 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 922 jmpb (DONE_LABEL); 923 924 #endif 925 if (LockingMode != LM_MONITOR) { 926 bind (Stacked); 927 if (LockingMode == LM_LIGHTWEIGHT) { 928 mov(boxReg, tmpReg); 929 lightweight_unlock(objReg, boxReg, tmpReg, NO_COUNT); 930 jmp(COUNT); 931 } else if (LockingMode == LM_LEGACY) { 932 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 933 lock(); 934 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 935 } 936 // Intentional fall-thru into DONE_LABEL 937 } 938 bind(DONE_LABEL); 939 940 // ZFlag == 1 count in fast path 941 // ZFlag == 0 count in slow path 942 jccb(Assembler::notZero, NO_COUNT); 943 944 bind(COUNT); 945 // Count monitors in fast path 946 #ifndef _LP64 947 get_thread(tmpReg); 948 decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); 949 #else // _LP64 950 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 951 #endif 952 953 xorl(tmpReg, tmpReg); // Set ZF == 1 954 955 bind(NO_COUNT); 956 } 957 958 //------------------------------------------------------------------------------------------- 959 // Generic instructions support for use in .ad files C2 code generation 960 961 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 962 if (dst != src) { 963 movdqu(dst, src); 964 } 965 if (opcode == Op_AbsVD) { 966 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 967 } else { 968 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 969 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 970 } 971 } 972 973 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 974 if (opcode == Op_AbsVD) { 975 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 976 } else { 977 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 978 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 979 } 980 } 981 982 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 983 if (dst != src) { 984 movdqu(dst, src); 985 } 986 if (opcode == Op_AbsVF) { 987 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 988 } else { 989 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 990 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 991 } 992 } 993 994 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 995 if (opcode == Op_AbsVF) { 996 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 997 } else { 998 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 999 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 1000 } 1001 } 1002 1003 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 1004 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1005 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 1006 1007 if (opcode == Op_MinV) { 1008 if (elem_bt == T_BYTE) { 1009 pminsb(dst, src); 1010 } else if (elem_bt == T_SHORT) { 1011 pminsw(dst, src); 1012 } else if (elem_bt == T_INT) { 1013 pminsd(dst, src); 1014 } else { 1015 assert(elem_bt == T_LONG, "required"); 1016 assert(tmp == xmm0, "required"); 1017 assert_different_registers(dst, src, tmp); 1018 movdqu(xmm0, dst); 1019 pcmpgtq(xmm0, src); 1020 blendvpd(dst, src); // xmm0 as mask 1021 } 1022 } else { // opcode == Op_MaxV 1023 if (elem_bt == T_BYTE) { 1024 pmaxsb(dst, src); 1025 } else if (elem_bt == T_SHORT) { 1026 pmaxsw(dst, src); 1027 } else if (elem_bt == T_INT) { 1028 pmaxsd(dst, src); 1029 } else { 1030 assert(elem_bt == T_LONG, "required"); 1031 assert(tmp == xmm0, "required"); 1032 assert_different_registers(dst, src, tmp); 1033 movdqu(xmm0, src); 1034 pcmpgtq(xmm0, dst); 1035 blendvpd(dst, src); // xmm0 as mask 1036 } 1037 } 1038 } 1039 1040 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 1041 XMMRegister dst, XMMRegister src1, XMMRegister src2, 1042 int vlen_enc) { 1043 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1044 1045 if (opcode == Op_MinV) { 1046 if (elem_bt == T_BYTE) { 1047 vpminsb(dst, src1, src2, vlen_enc); 1048 } else if (elem_bt == T_SHORT) { 1049 vpminsw(dst, src1, src2, vlen_enc); 1050 } else if (elem_bt == T_INT) { 1051 vpminsd(dst, src1, src2, vlen_enc); 1052 } else { 1053 assert(elem_bt == T_LONG, "required"); 1054 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1055 vpminsq(dst, src1, src2, vlen_enc); 1056 } else { 1057 assert_different_registers(dst, src1, src2); 1058 vpcmpgtq(dst, src1, src2, vlen_enc); 1059 vblendvpd(dst, src1, src2, dst, vlen_enc); 1060 } 1061 } 1062 } else { // opcode == Op_MaxV 1063 if (elem_bt == T_BYTE) { 1064 vpmaxsb(dst, src1, src2, vlen_enc); 1065 } else if (elem_bt == T_SHORT) { 1066 vpmaxsw(dst, src1, src2, vlen_enc); 1067 } else if (elem_bt == T_INT) { 1068 vpmaxsd(dst, src1, src2, vlen_enc); 1069 } else { 1070 assert(elem_bt == T_LONG, "required"); 1071 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1072 vpmaxsq(dst, src1, src2, vlen_enc); 1073 } else { 1074 assert_different_registers(dst, src1, src2); 1075 vpcmpgtq(dst, src1, src2, vlen_enc); 1076 vblendvpd(dst, src2, src1, dst, vlen_enc); 1077 } 1078 } 1079 } 1080 } 1081 1082 // Float/Double min max 1083 1084 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1085 XMMRegister dst, XMMRegister a, XMMRegister b, 1086 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1087 int vlen_enc) { 1088 assert(UseAVX > 0, "required"); 1089 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1090 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1091 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1092 assert_different_registers(a, b, tmp, atmp, btmp); 1093 1094 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1095 bool is_double_word = is_double_word_type(elem_bt); 1096 1097 /* Note on 'non-obvious' assembly sequence: 1098 * 1099 * While there are vminps/vmaxps instructions, there are two important differences between hardware 1100 * and Java on how they handle floats: 1101 * a. -0.0 and +0.0 are considered equal (vminps/vmaxps will return second parameter when inputs are equal) 1102 * b. NaN is not necesarily propagated (vminps/vmaxps will return second parameter when either input is NaN) 1103 * 1104 * It is still more efficient to use vminps/vmaxps, but with some pre/post-processing: 1105 * a. -0.0/+0.0: Bias negative (positive) numbers to second parameter before vminps (vmaxps) 1106 * (only useful when signs differ, noop otherwise) 1107 * b. NaN: Check if it was the first parameter that had the NaN (with vcmp[UNORD_Q]) 1108 1109 * Following pseudo code describes the algorithm for max[FD] (Min algorithm is on similar lines): 1110 * btmp = (b < +0.0) ? a : b 1111 * atmp = (b < +0.0) ? b : a 1112 * Tmp = Max_Float(atmp , btmp) 1113 * Res = (atmp == NaN) ? atmp : Tmp 1114 */ 1115 1116 void (MacroAssembler::*vblend)(XMMRegister, XMMRegister, XMMRegister, XMMRegister, int, bool, XMMRegister); 1117 void (MacroAssembler::*vmaxmin)(XMMRegister, XMMRegister, XMMRegister, int); 1118 void (MacroAssembler::*vcmp)(XMMRegister, XMMRegister, XMMRegister, int, int); 1119 XMMRegister mask; 1120 1121 if (!is_double_word && is_min) { 1122 mask = a; 1123 vblend = &MacroAssembler::vblendvps; 1124 vmaxmin = &MacroAssembler::vminps; 1125 vcmp = &MacroAssembler::vcmpps; 1126 } else if (!is_double_word && !is_min) { 1127 mask = b; 1128 vblend = &MacroAssembler::vblendvps; 1129 vmaxmin = &MacroAssembler::vmaxps; 1130 vcmp = &MacroAssembler::vcmpps; 1131 } else if (is_double_word && is_min) { 1132 mask = a; 1133 vblend = &MacroAssembler::vblendvpd; 1134 vmaxmin = &MacroAssembler::vminpd; 1135 vcmp = &MacroAssembler::vcmppd; 1136 } else { 1137 assert(is_double_word && !is_min, "sanity"); 1138 mask = b; 1139 vblend = &MacroAssembler::vblendvpd; 1140 vmaxmin = &MacroAssembler::vmaxpd; 1141 vcmp = &MacroAssembler::vcmppd; 1142 } 1143 1144 // Make sure EnableX86ECoreOpts isn't disabled on register overlaps 1145 XMMRegister maxmin, scratch; 1146 if (dst == btmp) { 1147 maxmin = btmp; 1148 scratch = tmp; 1149 } else { 1150 maxmin = tmp; 1151 scratch = btmp; 1152 } 1153 1154 bool precompute_mask = EnableX86ECoreOpts && UseAVX>1; 1155 if (precompute_mask && !is_double_word) { 1156 vpsrad(tmp, mask, 32, vlen_enc); 1157 mask = tmp; 1158 } else if (precompute_mask && is_double_word) { 1159 vpxor(tmp, tmp, tmp, vlen_enc); 1160 vpcmpgtq(tmp, tmp, mask, vlen_enc); 1161 mask = tmp; 1162 } 1163 1164 (this->*vblend)(atmp, a, b, mask, vlen_enc, !precompute_mask, btmp); 1165 (this->*vblend)(btmp, b, a, mask, vlen_enc, !precompute_mask, tmp); 1166 (this->*vmaxmin)(maxmin, atmp, btmp, vlen_enc); 1167 (this->*vcmp)(scratch, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1168 (this->*vblend)(dst, maxmin, atmp, scratch, vlen_enc, false, scratch); 1169 } 1170 1171 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1172 XMMRegister dst, XMMRegister a, XMMRegister b, 1173 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1174 int vlen_enc) { 1175 assert(UseAVX > 2, "required"); 1176 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1177 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1178 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1179 assert_different_registers(dst, a, b, atmp, btmp); 1180 1181 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1182 bool is_double_word = is_double_word_type(elem_bt); 1183 bool merge = true; 1184 1185 if (!is_double_word && is_min) { 1186 evpmovd2m(ktmp, a, vlen_enc); 1187 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1188 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1189 vminps(dst, atmp, btmp, vlen_enc); 1190 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1191 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1192 } else if (!is_double_word && !is_min) { 1193 evpmovd2m(ktmp, b, vlen_enc); 1194 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1195 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1196 vmaxps(dst, atmp, btmp, vlen_enc); 1197 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1198 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1199 } else if (is_double_word && is_min) { 1200 evpmovq2m(ktmp, a, vlen_enc); 1201 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1202 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1203 vminpd(dst, atmp, btmp, vlen_enc); 1204 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1205 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1206 } else { 1207 assert(is_double_word && !is_min, "sanity"); 1208 evpmovq2m(ktmp, b, vlen_enc); 1209 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1210 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1211 vmaxpd(dst, atmp, btmp, vlen_enc); 1212 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1213 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1214 } 1215 } 1216 1217 // Float/Double signum 1218 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1219 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1220 1221 Label DONE_LABEL; 1222 1223 if (opcode == Op_SignumF) { 1224 assert(UseSSE > 0, "required"); 1225 ucomiss(dst, zero); 1226 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1227 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1228 movflt(dst, one); 1229 jcc(Assembler::above, DONE_LABEL); 1230 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1231 } else if (opcode == Op_SignumD) { 1232 assert(UseSSE > 1, "required"); 1233 ucomisd(dst, zero); 1234 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1235 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1236 movdbl(dst, one); 1237 jcc(Assembler::above, DONE_LABEL); 1238 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1239 } 1240 1241 bind(DONE_LABEL); 1242 } 1243 1244 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1245 if (sign) { 1246 pmovsxbw(dst, src); 1247 } else { 1248 pmovzxbw(dst, src); 1249 } 1250 } 1251 1252 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1253 if (sign) { 1254 vpmovsxbw(dst, src, vector_len); 1255 } else { 1256 vpmovzxbw(dst, src, vector_len); 1257 } 1258 } 1259 1260 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1261 if (sign) { 1262 vpmovsxbd(dst, src, vector_len); 1263 } else { 1264 vpmovzxbd(dst, src, vector_len); 1265 } 1266 } 1267 1268 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1269 if (sign) { 1270 vpmovsxwd(dst, src, vector_len); 1271 } else { 1272 vpmovzxwd(dst, src, vector_len); 1273 } 1274 } 1275 1276 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1277 int shift, int vector_len) { 1278 if (opcode == Op_RotateLeftV) { 1279 if (etype == T_INT) { 1280 evprold(dst, src, shift, vector_len); 1281 } else { 1282 assert(etype == T_LONG, "expected type T_LONG"); 1283 evprolq(dst, src, shift, vector_len); 1284 } 1285 } else { 1286 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1287 if (etype == T_INT) { 1288 evprord(dst, src, shift, vector_len); 1289 } else { 1290 assert(etype == T_LONG, "expected type T_LONG"); 1291 evprorq(dst, src, shift, vector_len); 1292 } 1293 } 1294 } 1295 1296 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1297 XMMRegister shift, int vector_len) { 1298 if (opcode == Op_RotateLeftV) { 1299 if (etype == T_INT) { 1300 evprolvd(dst, src, shift, vector_len); 1301 } else { 1302 assert(etype == T_LONG, "expected type T_LONG"); 1303 evprolvq(dst, src, shift, vector_len); 1304 } 1305 } else { 1306 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1307 if (etype == T_INT) { 1308 evprorvd(dst, src, shift, vector_len); 1309 } else { 1310 assert(etype == T_LONG, "expected type T_LONG"); 1311 evprorvq(dst, src, shift, vector_len); 1312 } 1313 } 1314 } 1315 1316 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1317 if (opcode == Op_RShiftVI) { 1318 psrad(dst, shift); 1319 } else if (opcode == Op_LShiftVI) { 1320 pslld(dst, shift); 1321 } else { 1322 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1323 psrld(dst, shift); 1324 } 1325 } 1326 1327 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1328 switch (opcode) { 1329 case Op_RShiftVI: psrad(dst, shift); break; 1330 case Op_LShiftVI: pslld(dst, shift); break; 1331 case Op_URShiftVI: psrld(dst, shift); break; 1332 1333 default: assert(false, "%s", NodeClassNames[opcode]); 1334 } 1335 } 1336 1337 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1338 if (opcode == Op_RShiftVI) { 1339 vpsrad(dst, nds, shift, vector_len); 1340 } else if (opcode == Op_LShiftVI) { 1341 vpslld(dst, nds, shift, vector_len); 1342 } else { 1343 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1344 vpsrld(dst, nds, shift, vector_len); 1345 } 1346 } 1347 1348 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1349 switch (opcode) { 1350 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1351 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1352 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1353 1354 default: assert(false, "%s", NodeClassNames[opcode]); 1355 } 1356 } 1357 1358 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1359 switch (opcode) { 1360 case Op_RShiftVB: // fall-through 1361 case Op_RShiftVS: psraw(dst, shift); break; 1362 1363 case Op_LShiftVB: // fall-through 1364 case Op_LShiftVS: psllw(dst, shift); break; 1365 1366 case Op_URShiftVS: // fall-through 1367 case Op_URShiftVB: psrlw(dst, shift); break; 1368 1369 default: assert(false, "%s", NodeClassNames[opcode]); 1370 } 1371 } 1372 1373 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1374 switch (opcode) { 1375 case Op_RShiftVB: // fall-through 1376 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1377 1378 case Op_LShiftVB: // fall-through 1379 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1380 1381 case Op_URShiftVS: // fall-through 1382 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1383 1384 default: assert(false, "%s", NodeClassNames[opcode]); 1385 } 1386 } 1387 1388 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1389 switch (opcode) { 1390 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1391 case Op_LShiftVL: psllq(dst, shift); break; 1392 case Op_URShiftVL: psrlq(dst, shift); break; 1393 1394 default: assert(false, "%s", NodeClassNames[opcode]); 1395 } 1396 } 1397 1398 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1399 if (opcode == Op_RShiftVL) { 1400 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1401 } else if (opcode == Op_LShiftVL) { 1402 psllq(dst, shift); 1403 } else { 1404 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1405 psrlq(dst, shift); 1406 } 1407 } 1408 1409 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1410 switch (opcode) { 1411 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1412 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1413 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1414 1415 default: assert(false, "%s", NodeClassNames[opcode]); 1416 } 1417 } 1418 1419 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1420 if (opcode == Op_RShiftVL) { 1421 evpsraq(dst, nds, shift, vector_len); 1422 } else if (opcode == Op_LShiftVL) { 1423 vpsllq(dst, nds, shift, vector_len); 1424 } else { 1425 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1426 vpsrlq(dst, nds, shift, vector_len); 1427 } 1428 } 1429 1430 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1431 switch (opcode) { 1432 case Op_RShiftVB: // fall-through 1433 case Op_RShiftVS: // fall-through 1434 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1435 1436 case Op_LShiftVB: // fall-through 1437 case Op_LShiftVS: // fall-through 1438 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1439 1440 case Op_URShiftVB: // fall-through 1441 case Op_URShiftVS: // fall-through 1442 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1443 1444 default: assert(false, "%s", NodeClassNames[opcode]); 1445 } 1446 } 1447 1448 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1449 switch (opcode) { 1450 case Op_RShiftVB: // fall-through 1451 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1452 1453 case Op_LShiftVB: // fall-through 1454 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1455 1456 case Op_URShiftVB: // fall-through 1457 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1458 1459 default: assert(false, "%s", NodeClassNames[opcode]); 1460 } 1461 } 1462 1463 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1464 assert(UseAVX >= 2, "required"); 1465 switch (opcode) { 1466 case Op_RShiftVL: { 1467 if (UseAVX > 2) { 1468 assert(tmp == xnoreg, "not used"); 1469 if (!VM_Version::supports_avx512vl()) { 1470 vlen_enc = Assembler::AVX_512bit; 1471 } 1472 evpsravq(dst, src, shift, vlen_enc); 1473 } else { 1474 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1475 vpsrlvq(dst, src, shift, vlen_enc); 1476 vpsrlvq(tmp, tmp, shift, vlen_enc); 1477 vpxor(dst, dst, tmp, vlen_enc); 1478 vpsubq(dst, dst, tmp, vlen_enc); 1479 } 1480 break; 1481 } 1482 case Op_LShiftVL: { 1483 assert(tmp == xnoreg, "not used"); 1484 vpsllvq(dst, src, shift, vlen_enc); 1485 break; 1486 } 1487 case Op_URShiftVL: { 1488 assert(tmp == xnoreg, "not used"); 1489 vpsrlvq(dst, src, shift, vlen_enc); 1490 break; 1491 } 1492 default: assert(false, "%s", NodeClassNames[opcode]); 1493 } 1494 } 1495 1496 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1497 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1498 assert(opcode == Op_LShiftVB || 1499 opcode == Op_RShiftVB || 1500 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1501 bool sign = (opcode != Op_URShiftVB); 1502 assert(vector_len == 0, "required"); 1503 vextendbd(sign, dst, src, 1); 1504 vpmovzxbd(vtmp, shift, 1); 1505 varshiftd(opcode, dst, dst, vtmp, 1); 1506 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1507 vextracti128_high(vtmp, dst); 1508 vpackusdw(dst, dst, vtmp, 0); 1509 } 1510 1511 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1512 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1513 assert(opcode == Op_LShiftVB || 1514 opcode == Op_RShiftVB || 1515 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1516 bool sign = (opcode != Op_URShiftVB); 1517 int ext_vector_len = vector_len + 1; 1518 vextendbw(sign, dst, src, ext_vector_len); 1519 vpmovzxbw(vtmp, shift, ext_vector_len); 1520 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1521 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1522 if (vector_len == 0) { 1523 vextracti128_high(vtmp, dst); 1524 vpackuswb(dst, dst, vtmp, vector_len); 1525 } else { 1526 vextracti64x4_high(vtmp, dst); 1527 vpackuswb(dst, dst, vtmp, vector_len); 1528 vpermq(dst, dst, 0xD8, vector_len); 1529 } 1530 } 1531 1532 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1533 switch(typ) { 1534 case T_BYTE: 1535 pinsrb(dst, val, idx); 1536 break; 1537 case T_SHORT: 1538 pinsrw(dst, val, idx); 1539 break; 1540 case T_INT: 1541 pinsrd(dst, val, idx); 1542 break; 1543 case T_LONG: 1544 pinsrq(dst, val, idx); 1545 break; 1546 default: 1547 assert(false,"Should not reach here."); 1548 break; 1549 } 1550 } 1551 1552 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1553 switch(typ) { 1554 case T_BYTE: 1555 vpinsrb(dst, src, val, idx); 1556 break; 1557 case T_SHORT: 1558 vpinsrw(dst, src, val, idx); 1559 break; 1560 case T_INT: 1561 vpinsrd(dst, src, val, idx); 1562 break; 1563 case T_LONG: 1564 vpinsrq(dst, src, val, idx); 1565 break; 1566 default: 1567 assert(false,"Should not reach here."); 1568 break; 1569 } 1570 } 1571 1572 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1573 switch(typ) { 1574 case T_INT: 1575 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1576 break; 1577 case T_FLOAT: 1578 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1579 break; 1580 case T_LONG: 1581 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1582 break; 1583 case T_DOUBLE: 1584 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1585 break; 1586 default: 1587 assert(false,"Should not reach here."); 1588 break; 1589 } 1590 } 1591 1592 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1593 switch(typ) { 1594 case T_INT: 1595 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1596 break; 1597 case T_FLOAT: 1598 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1599 break; 1600 case T_LONG: 1601 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1602 break; 1603 case T_DOUBLE: 1604 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1605 break; 1606 default: 1607 assert(false,"Should not reach here."); 1608 break; 1609 } 1610 } 1611 1612 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1613 switch(typ) { 1614 case T_INT: 1615 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1616 break; 1617 case T_FLOAT: 1618 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1619 break; 1620 case T_LONG: 1621 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1622 break; 1623 case T_DOUBLE: 1624 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1625 break; 1626 default: 1627 assert(false,"Should not reach here."); 1628 break; 1629 } 1630 } 1631 1632 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1633 if (vlen_in_bytes <= 16) { 1634 pxor (dst, dst); 1635 psubb(dst, src); 1636 switch (elem_bt) { 1637 case T_BYTE: /* nothing to do */ break; 1638 case T_SHORT: pmovsxbw(dst, dst); break; 1639 case T_INT: pmovsxbd(dst, dst); break; 1640 case T_FLOAT: pmovsxbd(dst, dst); break; 1641 case T_LONG: pmovsxbq(dst, dst); break; 1642 case T_DOUBLE: pmovsxbq(dst, dst); break; 1643 1644 default: assert(false, "%s", type2name(elem_bt)); 1645 } 1646 } else { 1647 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1648 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1649 1650 vpxor (dst, dst, dst, vlen_enc); 1651 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1652 1653 switch (elem_bt) { 1654 case T_BYTE: /* nothing to do */ break; 1655 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1656 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1657 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1658 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1659 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1660 1661 default: assert(false, "%s", type2name(elem_bt)); 1662 } 1663 } 1664 } 1665 1666 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1667 if (novlbwdq) { 1668 vpmovsxbd(xtmp, src, vlen_enc); 1669 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1670 Assembler::eq, true, vlen_enc, noreg); 1671 } else { 1672 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1673 vpsubb(xtmp, xtmp, src, vlen_enc); 1674 evpmovb2m(dst, xtmp, vlen_enc); 1675 } 1676 } 1677 1678 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) { 1679 switch (vlen_in_bytes) { 1680 case 4: movdl(dst, src); break; 1681 case 8: movq(dst, src); break; 1682 case 16: movdqu(dst, src); break; 1683 case 32: vmovdqu(dst, src); break; 1684 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1685 default: ShouldNotReachHere(); 1686 } 1687 } 1688 1689 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1690 assert(rscratch != noreg || always_reachable(src), "missing"); 1691 1692 if (reachable(src)) { 1693 load_vector(dst, as_Address(src), vlen_in_bytes); 1694 } else { 1695 lea(rscratch, src); 1696 load_vector(dst, Address(rscratch, 0), vlen_in_bytes); 1697 } 1698 } 1699 1700 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1701 int vlen_enc = vector_length_encoding(vlen); 1702 if (VM_Version::supports_avx()) { 1703 if (bt == T_LONG) { 1704 if (VM_Version::supports_avx2()) { 1705 vpbroadcastq(dst, src, vlen_enc); 1706 } else { 1707 vmovddup(dst, src, vlen_enc); 1708 } 1709 } else if (bt == T_DOUBLE) { 1710 if (vlen_enc != Assembler::AVX_128bit) { 1711 vbroadcastsd(dst, src, vlen_enc, noreg); 1712 } else { 1713 vmovddup(dst, src, vlen_enc); 1714 } 1715 } else { 1716 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1717 vpbroadcastd(dst, src, vlen_enc); 1718 } else { 1719 vbroadcastss(dst, src, vlen_enc); 1720 } 1721 } 1722 } else if (VM_Version::supports_sse3()) { 1723 movddup(dst, src); 1724 } else { 1725 movq(dst, src); 1726 if (vlen == 16) { 1727 punpcklqdq(dst, dst); 1728 } 1729 } 1730 } 1731 1732 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1733 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1734 int offset = exact_log2(type2aelembytes(bt)) << 6; 1735 if (is_floating_point_type(bt)) { 1736 offset += 128; 1737 } 1738 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1739 load_vector(dst, addr, vlen_in_bytes); 1740 } 1741 1742 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1743 1744 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1745 int vector_len = Assembler::AVX_128bit; 1746 1747 switch (opcode) { 1748 case Op_AndReductionV: pand(dst, src); break; 1749 case Op_OrReductionV: por (dst, src); break; 1750 case Op_XorReductionV: pxor(dst, src); break; 1751 case Op_MinReductionV: 1752 switch (typ) { 1753 case T_BYTE: pminsb(dst, src); break; 1754 case T_SHORT: pminsw(dst, src); break; 1755 case T_INT: pminsd(dst, src); break; 1756 case T_LONG: assert(UseAVX > 2, "required"); 1757 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1758 default: assert(false, "wrong type"); 1759 } 1760 break; 1761 case Op_MaxReductionV: 1762 switch (typ) { 1763 case T_BYTE: pmaxsb(dst, src); break; 1764 case T_SHORT: pmaxsw(dst, src); break; 1765 case T_INT: pmaxsd(dst, src); break; 1766 case T_LONG: assert(UseAVX > 2, "required"); 1767 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1768 default: assert(false, "wrong type"); 1769 } 1770 break; 1771 case Op_AddReductionVF: addss(dst, src); break; 1772 case Op_AddReductionVD: addsd(dst, src); break; 1773 case Op_AddReductionVI: 1774 switch (typ) { 1775 case T_BYTE: paddb(dst, src); break; 1776 case T_SHORT: paddw(dst, src); break; 1777 case T_INT: paddd(dst, src); break; 1778 default: assert(false, "wrong type"); 1779 } 1780 break; 1781 case Op_AddReductionVL: paddq(dst, src); break; 1782 case Op_MulReductionVF: mulss(dst, src); break; 1783 case Op_MulReductionVD: mulsd(dst, src); break; 1784 case Op_MulReductionVI: 1785 switch (typ) { 1786 case T_SHORT: pmullw(dst, src); break; 1787 case T_INT: pmulld(dst, src); break; 1788 default: assert(false, "wrong type"); 1789 } 1790 break; 1791 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1792 evpmullq(dst, dst, src, vector_len); break; 1793 default: assert(false, "wrong opcode"); 1794 } 1795 } 1796 1797 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1798 int vector_len = Assembler::AVX_256bit; 1799 1800 switch (opcode) { 1801 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1802 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1803 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1804 case Op_MinReductionV: 1805 switch (typ) { 1806 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1807 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1808 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1809 case T_LONG: assert(UseAVX > 2, "required"); 1810 vpminsq(dst, src1, src2, vector_len); break; 1811 default: assert(false, "wrong type"); 1812 } 1813 break; 1814 case Op_MaxReductionV: 1815 switch (typ) { 1816 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1817 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1818 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1819 case T_LONG: assert(UseAVX > 2, "required"); 1820 vpmaxsq(dst, src1, src2, vector_len); break; 1821 default: assert(false, "wrong type"); 1822 } 1823 break; 1824 case Op_AddReductionVI: 1825 switch (typ) { 1826 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1827 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1828 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1829 default: assert(false, "wrong type"); 1830 } 1831 break; 1832 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1833 case Op_MulReductionVI: 1834 switch (typ) { 1835 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1836 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1837 default: assert(false, "wrong type"); 1838 } 1839 break; 1840 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1841 default: assert(false, "wrong opcode"); 1842 } 1843 } 1844 1845 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1846 XMMRegister dst, XMMRegister src, 1847 XMMRegister vtmp1, XMMRegister vtmp2) { 1848 switch (opcode) { 1849 case Op_AddReductionVF: 1850 case Op_MulReductionVF: 1851 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1852 break; 1853 1854 case Op_AddReductionVD: 1855 case Op_MulReductionVD: 1856 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1857 break; 1858 1859 default: assert(false, "wrong opcode"); 1860 } 1861 } 1862 1863 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1864 Register dst, Register src1, XMMRegister src2, 1865 XMMRegister vtmp1, XMMRegister vtmp2) { 1866 switch (vlen) { 1867 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1868 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1869 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1870 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1871 1872 default: assert(false, "wrong vector length"); 1873 } 1874 } 1875 1876 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1877 Register dst, Register src1, XMMRegister src2, 1878 XMMRegister vtmp1, XMMRegister vtmp2) { 1879 switch (vlen) { 1880 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1881 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1882 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1883 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1884 1885 default: assert(false, "wrong vector length"); 1886 } 1887 } 1888 1889 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1890 Register dst, Register src1, XMMRegister src2, 1891 XMMRegister vtmp1, XMMRegister vtmp2) { 1892 switch (vlen) { 1893 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1894 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1895 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1896 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1897 1898 default: assert(false, "wrong vector length"); 1899 } 1900 } 1901 1902 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1903 Register dst, Register src1, XMMRegister src2, 1904 XMMRegister vtmp1, XMMRegister vtmp2) { 1905 switch (vlen) { 1906 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1907 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1908 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1909 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1910 1911 default: assert(false, "wrong vector length"); 1912 } 1913 } 1914 1915 #ifdef _LP64 1916 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1917 Register dst, Register src1, XMMRegister src2, 1918 XMMRegister vtmp1, XMMRegister vtmp2) { 1919 switch (vlen) { 1920 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1921 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1922 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1923 1924 default: assert(false, "wrong vector length"); 1925 } 1926 } 1927 #endif // _LP64 1928 1929 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1930 switch (vlen) { 1931 case 2: 1932 assert(vtmp2 == xnoreg, ""); 1933 reduce2F(opcode, dst, src, vtmp1); 1934 break; 1935 case 4: 1936 assert(vtmp2 == xnoreg, ""); 1937 reduce4F(opcode, dst, src, vtmp1); 1938 break; 1939 case 8: 1940 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1941 break; 1942 case 16: 1943 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1944 break; 1945 default: assert(false, "wrong vector length"); 1946 } 1947 } 1948 1949 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1950 switch (vlen) { 1951 case 2: 1952 assert(vtmp2 == xnoreg, ""); 1953 reduce2D(opcode, dst, src, vtmp1); 1954 break; 1955 case 4: 1956 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1957 break; 1958 case 8: 1959 reduce8D(opcode, dst, src, vtmp1, vtmp2); 1960 break; 1961 default: assert(false, "wrong vector length"); 1962 } 1963 } 1964 1965 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1966 if (opcode == Op_AddReductionVI) { 1967 if (vtmp1 != src2) { 1968 movdqu(vtmp1, src2); 1969 } 1970 phaddd(vtmp1, vtmp1); 1971 } else { 1972 pshufd(vtmp1, src2, 0x1); 1973 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1974 } 1975 movdl(vtmp2, src1); 1976 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1977 movdl(dst, vtmp1); 1978 } 1979 1980 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1981 if (opcode == Op_AddReductionVI) { 1982 if (vtmp1 != src2) { 1983 movdqu(vtmp1, src2); 1984 } 1985 phaddd(vtmp1, src2); 1986 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1987 } else { 1988 pshufd(vtmp2, src2, 0xE); 1989 reduce_operation_128(T_INT, opcode, vtmp2, src2); 1990 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1991 } 1992 } 1993 1994 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1995 if (opcode == Op_AddReductionVI) { 1996 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 1997 vextracti128_high(vtmp2, vtmp1); 1998 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 1999 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2000 } else { 2001 vextracti128_high(vtmp1, src2); 2002 reduce_operation_128(T_INT, opcode, vtmp1, src2); 2003 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2004 } 2005 } 2006 2007 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2008 vextracti64x4_high(vtmp2, src2); 2009 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 2010 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2011 } 2012 2013 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2014 pshufd(vtmp2, src2, 0x1); 2015 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2016 movdqu(vtmp1, vtmp2); 2017 psrldq(vtmp1, 2); 2018 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2019 movdqu(vtmp2, vtmp1); 2020 psrldq(vtmp2, 1); 2021 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2022 movdl(vtmp2, src1); 2023 pmovsxbd(vtmp1, vtmp1); 2024 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2025 pextrb(dst, vtmp1, 0x0); 2026 movsbl(dst, dst); 2027 } 2028 2029 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2030 pshufd(vtmp1, src2, 0xE); 2031 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2032 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2033 } 2034 2035 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2036 vextracti128_high(vtmp2, src2); 2037 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2038 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2039 } 2040 2041 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2042 vextracti64x4_high(vtmp1, src2); 2043 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2044 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2045 } 2046 2047 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2048 pmovsxbw(vtmp2, src2); 2049 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2050 } 2051 2052 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2053 if (UseAVX > 1) { 2054 int vector_len = Assembler::AVX_256bit; 2055 vpmovsxbw(vtmp1, src2, vector_len); 2056 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2057 } else { 2058 pmovsxbw(vtmp2, src2); 2059 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2060 pshufd(vtmp2, src2, 0x1); 2061 pmovsxbw(vtmp2, src2); 2062 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2063 } 2064 } 2065 2066 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2067 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2068 int vector_len = Assembler::AVX_512bit; 2069 vpmovsxbw(vtmp1, src2, vector_len); 2070 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2071 } else { 2072 assert(UseAVX >= 2,"Should not reach here."); 2073 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2074 vextracti128_high(vtmp2, src2); 2075 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2076 } 2077 } 2078 2079 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2080 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2081 vextracti64x4_high(vtmp2, src2); 2082 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2083 } 2084 2085 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2086 if (opcode == Op_AddReductionVI) { 2087 if (vtmp1 != src2) { 2088 movdqu(vtmp1, src2); 2089 } 2090 phaddw(vtmp1, vtmp1); 2091 phaddw(vtmp1, vtmp1); 2092 } else { 2093 pshufd(vtmp2, src2, 0x1); 2094 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2095 movdqu(vtmp1, vtmp2); 2096 psrldq(vtmp1, 2); 2097 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2098 } 2099 movdl(vtmp2, src1); 2100 pmovsxwd(vtmp1, vtmp1); 2101 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2102 pextrw(dst, vtmp1, 0x0); 2103 movswl(dst, dst); 2104 } 2105 2106 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2107 if (opcode == Op_AddReductionVI) { 2108 if (vtmp1 != src2) { 2109 movdqu(vtmp1, src2); 2110 } 2111 phaddw(vtmp1, src2); 2112 } else { 2113 pshufd(vtmp1, src2, 0xE); 2114 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2115 } 2116 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2117 } 2118 2119 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2120 if (opcode == Op_AddReductionVI) { 2121 int vector_len = Assembler::AVX_256bit; 2122 vphaddw(vtmp2, src2, src2, vector_len); 2123 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2124 } else { 2125 vextracti128_high(vtmp2, src2); 2126 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2127 } 2128 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2129 } 2130 2131 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2132 int vector_len = Assembler::AVX_256bit; 2133 vextracti64x4_high(vtmp1, src2); 2134 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2135 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2136 } 2137 2138 #ifdef _LP64 2139 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2140 pshufd(vtmp2, src2, 0xE); 2141 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2142 movdq(vtmp1, src1); 2143 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2144 movdq(dst, vtmp1); 2145 } 2146 2147 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2148 vextracti128_high(vtmp1, src2); 2149 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2150 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2151 } 2152 2153 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2154 vextracti64x4_high(vtmp2, src2); 2155 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2156 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2157 } 2158 2159 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2160 mov64(temp, -1L); 2161 bzhiq(temp, temp, len); 2162 kmovql(dst, temp); 2163 } 2164 #endif // _LP64 2165 2166 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2167 reduce_operation_128(T_FLOAT, opcode, dst, src); 2168 pshufd(vtmp, src, 0x1); 2169 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2170 } 2171 2172 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2173 reduce2F(opcode, dst, src, vtmp); 2174 pshufd(vtmp, src, 0x2); 2175 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2176 pshufd(vtmp, src, 0x3); 2177 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2178 } 2179 2180 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2181 reduce4F(opcode, dst, src, vtmp2); 2182 vextractf128_high(vtmp2, src); 2183 reduce4F(opcode, dst, vtmp2, vtmp1); 2184 } 2185 2186 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2187 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2188 vextracti64x4_high(vtmp1, src); 2189 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2190 } 2191 2192 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2193 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2194 pshufd(vtmp, src, 0xE); 2195 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2196 } 2197 2198 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2199 reduce2D(opcode, dst, src, vtmp2); 2200 vextractf128_high(vtmp2, src); 2201 reduce2D(opcode, dst, vtmp2, vtmp1); 2202 } 2203 2204 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2205 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2206 vextracti64x4_high(vtmp1, src); 2207 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2208 } 2209 2210 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2211 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2212 } 2213 2214 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2215 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2216 } 2217 2218 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2219 int vec_enc) { 2220 switch(elem_bt) { 2221 case T_INT: 2222 case T_FLOAT: 2223 vmaskmovps(dst, src, mask, vec_enc); 2224 break; 2225 case T_LONG: 2226 case T_DOUBLE: 2227 vmaskmovpd(dst, src, mask, vec_enc); 2228 break; 2229 default: 2230 fatal("Unsupported type %s", type2name(elem_bt)); 2231 break; 2232 } 2233 } 2234 2235 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2236 int vec_enc) { 2237 switch(elem_bt) { 2238 case T_INT: 2239 case T_FLOAT: 2240 vmaskmovps(dst, src, mask, vec_enc); 2241 break; 2242 case T_LONG: 2243 case T_DOUBLE: 2244 vmaskmovpd(dst, src, mask, vec_enc); 2245 break; 2246 default: 2247 fatal("Unsupported type %s", type2name(elem_bt)); 2248 break; 2249 } 2250 } 2251 2252 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2253 XMMRegister dst, XMMRegister src, 2254 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2255 XMMRegister xmm_0, XMMRegister xmm_1) { 2256 const int permconst[] = {1, 14}; 2257 XMMRegister wsrc = src; 2258 XMMRegister wdst = xmm_0; 2259 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2260 2261 int vlen_enc = Assembler::AVX_128bit; 2262 if (vlen == 16) { 2263 vlen_enc = Assembler::AVX_256bit; 2264 } 2265 2266 for (int i = log2(vlen) - 1; i >=0; i--) { 2267 if (i == 0 && !is_dst_valid) { 2268 wdst = dst; 2269 } 2270 if (i == 3) { 2271 vextracti64x4_high(wtmp, wsrc); 2272 } else if (i == 2) { 2273 vextracti128_high(wtmp, wsrc); 2274 } else { // i = [0,1] 2275 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2276 } 2277 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2278 wsrc = wdst; 2279 vlen_enc = Assembler::AVX_128bit; 2280 } 2281 if (is_dst_valid) { 2282 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2283 } 2284 } 2285 2286 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2287 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2288 XMMRegister xmm_0, XMMRegister xmm_1) { 2289 XMMRegister wsrc = src; 2290 XMMRegister wdst = xmm_0; 2291 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2292 int vlen_enc = Assembler::AVX_128bit; 2293 if (vlen == 8) { 2294 vlen_enc = Assembler::AVX_256bit; 2295 } 2296 for (int i = log2(vlen) - 1; i >=0; i--) { 2297 if (i == 0 && !is_dst_valid) { 2298 wdst = dst; 2299 } 2300 if (i == 1) { 2301 vextracti128_high(wtmp, wsrc); 2302 } else if (i == 2) { 2303 vextracti64x4_high(wtmp, wsrc); 2304 } else { 2305 assert(i == 0, "%d", i); 2306 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2307 } 2308 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2309 wsrc = wdst; 2310 vlen_enc = Assembler::AVX_128bit; 2311 } 2312 if (is_dst_valid) { 2313 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2314 } 2315 } 2316 2317 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2318 switch (bt) { 2319 case T_BYTE: pextrb(dst, src, idx); break; 2320 case T_SHORT: pextrw(dst, src, idx); break; 2321 case T_INT: pextrd(dst, src, idx); break; 2322 case T_LONG: pextrq(dst, src, idx); break; 2323 2324 default: 2325 assert(false,"Should not reach here."); 2326 break; 2327 } 2328 } 2329 2330 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2331 int esize = type2aelembytes(typ); 2332 int elem_per_lane = 16/esize; 2333 int lane = elemindex / elem_per_lane; 2334 int eindex = elemindex % elem_per_lane; 2335 2336 if (lane >= 2) { 2337 assert(UseAVX > 2, "required"); 2338 vextractf32x4(dst, src, lane & 3); 2339 return dst; 2340 } else if (lane > 0) { 2341 assert(UseAVX > 0, "required"); 2342 vextractf128(dst, src, lane); 2343 return dst; 2344 } else { 2345 return src; 2346 } 2347 } 2348 2349 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2350 if (typ == T_BYTE) { 2351 movsbl(dst, dst); 2352 } else if (typ == T_SHORT) { 2353 movswl(dst, dst); 2354 } 2355 } 2356 2357 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2358 int esize = type2aelembytes(typ); 2359 int elem_per_lane = 16/esize; 2360 int eindex = elemindex % elem_per_lane; 2361 assert(is_integral_type(typ),"required"); 2362 2363 if (eindex == 0) { 2364 if (typ == T_LONG) { 2365 movq(dst, src); 2366 } else { 2367 movdl(dst, src); 2368 movsxl(typ, dst); 2369 } 2370 } else { 2371 extract(typ, dst, src, eindex); 2372 movsxl(typ, dst); 2373 } 2374 } 2375 2376 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2377 int esize = type2aelembytes(typ); 2378 int elem_per_lane = 16/esize; 2379 int eindex = elemindex % elem_per_lane; 2380 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2381 2382 if (eindex == 0) { 2383 movq(dst, src); 2384 } else { 2385 if (typ == T_FLOAT) { 2386 if (UseAVX == 0) { 2387 movdqu(dst, src); 2388 shufps(dst, dst, eindex); 2389 } else { 2390 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2391 } 2392 } else { 2393 if (UseAVX == 0) { 2394 movdqu(dst, src); 2395 psrldq(dst, eindex*esize); 2396 } else { 2397 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2398 } 2399 movq(dst, dst); 2400 } 2401 } 2402 // Zero upper bits 2403 if (typ == T_FLOAT) { 2404 if (UseAVX == 0) { 2405 assert(vtmp != xnoreg, "required."); 2406 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2407 pand(dst, vtmp); 2408 } else { 2409 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2410 } 2411 } 2412 } 2413 2414 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2415 switch(typ) { 2416 case T_BYTE: 2417 case T_BOOLEAN: 2418 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2419 break; 2420 case T_SHORT: 2421 case T_CHAR: 2422 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2423 break; 2424 case T_INT: 2425 case T_FLOAT: 2426 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2427 break; 2428 case T_LONG: 2429 case T_DOUBLE: 2430 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2431 break; 2432 default: 2433 assert(false,"Should not reach here."); 2434 break; 2435 } 2436 } 2437 2438 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2439 assert(rscratch != noreg || always_reachable(src2), "missing"); 2440 2441 switch(typ) { 2442 case T_BOOLEAN: 2443 case T_BYTE: 2444 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2445 break; 2446 case T_CHAR: 2447 case T_SHORT: 2448 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2449 break; 2450 case T_INT: 2451 case T_FLOAT: 2452 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2453 break; 2454 case T_LONG: 2455 case T_DOUBLE: 2456 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2457 break; 2458 default: 2459 assert(false,"Should not reach here."); 2460 break; 2461 } 2462 } 2463 2464 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2465 switch(typ) { 2466 case T_BYTE: 2467 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2468 break; 2469 case T_SHORT: 2470 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2471 break; 2472 case T_INT: 2473 case T_FLOAT: 2474 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2475 break; 2476 case T_LONG: 2477 case T_DOUBLE: 2478 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2479 break; 2480 default: 2481 assert(false,"Should not reach here."); 2482 break; 2483 } 2484 } 2485 2486 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2487 assert(vlen_in_bytes <= 32, ""); 2488 int esize = type2aelembytes(bt); 2489 if (vlen_in_bytes == 32) { 2490 assert(vtmp == xnoreg, "required."); 2491 if (esize >= 4) { 2492 vtestps(src1, src2, AVX_256bit); 2493 } else { 2494 vptest(src1, src2, AVX_256bit); 2495 } 2496 return; 2497 } 2498 if (vlen_in_bytes < 16) { 2499 // Duplicate the lower part to fill the whole register, 2500 // Don't need to do so for src2 2501 assert(vtmp != xnoreg, "required"); 2502 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2503 pshufd(vtmp, src1, shuffle_imm); 2504 } else { 2505 assert(vtmp == xnoreg, "required"); 2506 vtmp = src1; 2507 } 2508 if (esize >= 4 && VM_Version::supports_avx()) { 2509 vtestps(vtmp, src2, AVX_128bit); 2510 } else { 2511 ptest(vtmp, src2); 2512 } 2513 } 2514 2515 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2516 assert(UseAVX >= 2, "required"); 2517 #ifdef ASSERT 2518 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2519 bool is_bw_supported = VM_Version::supports_avx512bw(); 2520 if (is_bw && !is_bw_supported) { 2521 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2522 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2523 "XMM register should be 0-15"); 2524 } 2525 #endif // ASSERT 2526 switch (elem_bt) { 2527 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2528 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2529 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2530 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2531 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2532 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2533 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2534 } 2535 } 2536 2537 #ifdef _LP64 2538 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2539 assert(UseAVX >= 2, "required"); 2540 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2541 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2542 if ((UseAVX > 2) && 2543 (!is_bw || VM_Version::supports_avx512bw()) && 2544 (!is_vl || VM_Version::supports_avx512vl())) { 2545 switch (elem_bt) { 2546 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2547 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2548 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2549 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2550 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2551 } 2552 } else { 2553 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2554 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2555 switch (elem_bt) { 2556 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2557 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2558 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2559 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2560 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2561 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2562 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2563 } 2564 } 2565 } 2566 #endif 2567 2568 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2569 switch (to_elem_bt) { 2570 case T_SHORT: 2571 vpmovsxbw(dst, src, vlen_enc); 2572 break; 2573 case T_INT: 2574 vpmovsxbd(dst, src, vlen_enc); 2575 break; 2576 case T_FLOAT: 2577 vpmovsxbd(dst, src, vlen_enc); 2578 vcvtdq2ps(dst, dst, vlen_enc); 2579 break; 2580 case T_LONG: 2581 vpmovsxbq(dst, src, vlen_enc); 2582 break; 2583 case T_DOUBLE: { 2584 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2585 vpmovsxbd(dst, src, mid_vlen_enc); 2586 vcvtdq2pd(dst, dst, vlen_enc); 2587 break; 2588 } 2589 default: 2590 fatal("Unsupported type %s", type2name(to_elem_bt)); 2591 break; 2592 } 2593 } 2594 2595 //------------------------------------------------------------------------------------------- 2596 2597 // IndexOf for constant substrings with size >= 8 chars 2598 // which don't need to be loaded through stack. 2599 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2600 Register cnt1, Register cnt2, 2601 int int_cnt2, Register result, 2602 XMMRegister vec, Register tmp, 2603 int ae) { 2604 ShortBranchVerifier sbv(this); 2605 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2606 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2607 2608 // This method uses the pcmpestri instruction with bound registers 2609 // inputs: 2610 // xmm - substring 2611 // rax - substring length (elements count) 2612 // mem - scanned string 2613 // rdx - string length (elements count) 2614 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2615 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2616 // outputs: 2617 // rcx - matched index in string 2618 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2619 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2620 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2621 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2622 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2623 2624 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2625 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2626 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2627 2628 // Note, inline_string_indexOf() generates checks: 2629 // if (substr.count > string.count) return -1; 2630 // if (substr.count == 0) return 0; 2631 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2632 2633 // Load substring. 2634 if (ae == StrIntrinsicNode::UL) { 2635 pmovzxbw(vec, Address(str2, 0)); 2636 } else { 2637 movdqu(vec, Address(str2, 0)); 2638 } 2639 movl(cnt2, int_cnt2); 2640 movptr(result, str1); // string addr 2641 2642 if (int_cnt2 > stride) { 2643 jmpb(SCAN_TO_SUBSTR); 2644 2645 // Reload substr for rescan, this code 2646 // is executed only for large substrings (> 8 chars) 2647 bind(RELOAD_SUBSTR); 2648 if (ae == StrIntrinsicNode::UL) { 2649 pmovzxbw(vec, Address(str2, 0)); 2650 } else { 2651 movdqu(vec, Address(str2, 0)); 2652 } 2653 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2654 2655 bind(RELOAD_STR); 2656 // We came here after the beginning of the substring was 2657 // matched but the rest of it was not so we need to search 2658 // again. Start from the next element after the previous match. 2659 2660 // cnt2 is number of substring reminding elements and 2661 // cnt1 is number of string reminding elements when cmp failed. 2662 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2663 subl(cnt1, cnt2); 2664 addl(cnt1, int_cnt2); 2665 movl(cnt2, int_cnt2); // Now restore cnt2 2666 2667 decrementl(cnt1); // Shift to next element 2668 cmpl(cnt1, cnt2); 2669 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2670 2671 addptr(result, (1<<scale1)); 2672 2673 } // (int_cnt2 > 8) 2674 2675 // Scan string for start of substr in 16-byte vectors 2676 bind(SCAN_TO_SUBSTR); 2677 pcmpestri(vec, Address(result, 0), mode); 2678 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2679 subl(cnt1, stride); 2680 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2681 cmpl(cnt1, cnt2); 2682 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2683 addptr(result, 16); 2684 jmpb(SCAN_TO_SUBSTR); 2685 2686 // Found a potential substr 2687 bind(FOUND_CANDIDATE); 2688 // Matched whole vector if first element matched (tmp(rcx) == 0). 2689 if (int_cnt2 == stride) { 2690 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2691 } else { // int_cnt2 > 8 2692 jccb(Assembler::overflow, FOUND_SUBSTR); 2693 } 2694 // After pcmpestri tmp(rcx) contains matched element index 2695 // Compute start addr of substr 2696 lea(result, Address(result, tmp, scale1)); 2697 2698 // Make sure string is still long enough 2699 subl(cnt1, tmp); 2700 cmpl(cnt1, cnt2); 2701 if (int_cnt2 == stride) { 2702 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2703 } else { // int_cnt2 > 8 2704 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2705 } 2706 // Left less then substring. 2707 2708 bind(RET_NOT_FOUND); 2709 movl(result, -1); 2710 jmp(EXIT); 2711 2712 if (int_cnt2 > stride) { 2713 // This code is optimized for the case when whole substring 2714 // is matched if its head is matched. 2715 bind(MATCH_SUBSTR_HEAD); 2716 pcmpestri(vec, Address(result, 0), mode); 2717 // Reload only string if does not match 2718 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2719 2720 Label CONT_SCAN_SUBSTR; 2721 // Compare the rest of substring (> 8 chars). 2722 bind(FOUND_SUBSTR); 2723 // First 8 chars are already matched. 2724 negptr(cnt2); 2725 addptr(cnt2, stride); 2726 2727 bind(SCAN_SUBSTR); 2728 subl(cnt1, stride); 2729 cmpl(cnt2, -stride); // Do not read beyond substring 2730 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2731 // Back-up strings to avoid reading beyond substring: 2732 // cnt1 = cnt1 - cnt2 + 8 2733 addl(cnt1, cnt2); // cnt2 is negative 2734 addl(cnt1, stride); 2735 movl(cnt2, stride); negptr(cnt2); 2736 bind(CONT_SCAN_SUBSTR); 2737 if (int_cnt2 < (int)G) { 2738 int tail_off1 = int_cnt2<<scale1; 2739 int tail_off2 = int_cnt2<<scale2; 2740 if (ae == StrIntrinsicNode::UL) { 2741 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2742 } else { 2743 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2744 } 2745 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2746 } else { 2747 // calculate index in register to avoid integer overflow (int_cnt2*2) 2748 movl(tmp, int_cnt2); 2749 addptr(tmp, cnt2); 2750 if (ae == StrIntrinsicNode::UL) { 2751 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2752 } else { 2753 movdqu(vec, Address(str2, tmp, scale2, 0)); 2754 } 2755 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2756 } 2757 // Need to reload strings pointers if not matched whole vector 2758 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2759 addptr(cnt2, stride); 2760 jcc(Assembler::negative, SCAN_SUBSTR); 2761 // Fall through if found full substring 2762 2763 } // (int_cnt2 > 8) 2764 2765 bind(RET_FOUND); 2766 // Found result if we matched full small substring. 2767 // Compute substr offset 2768 subptr(result, str1); 2769 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2770 shrl(result, 1); // index 2771 } 2772 bind(EXIT); 2773 2774 } // string_indexofC8 2775 2776 // Small strings are loaded through stack if they cross page boundary. 2777 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2778 Register cnt1, Register cnt2, 2779 int int_cnt2, Register result, 2780 XMMRegister vec, Register tmp, 2781 int ae) { 2782 ShortBranchVerifier sbv(this); 2783 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2784 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2785 2786 // 2787 // int_cnt2 is length of small (< 8 chars) constant substring 2788 // or (-1) for non constant substring in which case its length 2789 // is in cnt2 register. 2790 // 2791 // Note, inline_string_indexOf() generates checks: 2792 // if (substr.count > string.count) return -1; 2793 // if (substr.count == 0) return 0; 2794 // 2795 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2796 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2797 // This method uses the pcmpestri instruction with bound registers 2798 // inputs: 2799 // xmm - substring 2800 // rax - substring length (elements count) 2801 // mem - scanned string 2802 // rdx - string length (elements count) 2803 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2804 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2805 // outputs: 2806 // rcx - matched index in string 2807 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2808 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2809 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2810 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2811 2812 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2813 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2814 FOUND_CANDIDATE; 2815 2816 { //======================================================== 2817 // We don't know where these strings are located 2818 // and we can't read beyond them. Load them through stack. 2819 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2820 2821 movptr(tmp, rsp); // save old SP 2822 2823 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2824 if (int_cnt2 == (1>>scale2)) { // One byte 2825 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2826 load_unsigned_byte(result, Address(str2, 0)); 2827 movdl(vec, result); // move 32 bits 2828 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2829 // Not enough header space in 32-bit VM: 12+3 = 15. 2830 movl(result, Address(str2, -1)); 2831 shrl(result, 8); 2832 movdl(vec, result); // move 32 bits 2833 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2834 load_unsigned_short(result, Address(str2, 0)); 2835 movdl(vec, result); // move 32 bits 2836 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2837 movdl(vec, Address(str2, 0)); // move 32 bits 2838 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2839 movq(vec, Address(str2, 0)); // move 64 bits 2840 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2841 // Array header size is 12 bytes in 32-bit VM 2842 // + 6 bytes for 3 chars == 18 bytes, 2843 // enough space to load vec and shift. 2844 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2845 if (ae == StrIntrinsicNode::UL) { 2846 int tail_off = int_cnt2-8; 2847 pmovzxbw(vec, Address(str2, tail_off)); 2848 psrldq(vec, -2*tail_off); 2849 } 2850 else { 2851 int tail_off = int_cnt2*(1<<scale2); 2852 movdqu(vec, Address(str2, tail_off-16)); 2853 psrldq(vec, 16-tail_off); 2854 } 2855 } 2856 } else { // not constant substring 2857 cmpl(cnt2, stride); 2858 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 2859 2860 // We can read beyond string if srt+16 does not cross page boundary 2861 // since heaps are aligned and mapped by pages. 2862 assert(os::vm_page_size() < (int)G, "default page should be small"); 2863 movl(result, str2); // We need only low 32 bits 2864 andl(result, ((int)os::vm_page_size()-1)); 2865 cmpl(result, ((int)os::vm_page_size()-16)); 2866 jccb(Assembler::belowEqual, CHECK_STR); 2867 2868 // Move small strings to stack to allow load 16 bytes into vec. 2869 subptr(rsp, 16); 2870 int stk_offset = wordSize-(1<<scale2); 2871 push(cnt2); 2872 2873 bind(COPY_SUBSTR); 2874 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 2875 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 2876 movb(Address(rsp, cnt2, scale2, stk_offset), result); 2877 } else if (ae == StrIntrinsicNode::UU) { 2878 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 2879 movw(Address(rsp, cnt2, scale2, stk_offset), result); 2880 } 2881 decrement(cnt2); 2882 jccb(Assembler::notZero, COPY_SUBSTR); 2883 2884 pop(cnt2); 2885 movptr(str2, rsp); // New substring address 2886 } // non constant 2887 2888 bind(CHECK_STR); 2889 cmpl(cnt1, stride); 2890 jccb(Assembler::aboveEqual, BIG_STRINGS); 2891 2892 // Check cross page boundary. 2893 movl(result, str1); // We need only low 32 bits 2894 andl(result, ((int)os::vm_page_size()-1)); 2895 cmpl(result, ((int)os::vm_page_size()-16)); 2896 jccb(Assembler::belowEqual, BIG_STRINGS); 2897 2898 subptr(rsp, 16); 2899 int stk_offset = -(1<<scale1); 2900 if (int_cnt2 < 0) { // not constant 2901 push(cnt2); 2902 stk_offset += wordSize; 2903 } 2904 movl(cnt2, cnt1); 2905 2906 bind(COPY_STR); 2907 if (ae == StrIntrinsicNode::LL) { 2908 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 2909 movb(Address(rsp, cnt2, scale1, stk_offset), result); 2910 } else { 2911 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 2912 movw(Address(rsp, cnt2, scale1, stk_offset), result); 2913 } 2914 decrement(cnt2); 2915 jccb(Assembler::notZero, COPY_STR); 2916 2917 if (int_cnt2 < 0) { // not constant 2918 pop(cnt2); 2919 } 2920 movptr(str1, rsp); // New string address 2921 2922 bind(BIG_STRINGS); 2923 // Load substring. 2924 if (int_cnt2 < 0) { // -1 2925 if (ae == StrIntrinsicNode::UL) { 2926 pmovzxbw(vec, Address(str2, 0)); 2927 } else { 2928 movdqu(vec, Address(str2, 0)); 2929 } 2930 push(cnt2); // substr count 2931 push(str2); // substr addr 2932 push(str1); // string addr 2933 } else { 2934 // Small (< 8 chars) constant substrings are loaded already. 2935 movl(cnt2, int_cnt2); 2936 } 2937 push(tmp); // original SP 2938 2939 } // Finished loading 2940 2941 //======================================================== 2942 // Start search 2943 // 2944 2945 movptr(result, str1); // string addr 2946 2947 if (int_cnt2 < 0) { // Only for non constant substring 2948 jmpb(SCAN_TO_SUBSTR); 2949 2950 // SP saved at sp+0 2951 // String saved at sp+1*wordSize 2952 // Substr saved at sp+2*wordSize 2953 // Substr count saved at sp+3*wordSize 2954 2955 // Reload substr for rescan, this code 2956 // is executed only for large substrings (> 8 chars) 2957 bind(RELOAD_SUBSTR); 2958 movptr(str2, Address(rsp, 2*wordSize)); 2959 movl(cnt2, Address(rsp, 3*wordSize)); 2960 if (ae == StrIntrinsicNode::UL) { 2961 pmovzxbw(vec, Address(str2, 0)); 2962 } else { 2963 movdqu(vec, Address(str2, 0)); 2964 } 2965 // We came here after the beginning of the substring was 2966 // matched but the rest of it was not so we need to search 2967 // again. Start from the next element after the previous match. 2968 subptr(str1, result); // Restore counter 2969 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2970 shrl(str1, 1); 2971 } 2972 addl(cnt1, str1); 2973 decrementl(cnt1); // Shift to next element 2974 cmpl(cnt1, cnt2); 2975 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2976 2977 addptr(result, (1<<scale1)); 2978 } // non constant 2979 2980 // Scan string for start of substr in 16-byte vectors 2981 bind(SCAN_TO_SUBSTR); 2982 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2983 pcmpestri(vec, Address(result, 0), mode); 2984 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2985 subl(cnt1, stride); 2986 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2987 cmpl(cnt1, cnt2); 2988 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2989 addptr(result, 16); 2990 2991 bind(ADJUST_STR); 2992 cmpl(cnt1, stride); // Do not read beyond string 2993 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2994 // Back-up string to avoid reading beyond string. 2995 lea(result, Address(result, cnt1, scale1, -16)); 2996 movl(cnt1, stride); 2997 jmpb(SCAN_TO_SUBSTR); 2998 2999 // Found a potential substr 3000 bind(FOUND_CANDIDATE); 3001 // After pcmpestri tmp(rcx) contains matched element index 3002 3003 // Make sure string is still long enough 3004 subl(cnt1, tmp); 3005 cmpl(cnt1, cnt2); 3006 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 3007 // Left less then substring. 3008 3009 bind(RET_NOT_FOUND); 3010 movl(result, -1); 3011 jmp(CLEANUP); 3012 3013 bind(FOUND_SUBSTR); 3014 // Compute start addr of substr 3015 lea(result, Address(result, tmp, scale1)); 3016 if (int_cnt2 > 0) { // Constant substring 3017 // Repeat search for small substring (< 8 chars) 3018 // from new point without reloading substring. 3019 // Have to check that we don't read beyond string. 3020 cmpl(tmp, stride-int_cnt2); 3021 jccb(Assembler::greater, ADJUST_STR); 3022 // Fall through if matched whole substring. 3023 } else { // non constant 3024 assert(int_cnt2 == -1, "should be != 0"); 3025 3026 addl(tmp, cnt2); 3027 // Found result if we matched whole substring. 3028 cmpl(tmp, stride); 3029 jcc(Assembler::lessEqual, RET_FOUND); 3030 3031 // Repeat search for small substring (<= 8 chars) 3032 // from new point 'str1' without reloading substring. 3033 cmpl(cnt2, stride); 3034 // Have to check that we don't read beyond string. 3035 jccb(Assembler::lessEqual, ADJUST_STR); 3036 3037 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3038 // Compare the rest of substring (> 8 chars). 3039 movptr(str1, result); 3040 3041 cmpl(tmp, cnt2); 3042 // First 8 chars are already matched. 3043 jccb(Assembler::equal, CHECK_NEXT); 3044 3045 bind(SCAN_SUBSTR); 3046 pcmpestri(vec, Address(str1, 0), mode); 3047 // Need to reload strings pointers if not matched whole vector 3048 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3049 3050 bind(CHECK_NEXT); 3051 subl(cnt2, stride); 3052 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3053 addptr(str1, 16); 3054 if (ae == StrIntrinsicNode::UL) { 3055 addptr(str2, 8); 3056 } else { 3057 addptr(str2, 16); 3058 } 3059 subl(cnt1, stride); 3060 cmpl(cnt2, stride); // Do not read beyond substring 3061 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3062 // Back-up strings to avoid reading beyond substring. 3063 3064 if (ae == StrIntrinsicNode::UL) { 3065 lea(str2, Address(str2, cnt2, scale2, -8)); 3066 lea(str1, Address(str1, cnt2, scale1, -16)); 3067 } else { 3068 lea(str2, Address(str2, cnt2, scale2, -16)); 3069 lea(str1, Address(str1, cnt2, scale1, -16)); 3070 } 3071 subl(cnt1, cnt2); 3072 movl(cnt2, stride); 3073 addl(cnt1, stride); 3074 bind(CONT_SCAN_SUBSTR); 3075 if (ae == StrIntrinsicNode::UL) { 3076 pmovzxbw(vec, Address(str2, 0)); 3077 } else { 3078 movdqu(vec, Address(str2, 0)); 3079 } 3080 jmp(SCAN_SUBSTR); 3081 3082 bind(RET_FOUND_LONG); 3083 movptr(str1, Address(rsp, wordSize)); 3084 } // non constant 3085 3086 bind(RET_FOUND); 3087 // Compute substr offset 3088 subptr(result, str1); 3089 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3090 shrl(result, 1); // index 3091 } 3092 bind(CLEANUP); 3093 pop(rsp); // restore SP 3094 3095 } // string_indexof 3096 3097 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3098 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3099 ShortBranchVerifier sbv(this); 3100 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3101 3102 int stride = 8; 3103 3104 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3105 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3106 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3107 FOUND_SEQ_CHAR, DONE_LABEL; 3108 3109 movptr(result, str1); 3110 if (UseAVX >= 2) { 3111 cmpl(cnt1, stride); 3112 jcc(Assembler::less, SCAN_TO_CHAR); 3113 cmpl(cnt1, 2*stride); 3114 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3115 movdl(vec1, ch); 3116 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3117 vpxor(vec2, vec2); 3118 movl(tmp, cnt1); 3119 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3120 andl(cnt1,0x0000000F); //tail count (in chars) 3121 3122 bind(SCAN_TO_16_CHAR_LOOP); 3123 vmovdqu(vec3, Address(result, 0)); 3124 vpcmpeqw(vec3, vec3, vec1, 1); 3125 vptest(vec2, vec3); 3126 jcc(Assembler::carryClear, FOUND_CHAR); 3127 addptr(result, 32); 3128 subl(tmp, 2*stride); 3129 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3130 jmp(SCAN_TO_8_CHAR); 3131 bind(SCAN_TO_8_CHAR_INIT); 3132 movdl(vec1, ch); 3133 pshuflw(vec1, vec1, 0x00); 3134 pshufd(vec1, vec1, 0); 3135 pxor(vec2, vec2); 3136 } 3137 bind(SCAN_TO_8_CHAR); 3138 cmpl(cnt1, stride); 3139 jcc(Assembler::less, SCAN_TO_CHAR); 3140 if (UseAVX < 2) { 3141 movdl(vec1, ch); 3142 pshuflw(vec1, vec1, 0x00); 3143 pshufd(vec1, vec1, 0); 3144 pxor(vec2, vec2); 3145 } 3146 movl(tmp, cnt1); 3147 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3148 andl(cnt1,0x00000007); //tail count (in chars) 3149 3150 bind(SCAN_TO_8_CHAR_LOOP); 3151 movdqu(vec3, Address(result, 0)); 3152 pcmpeqw(vec3, vec1); 3153 ptest(vec2, vec3); 3154 jcc(Assembler::carryClear, FOUND_CHAR); 3155 addptr(result, 16); 3156 subl(tmp, stride); 3157 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3158 bind(SCAN_TO_CHAR); 3159 testl(cnt1, cnt1); 3160 jcc(Assembler::zero, RET_NOT_FOUND); 3161 bind(SCAN_TO_CHAR_LOOP); 3162 load_unsigned_short(tmp, Address(result, 0)); 3163 cmpl(ch, tmp); 3164 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3165 addptr(result, 2); 3166 subl(cnt1, 1); 3167 jccb(Assembler::zero, RET_NOT_FOUND); 3168 jmp(SCAN_TO_CHAR_LOOP); 3169 3170 bind(RET_NOT_FOUND); 3171 movl(result, -1); 3172 jmpb(DONE_LABEL); 3173 3174 bind(FOUND_CHAR); 3175 if (UseAVX >= 2) { 3176 vpmovmskb(tmp, vec3); 3177 } else { 3178 pmovmskb(tmp, vec3); 3179 } 3180 bsfl(ch, tmp); 3181 addptr(result, ch); 3182 3183 bind(FOUND_SEQ_CHAR); 3184 subptr(result, str1); 3185 shrl(result, 1); 3186 3187 bind(DONE_LABEL); 3188 } // string_indexof_char 3189 3190 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3191 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3192 ShortBranchVerifier sbv(this); 3193 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3194 3195 int stride = 16; 3196 3197 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3198 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3199 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3200 FOUND_SEQ_CHAR, DONE_LABEL; 3201 3202 movptr(result, str1); 3203 if (UseAVX >= 2) { 3204 cmpl(cnt1, stride); 3205 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3206 cmpl(cnt1, stride*2); 3207 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3208 movdl(vec1, ch); 3209 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3210 vpxor(vec2, vec2); 3211 movl(tmp, cnt1); 3212 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3213 andl(cnt1,0x0000001F); //tail count (in chars) 3214 3215 bind(SCAN_TO_32_CHAR_LOOP); 3216 vmovdqu(vec3, Address(result, 0)); 3217 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3218 vptest(vec2, vec3); 3219 jcc(Assembler::carryClear, FOUND_CHAR); 3220 addptr(result, 32); 3221 subl(tmp, stride*2); 3222 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3223 jmp(SCAN_TO_16_CHAR); 3224 3225 bind(SCAN_TO_16_CHAR_INIT); 3226 movdl(vec1, ch); 3227 pxor(vec2, vec2); 3228 pshufb(vec1, vec2); 3229 } 3230 3231 bind(SCAN_TO_16_CHAR); 3232 cmpl(cnt1, stride); 3233 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3234 if (UseAVX < 2) { 3235 movdl(vec1, ch); 3236 pxor(vec2, vec2); 3237 pshufb(vec1, vec2); 3238 } 3239 movl(tmp, cnt1); 3240 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3241 andl(cnt1,0x0000000F); //tail count (in bytes) 3242 3243 bind(SCAN_TO_16_CHAR_LOOP); 3244 movdqu(vec3, Address(result, 0)); 3245 pcmpeqb(vec3, vec1); 3246 ptest(vec2, vec3); 3247 jcc(Assembler::carryClear, FOUND_CHAR); 3248 addptr(result, 16); 3249 subl(tmp, stride); 3250 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3251 3252 bind(SCAN_TO_CHAR_INIT); 3253 testl(cnt1, cnt1); 3254 jcc(Assembler::zero, RET_NOT_FOUND); 3255 bind(SCAN_TO_CHAR_LOOP); 3256 load_unsigned_byte(tmp, Address(result, 0)); 3257 cmpl(ch, tmp); 3258 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3259 addptr(result, 1); 3260 subl(cnt1, 1); 3261 jccb(Assembler::zero, RET_NOT_FOUND); 3262 jmp(SCAN_TO_CHAR_LOOP); 3263 3264 bind(RET_NOT_FOUND); 3265 movl(result, -1); 3266 jmpb(DONE_LABEL); 3267 3268 bind(FOUND_CHAR); 3269 if (UseAVX >= 2) { 3270 vpmovmskb(tmp, vec3); 3271 } else { 3272 pmovmskb(tmp, vec3); 3273 } 3274 bsfl(ch, tmp); 3275 addptr(result, ch); 3276 3277 bind(FOUND_SEQ_CHAR); 3278 subptr(result, str1); 3279 3280 bind(DONE_LABEL); 3281 } // stringL_indexof_char 3282 3283 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3284 switch (eltype) { 3285 case T_BOOLEAN: return sizeof(jboolean); 3286 case T_BYTE: return sizeof(jbyte); 3287 case T_SHORT: return sizeof(jshort); 3288 case T_CHAR: return sizeof(jchar); 3289 case T_INT: return sizeof(jint); 3290 default: 3291 ShouldNotReachHere(); 3292 return -1; 3293 } 3294 } 3295 3296 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3297 switch (eltype) { 3298 // T_BOOLEAN used as surrogate for unsigned byte 3299 case T_BOOLEAN: movzbl(dst, src); break; 3300 case T_BYTE: movsbl(dst, src); break; 3301 case T_SHORT: movswl(dst, src); break; 3302 case T_CHAR: movzwl(dst, src); break; 3303 case T_INT: movl(dst, src); break; 3304 default: 3305 ShouldNotReachHere(); 3306 } 3307 } 3308 3309 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3310 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3311 } 3312 3313 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3314 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3315 } 3316 3317 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3318 const int vlen = Assembler::AVX_256bit; 3319 switch (eltype) { 3320 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3321 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3322 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3323 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3324 case T_INT: 3325 // do nothing 3326 break; 3327 default: 3328 ShouldNotReachHere(); 3329 } 3330 } 3331 3332 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3333 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3334 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3335 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3336 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3337 BasicType eltype) { 3338 ShortBranchVerifier sbv(this); 3339 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3340 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3341 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3342 3343 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3344 SHORT_UNROLLED_LOOP_EXIT, 3345 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3346 UNROLLED_VECTOR_LOOP_BEGIN, 3347 END; 3348 switch (eltype) { 3349 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3350 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3351 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3352 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3353 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3354 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3355 } 3356 3357 // For "renaming" for readibility of the code 3358 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3359 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3360 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3361 3362 const int elsize = arrays_hashcode_elsize(eltype); 3363 3364 /* 3365 if (cnt1 >= 2) { 3366 if (cnt1 >= 32) { 3367 UNROLLED VECTOR LOOP 3368 } 3369 UNROLLED SCALAR LOOP 3370 } 3371 SINGLE SCALAR 3372 */ 3373 3374 cmpl(cnt1, 32); 3375 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3376 3377 // cnt1 >= 32 && generate_vectorized_loop 3378 xorl(index, index); 3379 3380 // vresult = IntVector.zero(I256); 3381 for (int idx = 0; idx < 4; idx++) { 3382 vpxor(vresult[idx], vresult[idx]); 3383 } 3384 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3385 Register bound = tmp2; 3386 Register next = tmp3; 3387 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3388 movl(next, Address(tmp2, 0)); 3389 movdl(vnext, next); 3390 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3391 3392 // index = 0; 3393 // bound = cnt1 & ~(32 - 1); 3394 movl(bound, cnt1); 3395 andl(bound, ~(32 - 1)); 3396 // for (; index < bound; index += 32) { 3397 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3398 // result *= next; 3399 imull(result, next); 3400 // loop fission to upfront the cost of fetching from memory, OOO execution 3401 // can then hopefully do a better job of prefetching 3402 for (int idx = 0; idx < 4; idx++) { 3403 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3404 } 3405 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3406 for (int idx = 0; idx < 4; idx++) { 3407 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3408 arrays_hashcode_elvcast(vtmp[idx], eltype); 3409 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3410 } 3411 // index += 32; 3412 addl(index, 32); 3413 // index < bound; 3414 cmpl(index, bound); 3415 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3416 // } 3417 3418 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3419 subl(cnt1, bound); 3420 // release bound 3421 3422 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3423 for (int idx = 0; idx < 4; idx++) { 3424 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3425 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3426 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3427 } 3428 // result += vresult.reduceLanes(ADD); 3429 for (int idx = 0; idx < 4; idx++) { 3430 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3431 } 3432 3433 // } else if (cnt1 < 32) { 3434 3435 bind(SHORT_UNROLLED_BEGIN); 3436 // int i = 1; 3437 movl(index, 1); 3438 cmpl(index, cnt1); 3439 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3440 3441 // for (; i < cnt1 ; i += 2) { 3442 bind(SHORT_UNROLLED_LOOP_BEGIN); 3443 movl(tmp3, 961); 3444 imull(result, tmp3); 3445 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3446 movl(tmp3, tmp2); 3447 shll(tmp3, 5); 3448 subl(tmp3, tmp2); 3449 addl(result, tmp3); 3450 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3451 addl(result, tmp3); 3452 addl(index, 2); 3453 cmpl(index, cnt1); 3454 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3455 3456 // } 3457 // if (i >= cnt1) { 3458 bind(SHORT_UNROLLED_LOOP_EXIT); 3459 jccb(Assembler::greater, END); 3460 movl(tmp2, result); 3461 shll(result, 5); 3462 subl(result, tmp2); 3463 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3464 addl(result, tmp3); 3465 // } 3466 bind(END); 3467 3468 BLOCK_COMMENT("} // arrays_hashcode"); 3469 3470 } // arrays_hashcode 3471 3472 // helper function for string_compare 3473 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3474 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3475 Address::ScaleFactor scale2, Register index, int ae) { 3476 if (ae == StrIntrinsicNode::LL) { 3477 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3478 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3479 } else if (ae == StrIntrinsicNode::UU) { 3480 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3481 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3482 } else { 3483 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3484 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3485 } 3486 } 3487 3488 // Compare strings, used for char[] and byte[]. 3489 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3490 Register cnt1, Register cnt2, Register result, 3491 XMMRegister vec1, int ae, KRegister mask) { 3492 ShortBranchVerifier sbv(this); 3493 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3494 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3495 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3496 int stride2x2 = 0x40; 3497 Address::ScaleFactor scale = Address::no_scale; 3498 Address::ScaleFactor scale1 = Address::no_scale; 3499 Address::ScaleFactor scale2 = Address::no_scale; 3500 3501 if (ae != StrIntrinsicNode::LL) { 3502 stride2x2 = 0x20; 3503 } 3504 3505 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3506 shrl(cnt2, 1); 3507 } 3508 // Compute the minimum of the string lengths and the 3509 // difference of the string lengths (stack). 3510 // Do the conditional move stuff 3511 movl(result, cnt1); 3512 subl(cnt1, cnt2); 3513 push(cnt1); 3514 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3515 3516 // Is the minimum length zero? 3517 testl(cnt2, cnt2); 3518 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3519 if (ae == StrIntrinsicNode::LL) { 3520 // Load first bytes 3521 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3522 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3523 } else if (ae == StrIntrinsicNode::UU) { 3524 // Load first characters 3525 load_unsigned_short(result, Address(str1, 0)); 3526 load_unsigned_short(cnt1, Address(str2, 0)); 3527 } else { 3528 load_unsigned_byte(result, Address(str1, 0)); 3529 load_unsigned_short(cnt1, Address(str2, 0)); 3530 } 3531 subl(result, cnt1); 3532 jcc(Assembler::notZero, POP_LABEL); 3533 3534 if (ae == StrIntrinsicNode::UU) { 3535 // Divide length by 2 to get number of chars 3536 shrl(cnt2, 1); 3537 } 3538 cmpl(cnt2, 1); 3539 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3540 3541 // Check if the strings start at the same location and setup scale and stride 3542 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3543 cmpptr(str1, str2); 3544 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3545 if (ae == StrIntrinsicNode::LL) { 3546 scale = Address::times_1; 3547 stride = 16; 3548 } else { 3549 scale = Address::times_2; 3550 stride = 8; 3551 } 3552 } else { 3553 scale1 = Address::times_1; 3554 scale2 = Address::times_2; 3555 // scale not used 3556 stride = 8; 3557 } 3558 3559 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3560 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3561 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3562 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3563 Label COMPARE_TAIL_LONG; 3564 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3565 3566 int pcmpmask = 0x19; 3567 if (ae == StrIntrinsicNode::LL) { 3568 pcmpmask &= ~0x01; 3569 } 3570 3571 // Setup to compare 16-chars (32-bytes) vectors, 3572 // start from first character again because it has aligned address. 3573 if (ae == StrIntrinsicNode::LL) { 3574 stride2 = 32; 3575 } else { 3576 stride2 = 16; 3577 } 3578 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3579 adr_stride = stride << scale; 3580 } else { 3581 adr_stride1 = 8; //stride << scale1; 3582 adr_stride2 = 16; //stride << scale2; 3583 } 3584 3585 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3586 // rax and rdx are used by pcmpestri as elements counters 3587 movl(result, cnt2); 3588 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3589 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3590 3591 // fast path : compare first 2 8-char vectors. 3592 bind(COMPARE_16_CHARS); 3593 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3594 movdqu(vec1, Address(str1, 0)); 3595 } else { 3596 pmovzxbw(vec1, Address(str1, 0)); 3597 } 3598 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3599 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3600 3601 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3602 movdqu(vec1, Address(str1, adr_stride)); 3603 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3604 } else { 3605 pmovzxbw(vec1, Address(str1, adr_stride1)); 3606 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3607 } 3608 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3609 addl(cnt1, stride); 3610 3611 // Compare the characters at index in cnt1 3612 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3613 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3614 subl(result, cnt2); 3615 jmp(POP_LABEL); 3616 3617 // Setup the registers to start vector comparison loop 3618 bind(COMPARE_WIDE_VECTORS); 3619 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3620 lea(str1, Address(str1, result, scale)); 3621 lea(str2, Address(str2, result, scale)); 3622 } else { 3623 lea(str1, Address(str1, result, scale1)); 3624 lea(str2, Address(str2, result, scale2)); 3625 } 3626 subl(result, stride2); 3627 subl(cnt2, stride2); 3628 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3629 negptr(result); 3630 3631 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3632 bind(COMPARE_WIDE_VECTORS_LOOP); 3633 3634 #ifdef _LP64 3635 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3636 cmpl(cnt2, stride2x2); 3637 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3638 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3639 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3640 3641 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3642 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3643 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3644 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3645 } else { 3646 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3647 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3648 } 3649 kortestql(mask, mask); 3650 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3651 addptr(result, stride2x2); // update since we already compared at this addr 3652 subl(cnt2, stride2x2); // and sub the size too 3653 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3654 3655 vpxor(vec1, vec1); 3656 jmpb(COMPARE_WIDE_TAIL); 3657 }//if (VM_Version::supports_avx512vlbw()) 3658 #endif // _LP64 3659 3660 3661 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3662 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3663 vmovdqu(vec1, Address(str1, result, scale)); 3664 vpxor(vec1, Address(str2, result, scale)); 3665 } else { 3666 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3667 vpxor(vec1, Address(str2, result, scale2)); 3668 } 3669 vptest(vec1, vec1); 3670 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3671 addptr(result, stride2); 3672 subl(cnt2, stride2); 3673 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3674 // clean upper bits of YMM registers 3675 vpxor(vec1, vec1); 3676 3677 // compare wide vectors tail 3678 bind(COMPARE_WIDE_TAIL); 3679 testptr(result, result); 3680 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3681 3682 movl(result, stride2); 3683 movl(cnt2, result); 3684 negptr(result); 3685 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3686 3687 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3688 bind(VECTOR_NOT_EQUAL); 3689 // clean upper bits of YMM registers 3690 vpxor(vec1, vec1); 3691 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3692 lea(str1, Address(str1, result, scale)); 3693 lea(str2, Address(str2, result, scale)); 3694 } else { 3695 lea(str1, Address(str1, result, scale1)); 3696 lea(str2, Address(str2, result, scale2)); 3697 } 3698 jmp(COMPARE_16_CHARS); 3699 3700 // Compare tail chars, length between 1 to 15 chars 3701 bind(COMPARE_TAIL_LONG); 3702 movl(cnt2, result); 3703 cmpl(cnt2, stride); 3704 jcc(Assembler::less, COMPARE_SMALL_STR); 3705 3706 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3707 movdqu(vec1, Address(str1, 0)); 3708 } else { 3709 pmovzxbw(vec1, Address(str1, 0)); 3710 } 3711 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3712 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3713 subptr(cnt2, stride); 3714 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3715 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3716 lea(str1, Address(str1, result, scale)); 3717 lea(str2, Address(str2, result, scale)); 3718 } else { 3719 lea(str1, Address(str1, result, scale1)); 3720 lea(str2, Address(str2, result, scale2)); 3721 } 3722 negptr(cnt2); 3723 jmpb(WHILE_HEAD_LABEL); 3724 3725 bind(COMPARE_SMALL_STR); 3726 } else if (UseSSE42Intrinsics) { 3727 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3728 int pcmpmask = 0x19; 3729 // Setup to compare 8-char (16-byte) vectors, 3730 // start from first character again because it has aligned address. 3731 movl(result, cnt2); 3732 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3733 if (ae == StrIntrinsicNode::LL) { 3734 pcmpmask &= ~0x01; 3735 } 3736 jcc(Assembler::zero, COMPARE_TAIL); 3737 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3738 lea(str1, Address(str1, result, scale)); 3739 lea(str2, Address(str2, result, scale)); 3740 } else { 3741 lea(str1, Address(str1, result, scale1)); 3742 lea(str2, Address(str2, result, scale2)); 3743 } 3744 negptr(result); 3745 3746 // pcmpestri 3747 // inputs: 3748 // vec1- substring 3749 // rax - negative string length (elements count) 3750 // mem - scanned string 3751 // rdx - string length (elements count) 3752 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3753 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3754 // outputs: 3755 // rcx - first mismatched element index 3756 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3757 3758 bind(COMPARE_WIDE_VECTORS); 3759 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3760 movdqu(vec1, Address(str1, result, scale)); 3761 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3762 } else { 3763 pmovzxbw(vec1, Address(str1, result, scale1)); 3764 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3765 } 3766 // After pcmpestri cnt1(rcx) contains mismatched element index 3767 3768 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3769 addptr(result, stride); 3770 subptr(cnt2, stride); 3771 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3772 3773 // compare wide vectors tail 3774 testptr(result, result); 3775 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3776 3777 movl(cnt2, stride); 3778 movl(result, stride); 3779 negptr(result); 3780 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3781 movdqu(vec1, Address(str1, result, scale)); 3782 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3783 } else { 3784 pmovzxbw(vec1, Address(str1, result, scale1)); 3785 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3786 } 3787 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3788 3789 // Mismatched characters in the vectors 3790 bind(VECTOR_NOT_EQUAL); 3791 addptr(cnt1, result); 3792 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3793 subl(result, cnt2); 3794 jmpb(POP_LABEL); 3795 3796 bind(COMPARE_TAIL); // limit is zero 3797 movl(cnt2, result); 3798 // Fallthru to tail compare 3799 } 3800 // Shift str2 and str1 to the end of the arrays, negate min 3801 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3802 lea(str1, Address(str1, cnt2, scale)); 3803 lea(str2, Address(str2, cnt2, scale)); 3804 } else { 3805 lea(str1, Address(str1, cnt2, scale1)); 3806 lea(str2, Address(str2, cnt2, scale2)); 3807 } 3808 decrementl(cnt2); // first character was compared already 3809 negptr(cnt2); 3810 3811 // Compare the rest of the elements 3812 bind(WHILE_HEAD_LABEL); 3813 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3814 subl(result, cnt1); 3815 jccb(Assembler::notZero, POP_LABEL); 3816 increment(cnt2); 3817 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3818 3819 // Strings are equal up to min length. Return the length difference. 3820 bind(LENGTH_DIFF_LABEL); 3821 pop(result); 3822 if (ae == StrIntrinsicNode::UU) { 3823 // Divide diff by 2 to get number of chars 3824 sarl(result, 1); 3825 } 3826 jmpb(DONE_LABEL); 3827 3828 #ifdef _LP64 3829 if (VM_Version::supports_avx512vlbw()) { 3830 3831 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3832 3833 kmovql(cnt1, mask); 3834 notq(cnt1); 3835 bsfq(cnt2, cnt1); 3836 if (ae != StrIntrinsicNode::LL) { 3837 // Divide diff by 2 to get number of chars 3838 sarl(cnt2, 1); 3839 } 3840 addq(result, cnt2); 3841 if (ae == StrIntrinsicNode::LL) { 3842 load_unsigned_byte(cnt1, Address(str2, result)); 3843 load_unsigned_byte(result, Address(str1, result)); 3844 } else if (ae == StrIntrinsicNode::UU) { 3845 load_unsigned_short(cnt1, Address(str2, result, scale)); 3846 load_unsigned_short(result, Address(str1, result, scale)); 3847 } else { 3848 load_unsigned_short(cnt1, Address(str2, result, scale2)); 3849 load_unsigned_byte(result, Address(str1, result, scale1)); 3850 } 3851 subl(result, cnt1); 3852 jmpb(POP_LABEL); 3853 }//if (VM_Version::supports_avx512vlbw()) 3854 #endif // _LP64 3855 3856 // Discard the stored length difference 3857 bind(POP_LABEL); 3858 pop(cnt1); 3859 3860 // That's it 3861 bind(DONE_LABEL); 3862 if(ae == StrIntrinsicNode::UL) { 3863 negl(result); 3864 } 3865 3866 } 3867 3868 // Search for Non-ASCII character (Negative byte value) in a byte array, 3869 // return the index of the first such character, otherwise the length 3870 // of the array segment searched. 3871 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 3872 // @IntrinsicCandidate 3873 // public static int countPositives(byte[] ba, int off, int len) { 3874 // for (int i = off; i < off + len; i++) { 3875 // if (ba[i] < 0) { 3876 // return i - off; 3877 // } 3878 // } 3879 // return len; 3880 // } 3881 void C2_MacroAssembler::count_positives(Register ary1, Register len, 3882 Register result, Register tmp1, 3883 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 3884 // rsi: byte array 3885 // rcx: len 3886 // rax: result 3887 ShortBranchVerifier sbv(this); 3888 assert_different_registers(ary1, len, result, tmp1); 3889 assert_different_registers(vec1, vec2); 3890 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 3891 3892 movl(result, len); // copy 3893 // len == 0 3894 testl(len, len); 3895 jcc(Assembler::zero, DONE); 3896 3897 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 3898 VM_Version::supports_avx512vlbw() && 3899 VM_Version::supports_bmi2()) { 3900 3901 Label test_64_loop, test_tail, BREAK_LOOP; 3902 movl(tmp1, len); 3903 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 3904 3905 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 3906 andl(len, 0xffffffc0); // vector count (in chars) 3907 jccb(Assembler::zero, test_tail); 3908 3909 lea(ary1, Address(ary1, len, Address::times_1)); 3910 negptr(len); 3911 3912 bind(test_64_loop); 3913 // Check whether our 64 elements of size byte contain negatives 3914 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 3915 kortestql(mask1, mask1); 3916 jcc(Assembler::notZero, BREAK_LOOP); 3917 3918 addptr(len, 64); 3919 jccb(Assembler::notZero, test_64_loop); 3920 3921 bind(test_tail); 3922 // bail out when there is nothing to be done 3923 testl(tmp1, -1); 3924 jcc(Assembler::zero, DONE); 3925 3926 3927 // check the tail for absense of negatives 3928 // ~(~0 << len) applied up to two times (for 32-bit scenario) 3929 #ifdef _LP64 3930 { 3931 Register tmp3_aliased = len; 3932 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 3933 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 3934 notq(tmp3_aliased); 3935 kmovql(mask2, tmp3_aliased); 3936 } 3937 #else 3938 Label k_init; 3939 jmp(k_init); 3940 3941 // We could not read 64-bits from a general purpose register thus we move 3942 // data required to compose 64 1's to the instruction stream 3943 // We emit 64 byte wide series of elements from 0..63 which later on would 3944 // be used as a compare targets with tail count contained in tmp1 register. 3945 // Result would be a k register having tmp1 consecutive number or 1 3946 // counting from least significant bit. 3947 address tmp = pc(); 3948 emit_int64(0x0706050403020100); 3949 emit_int64(0x0F0E0D0C0B0A0908); 3950 emit_int64(0x1716151413121110); 3951 emit_int64(0x1F1E1D1C1B1A1918); 3952 emit_int64(0x2726252423222120); 3953 emit_int64(0x2F2E2D2C2B2A2928); 3954 emit_int64(0x3736353433323130); 3955 emit_int64(0x3F3E3D3C3B3A3938); 3956 3957 bind(k_init); 3958 lea(len, InternalAddress(tmp)); 3959 // create mask to test for negative byte inside a vector 3960 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 3961 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 3962 3963 #endif 3964 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 3965 ktestq(mask1, mask2); 3966 jcc(Assembler::zero, DONE); 3967 3968 // do a full check for negative registers in the tail 3969 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 3970 // ary1 already pointing to the right place 3971 jmpb(TAIL_START); 3972 3973 bind(BREAK_LOOP); 3974 // At least one byte in the last 64 byte block was negative. 3975 // Set up to look at the last 64 bytes as if they were a tail 3976 lea(ary1, Address(ary1, len, Address::times_1)); 3977 addptr(result, len); 3978 // Ignore the very last byte: if all others are positive, 3979 // it must be negative, so we can skip right to the 2+1 byte 3980 // end comparison at this point 3981 orl(result, 63); 3982 movl(len, 63); 3983 // Fallthru to tail compare 3984 } else { 3985 3986 if (UseAVX >= 2 && UseSSE >= 2) { 3987 // With AVX2, use 32-byte vector compare 3988 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 3989 3990 // Compare 32-byte vectors 3991 testl(len, 0xffffffe0); // vector count (in bytes) 3992 jccb(Assembler::zero, TAIL_START); 3993 3994 andl(len, 0xffffffe0); 3995 lea(ary1, Address(ary1, len, Address::times_1)); 3996 negptr(len); 3997 3998 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 3999 movdl(vec2, tmp1); 4000 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 4001 4002 bind(COMPARE_WIDE_VECTORS); 4003 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 4004 vptest(vec1, vec2); 4005 jccb(Assembler::notZero, BREAK_LOOP); 4006 addptr(len, 32); 4007 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4008 4009 testl(result, 0x0000001f); // any bytes remaining? 4010 jcc(Assembler::zero, DONE); 4011 4012 // Quick test using the already prepared vector mask 4013 movl(len, result); 4014 andl(len, 0x0000001f); 4015 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 4016 vptest(vec1, vec2); 4017 jcc(Assembler::zero, DONE); 4018 // There are zeros, jump to the tail to determine exactly where 4019 jmpb(TAIL_START); 4020 4021 bind(BREAK_LOOP); 4022 // At least one byte in the last 32-byte vector is negative. 4023 // Set up to look at the last 32 bytes as if they were a tail 4024 lea(ary1, Address(ary1, len, Address::times_1)); 4025 addptr(result, len); 4026 // Ignore the very last byte: if all others are positive, 4027 // it must be negative, so we can skip right to the 2+1 byte 4028 // end comparison at this point 4029 orl(result, 31); 4030 movl(len, 31); 4031 // Fallthru to tail compare 4032 } else if (UseSSE42Intrinsics) { 4033 // With SSE4.2, use double quad vector compare 4034 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4035 4036 // Compare 16-byte vectors 4037 testl(len, 0xfffffff0); // vector count (in bytes) 4038 jcc(Assembler::zero, TAIL_START); 4039 4040 andl(len, 0xfffffff0); 4041 lea(ary1, Address(ary1, len, Address::times_1)); 4042 negptr(len); 4043 4044 movl(tmp1, 0x80808080); 4045 movdl(vec2, tmp1); 4046 pshufd(vec2, vec2, 0); 4047 4048 bind(COMPARE_WIDE_VECTORS); 4049 movdqu(vec1, Address(ary1, len, Address::times_1)); 4050 ptest(vec1, vec2); 4051 jccb(Assembler::notZero, BREAK_LOOP); 4052 addptr(len, 16); 4053 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4054 4055 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4056 jcc(Assembler::zero, DONE); 4057 4058 // Quick test using the already prepared vector mask 4059 movl(len, result); 4060 andl(len, 0x0000000f); // tail count (in bytes) 4061 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4062 ptest(vec1, vec2); 4063 jcc(Assembler::zero, DONE); 4064 jmpb(TAIL_START); 4065 4066 bind(BREAK_LOOP); 4067 // At least one byte in the last 16-byte vector is negative. 4068 // Set up and look at the last 16 bytes as if they were a tail 4069 lea(ary1, Address(ary1, len, Address::times_1)); 4070 addptr(result, len); 4071 // Ignore the very last byte: if all others are positive, 4072 // it must be negative, so we can skip right to the 2+1 byte 4073 // end comparison at this point 4074 orl(result, 15); 4075 movl(len, 15); 4076 // Fallthru to tail compare 4077 } 4078 } 4079 4080 bind(TAIL_START); 4081 // Compare 4-byte vectors 4082 andl(len, 0xfffffffc); // vector count (in bytes) 4083 jccb(Assembler::zero, COMPARE_CHAR); 4084 4085 lea(ary1, Address(ary1, len, Address::times_1)); 4086 negptr(len); 4087 4088 bind(COMPARE_VECTORS); 4089 movl(tmp1, Address(ary1, len, Address::times_1)); 4090 andl(tmp1, 0x80808080); 4091 jccb(Assembler::notZero, TAIL_ADJUST); 4092 addptr(len, 4); 4093 jccb(Assembler::notZero, COMPARE_VECTORS); 4094 4095 // Compare trailing char (final 2-3 bytes), if any 4096 bind(COMPARE_CHAR); 4097 4098 testl(result, 0x2); // tail char 4099 jccb(Assembler::zero, COMPARE_BYTE); 4100 load_unsigned_short(tmp1, Address(ary1, 0)); 4101 andl(tmp1, 0x00008080); 4102 jccb(Assembler::notZero, CHAR_ADJUST); 4103 lea(ary1, Address(ary1, 2)); 4104 4105 bind(COMPARE_BYTE); 4106 testl(result, 0x1); // tail byte 4107 jccb(Assembler::zero, DONE); 4108 load_unsigned_byte(tmp1, Address(ary1, 0)); 4109 testl(tmp1, 0x00000080); 4110 jccb(Assembler::zero, DONE); 4111 subptr(result, 1); 4112 jmpb(DONE); 4113 4114 bind(TAIL_ADJUST); 4115 // there are negative bits in the last 4 byte block. 4116 // Adjust result and check the next three bytes 4117 addptr(result, len); 4118 orl(result, 3); 4119 lea(ary1, Address(ary1, len, Address::times_1)); 4120 jmpb(COMPARE_CHAR); 4121 4122 bind(CHAR_ADJUST); 4123 // We are looking at a char + optional byte tail, and found that one 4124 // of the bytes in the char is negative. Adjust the result, check the 4125 // first byte and readjust if needed. 4126 andl(result, 0xfffffffc); 4127 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4128 jccb(Assembler::notZero, DONE); 4129 addptr(result, 1); 4130 4131 // That's it 4132 bind(DONE); 4133 if (UseAVX >= 2 && UseSSE >= 2) { 4134 // clean upper bits of YMM registers 4135 vpxor(vec1, vec1); 4136 vpxor(vec2, vec2); 4137 } 4138 } 4139 4140 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4141 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4142 Register limit, Register result, Register chr, 4143 XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) { 4144 ShortBranchVerifier sbv(this); 4145 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4146 4147 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4148 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4149 4150 if (is_array_equ) { 4151 // Check the input args 4152 cmpoop(ary1, ary2); 4153 jcc(Assembler::equal, TRUE_LABEL); 4154 4155 // Need additional checks for arrays_equals. 4156 testptr(ary1, ary1); 4157 jcc(Assembler::zero, FALSE_LABEL); 4158 testptr(ary2, ary2); 4159 jcc(Assembler::zero, FALSE_LABEL); 4160 4161 // Check the lengths 4162 movl(limit, Address(ary1, length_offset)); 4163 cmpl(limit, Address(ary2, length_offset)); 4164 jcc(Assembler::notEqual, FALSE_LABEL); 4165 } 4166 4167 // count == 0 4168 testl(limit, limit); 4169 jcc(Assembler::zero, TRUE_LABEL); 4170 4171 if (is_array_equ) { 4172 // Load array address 4173 lea(ary1, Address(ary1, base_offset)); 4174 lea(ary2, Address(ary2, base_offset)); 4175 } 4176 4177 if (is_array_equ && is_char) { 4178 // arrays_equals when used for char[]. 4179 shll(limit, 1); // byte count != 0 4180 } 4181 movl(result, limit); // copy 4182 4183 if (UseAVX >= 2) { 4184 // With AVX2, use 32-byte vector compare 4185 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4186 4187 // Compare 32-byte vectors 4188 andl(result, 0x0000001f); // tail count (in bytes) 4189 andl(limit, 0xffffffe0); // vector count (in bytes) 4190 jcc(Assembler::zero, COMPARE_TAIL); 4191 4192 lea(ary1, Address(ary1, limit, Address::times_1)); 4193 lea(ary2, Address(ary2, limit, Address::times_1)); 4194 negptr(limit); 4195 4196 #ifdef _LP64 4197 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4198 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4199 4200 cmpl(limit, -64); 4201 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4202 4203 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4204 4205 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4206 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4207 kortestql(mask, mask); 4208 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4209 addptr(limit, 64); // update since we already compared at this addr 4210 cmpl(limit, -64); 4211 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4212 4213 // At this point we may still need to compare -limit+result bytes. 4214 // We could execute the next two instruction and just continue via non-wide path: 4215 // cmpl(limit, 0); 4216 // jcc(Assembler::equal, COMPARE_TAIL); // true 4217 // But since we stopped at the points ary{1,2}+limit which are 4218 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4219 // (|limit| <= 32 and result < 32), 4220 // we may just compare the last 64 bytes. 4221 // 4222 addptr(result, -64); // it is safe, bc we just came from this area 4223 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4224 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4225 kortestql(mask, mask); 4226 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4227 4228 jmp(TRUE_LABEL); 4229 4230 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4231 4232 }//if (VM_Version::supports_avx512vlbw()) 4233 #endif //_LP64 4234 bind(COMPARE_WIDE_VECTORS); 4235 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); 4236 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4237 vpxor(vec1, vec2); 4238 4239 vptest(vec1, vec1); 4240 jcc(Assembler::notZero, FALSE_LABEL); 4241 addptr(limit, 32); 4242 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4243 4244 testl(result, result); 4245 jcc(Assembler::zero, TRUE_LABEL); 4246 4247 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 4248 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4249 vpxor(vec1, vec2); 4250 4251 vptest(vec1, vec1); 4252 jccb(Assembler::notZero, FALSE_LABEL); 4253 jmpb(TRUE_LABEL); 4254 4255 bind(COMPARE_TAIL); // limit is zero 4256 movl(limit, result); 4257 // Fallthru to tail compare 4258 } else if (UseSSE42Intrinsics) { 4259 // With SSE4.2, use double quad vector compare 4260 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4261 4262 // Compare 16-byte vectors 4263 andl(result, 0x0000000f); // tail count (in bytes) 4264 andl(limit, 0xfffffff0); // vector count (in bytes) 4265 jcc(Assembler::zero, COMPARE_TAIL); 4266 4267 lea(ary1, Address(ary1, limit, Address::times_1)); 4268 lea(ary2, Address(ary2, limit, Address::times_1)); 4269 negptr(limit); 4270 4271 bind(COMPARE_WIDE_VECTORS); 4272 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4273 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4274 pxor(vec1, vec2); 4275 4276 ptest(vec1, vec1); 4277 jcc(Assembler::notZero, FALSE_LABEL); 4278 addptr(limit, 16); 4279 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4280 4281 testl(result, result); 4282 jcc(Assembler::zero, TRUE_LABEL); 4283 4284 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4285 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4286 pxor(vec1, vec2); 4287 4288 ptest(vec1, vec1); 4289 jccb(Assembler::notZero, FALSE_LABEL); 4290 jmpb(TRUE_LABEL); 4291 4292 bind(COMPARE_TAIL); // limit is zero 4293 movl(limit, result); 4294 // Fallthru to tail compare 4295 } 4296 4297 // Compare 4-byte vectors 4298 andl(limit, 0xfffffffc); // vector count (in bytes) 4299 jccb(Assembler::zero, COMPARE_CHAR); 4300 4301 lea(ary1, Address(ary1, limit, Address::times_1)); 4302 lea(ary2, Address(ary2, limit, Address::times_1)); 4303 negptr(limit); 4304 4305 bind(COMPARE_VECTORS); 4306 movl(chr, Address(ary1, limit, Address::times_1)); 4307 cmpl(chr, Address(ary2, limit, Address::times_1)); 4308 jccb(Assembler::notEqual, FALSE_LABEL); 4309 addptr(limit, 4); 4310 jcc(Assembler::notZero, COMPARE_VECTORS); 4311 4312 // Compare trailing char (final 2 bytes), if any 4313 bind(COMPARE_CHAR); 4314 testl(result, 0x2); // tail char 4315 jccb(Assembler::zero, COMPARE_BYTE); 4316 load_unsigned_short(chr, Address(ary1, 0)); 4317 load_unsigned_short(limit, Address(ary2, 0)); 4318 cmpl(chr, limit); 4319 jccb(Assembler::notEqual, FALSE_LABEL); 4320 4321 if (is_array_equ && is_char) { 4322 bind(COMPARE_BYTE); 4323 } else { 4324 lea(ary1, Address(ary1, 2)); 4325 lea(ary2, Address(ary2, 2)); 4326 4327 bind(COMPARE_BYTE); 4328 testl(result, 0x1); // tail byte 4329 jccb(Assembler::zero, TRUE_LABEL); 4330 load_unsigned_byte(chr, Address(ary1, 0)); 4331 load_unsigned_byte(limit, Address(ary2, 0)); 4332 cmpl(chr, limit); 4333 jccb(Assembler::notEqual, FALSE_LABEL); 4334 } 4335 bind(TRUE_LABEL); 4336 movl(result, 1); // return true 4337 jmpb(DONE); 4338 4339 bind(FALSE_LABEL); 4340 xorl(result, result); // return false 4341 4342 // That's it 4343 bind(DONE); 4344 if (UseAVX >= 2) { 4345 // clean upper bits of YMM registers 4346 vpxor(vec1, vec1); 4347 vpxor(vec2, vec2); 4348 } 4349 } 4350 4351 #ifdef _LP64 4352 4353 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4354 #define __ masm. 4355 Register dst = stub.data<0>(); 4356 XMMRegister src = stub.data<1>(); 4357 address target = stub.data<2>(); 4358 __ bind(stub.entry()); 4359 __ subptr(rsp, 8); 4360 __ movdbl(Address(rsp), src); 4361 __ call(RuntimeAddress(target)); 4362 __ pop(dst); 4363 __ jmp(stub.continuation()); 4364 #undef __ 4365 } 4366 4367 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4368 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4369 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4370 4371 address slowpath_target; 4372 if (dst_bt == T_INT) { 4373 if (src_bt == T_FLOAT) { 4374 cvttss2sil(dst, src); 4375 cmpl(dst, 0x80000000); 4376 slowpath_target = StubRoutines::x86::f2i_fixup(); 4377 } else { 4378 cvttsd2sil(dst, src); 4379 cmpl(dst, 0x80000000); 4380 slowpath_target = StubRoutines::x86::d2i_fixup(); 4381 } 4382 } else { 4383 if (src_bt == T_FLOAT) { 4384 cvttss2siq(dst, src); 4385 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4386 slowpath_target = StubRoutines::x86::f2l_fixup(); 4387 } else { 4388 cvttsd2siq(dst, src); 4389 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4390 slowpath_target = StubRoutines::x86::d2l_fixup(); 4391 } 4392 } 4393 4394 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4395 jcc(Assembler::equal, stub->entry()); 4396 bind(stub->continuation()); 4397 } 4398 4399 #endif // _LP64 4400 4401 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4402 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4403 switch(ideal_opc) { 4404 case Op_LShiftVS: 4405 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4406 case Op_LShiftVI: 4407 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4408 case Op_LShiftVL: 4409 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4410 case Op_RShiftVS: 4411 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4412 case Op_RShiftVI: 4413 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4414 case Op_RShiftVL: 4415 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4416 case Op_URShiftVS: 4417 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4418 case Op_URShiftVI: 4419 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4420 case Op_URShiftVL: 4421 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4422 case Op_RotateRightV: 4423 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4424 case Op_RotateLeftV: 4425 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4426 default: 4427 fatal("Unsupported masked operation"); break; 4428 } 4429 } 4430 4431 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4432 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4433 bool is_varshift) { 4434 switch (ideal_opc) { 4435 case Op_AddVB: 4436 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4437 case Op_AddVS: 4438 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4439 case Op_AddVI: 4440 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4441 case Op_AddVL: 4442 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4443 case Op_AddVF: 4444 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4445 case Op_AddVD: 4446 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4447 case Op_SubVB: 4448 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4449 case Op_SubVS: 4450 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4451 case Op_SubVI: 4452 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4453 case Op_SubVL: 4454 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4455 case Op_SubVF: 4456 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4457 case Op_SubVD: 4458 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4459 case Op_MulVS: 4460 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4461 case Op_MulVI: 4462 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4463 case Op_MulVL: 4464 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4465 case Op_MulVF: 4466 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4467 case Op_MulVD: 4468 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4469 case Op_DivVF: 4470 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4471 case Op_DivVD: 4472 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4473 case Op_SqrtVF: 4474 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4475 case Op_SqrtVD: 4476 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4477 case Op_AbsVB: 4478 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4479 case Op_AbsVS: 4480 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4481 case Op_AbsVI: 4482 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4483 case Op_AbsVL: 4484 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4485 case Op_FmaVF: 4486 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4487 case Op_FmaVD: 4488 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4489 case Op_VectorRearrange: 4490 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4491 case Op_LShiftVS: 4492 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4493 case Op_LShiftVI: 4494 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4495 case Op_LShiftVL: 4496 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4497 case Op_RShiftVS: 4498 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4499 case Op_RShiftVI: 4500 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4501 case Op_RShiftVL: 4502 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4503 case Op_URShiftVS: 4504 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4505 case Op_URShiftVI: 4506 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4507 case Op_URShiftVL: 4508 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4509 case Op_RotateLeftV: 4510 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4511 case Op_RotateRightV: 4512 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4513 case Op_MaxV: 4514 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4515 case Op_MinV: 4516 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4517 case Op_XorV: 4518 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4519 case Op_OrV: 4520 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4521 case Op_AndV: 4522 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4523 default: 4524 fatal("Unsupported masked operation"); break; 4525 } 4526 } 4527 4528 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4529 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4530 switch (ideal_opc) { 4531 case Op_AddVB: 4532 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4533 case Op_AddVS: 4534 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4535 case Op_AddVI: 4536 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4537 case Op_AddVL: 4538 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4539 case Op_AddVF: 4540 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4541 case Op_AddVD: 4542 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4543 case Op_SubVB: 4544 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4545 case Op_SubVS: 4546 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4547 case Op_SubVI: 4548 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4549 case Op_SubVL: 4550 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4551 case Op_SubVF: 4552 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4553 case Op_SubVD: 4554 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4555 case Op_MulVS: 4556 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4557 case Op_MulVI: 4558 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4559 case Op_MulVL: 4560 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4561 case Op_MulVF: 4562 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4563 case Op_MulVD: 4564 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4565 case Op_DivVF: 4566 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4567 case Op_DivVD: 4568 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4569 case Op_FmaVF: 4570 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4571 case Op_FmaVD: 4572 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4573 case Op_MaxV: 4574 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4575 case Op_MinV: 4576 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4577 case Op_XorV: 4578 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4579 case Op_OrV: 4580 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4581 case Op_AndV: 4582 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4583 default: 4584 fatal("Unsupported masked operation"); break; 4585 } 4586 } 4587 4588 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4589 KRegister src1, KRegister src2) { 4590 BasicType etype = T_ILLEGAL; 4591 switch(mask_len) { 4592 case 2: 4593 case 4: 4594 case 8: etype = T_BYTE; break; 4595 case 16: etype = T_SHORT; break; 4596 case 32: etype = T_INT; break; 4597 case 64: etype = T_LONG; break; 4598 default: fatal("Unsupported type"); break; 4599 } 4600 assert(etype != T_ILLEGAL, ""); 4601 switch(ideal_opc) { 4602 case Op_AndVMask: 4603 kand(etype, dst, src1, src2); break; 4604 case Op_OrVMask: 4605 kor(etype, dst, src1, src2); break; 4606 case Op_XorVMask: 4607 kxor(etype, dst, src1, src2); break; 4608 default: 4609 fatal("Unsupported masked operation"); break; 4610 } 4611 } 4612 4613 /* 4614 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4615 * If src is NaN, the result is 0. 4616 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4617 * the result is equal to the value of Integer.MIN_VALUE. 4618 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4619 * the result is equal to the value of Integer.MAX_VALUE. 4620 */ 4621 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4622 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4623 Register rscratch, AddressLiteral float_sign_flip, 4624 int vec_enc) { 4625 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4626 Label done; 4627 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4628 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4629 vptest(xtmp2, xtmp2, vec_enc); 4630 jccb(Assembler::equal, done); 4631 4632 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4633 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4634 4635 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4636 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4637 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4638 4639 // Recompute the mask for remaining special value. 4640 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4641 // Extract SRC values corresponding to TRUE mask lanes. 4642 vpand(xtmp4, xtmp2, src, vec_enc); 4643 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4644 // values are set. 4645 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4646 4647 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4648 bind(done); 4649 } 4650 4651 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4652 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4653 Register rscratch, AddressLiteral float_sign_flip, 4654 int vec_enc) { 4655 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4656 Label done; 4657 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4658 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4659 kortestwl(ktmp1, ktmp1); 4660 jccb(Assembler::equal, done); 4661 4662 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4663 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4664 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4665 4666 kxorwl(ktmp1, ktmp1, ktmp2); 4667 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4668 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4669 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4670 bind(done); 4671 } 4672 4673 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4674 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4675 Register rscratch, AddressLiteral double_sign_flip, 4676 int vec_enc) { 4677 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4678 4679 Label done; 4680 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4681 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4682 kortestwl(ktmp1, ktmp1); 4683 jccb(Assembler::equal, done); 4684 4685 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4686 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4687 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4688 4689 kxorwl(ktmp1, ktmp1, ktmp2); 4690 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4691 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4692 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4693 bind(done); 4694 } 4695 4696 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4697 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4698 Register rscratch, AddressLiteral float_sign_flip, 4699 int vec_enc) { 4700 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4701 Label done; 4702 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4703 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4704 kortestwl(ktmp1, ktmp1); 4705 jccb(Assembler::equal, done); 4706 4707 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4708 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4709 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4710 4711 kxorwl(ktmp1, ktmp1, ktmp2); 4712 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4713 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4714 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4715 bind(done); 4716 } 4717 4718 /* 4719 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4720 * If src is NaN, the result is 0. 4721 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 4722 * the result is equal to the value of Long.MIN_VALUE. 4723 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 4724 * the result is equal to the value of Long.MAX_VALUE. 4725 */ 4726 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4727 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4728 Register rscratch, AddressLiteral double_sign_flip, 4729 int vec_enc) { 4730 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4731 4732 Label done; 4733 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4734 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 4735 kortestwl(ktmp1, ktmp1); 4736 jccb(Assembler::equal, done); 4737 4738 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4739 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4740 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4741 4742 kxorwl(ktmp1, ktmp1, ktmp2); 4743 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4744 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4745 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4746 bind(done); 4747 } 4748 4749 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 4750 XMMRegister xtmp, int index, int vec_enc) { 4751 assert(vec_enc < Assembler::AVX_512bit, ""); 4752 if (vec_enc == Assembler::AVX_256bit) { 4753 vextractf128_high(xtmp, src); 4754 vshufps(dst, src, xtmp, index, vec_enc); 4755 } else { 4756 vshufps(dst, src, zero, index, vec_enc); 4757 } 4758 } 4759 4760 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4761 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 4762 AddressLiteral float_sign_flip, int src_vec_enc) { 4763 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4764 4765 Label done; 4766 // Compare the destination lanes with float_sign_flip 4767 // value to get mask for all special values. 4768 movdqu(xtmp1, float_sign_flip, rscratch); 4769 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 4770 ptest(xtmp2, xtmp2); 4771 jccb(Assembler::equal, done); 4772 4773 // Flip float_sign_flip to get max integer value. 4774 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 4775 pxor(xtmp1, xtmp4); 4776 4777 // Set detination lanes corresponding to unordered source lanes as zero. 4778 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 4779 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 4780 4781 // Shuffle mask vector and pack lower doubles word from each quadword lane. 4782 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 4783 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 4784 4785 // Recompute the mask for remaining special value. 4786 pxor(xtmp2, xtmp3); 4787 // Extract mask corresponding to non-negative source lanes. 4788 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 4789 4790 // Shuffle mask vector and pack lower doubles word from each quadword lane. 4791 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 4792 pand(xtmp3, xtmp2); 4793 4794 // Replace destination lanes holding special value(0x80000000) with max int 4795 // if corresponding source lane holds a +ve value. 4796 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 4797 bind(done); 4798 } 4799 4800 4801 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 4802 XMMRegister xtmp, Register rscratch, int vec_enc) { 4803 switch(to_elem_bt) { 4804 case T_SHORT: 4805 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 4806 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 4807 vpackusdw(dst, dst, zero, vec_enc); 4808 if (vec_enc == Assembler::AVX_256bit) { 4809 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 4810 } 4811 break; 4812 case T_BYTE: 4813 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 4814 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 4815 vpackusdw(dst, dst, zero, vec_enc); 4816 if (vec_enc == Assembler::AVX_256bit) { 4817 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 4818 } 4819 vpackuswb(dst, dst, zero, vec_enc); 4820 break; 4821 default: assert(false, "%s", type2name(to_elem_bt)); 4822 } 4823 } 4824 4825 /* 4826 * Algorithm for vector D2L and F2I conversions:- 4827 * a) Perform vector D2L/F2I cast. 4828 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 4829 * It signifies that source value could be any of the special floating point 4830 * values(NaN,-Inf,Inf,Max,-Min). 4831 * c) Set destination to zero if source is NaN value. 4832 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 4833 */ 4834 4835 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4836 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4837 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 4838 int to_elem_sz = type2aelembytes(to_elem_bt); 4839 assert(to_elem_sz <= 4, ""); 4840 vcvttps2dq(dst, src, vec_enc); 4841 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 4842 if (to_elem_sz < 4) { 4843 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4844 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 4845 } 4846 } 4847 4848 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4849 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 4850 Register rscratch, int vec_enc) { 4851 int to_elem_sz = type2aelembytes(to_elem_bt); 4852 assert(to_elem_sz <= 4, ""); 4853 vcvttps2dq(dst, src, vec_enc); 4854 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 4855 switch(to_elem_bt) { 4856 case T_INT: 4857 break; 4858 case T_SHORT: 4859 evpmovdw(dst, dst, vec_enc); 4860 break; 4861 case T_BYTE: 4862 evpmovdb(dst, dst, vec_enc); 4863 break; 4864 default: assert(false, "%s", type2name(to_elem_bt)); 4865 } 4866 } 4867 4868 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4869 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 4870 Register rscratch, int vec_enc) { 4871 evcvttps2qq(dst, src, vec_enc); 4872 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 4873 } 4874 4875 // Handling for downcasting from double to integer or sub-word types on AVX2. 4876 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4877 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 4878 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 4879 int to_elem_sz = type2aelembytes(to_elem_bt); 4880 assert(to_elem_sz < 8, ""); 4881 vcvttpd2dq(dst, src, vec_enc); 4882 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 4883 float_sign_flip, vec_enc); 4884 if (to_elem_sz < 4) { 4885 // xtmp4 holds all zero lanes. 4886 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 4887 } 4888 } 4889 4890 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 4891 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 4892 KRegister ktmp2, AddressLiteral sign_flip, 4893 Register rscratch, int vec_enc) { 4894 if (VM_Version::supports_avx512dq()) { 4895 evcvttpd2qq(dst, src, vec_enc); 4896 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 4897 switch(to_elem_bt) { 4898 case T_LONG: 4899 break; 4900 case T_INT: 4901 evpmovsqd(dst, dst, vec_enc); 4902 break; 4903 case T_SHORT: 4904 evpmovsqd(dst, dst, vec_enc); 4905 evpmovdw(dst, dst, vec_enc); 4906 break; 4907 case T_BYTE: 4908 evpmovsqd(dst, dst, vec_enc); 4909 evpmovdb(dst, dst, vec_enc); 4910 break; 4911 default: assert(false, "%s", type2name(to_elem_bt)); 4912 } 4913 } else { 4914 assert(type2aelembytes(to_elem_bt) <= 4, ""); 4915 vcvttpd2dq(dst, src, vec_enc); 4916 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 4917 switch(to_elem_bt) { 4918 case T_INT: 4919 break; 4920 case T_SHORT: 4921 evpmovdw(dst, dst, vec_enc); 4922 break; 4923 case T_BYTE: 4924 evpmovdb(dst, dst, vec_enc); 4925 break; 4926 default: assert(false, "%s", type2name(to_elem_bt)); 4927 } 4928 } 4929 } 4930 4931 #ifdef _LP64 4932 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 4933 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 4934 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 4935 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4936 // and re-instantiate original MXCSR.RC mode after that. 4937 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 4938 4939 mov64(tmp, julong_cast(0.5L)); 4940 evpbroadcastq(xtmp1, tmp, vec_enc); 4941 vaddpd(xtmp1, src , xtmp1, vec_enc); 4942 evcvtpd2qq(dst, xtmp1, vec_enc); 4943 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 4944 double_sign_flip, vec_enc);; 4945 4946 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 4947 } 4948 4949 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 4950 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 4951 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 4952 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4953 // and re-instantiate original MXCSR.RC mode after that. 4954 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 4955 4956 movl(tmp, jint_cast(0.5)); 4957 movq(xtmp1, tmp); 4958 vbroadcastss(xtmp1, xtmp1, vec_enc); 4959 vaddps(xtmp1, src , xtmp1, vec_enc); 4960 vcvtps2dq(dst, xtmp1, vec_enc); 4961 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 4962 float_sign_flip, vec_enc); 4963 4964 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 4965 } 4966 4967 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 4968 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 4969 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 4970 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4971 // and re-instantiate original MXCSR.RC mode after that. 4972 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 4973 4974 movl(tmp, jint_cast(0.5)); 4975 movq(xtmp1, tmp); 4976 vbroadcastss(xtmp1, xtmp1, vec_enc); 4977 vaddps(xtmp1, src , xtmp1, vec_enc); 4978 vcvtps2dq(dst, xtmp1, vec_enc); 4979 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 4980 4981 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 4982 } 4983 #endif // _LP64 4984 4985 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 4986 BasicType from_elem_bt, BasicType to_elem_bt) { 4987 switch (from_elem_bt) { 4988 case T_BYTE: 4989 switch (to_elem_bt) { 4990 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 4991 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 4992 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 4993 default: ShouldNotReachHere(); 4994 } 4995 break; 4996 case T_SHORT: 4997 switch (to_elem_bt) { 4998 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 4999 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 5000 default: ShouldNotReachHere(); 5001 } 5002 break; 5003 case T_INT: 5004 assert(to_elem_bt == T_LONG, ""); 5005 vpmovzxdq(dst, src, vlen_enc); 5006 break; 5007 default: 5008 ShouldNotReachHere(); 5009 } 5010 } 5011 5012 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 5013 BasicType from_elem_bt, BasicType to_elem_bt) { 5014 switch (from_elem_bt) { 5015 case T_BYTE: 5016 switch (to_elem_bt) { 5017 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5018 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5019 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5020 default: ShouldNotReachHere(); 5021 } 5022 break; 5023 case T_SHORT: 5024 switch (to_elem_bt) { 5025 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5026 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5027 default: ShouldNotReachHere(); 5028 } 5029 break; 5030 case T_INT: 5031 assert(to_elem_bt == T_LONG, ""); 5032 vpmovsxdq(dst, src, vlen_enc); 5033 break; 5034 default: 5035 ShouldNotReachHere(); 5036 } 5037 } 5038 5039 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5040 BasicType dst_bt, BasicType src_bt, int vlen) { 5041 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5042 assert(vlen_enc != AVX_512bit, ""); 5043 5044 int dst_bt_size = type2aelembytes(dst_bt); 5045 int src_bt_size = type2aelembytes(src_bt); 5046 if (dst_bt_size > src_bt_size) { 5047 switch (dst_bt_size / src_bt_size) { 5048 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5049 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5050 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5051 default: ShouldNotReachHere(); 5052 } 5053 } else { 5054 assert(dst_bt_size < src_bt_size, ""); 5055 switch (src_bt_size / dst_bt_size) { 5056 case 2: { 5057 if (vlen_enc == AVX_128bit) { 5058 vpacksswb(dst, src, src, vlen_enc); 5059 } else { 5060 vpacksswb(dst, src, src, vlen_enc); 5061 vpermq(dst, dst, 0x08, vlen_enc); 5062 } 5063 break; 5064 } 5065 case 4: { 5066 if (vlen_enc == AVX_128bit) { 5067 vpackssdw(dst, src, src, vlen_enc); 5068 vpacksswb(dst, dst, dst, vlen_enc); 5069 } else { 5070 vpackssdw(dst, src, src, vlen_enc); 5071 vpermq(dst, dst, 0x08, vlen_enc); 5072 vpacksswb(dst, dst, dst, AVX_128bit); 5073 } 5074 break; 5075 } 5076 case 8: { 5077 if (vlen_enc == AVX_128bit) { 5078 vpshufd(dst, src, 0x08, vlen_enc); 5079 vpackssdw(dst, dst, dst, vlen_enc); 5080 vpacksswb(dst, dst, dst, vlen_enc); 5081 } else { 5082 vpshufd(dst, src, 0x08, vlen_enc); 5083 vpermq(dst, dst, 0x08, vlen_enc); 5084 vpackssdw(dst, dst, dst, AVX_128bit); 5085 vpacksswb(dst, dst, dst, AVX_128bit); 5086 } 5087 break; 5088 } 5089 default: ShouldNotReachHere(); 5090 } 5091 } 5092 } 5093 5094 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5095 bool merge, BasicType bt, int vlen_enc) { 5096 if (bt == T_INT) { 5097 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5098 } else { 5099 assert(bt == T_LONG, ""); 5100 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5101 } 5102 } 5103 5104 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5105 bool merge, BasicType bt, int vlen_enc) { 5106 if (bt == T_INT) { 5107 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5108 } else { 5109 assert(bt == T_LONG, ""); 5110 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5111 } 5112 } 5113 5114 #ifdef _LP64 5115 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5116 Register rtmp2, XMMRegister xtmp, int mask_len, 5117 int vec_enc) { 5118 int index = 0; 5119 int vindex = 0; 5120 mov64(rtmp1, 0x0101010101010101L); 5121 pdepq(rtmp1, src, rtmp1); 5122 if (mask_len > 8) { 5123 movq(rtmp2, src); 5124 vpxor(xtmp, xtmp, xtmp, vec_enc); 5125 movq(xtmp, rtmp1); 5126 } 5127 movq(dst, rtmp1); 5128 5129 mask_len -= 8; 5130 while (mask_len > 0) { 5131 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5132 index++; 5133 if ((index % 2) == 0) { 5134 pxor(xtmp, xtmp); 5135 } 5136 mov64(rtmp1, 0x0101010101010101L); 5137 shrq(rtmp2, 8); 5138 pdepq(rtmp1, rtmp2, rtmp1); 5139 pinsrq(xtmp, rtmp1, index % 2); 5140 vindex = index / 2; 5141 if (vindex) { 5142 // Write entire 16 byte vector when both 64 bit 5143 // lanes are update to save redundant instructions. 5144 if (index % 2) { 5145 vinsertf128(dst, dst, xtmp, vindex); 5146 } 5147 } else { 5148 vmovdqu(dst, xtmp); 5149 } 5150 mask_len -= 8; 5151 } 5152 } 5153 5154 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5155 switch(opc) { 5156 case Op_VectorMaskTrueCount: 5157 popcntq(dst, tmp); 5158 break; 5159 case Op_VectorMaskLastTrue: 5160 if (VM_Version::supports_lzcnt()) { 5161 lzcntq(tmp, tmp); 5162 movl(dst, 63); 5163 subl(dst, tmp); 5164 } else { 5165 movl(dst, -1); 5166 bsrq(tmp, tmp); 5167 cmov32(Assembler::notZero, dst, tmp); 5168 } 5169 break; 5170 case Op_VectorMaskFirstTrue: 5171 if (VM_Version::supports_bmi1()) { 5172 if (masklen < 32) { 5173 orl(tmp, 1 << masklen); 5174 tzcntl(dst, tmp); 5175 } else if (masklen == 32) { 5176 tzcntl(dst, tmp); 5177 } else { 5178 assert(masklen == 64, ""); 5179 tzcntq(dst, tmp); 5180 } 5181 } else { 5182 if (masklen < 32) { 5183 orl(tmp, 1 << masklen); 5184 bsfl(dst, tmp); 5185 } else { 5186 assert(masklen == 32 || masklen == 64, ""); 5187 movl(dst, masklen); 5188 if (masklen == 32) { 5189 bsfl(tmp, tmp); 5190 } else { 5191 bsfq(tmp, tmp); 5192 } 5193 cmov32(Assembler::notZero, dst, tmp); 5194 } 5195 } 5196 break; 5197 case Op_VectorMaskToLong: 5198 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5199 break; 5200 default: assert(false, "Unhandled mask operation"); 5201 } 5202 } 5203 5204 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5205 int masklen, int masksize, int vec_enc) { 5206 assert(VM_Version::supports_popcnt(), ""); 5207 5208 if(VM_Version::supports_avx512bw()) { 5209 kmovql(tmp, mask); 5210 } else { 5211 assert(masklen <= 16, ""); 5212 kmovwl(tmp, mask); 5213 } 5214 5215 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5216 // operations needs to be clipped. 5217 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5218 andq(tmp, (1 << masklen) - 1); 5219 } 5220 5221 vector_mask_operation_helper(opc, dst, tmp, masklen); 5222 } 5223 5224 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5225 Register tmp, int masklen, BasicType bt, int vec_enc) { 5226 assert(vec_enc == AVX_128bit && VM_Version::supports_avx() || 5227 vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4), ""); 5228 assert(VM_Version::supports_popcnt(), ""); 5229 5230 bool need_clip = false; 5231 switch(bt) { 5232 case T_BOOLEAN: 5233 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5234 vpxor(xtmp, xtmp, xtmp, vec_enc); 5235 vpsubb(xtmp, xtmp, mask, vec_enc); 5236 vpmovmskb(tmp, xtmp, vec_enc); 5237 need_clip = masklen < 16; 5238 break; 5239 case T_BYTE: 5240 vpmovmskb(tmp, mask, vec_enc); 5241 need_clip = masklen < 16; 5242 break; 5243 case T_SHORT: 5244 vpacksswb(xtmp, mask, mask, vec_enc); 5245 if (masklen >= 16) { 5246 vpermpd(xtmp, xtmp, 8, vec_enc); 5247 } 5248 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5249 need_clip = masklen < 16; 5250 break; 5251 case T_INT: 5252 case T_FLOAT: 5253 vmovmskps(tmp, mask, vec_enc); 5254 need_clip = masklen < 4; 5255 break; 5256 case T_LONG: 5257 case T_DOUBLE: 5258 vmovmskpd(tmp, mask, vec_enc); 5259 need_clip = masklen < 2; 5260 break; 5261 default: assert(false, "Unhandled type, %s", type2name(bt)); 5262 } 5263 5264 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5265 // operations needs to be clipped. 5266 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5267 // need_clip implies masklen < 32 5268 andq(tmp, (1 << masklen) - 1); 5269 } 5270 5271 vector_mask_operation_helper(opc, dst, tmp, masklen); 5272 } 5273 5274 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5275 Register rtmp2, int mask_len) { 5276 kmov(rtmp1, src); 5277 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5278 mov64(rtmp2, -1L); 5279 pextq(rtmp2, rtmp2, rtmp1); 5280 kmov(dst, rtmp2); 5281 } 5282 5283 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5284 bool merge, BasicType bt, int vec_enc) { 5285 if (opcode == Op_CompressV) { 5286 switch(bt) { 5287 case T_BYTE: 5288 evpcompressb(dst, mask, src, merge, vec_enc); 5289 break; 5290 case T_CHAR: 5291 case T_SHORT: 5292 evpcompressw(dst, mask, src, merge, vec_enc); 5293 break; 5294 case T_INT: 5295 evpcompressd(dst, mask, src, merge, vec_enc); 5296 break; 5297 case T_FLOAT: 5298 evcompressps(dst, mask, src, merge, vec_enc); 5299 break; 5300 case T_LONG: 5301 evpcompressq(dst, mask, src, merge, vec_enc); 5302 break; 5303 case T_DOUBLE: 5304 evcompresspd(dst, mask, src, merge, vec_enc); 5305 break; 5306 default: 5307 fatal("Unsupported type %s", type2name(bt)); 5308 break; 5309 } 5310 } else { 5311 assert(opcode == Op_ExpandV, ""); 5312 switch(bt) { 5313 case T_BYTE: 5314 evpexpandb(dst, mask, src, merge, vec_enc); 5315 break; 5316 case T_CHAR: 5317 case T_SHORT: 5318 evpexpandw(dst, mask, src, merge, vec_enc); 5319 break; 5320 case T_INT: 5321 evpexpandd(dst, mask, src, merge, vec_enc); 5322 break; 5323 case T_FLOAT: 5324 evexpandps(dst, mask, src, merge, vec_enc); 5325 break; 5326 case T_LONG: 5327 evpexpandq(dst, mask, src, merge, vec_enc); 5328 break; 5329 case T_DOUBLE: 5330 evexpandpd(dst, mask, src, merge, vec_enc); 5331 break; 5332 default: 5333 fatal("Unsupported type %s", type2name(bt)); 5334 break; 5335 } 5336 } 5337 } 5338 #endif 5339 5340 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5341 KRegister ktmp1, int vec_enc) { 5342 if (opcode == Op_SignumVD) { 5343 vsubpd(dst, zero, one, vec_enc); 5344 // if src < 0 ? -1 : 1 5345 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5346 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5347 // if src == NaN, -0.0 or 0.0 return src. 5348 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5349 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5350 } else { 5351 assert(opcode == Op_SignumVF, ""); 5352 vsubps(dst, zero, one, vec_enc); 5353 // if src < 0 ? -1 : 1 5354 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5355 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5356 // if src == NaN, -0.0 or 0.0 return src. 5357 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5358 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5359 } 5360 } 5361 5362 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5363 XMMRegister xtmp1, int vec_enc) { 5364 if (opcode == Op_SignumVD) { 5365 vsubpd(dst, zero, one, vec_enc); 5366 // if src < 0 ? -1 : 1 5367 vblendvpd(dst, one, dst, src, vec_enc, true, xtmp1); 5368 // if src == NaN, -0.0 or 0.0 return src. 5369 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5370 vblendvpd(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5371 } else { 5372 assert(opcode == Op_SignumVF, ""); 5373 vsubps(dst, zero, one, vec_enc); 5374 // if src < 0 ? -1 : 1 5375 vblendvps(dst, one, dst, src, vec_enc, true, xtmp1); 5376 // if src == NaN, -0.0 or 0.0 return src. 5377 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5378 vblendvps(dst, dst, src, xtmp1, vec_enc, false, xtmp1); 5379 } 5380 } 5381 5382 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5383 if (VM_Version::supports_avx512bw()) { 5384 if (mask_len > 32) { 5385 kmovql(dst, src); 5386 } else { 5387 kmovdl(dst, src); 5388 if (mask_len != 32) { 5389 kshiftrdl(dst, dst, 32 - mask_len); 5390 } 5391 } 5392 } else { 5393 assert(mask_len <= 16, ""); 5394 kmovwl(dst, src); 5395 if (mask_len != 16) { 5396 kshiftrwl(dst, dst, 16 - mask_len); 5397 } 5398 } 5399 } 5400 5401 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5402 int lane_size = type2aelembytes(bt); 5403 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5404 if ((is_LP64 || lane_size < 8) && 5405 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5406 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5407 movptr(rtmp, imm32); 5408 switch(lane_size) { 5409 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5410 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5411 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5412 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5413 fatal("Unsupported lane size %d", lane_size); 5414 break; 5415 } 5416 } else { 5417 movptr(rtmp, imm32); 5418 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5419 switch(lane_size) { 5420 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5421 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5422 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5423 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5424 fatal("Unsupported lane size %d", lane_size); 5425 break; 5426 } 5427 } 5428 } 5429 5430 // 5431 // Following is lookup table based popcount computation algorithm:- 5432 // Index Bit set count 5433 // [ 0000 -> 0, 5434 // 0001 -> 1, 5435 // 0010 -> 1, 5436 // 0011 -> 2, 5437 // 0100 -> 1, 5438 // 0101 -> 2, 5439 // 0110 -> 2, 5440 // 0111 -> 3, 5441 // 1000 -> 1, 5442 // 1001 -> 2, 5443 // 1010 -> 3, 5444 // 1011 -> 3, 5445 // 1100 -> 2, 5446 // 1101 -> 3, 5447 // 1111 -> 4 ] 5448 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5449 // shuffle indices for lookup table access. 5450 // b. Right shift each byte of vector lane by 4 positions. 5451 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5452 // shuffle indices for lookup table access. 5453 // d. Add the bitset count of upper and lower 4 bits of each byte. 5454 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5455 // count of all the bytes of a quadword. 5456 // f. Perform step e. for upper 128bit vector lane. 5457 // g. Pack the bitset count of quadwords back to double word. 5458 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5459 5460 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5461 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5462 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5463 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5464 vpsrlw(dst, src, 4, vec_enc); 5465 vpand(dst, dst, xtmp1, vec_enc); 5466 vpand(xtmp1, src, xtmp1, vec_enc); 5467 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5468 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5469 vpshufb(dst, xtmp2, dst, vec_enc); 5470 vpaddb(dst, dst, xtmp1, vec_enc); 5471 } 5472 5473 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5474 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5475 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5476 // Following code is as per steps e,f,g and h of above algorithm. 5477 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5478 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5479 vpsadbw(dst, dst, xtmp2, vec_enc); 5480 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5481 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5482 vpackuswb(dst, xtmp1, dst, vec_enc); 5483 } 5484 5485 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5486 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5487 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5488 // Add the popcount of upper and lower bytes of word. 5489 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5490 vpsrlw(dst, xtmp1, 8, vec_enc); 5491 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5492 vpaddw(dst, dst, xtmp1, vec_enc); 5493 } 5494 5495 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5496 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5497 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5498 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5499 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5500 } 5501 5502 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5503 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5504 switch(bt) { 5505 case T_LONG: 5506 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5507 break; 5508 case T_INT: 5509 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5510 break; 5511 case T_CHAR: 5512 case T_SHORT: 5513 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5514 break; 5515 case T_BYTE: 5516 case T_BOOLEAN: 5517 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5518 break; 5519 default: 5520 fatal("Unsupported type %s", type2name(bt)); 5521 break; 5522 } 5523 } 5524 5525 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5526 KRegister mask, bool merge, int vec_enc) { 5527 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5528 switch(bt) { 5529 case T_LONG: 5530 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5531 evpopcntq(dst, mask, src, merge, vec_enc); 5532 break; 5533 case T_INT: 5534 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5535 evpopcntd(dst, mask, src, merge, vec_enc); 5536 break; 5537 case T_CHAR: 5538 case T_SHORT: 5539 assert(VM_Version::supports_avx512_bitalg(), ""); 5540 evpopcntw(dst, mask, src, merge, vec_enc); 5541 break; 5542 case T_BYTE: 5543 case T_BOOLEAN: 5544 assert(VM_Version::supports_avx512_bitalg(), ""); 5545 evpopcntb(dst, mask, src, merge, vec_enc); 5546 break; 5547 default: 5548 fatal("Unsupported type %s", type2name(bt)); 5549 break; 5550 } 5551 } 5552 5553 #ifndef _LP64 5554 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 5555 assert(VM_Version::supports_avx512bw(), ""); 5556 kmovdl(tmp, src); 5557 kunpckdql(dst, tmp, tmp); 5558 } 5559 #endif 5560 5561 // Bit reversal algorithm first reverses the bits of each byte followed by 5562 // a byte level reversal for multi-byte primitive types (short/int/long). 5563 // Algorithm performs a lookup table access to get reverse bit sequence 5564 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5565 // is obtained by swapping the reverse bit sequences of upper and lower 5566 // nibble of a byte. 5567 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5568 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5569 if (VM_Version::supports_avx512vlbw()) { 5570 5571 // Get the reverse bit sequence of lower nibble of each byte. 5572 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5573 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5574 evpandq(dst, xtmp2, src, vec_enc); 5575 vpshufb(dst, xtmp1, dst, vec_enc); 5576 vpsllq(dst, dst, 4, vec_enc); 5577 5578 // Get the reverse bit sequence of upper nibble of each byte. 5579 vpandn(xtmp2, xtmp2, src, vec_enc); 5580 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5581 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5582 5583 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5584 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5585 evporq(xtmp2, dst, xtmp2, vec_enc); 5586 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5587 5588 } else if(vec_enc == Assembler::AVX_512bit) { 5589 // Shift based bit reversal. 5590 assert(bt == T_LONG || bt == T_INT, ""); 5591 5592 // Swap lower and upper nibble of each byte. 5593 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5594 5595 // Swap two least and most significant bits of each nibble. 5596 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5597 5598 // Swap adjacent pair of bits. 5599 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5600 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5601 5602 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5603 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5604 } else { 5605 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5606 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5607 5608 // Get the reverse bit sequence of lower nibble of each byte. 5609 vpand(dst, xtmp2, src, vec_enc); 5610 vpshufb(dst, xtmp1, dst, vec_enc); 5611 vpsllq(dst, dst, 4, vec_enc); 5612 5613 // Get the reverse bit sequence of upper nibble of each byte. 5614 vpandn(xtmp2, xtmp2, src, vec_enc); 5615 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5616 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5617 5618 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5619 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5620 vpor(xtmp2, dst, xtmp2, vec_enc); 5621 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5622 } 5623 } 5624 5625 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 5626 XMMRegister xtmp, Register rscratch) { 5627 assert(VM_Version::supports_gfni(), ""); 5628 assert(rscratch != noreg || always_reachable(mask), "missing"); 5629 5630 // Galois field instruction based bit reversal based on following algorithm. 5631 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5632 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 5633 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 5634 vector_reverse_byte(bt, dst, xtmp, vec_enc); 5635 } 5636 5637 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 5638 XMMRegister xtmp1, Register rtmp, int vec_enc) { 5639 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 5640 evpandq(dst, xtmp1, src, vec_enc); 5641 vpsllq(dst, dst, nbits, vec_enc); 5642 vpandn(xtmp1, xtmp1, src, vec_enc); 5643 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 5644 evporq(dst, dst, xtmp1, vec_enc); 5645 } 5646 5647 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5648 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5649 // Shift based bit reversal. 5650 assert(VM_Version::supports_evex(), ""); 5651 switch(bt) { 5652 case T_LONG: 5653 // Swap upper and lower double word of each quad word. 5654 evprorq(xtmp1, k0, src, 32, true, vec_enc); 5655 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 5656 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5657 break; 5658 case T_INT: 5659 // Swap upper and lower word of each double word. 5660 evprord(xtmp1, k0, src, 16, true, vec_enc); 5661 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5662 break; 5663 case T_CHAR: 5664 case T_SHORT: 5665 // Swap upper and lower byte of each word. 5666 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 5667 break; 5668 case T_BYTE: 5669 evmovdquq(dst, k0, src, true, vec_enc); 5670 break; 5671 default: 5672 fatal("Unsupported type %s", type2name(bt)); 5673 break; 5674 } 5675 } 5676 5677 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5678 if (bt == T_BYTE) { 5679 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 5680 evmovdquq(dst, k0, src, true, vec_enc); 5681 } else { 5682 vmovdqu(dst, src); 5683 } 5684 return; 5685 } 5686 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 5687 // pre-computed shuffle indices. 5688 switch(bt) { 5689 case T_LONG: 5690 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 5691 break; 5692 case T_INT: 5693 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 5694 break; 5695 case T_CHAR: 5696 case T_SHORT: 5697 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 5698 break; 5699 default: 5700 fatal("Unsupported type %s", type2name(bt)); 5701 break; 5702 } 5703 vpshufb(dst, src, dst, vec_enc); 5704 } 5705 5706 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5707 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5708 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 5709 assert(is_integral_type(bt), ""); 5710 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5711 assert(VM_Version::supports_avx512cd(), ""); 5712 switch(bt) { 5713 case T_LONG: 5714 evplzcntq(dst, ktmp, src, merge, vec_enc); 5715 break; 5716 case T_INT: 5717 evplzcntd(dst, ktmp, src, merge, vec_enc); 5718 break; 5719 case T_SHORT: 5720 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 5721 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 5722 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 5723 vpunpckhwd(dst, xtmp1, src, vec_enc); 5724 evplzcntd(dst, ktmp, dst, merge, vec_enc); 5725 vpackusdw(dst, xtmp2, dst, vec_enc); 5726 break; 5727 case T_BYTE: 5728 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5729 // accessing the lookup table. 5730 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 5731 // accessing the lookup table. 5732 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 5733 assert(VM_Version::supports_avx512bw(), ""); 5734 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 5735 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 5736 vpand(xtmp2, dst, src, vec_enc); 5737 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5738 vpsrlw(xtmp3, src, 4, vec_enc); 5739 vpand(xtmp3, dst, xtmp3, vec_enc); 5740 vpshufb(dst, xtmp1, xtmp3, vec_enc); 5741 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 5742 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 5743 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 5744 break; 5745 default: 5746 fatal("Unsupported type %s", type2name(bt)); 5747 break; 5748 } 5749 } 5750 5751 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5752 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5753 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 5754 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5755 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5756 // accessing the lookup table. 5757 vpand(dst, xtmp2, src, vec_enc); 5758 vpshufb(dst, xtmp1, dst, vec_enc); 5759 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 5760 // accessing the lookup table. 5761 vpsrlw(xtmp3, src, 4, vec_enc); 5762 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 5763 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 5764 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 5765 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 5766 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 5767 vpaddb(dst, dst, xtmp2, vec_enc); 5768 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 5769 } 5770 5771 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5772 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5773 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5774 // Add zero counts of lower byte and upper byte of a word if 5775 // upper byte holds a zero value. 5776 vpsrlw(xtmp3, src, 8, vec_enc); 5777 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 5778 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 5779 vpsllw(xtmp2, dst, 8, vec_enc); 5780 vpaddw(xtmp2, xtmp2, dst, vec_enc); 5781 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5782 vpsrlw(dst, dst, 8, vec_enc); 5783 } 5784 5785 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5786 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 5787 // Since IEEE 754 floating point format represents mantissa in 1.0 format 5788 // hence biased exponent can be used to compute leading zero count as per 5789 // following formula:- 5790 // LZCNT = 32 - (biased_exp - 127) 5791 // Special handling has been introduced for Zero, Max_Int and -ve source values. 5792 5793 // Broadcast 0xFF 5794 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 5795 vpsrld(xtmp1, xtmp1, 24, vec_enc); 5796 5797 // Extract biased exponent. 5798 vcvtdq2ps(dst, src, vec_enc); 5799 vpsrld(dst, dst, 23, vec_enc); 5800 vpand(dst, dst, xtmp1, vec_enc); 5801 5802 // Broadcast 127. 5803 vpsrld(xtmp1, xtmp1, 1, vec_enc); 5804 // Exponent = biased_exp - 127 5805 vpsubd(dst, dst, xtmp1, vec_enc); 5806 5807 // Exponent = Exponent + 1 5808 vpsrld(xtmp3, xtmp1, 6, vec_enc); 5809 vpaddd(dst, dst, xtmp3, vec_enc); 5810 5811 // Replace -ve exponent with zero, exponent is -ve when src 5812 // lane contains a zero value. 5813 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5814 vblendvps(dst, dst, xtmp2, dst, vec_enc); 5815 5816 // Rematerialize broadcast 32. 5817 vpslld(xtmp1, xtmp3, 5, vec_enc); 5818 // Exponent is 32 if corresponding source lane contains max_int value. 5819 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 5820 // LZCNT = 32 - exponent 5821 vpsubd(dst, xtmp1, dst, vec_enc); 5822 5823 // Replace LZCNT with a value 1 if corresponding source lane 5824 // contains max_int value. 5825 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 5826 5827 // Replace biased_exp with 0 if source lane value is less than zero. 5828 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5829 vblendvps(dst, dst, xtmp2, src, vec_enc); 5830 } 5831 5832 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5833 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5834 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5835 // Add zero counts of lower word and upper word of a double word if 5836 // upper word holds a zero value. 5837 vpsrld(xtmp3, src, 16, vec_enc); 5838 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 5839 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 5840 vpslld(xtmp2, dst, 16, vec_enc); 5841 vpaddd(xtmp2, xtmp2, dst, vec_enc); 5842 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5843 vpsrld(dst, dst, 16, vec_enc); 5844 // Add zero counts of lower doubleword and upper doubleword of a 5845 // quadword if upper doubleword holds a zero value. 5846 vpsrlq(xtmp3, src, 32, vec_enc); 5847 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 5848 vpsllq(xtmp2, dst, 32, vec_enc); 5849 vpaddq(xtmp2, xtmp2, dst, vec_enc); 5850 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5851 vpsrlq(dst, dst, 32, vec_enc); 5852 } 5853 5854 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 5855 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5856 Register rtmp, int vec_enc) { 5857 assert(is_integral_type(bt), "unexpected type"); 5858 assert(vec_enc < Assembler::AVX_512bit, ""); 5859 switch(bt) { 5860 case T_LONG: 5861 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5862 break; 5863 case T_INT: 5864 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 5865 break; 5866 case T_SHORT: 5867 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5868 break; 5869 case T_BYTE: 5870 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5871 break; 5872 default: 5873 fatal("Unsupported type %s", type2name(bt)); 5874 break; 5875 } 5876 } 5877 5878 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 5879 switch(bt) { 5880 case T_BYTE: 5881 vpsubb(dst, src1, src2, vec_enc); 5882 break; 5883 case T_SHORT: 5884 vpsubw(dst, src1, src2, vec_enc); 5885 break; 5886 case T_INT: 5887 vpsubd(dst, src1, src2, vec_enc); 5888 break; 5889 case T_LONG: 5890 vpsubq(dst, src1, src2, vec_enc); 5891 break; 5892 default: 5893 fatal("Unsupported type %s", type2name(bt)); 5894 break; 5895 } 5896 } 5897 5898 // Trailing zero count computation is based on leading zero count operation as per 5899 // following equation. All AVX3 targets support AVX512CD feature which offers 5900 // direct vector instruction to compute leading zero count. 5901 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 5902 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5903 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5904 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 5905 assert(is_integral_type(bt), ""); 5906 // xtmp = -1 5907 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 5908 // xtmp = xtmp + src 5909 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 5910 // xtmp = xtmp & ~src 5911 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 5912 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 5913 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 5914 vpsub(bt, dst, xtmp4, dst, vec_enc); 5915 } 5916 5917 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 5918 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 5919 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5920 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5921 assert(is_integral_type(bt), ""); 5922 // xtmp = 0 5923 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 5924 // xtmp = 0 - src 5925 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 5926 // xtmp = xtmp | src 5927 vpor(xtmp3, xtmp3, src, vec_enc); 5928 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 5929 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 5930 vpsub(bt, dst, xtmp1, dst, vec_enc); 5931 } 5932 5933 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 5934 Label done; 5935 Label neg_divisor_fastpath; 5936 cmpl(divisor, 0); 5937 jccb(Assembler::less, neg_divisor_fastpath); 5938 xorl(rdx, rdx); 5939 divl(divisor); 5940 jmpb(done); 5941 bind(neg_divisor_fastpath); 5942 // Fastpath for divisor < 0: 5943 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 5944 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 5945 movl(rdx, rax); 5946 subl(rdx, divisor); 5947 if (VM_Version::supports_bmi1()) { 5948 andnl(rax, rdx, rax); 5949 } else { 5950 notl(rdx); 5951 andl(rax, rdx); 5952 } 5953 shrl(rax, 31); 5954 bind(done); 5955 } 5956 5957 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 5958 Label done; 5959 Label neg_divisor_fastpath; 5960 cmpl(divisor, 0); 5961 jccb(Assembler::less, neg_divisor_fastpath); 5962 xorl(rdx, rdx); 5963 divl(divisor); 5964 jmpb(done); 5965 bind(neg_divisor_fastpath); 5966 // Fastpath when divisor < 0: 5967 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 5968 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 5969 movl(rdx, rax); 5970 subl(rax, divisor); 5971 if (VM_Version::supports_bmi1()) { 5972 andnl(rax, rax, rdx); 5973 } else { 5974 notl(rax); 5975 andl(rax, rdx); 5976 } 5977 sarl(rax, 31); 5978 andl(rax, divisor); 5979 subl(rdx, rax); 5980 bind(done); 5981 } 5982 5983 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 5984 Label done; 5985 Label neg_divisor_fastpath; 5986 5987 cmpl(divisor, 0); 5988 jccb(Assembler::less, neg_divisor_fastpath); 5989 xorl(rdx, rdx); 5990 divl(divisor); 5991 jmpb(done); 5992 bind(neg_divisor_fastpath); 5993 // Fastpath for divisor < 0: 5994 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 5995 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 5996 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 5997 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 5998 movl(rdx, rax); 5999 subl(rax, divisor); 6000 if (VM_Version::supports_bmi1()) { 6001 andnl(rax, rax, rdx); 6002 } else { 6003 notl(rax); 6004 andl(rax, rdx); 6005 } 6006 movl(tmp, rax); 6007 shrl(rax, 31); // quotient 6008 sarl(tmp, 31); 6009 andl(tmp, divisor); 6010 subl(rdx, tmp); // remainder 6011 bind(done); 6012 } 6013 6014 #ifdef _LP64 6015 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 6016 XMMRegister xtmp2, Register rtmp) { 6017 if(VM_Version::supports_gfni()) { 6018 // Galois field instruction based bit reversal based on following algorithm. 6019 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6020 mov64(rtmp, 0x8040201008040201L); 6021 movq(xtmp1, src); 6022 movq(xtmp2, rtmp); 6023 gf2p8affineqb(xtmp1, xtmp2, 0); 6024 movq(dst, xtmp1); 6025 } else { 6026 // Swap even and odd numbered bits. 6027 movl(rtmp, src); 6028 andl(rtmp, 0x55555555); 6029 shll(rtmp, 1); 6030 movl(dst, src); 6031 andl(dst, 0xAAAAAAAA); 6032 shrl(dst, 1); 6033 orl(dst, rtmp); 6034 6035 // Swap LSB and MSB 2 bits of each nibble. 6036 movl(rtmp, dst); 6037 andl(rtmp, 0x33333333); 6038 shll(rtmp, 2); 6039 andl(dst, 0xCCCCCCCC); 6040 shrl(dst, 2); 6041 orl(dst, rtmp); 6042 6043 // Swap LSB and MSB 4 bits of each byte. 6044 movl(rtmp, dst); 6045 andl(rtmp, 0x0F0F0F0F); 6046 shll(rtmp, 4); 6047 andl(dst, 0xF0F0F0F0); 6048 shrl(dst, 4); 6049 orl(dst, rtmp); 6050 } 6051 bswapl(dst); 6052 } 6053 6054 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6055 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6056 if(VM_Version::supports_gfni()) { 6057 // Galois field instruction based bit reversal based on following algorithm. 6058 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6059 mov64(rtmp1, 0x8040201008040201L); 6060 movq(xtmp1, src); 6061 movq(xtmp2, rtmp1); 6062 gf2p8affineqb(xtmp1, xtmp2, 0); 6063 movq(dst, xtmp1); 6064 } else { 6065 // Swap even and odd numbered bits. 6066 movq(rtmp1, src); 6067 mov64(rtmp2, 0x5555555555555555L); 6068 andq(rtmp1, rtmp2); 6069 shlq(rtmp1, 1); 6070 movq(dst, src); 6071 notq(rtmp2); 6072 andq(dst, rtmp2); 6073 shrq(dst, 1); 6074 orq(dst, rtmp1); 6075 6076 // Swap LSB and MSB 2 bits of each nibble. 6077 movq(rtmp1, dst); 6078 mov64(rtmp2, 0x3333333333333333L); 6079 andq(rtmp1, rtmp2); 6080 shlq(rtmp1, 2); 6081 notq(rtmp2); 6082 andq(dst, rtmp2); 6083 shrq(dst, 2); 6084 orq(dst, rtmp1); 6085 6086 // Swap LSB and MSB 4 bits of each byte. 6087 movq(rtmp1, dst); 6088 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6089 andq(rtmp1, rtmp2); 6090 shlq(rtmp1, 4); 6091 notq(rtmp2); 6092 andq(dst, rtmp2); 6093 shrq(dst, 4); 6094 orq(dst, rtmp1); 6095 } 6096 bswapq(dst); 6097 } 6098 6099 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6100 Label done; 6101 Label neg_divisor_fastpath; 6102 cmpq(divisor, 0); 6103 jccb(Assembler::less, neg_divisor_fastpath); 6104 xorl(rdx, rdx); 6105 divq(divisor); 6106 jmpb(done); 6107 bind(neg_divisor_fastpath); 6108 // Fastpath for divisor < 0: 6109 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6110 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6111 movq(rdx, rax); 6112 subq(rdx, divisor); 6113 if (VM_Version::supports_bmi1()) { 6114 andnq(rax, rdx, rax); 6115 } else { 6116 notq(rdx); 6117 andq(rax, rdx); 6118 } 6119 shrq(rax, 63); 6120 bind(done); 6121 } 6122 6123 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6124 Label done; 6125 Label neg_divisor_fastpath; 6126 cmpq(divisor, 0); 6127 jccb(Assembler::less, neg_divisor_fastpath); 6128 xorq(rdx, rdx); 6129 divq(divisor); 6130 jmp(done); 6131 bind(neg_divisor_fastpath); 6132 // Fastpath when divisor < 0: 6133 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6134 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6135 movq(rdx, rax); 6136 subq(rax, divisor); 6137 if (VM_Version::supports_bmi1()) { 6138 andnq(rax, rax, rdx); 6139 } else { 6140 notq(rax); 6141 andq(rax, rdx); 6142 } 6143 sarq(rax, 63); 6144 andq(rax, divisor); 6145 subq(rdx, rax); 6146 bind(done); 6147 } 6148 6149 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6150 Label done; 6151 Label neg_divisor_fastpath; 6152 cmpq(divisor, 0); 6153 jccb(Assembler::less, neg_divisor_fastpath); 6154 xorq(rdx, rdx); 6155 divq(divisor); 6156 jmp(done); 6157 bind(neg_divisor_fastpath); 6158 // Fastpath for divisor < 0: 6159 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6160 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6161 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6162 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6163 movq(rdx, rax); 6164 subq(rax, divisor); 6165 if (VM_Version::supports_bmi1()) { 6166 andnq(rax, rax, rdx); 6167 } else { 6168 notq(rax); 6169 andq(rax, rdx); 6170 } 6171 movq(tmp, rax); 6172 shrq(rax, 63); // quotient 6173 sarq(tmp, 63); 6174 andq(tmp, divisor); 6175 subq(rdx, tmp); // remainder 6176 bind(done); 6177 } 6178 #endif 6179 6180 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6181 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6182 int vlen_enc) { 6183 assert(VM_Version::supports_avx512bw(), ""); 6184 // Byte shuffles are inlane operations and indices are determined using 6185 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6186 // normalized to index range 0-15. This makes sure that all the multiples 6187 // of an index value are placed at same relative position in 128 bit 6188 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6189 // will be 16th element in their respective 128 bit lanes. 6190 movl(rtmp, 16); 6191 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6192 6193 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6194 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6195 // original shuffle indices and move the shuffled lanes corresponding to true 6196 // mask to destination vector. 6197 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6198 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6199 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6200 6201 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6202 // and broadcasting second 128 bit lane. 6203 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6204 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6205 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6206 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6207 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6208 6209 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6210 // and broadcasting third 128 bit lane. 6211 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6212 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6213 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6214 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6215 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6216 6217 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6218 // and broadcasting third 128 bit lane. 6219 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6220 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6221 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6222 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6223 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6224 } 6225 6226 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6227 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6228 if (vlen_enc == AVX_128bit) { 6229 vpermilps(dst, src, shuffle, vlen_enc); 6230 } else if (bt == T_INT) { 6231 vpermd(dst, shuffle, src, vlen_enc); 6232 } else { 6233 assert(bt == T_FLOAT, ""); 6234 vpermps(dst, shuffle, src, vlen_enc); 6235 } 6236 }