1 /* 2 * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "oops/methodData.hpp" 31 #include "opto/c2_MacroAssembler.hpp" 32 #include "opto/intrinsicnode.hpp" 33 #include "opto/output.hpp" 34 #include "opto/opcodes.hpp" 35 #include "opto/subnode.hpp" 36 #include "runtime/objectMonitor.hpp" 37 #include "runtime/stubRoutines.hpp" 38 39 #ifdef PRODUCT 40 #define BLOCK_COMMENT(str) /* nothing */ 41 #define STOP(error) stop(error) 42 #else 43 #define BLOCK_COMMENT(str) block_comment(str) 44 #define STOP(error) block_comment(error); stop(error) 45 #endif 46 47 // C2 compiled method's prolog code. 48 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { 49 50 // WARNING: Initial instruction MUST be 5 bytes or longer so that 51 // NativeJump::patch_verified_entry will be able to patch out the entry 52 // code safely. The push to verify stack depth is ok at 5 bytes, 53 // the frame allocation can be either 3 or 6 bytes. So if we don't do 54 // stack bang then we must use the 6 byte frame allocation even if 55 // we have no frame. :-( 56 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 57 58 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 59 // Remove word for return addr 60 framesize -= wordSize; 61 stack_bang_size -= wordSize; 62 63 // Calls to C2R adapters often do not accept exceptional returns. 64 // We require that their callers must bang for them. But be careful, because 65 // some VM calls (such as call site linkage) can use several kilobytes of 66 // stack. But the stack safety zone should account for that. 67 // See bugs 4446381, 4468289, 4497237. 68 if (stack_bang_size > 0) { 69 generate_stack_overflow_check(stack_bang_size); 70 71 // We always push rbp, so that on return to interpreter rbp, will be 72 // restored correctly and we can correct the stack. 73 push(rbp); 74 // Save caller's stack pointer into RBP if the frame pointer is preserved. 75 if (PreserveFramePointer) { 76 mov(rbp, rsp); 77 } 78 // Remove word for ebp 79 framesize -= wordSize; 80 81 // Create frame 82 if (framesize) { 83 subptr(rsp, framesize); 84 } 85 } else { 86 // Create frame (force generation of a 4 byte immediate value) 87 subptr_imm32(rsp, framesize); 88 89 // Save RBP register now. 90 framesize -= wordSize; 91 movptr(Address(rsp, framesize), rbp); 92 // Save caller's stack pointer into RBP if the frame pointer is preserved. 93 if (PreserveFramePointer) { 94 movptr(rbp, rsp); 95 if (framesize > 0) { 96 addptr(rbp, framesize); 97 } 98 } 99 } 100 101 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 102 framesize -= wordSize; 103 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 104 } 105 106 #ifndef _LP64 107 // If method sets FPU control word do it now 108 if (fp_mode_24b) { 109 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 110 } 111 if (UseSSE >= 2 && VerifyFPU) { 112 verify_FPU(0, "FPU stack must be clean on entry"); 113 } 114 #endif 115 116 #ifdef ASSERT 117 if (VerifyStackAtCalls) { 118 Label L; 119 push(rax); 120 mov(rax, rsp); 121 andptr(rax, StackAlignmentInBytes-1); 122 cmpptr(rax, StackAlignmentInBytes-wordSize); 123 pop(rax); 124 jcc(Assembler::equal, L); 125 STOP("Stack is not properly aligned!"); 126 bind(L); 127 } 128 #endif 129 130 if (!is_stub) { 131 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 132 #ifdef _LP64 133 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 134 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 135 Label dummy_slow_path; 136 Label dummy_continuation; 137 Label* slow_path = &dummy_slow_path; 138 Label* continuation = &dummy_continuation; 139 if (!Compile::current()->output()->in_scratch_emit_size()) { 140 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 141 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 142 Compile::current()->output()->add_stub(stub); 143 slow_path = &stub->entry(); 144 continuation = &stub->continuation(); 145 } 146 bs->nmethod_entry_barrier(this, slow_path, continuation); 147 } 148 #else 149 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 150 bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */); 151 #endif 152 } 153 } 154 155 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 156 switch (vlen_in_bytes) { 157 case 4: // fall-through 158 case 8: // fall-through 159 case 16: return Assembler::AVX_128bit; 160 case 32: return Assembler::AVX_256bit; 161 case 64: return Assembler::AVX_512bit; 162 163 default: { 164 ShouldNotReachHere(); 165 return Assembler::AVX_NoVec; 166 } 167 } 168 } 169 170 #if INCLUDE_RTM_OPT 171 172 // Update rtm_counters based on abort status 173 // input: abort_status 174 // rtm_counters (RTMLockingCounters*) 175 // flags are killed 176 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) { 177 178 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset())); 179 if (PrintPreciseRTMLockingStatistics) { 180 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 181 Label check_abort; 182 testl(abort_status, (1<<i)); 183 jccb(Assembler::equal, check_abort); 184 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx)))); 185 bind(check_abort); 186 } 187 } 188 } 189 190 // Branch if (random & (count-1) != 0), count is 2^n 191 // tmp, scr and flags are killed 192 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) { 193 assert(tmp == rax, ""); 194 assert(scr == rdx, ""); 195 rdtsc(); // modifies EDX:EAX 196 andptr(tmp, count-1); 197 jccb(Assembler::notZero, brLabel); 198 } 199 200 // Perform abort ratio calculation, set no_rtm bit if high ratio 201 // input: rtm_counters_Reg (RTMLockingCounters* address) 202 // tmpReg, rtm_counters_Reg and flags are killed 203 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg, 204 Register rtm_counters_Reg, 205 RTMLockingCounters* rtm_counters, 206 Metadata* method_data) { 207 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 208 209 if (RTMLockingCalculationDelay > 0) { 210 // Delay calculation 211 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr())); 212 testptr(tmpReg, tmpReg); 213 jccb(Assembler::equal, L_done); 214 } 215 // Abort ratio calculation only if abort_count > RTMAbortThreshold 216 // Aborted transactions = abort_count * 100 217 // All transactions = total_count * RTMTotalCountIncrRate 218 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 219 220 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset())); 221 cmpptr(tmpReg, RTMAbortThreshold); 222 jccb(Assembler::below, L_check_always_rtm2); 223 imulptr(tmpReg, tmpReg, 100); 224 225 Register scrReg = rtm_counters_Reg; 226 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 227 imulptr(scrReg, scrReg, RTMTotalCountIncrRate); 228 imulptr(scrReg, scrReg, RTMAbortRatio); 229 cmpptr(tmpReg, scrReg); 230 jccb(Assembler::below, L_check_always_rtm1); 231 if (method_data != nullptr) { 232 // set rtm_state to "no rtm" in MDO 233 mov_metadata(tmpReg, method_data); 234 lock(); 235 orl(Address(tmpReg, MethodData::rtm_state_offset()), NoRTM); 236 } 237 jmpb(L_done); 238 bind(L_check_always_rtm1); 239 // Reload RTMLockingCounters* address 240 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 241 bind(L_check_always_rtm2); 242 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 243 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 244 jccb(Assembler::below, L_done); 245 if (method_data != nullptr) { 246 // set rtm_state to "always rtm" in MDO 247 mov_metadata(tmpReg, method_data); 248 lock(); 249 orl(Address(tmpReg, MethodData::rtm_state_offset()), UseRTM); 250 } 251 bind(L_done); 252 } 253 254 // Update counters and perform abort ratio calculation 255 // input: abort_status_Reg 256 // rtm_counters_Reg, flags are killed 257 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg, 258 Register rtm_counters_Reg, 259 RTMLockingCounters* rtm_counters, 260 Metadata* method_data, 261 bool profile_rtm) { 262 263 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 264 // update rtm counters based on rax value at abort 265 // reads abort_status_Reg, updates flags 266 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 267 rtm_counters_update(abort_status_Reg, rtm_counters_Reg); 268 if (profile_rtm) { 269 // Save abort status because abort_status_Reg is used by following code. 270 if (RTMRetryCount > 0) { 271 push(abort_status_Reg); 272 } 273 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 274 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data); 275 // restore abort status 276 if (RTMRetryCount > 0) { 277 pop(abort_status_Reg); 278 } 279 } 280 } 281 282 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4) 283 // inputs: retry_count_Reg 284 // : abort_status_Reg 285 // output: retry_count_Reg decremented by 1 286 // flags are killed 287 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) { 288 Label doneRetry; 289 assert(abort_status_Reg == rax, ""); 290 // The abort reason bits are in eax (see all states in rtmLocking.hpp) 291 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) 292 // if reason is in 0x6 and retry count != 0 then retry 293 andptr(abort_status_Reg, 0x6); 294 jccb(Assembler::zero, doneRetry); 295 testl(retry_count_Reg, retry_count_Reg); 296 jccb(Assembler::zero, doneRetry); 297 pause(); 298 decrementl(retry_count_Reg); 299 jmp(retryLabel); 300 bind(doneRetry); 301 } 302 303 // Spin and retry if lock is busy, 304 // inputs: box_Reg (monitor address) 305 // : retry_count_Reg 306 // output: retry_count_Reg decremented by 1 307 // : clear z flag if retry count exceeded 308 // tmp_Reg, scr_Reg, flags are killed 309 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg, 310 Register tmp_Reg, Register scr_Reg, Label& retryLabel) { 311 Label SpinLoop, SpinExit, doneRetry; 312 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 313 314 testl(retry_count_Reg, retry_count_Reg); 315 jccb(Assembler::zero, doneRetry); 316 decrementl(retry_count_Reg); 317 movptr(scr_Reg, RTMSpinLoopCount); 318 319 bind(SpinLoop); 320 pause(); 321 decrementl(scr_Reg); 322 jccb(Assembler::lessEqual, SpinExit); 323 movptr(tmp_Reg, Address(box_Reg, owner_offset)); 324 testptr(tmp_Reg, tmp_Reg); 325 jccb(Assembler::notZero, SpinLoop); 326 327 bind(SpinExit); 328 jmp(retryLabel); 329 bind(doneRetry); 330 incrementl(retry_count_Reg); // clear z flag 331 } 332 333 // Use RTM for normal stack locks 334 // Input: objReg (object to lock) 335 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg, 336 Register retry_on_abort_count_Reg, 337 RTMLockingCounters* stack_rtm_counters, 338 Metadata* method_data, bool profile_rtm, 339 Label& DONE_LABEL, Label& IsInflated) { 340 assert(UseRTMForStackLocks, "why call this otherwise?"); 341 assert(tmpReg == rax, ""); 342 assert(scrReg == rdx, ""); 343 Label L_rtm_retry, L_decrement_retry, L_on_abort; 344 345 if (RTMRetryCount > 0) { 346 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 347 bind(L_rtm_retry); 348 } 349 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 350 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 351 jcc(Assembler::notZero, IsInflated); 352 353 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 354 Label L_noincrement; 355 if (RTMTotalCountIncrRate > 1) { 356 // tmpReg, scrReg and flags are killed 357 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 358 } 359 assert(stack_rtm_counters != nullptr, "should not be null when profiling RTM"); 360 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg); 361 bind(L_noincrement); 362 } 363 xbegin(L_on_abort); 364 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 365 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 366 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 367 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked 368 369 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 370 if (UseRTMXendForLockBusy) { 371 xend(); 372 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry) 373 jmp(L_decrement_retry); 374 } 375 else { 376 xabort(0); 377 } 378 bind(L_on_abort); 379 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 380 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm); 381 } 382 bind(L_decrement_retry); 383 if (RTMRetryCount > 0) { 384 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 385 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 386 } 387 } 388 389 // Use RTM for inflating locks 390 // inputs: objReg (object to lock) 391 // boxReg (on-stack box address (displaced header location) - KILLED) 392 // tmpReg (ObjectMonitor address + markWord::monitor_value) 393 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg, 394 Register scrReg, Register retry_on_busy_count_Reg, 395 Register retry_on_abort_count_Reg, 396 RTMLockingCounters* rtm_counters, 397 Metadata* method_data, bool profile_rtm, 398 Label& DONE_LABEL) { 399 assert(UseRTMLocking, "why call this otherwise?"); 400 assert(tmpReg == rax, ""); 401 assert(scrReg == rdx, ""); 402 Label L_rtm_retry, L_decrement_retry, L_on_abort; 403 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 404 405 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 406 movptr(boxReg, tmpReg); // Save ObjectMonitor address 407 408 if (RTMRetryCount > 0) { 409 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy 410 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 411 bind(L_rtm_retry); 412 } 413 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 414 Label L_noincrement; 415 if (RTMTotalCountIncrRate > 1) { 416 // tmpReg, scrReg and flags are killed 417 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 418 } 419 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 420 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg); 421 bind(L_noincrement); 422 } 423 xbegin(L_on_abort); 424 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 425 movptr(tmpReg, Address(tmpReg, owner_offset)); 426 testptr(tmpReg, tmpReg); 427 jcc(Assembler::zero, DONE_LABEL); 428 if (UseRTMXendForLockBusy) { 429 xend(); 430 jmp(L_decrement_retry); 431 } 432 else { 433 xabort(0); 434 } 435 bind(L_on_abort); 436 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 437 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 438 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm); 439 } 440 if (RTMRetryCount > 0) { 441 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 442 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 443 } 444 445 movptr(tmpReg, Address(boxReg, owner_offset)) ; 446 testptr(tmpReg, tmpReg) ; 447 jccb(Assembler::notZero, L_decrement_retry) ; 448 449 // Appears unlocked - try to swing _owner from null to non-null. 450 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 451 #ifdef _LP64 452 Register threadReg = r15_thread; 453 #else 454 get_thread(scrReg); 455 Register threadReg = scrReg; 456 #endif 457 lock(); 458 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg 459 460 if (RTMRetryCount > 0) { 461 // success done else retry 462 jccb(Assembler::equal, DONE_LABEL) ; 463 bind(L_decrement_retry); 464 // Spin and retry if lock is busy. 465 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry); 466 } 467 else { 468 bind(L_decrement_retry); 469 } 470 } 471 472 #endif // INCLUDE_RTM_OPT 473 474 // fast_lock and fast_unlock used by C2 475 476 // Because the transitions from emitted code to the runtime 477 // monitorenter/exit helper stubs are so slow it's critical that 478 // we inline both the stack-locking fast path and the inflated fast path. 479 // 480 // See also: cmpFastLock and cmpFastUnlock. 481 // 482 // What follows is a specialized inline transliteration of the code 483 // in enter() and exit(). If we're concerned about I$ bloat another 484 // option would be to emit TrySlowEnter and TrySlowExit methods 485 // at startup-time. These methods would accept arguments as 486 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 487 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 488 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 489 // In practice, however, the # of lock sites is bounded and is usually small. 490 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 491 // if the processor uses simple bimodal branch predictors keyed by EIP 492 // Since the helper routines would be called from multiple synchronization 493 // sites. 494 // 495 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 496 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 497 // to those specialized methods. That'd give us a mostly platform-independent 498 // implementation that the JITs could optimize and inline at their pleasure. 499 // Done correctly, the only time we'd need to cross to native could would be 500 // to park() or unpark() threads. We'd also need a few more unsafe operators 501 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 502 // (b) explicit barriers or fence operations. 503 // 504 // TODO: 505 // 506 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 507 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 508 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 509 // the lock operators would typically be faster than reifying Self. 510 // 511 // * Ideally I'd define the primitives as: 512 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 513 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 514 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 515 // Instead, we're stuck with a rather awkward and brittle register assignments below. 516 // Furthermore the register assignments are overconstrained, possibly resulting in 517 // sub-optimal code near the synchronization site. 518 // 519 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 520 // Alternately, use a better sp-proximity test. 521 // 522 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 523 // Either one is sufficient to uniquely identify a thread. 524 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 525 // 526 // * Intrinsify notify() and notifyAll() for the common cases where the 527 // object is locked by the calling thread but the waitlist is empty. 528 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 529 // 530 // * use jccb and jmpb instead of jcc and jmp to improve code density. 531 // But beware of excessive branch density on AMD Opterons. 532 // 533 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 534 // or failure of the fast path. If the fast path fails then we pass 535 // control to the slow path, typically in C. In fast_lock and 536 // fast_unlock we often branch to DONE_LABEL, just to find that C2 537 // will emit a conditional branch immediately after the node. 538 // So we have branches to branches and lots of ICC.ZF games. 539 // Instead, it might be better to have C2 pass a "FailureLabel" 540 // into fast_lock and fast_unlock. In the case of success, control 541 // will drop through the node. ICC.ZF is undefined at exit. 542 // In the case of failure, the node will branch directly to the 543 // FailureLabel 544 545 546 // obj: object to lock 547 // box: on-stack box address (displaced header location) - KILLED 548 // rax,: tmp -- KILLED 549 // scr: tmp -- KILLED 550 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 551 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 552 RTMLockingCounters* rtm_counters, 553 RTMLockingCounters* stack_rtm_counters, 554 Metadata* method_data, 555 bool use_rtm, bool profile_rtm) { 556 // Ensure the register assignments are disjoint 557 assert(tmpReg == rax, ""); 558 559 if (use_rtm) { 560 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg); 561 } else { 562 assert(cx1Reg == noreg, ""); 563 assert(cx2Reg == noreg, ""); 564 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 565 } 566 567 // Possible cases that we'll encounter in fast_lock 568 // ------------------------------------------------ 569 // * Inflated 570 // -- unlocked 571 // -- Locked 572 // = by self 573 // = by other 574 // * neutral 575 // * stack-locked 576 // -- by self 577 // = sp-proximity test hits 578 // = sp-proximity test generates false-negative 579 // -- by other 580 // 581 582 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 583 584 if (DiagnoseSyncOnValueBasedClasses != 0) { 585 load_klass(tmpReg, objReg, scrReg); 586 movl(tmpReg, Address(tmpReg, Klass::access_flags_offset())); 587 testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS); 588 jcc(Assembler::notZero, DONE_LABEL); 589 } 590 591 #if INCLUDE_RTM_OPT 592 if (UseRTMForStackLocks && use_rtm) { 593 assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive"); 594 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg, 595 stack_rtm_counters, method_data, profile_rtm, 596 DONE_LABEL, IsInflated); 597 } 598 #endif // INCLUDE_RTM_OPT 599 600 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 601 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 602 jcc(Assembler::notZero, IsInflated); 603 604 if (LockingMode == LM_MONITOR) { 605 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 606 testptr(objReg, objReg); 607 } else if (LockingMode == LM_LEGACY) { 608 // Attempt stack-locking ... 609 orptr (tmpReg, markWord::unlocked_value); 610 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 611 lock(); 612 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 613 jcc(Assembler::equal, COUNT); // Success 614 615 // Recursive locking. 616 // The object is stack-locked: markword contains stack pointer to BasicLock. 617 // Locked by current thread if difference with current SP is less than one page. 618 subptr(tmpReg, rsp); 619 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 620 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) ); 621 movptr(Address(boxReg, 0), tmpReg); 622 } else { 623 assert(LockingMode == LM_LIGHTWEIGHT, ""); 624 fast_lock_impl(objReg, tmpReg, thread, scrReg, NO_COUNT); 625 jmp(COUNT); 626 } 627 jmp(DONE_LABEL); 628 629 bind(IsInflated); 630 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 631 632 #if INCLUDE_RTM_OPT 633 // Use the same RTM locking code in 32- and 64-bit VM. 634 if (use_rtm) { 635 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg, 636 rtm_counters, method_data, profile_rtm, DONE_LABEL); 637 } else { 638 #endif // INCLUDE_RTM_OPT 639 640 #ifndef _LP64 641 // The object is inflated. 642 643 // boxReg refers to the on-stack BasicLock in the current frame. 644 // We'd like to write: 645 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 646 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 647 // additional latency as we have another ST in the store buffer that must drain. 648 649 // avoid ST-before-CAS 650 // register juggle because we need tmpReg for cmpxchgptr below 651 movptr(scrReg, boxReg); 652 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 653 654 // Optimistic form: consider XORL tmpReg,tmpReg 655 movptr(tmpReg, NULL_WORD); 656 657 // Appears unlocked - try to swing _owner from null to non-null. 658 // Ideally, I'd manifest "Self" with get_thread and then attempt 659 // to CAS the register containing Self into m->Owner. 660 // But we don't have enough registers, so instead we can either try to CAS 661 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 662 // we later store "Self" into m->Owner. Transiently storing a stack address 663 // (rsp or the address of the box) into m->owner is harmless. 664 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 665 lock(); 666 cmpxchgptr(thread, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 667 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 668 669 // If the CAS fails we can either retry or pass control to the slow path. 670 // We use the latter tactic. 671 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 672 // If the CAS was successful ... 673 // Self has acquired the lock 674 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 675 // Intentional fall-through into DONE_LABEL ... 676 #else // _LP64 677 // It's inflated and we use scrReg for ObjectMonitor* in this section. 678 movq(scrReg, tmpReg); 679 xorq(tmpReg, tmpReg); 680 lock(); 681 cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 682 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 683 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 684 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 685 // Propagate ICC.ZF from CAS above into DONE_LABEL. 686 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 687 688 cmpptr(thread, rax); // Check if we are already the owner (recursive lock) 689 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 690 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 691 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 692 #endif // _LP64 693 #if INCLUDE_RTM_OPT 694 } // use_rtm() 695 #endif 696 bind(DONE_LABEL); 697 698 // ZFlag == 1 count in fast path 699 // ZFlag == 0 count in slow path 700 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 701 702 bind(COUNT); 703 // Count monitors in fast path 704 increment(Address(thread, JavaThread::held_monitor_count_offset())); 705 706 xorl(tmpReg, tmpReg); // Set ZF == 1 707 708 bind(NO_COUNT); 709 710 // At NO_COUNT the icc ZFlag is set as follows ... 711 // fast_unlock uses the same protocol. 712 // ZFlag == 1 -> Success 713 // ZFlag == 0 -> Failure - force control through the slow path 714 } 715 716 // obj: object to unlock 717 // box: box address (displaced header location), killed. Must be EAX. 718 // tmp: killed, cannot be obj nor box. 719 // 720 // Some commentary on balanced locking: 721 // 722 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 723 // Methods that don't have provably balanced locking are forced to run in the 724 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 725 // The interpreter provides two properties: 726 // I1: At return-time the interpreter automatically and quietly unlocks any 727 // objects acquired the current activation (frame). Recall that the 728 // interpreter maintains an on-stack list of locks currently held by 729 // a frame. 730 // I2: If a method attempts to unlock an object that is not held by the 731 // the frame the interpreter throws IMSX. 732 // 733 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 734 // B() doesn't have provably balanced locking so it runs in the interpreter. 735 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 736 // is still locked by A(). 737 // 738 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 739 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 740 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 741 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 742 // Arguably given that the spec legislates the JNI case as undefined our implementation 743 // could reasonably *avoid* checking owner in fast_unlock(). 744 // In the interest of performance we elide m->Owner==Self check in unlock. 745 // A perfectly viable alternative is to elide the owner check except when 746 // Xcheck:jni is enabled. 747 748 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { 749 assert(boxReg == rax, ""); 750 assert_different_registers(objReg, boxReg, tmpReg); 751 752 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 753 754 #if INCLUDE_RTM_OPT 755 if (UseRTMForStackLocks && use_rtm) { 756 assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive"); 757 Label L_regular_unlock; 758 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 759 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 760 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 761 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock 762 xend(); // otherwise end... 763 jmp(DONE_LABEL); // ... and we're done 764 bind(L_regular_unlock); 765 } 766 #endif 767 768 if (LockingMode == LM_LEGACY) { 769 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 770 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 771 } 772 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 773 if (LockingMode != LM_MONITOR) { 774 testptr(tmpReg, markWord::monitor_value); // Inflated? 775 jcc(Assembler::zero, Stacked); 776 } 777 778 // It's inflated. 779 if (LockingMode == LM_LIGHTWEIGHT) { 780 // If the owner is ANONYMOUS, we need to fix it - in an outline stub. 781 testb(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t) ObjectMonitor::ANONYMOUS_OWNER); 782 #ifdef _LP64 783 if (!Compile::current()->output()->in_scratch_emit_size()) { 784 C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmpReg, boxReg); 785 Compile::current()->output()->add_stub(stub); 786 jcc(Assembler::notEqual, stub->entry()); 787 bind(stub->continuation()); 788 } else 789 #endif 790 { 791 // We can't easily implement this optimization on 32 bit because we don't have a thread register. 792 // Call the slow-path instead. 793 jcc(Assembler::notEqual, NO_COUNT); 794 } 795 } 796 797 #if INCLUDE_RTM_OPT 798 if (use_rtm) { 799 Label L_regular_inflated_unlock; 800 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 801 movptr(boxReg, Address(tmpReg, owner_offset)); 802 testptr(boxReg, boxReg); 803 jccb(Assembler::notZero, L_regular_inflated_unlock); 804 xend(); 805 jmp(DONE_LABEL); 806 bind(L_regular_inflated_unlock); 807 } 808 #endif 809 810 // Despite our balanced locking property we still check that m->_owner == Self 811 // as java routines or native JNI code called by this thread might 812 // have released the lock. 813 // Refer to the comments in synchronizer.cpp for how we might encode extra 814 // state in _succ so we can avoid fetching EntryList|cxq. 815 // 816 // If there's no contention try a 1-0 exit. That is, exit without 817 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 818 // we detect and recover from the race that the 1-0 exit admits. 819 // 820 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 821 // before it STs null into _owner, releasing the lock. Updates 822 // to data protected by the critical section must be visible before 823 // we drop the lock (and thus before any other thread could acquire 824 // the lock and observe the fields protected by the lock). 825 // IA32's memory-model is SPO, so STs are ordered with respect to 826 // each other and there's no need for an explicit barrier (fence). 827 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 828 #ifndef _LP64 829 // Note that we could employ various encoding schemes to reduce 830 // the number of loads below (currently 4) to just 2 or 3. 831 // Refer to the comments in synchronizer.cpp. 832 // In practice the chain of fetches doesn't seem to impact performance, however. 833 xorptr(boxReg, boxReg); 834 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 835 jccb (Assembler::notZero, DONE_LABEL); 836 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 837 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 838 jccb (Assembler::notZero, DONE_LABEL); 839 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 840 jmpb (DONE_LABEL); 841 #else // _LP64 842 // It's inflated 843 Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath; 844 845 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 846 jccb(Assembler::equal, LNotRecursive); 847 848 // Recursive inflated unlock 849 decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 850 jmpb(LSuccess); 851 852 bind(LNotRecursive); 853 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 854 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 855 jccb (Assembler::notZero, CheckSucc); 856 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 857 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 858 jmpb (DONE_LABEL); 859 860 // Try to avoid passing control into the slow_path ... 861 bind (CheckSucc); 862 863 // The following optional optimization can be elided if necessary 864 // Effectively: if (succ == null) goto slow path 865 // The code reduces the window for a race, however, 866 // and thus benefits performance. 867 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 868 jccb (Assembler::zero, LGoSlowPath); 869 870 xorptr(boxReg, boxReg); 871 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 872 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 873 874 // Memory barrier/fence 875 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 876 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 877 // This is faster on Nehalem and AMD Shanghai/Barcelona. 878 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 879 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 880 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 881 lock(); addl(Address(rsp, 0), 0); 882 883 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 884 jccb (Assembler::notZero, LSuccess); 885 886 // Rare inopportune interleaving - race. 887 // The successor vanished in the small window above. 888 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 889 // We need to ensure progress and succession. 890 // Try to reacquire the lock. 891 // If that fails then the new owner is responsible for succession and this 892 // thread needs to take no further action and can exit via the fast path (success). 893 // If the re-acquire succeeds then pass control into the slow path. 894 // As implemented, this latter mode is horrible because we generated more 895 // coherence traffic on the lock *and* artificially extended the critical section 896 // length while by virtue of passing control into the slow path. 897 898 // box is really RAX -- the following CMPXCHG depends on that binding 899 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 900 lock(); 901 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 902 // There's no successor so we tried to regrab the lock. 903 // If that didn't work, then another thread grabbed the 904 // lock so we're done (and exit was a success). 905 jccb (Assembler::notEqual, LSuccess); 906 // Intentional fall-through into slow path 907 908 bind (LGoSlowPath); 909 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 910 jmpb (DONE_LABEL); 911 912 bind (LSuccess); 913 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 914 jmpb (DONE_LABEL); 915 916 #endif 917 if (LockingMode != LM_MONITOR) { 918 bind (Stacked); 919 if (LockingMode == LM_LIGHTWEIGHT) { 920 mov(boxReg, tmpReg); 921 fast_unlock_impl(objReg, boxReg, tmpReg, NO_COUNT); 922 jmp(COUNT); 923 } else if (LockingMode == LM_LEGACY) { 924 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 925 lock(); 926 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 927 } 928 // Intentional fall-thru into DONE_LABEL 929 } 930 bind(DONE_LABEL); 931 932 // ZFlag == 1 count in fast path 933 // ZFlag == 0 count in slow path 934 jccb(Assembler::notZero, NO_COUNT); 935 936 bind(COUNT); 937 // Count monitors in fast path 938 #ifndef _LP64 939 get_thread(tmpReg); 940 decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); 941 #else // _LP64 942 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 943 #endif 944 945 xorl(tmpReg, tmpReg); // Set ZF == 1 946 947 bind(NO_COUNT); 948 } 949 950 //------------------------------------------------------------------------------------------- 951 // Generic instructions support for use in .ad files C2 code generation 952 953 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 954 if (dst != src) { 955 movdqu(dst, src); 956 } 957 if (opcode == Op_AbsVD) { 958 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 959 } else { 960 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 961 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 962 } 963 } 964 965 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 966 if (opcode == Op_AbsVD) { 967 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 968 } else { 969 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 970 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 971 } 972 } 973 974 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 975 if (dst != src) { 976 movdqu(dst, src); 977 } 978 if (opcode == Op_AbsVF) { 979 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 980 } else { 981 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 982 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 983 } 984 } 985 986 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 987 if (opcode == Op_AbsVF) { 988 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 989 } else { 990 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 991 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 992 } 993 } 994 995 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 996 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 997 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 998 999 if (opcode == Op_MinV) { 1000 if (elem_bt == T_BYTE) { 1001 pminsb(dst, src); 1002 } else if (elem_bt == T_SHORT) { 1003 pminsw(dst, src); 1004 } else if (elem_bt == T_INT) { 1005 pminsd(dst, src); 1006 } else { 1007 assert(elem_bt == T_LONG, "required"); 1008 assert(tmp == xmm0, "required"); 1009 assert_different_registers(dst, src, tmp); 1010 movdqu(xmm0, dst); 1011 pcmpgtq(xmm0, src); 1012 blendvpd(dst, src); // xmm0 as mask 1013 } 1014 } else { // opcode == Op_MaxV 1015 if (elem_bt == T_BYTE) { 1016 pmaxsb(dst, src); 1017 } else if (elem_bt == T_SHORT) { 1018 pmaxsw(dst, src); 1019 } else if (elem_bt == T_INT) { 1020 pmaxsd(dst, src); 1021 } else { 1022 assert(elem_bt == T_LONG, "required"); 1023 assert(tmp == xmm0, "required"); 1024 assert_different_registers(dst, src, tmp); 1025 movdqu(xmm0, src); 1026 pcmpgtq(xmm0, dst); 1027 blendvpd(dst, src); // xmm0 as mask 1028 } 1029 } 1030 } 1031 1032 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 1033 XMMRegister dst, XMMRegister src1, XMMRegister src2, 1034 int vlen_enc) { 1035 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1036 1037 if (opcode == Op_MinV) { 1038 if (elem_bt == T_BYTE) { 1039 vpminsb(dst, src1, src2, vlen_enc); 1040 } else if (elem_bt == T_SHORT) { 1041 vpminsw(dst, src1, src2, vlen_enc); 1042 } else if (elem_bt == T_INT) { 1043 vpminsd(dst, src1, src2, vlen_enc); 1044 } else { 1045 assert(elem_bt == T_LONG, "required"); 1046 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1047 vpminsq(dst, src1, src2, vlen_enc); 1048 } else { 1049 assert_different_registers(dst, src1, src2); 1050 vpcmpgtq(dst, src1, src2, vlen_enc); 1051 vblendvpd(dst, src1, src2, dst, vlen_enc); 1052 } 1053 } 1054 } else { // opcode == Op_MaxV 1055 if (elem_bt == T_BYTE) { 1056 vpmaxsb(dst, src1, src2, vlen_enc); 1057 } else if (elem_bt == T_SHORT) { 1058 vpmaxsw(dst, src1, src2, vlen_enc); 1059 } else if (elem_bt == T_INT) { 1060 vpmaxsd(dst, src1, src2, vlen_enc); 1061 } else { 1062 assert(elem_bt == T_LONG, "required"); 1063 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1064 vpmaxsq(dst, src1, src2, vlen_enc); 1065 } else { 1066 assert_different_registers(dst, src1, src2); 1067 vpcmpgtq(dst, src1, src2, vlen_enc); 1068 vblendvpd(dst, src2, src1, dst, vlen_enc); 1069 } 1070 } 1071 } 1072 } 1073 1074 // Float/Double min max 1075 1076 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1077 XMMRegister dst, XMMRegister a, XMMRegister b, 1078 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1079 int vlen_enc) { 1080 assert(UseAVX > 0, "required"); 1081 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1082 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1083 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1084 assert_different_registers(a, b, tmp, atmp, btmp); 1085 1086 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1087 bool is_double_word = is_double_word_type(elem_bt); 1088 1089 if (!is_double_word && is_min) { 1090 vblendvps(atmp, a, b, a, vlen_enc); 1091 vblendvps(btmp, b, a, a, vlen_enc); 1092 vminps(tmp, atmp, btmp, vlen_enc); 1093 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1094 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 1095 } else if (!is_double_word && !is_min) { 1096 vblendvps(btmp, b, a, b, vlen_enc); 1097 vblendvps(atmp, a, b, b, vlen_enc); 1098 vmaxps(tmp, atmp, btmp, vlen_enc); 1099 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1100 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 1101 } else if (is_double_word && is_min) { 1102 vblendvpd(atmp, a, b, a, vlen_enc); 1103 vblendvpd(btmp, b, a, a, vlen_enc); 1104 vminpd(tmp, atmp, btmp, vlen_enc); 1105 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1106 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 1107 } else { 1108 assert(is_double_word && !is_min, "sanity"); 1109 vblendvpd(btmp, b, a, b, vlen_enc); 1110 vblendvpd(atmp, a, b, b, vlen_enc); 1111 vmaxpd(tmp, atmp, btmp, vlen_enc); 1112 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1113 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 1114 } 1115 } 1116 1117 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1118 XMMRegister dst, XMMRegister a, XMMRegister b, 1119 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1120 int vlen_enc) { 1121 assert(UseAVX > 2, "required"); 1122 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1123 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1124 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1125 assert_different_registers(dst, a, b, atmp, btmp); 1126 1127 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1128 bool is_double_word = is_double_word_type(elem_bt); 1129 bool merge = true; 1130 1131 if (!is_double_word && is_min) { 1132 evpmovd2m(ktmp, a, vlen_enc); 1133 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1134 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1135 vminps(dst, atmp, btmp, vlen_enc); 1136 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1137 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1138 } else if (!is_double_word && !is_min) { 1139 evpmovd2m(ktmp, b, vlen_enc); 1140 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1141 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1142 vmaxps(dst, atmp, btmp, vlen_enc); 1143 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1144 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1145 } else if (is_double_word && is_min) { 1146 evpmovq2m(ktmp, a, vlen_enc); 1147 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1148 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1149 vminpd(dst, atmp, btmp, vlen_enc); 1150 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1151 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1152 } else { 1153 assert(is_double_word && !is_min, "sanity"); 1154 evpmovq2m(ktmp, b, vlen_enc); 1155 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1156 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1157 vmaxpd(dst, atmp, btmp, vlen_enc); 1158 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1159 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1160 } 1161 } 1162 1163 // Float/Double signum 1164 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1165 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1166 1167 Label DONE_LABEL; 1168 1169 if (opcode == Op_SignumF) { 1170 assert(UseSSE > 0, "required"); 1171 ucomiss(dst, zero); 1172 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1173 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1174 movflt(dst, one); 1175 jcc(Assembler::above, DONE_LABEL); 1176 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1177 } else if (opcode == Op_SignumD) { 1178 assert(UseSSE > 1, "required"); 1179 ucomisd(dst, zero); 1180 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1181 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1182 movdbl(dst, one); 1183 jcc(Assembler::above, DONE_LABEL); 1184 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1185 } 1186 1187 bind(DONE_LABEL); 1188 } 1189 1190 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1191 if (sign) { 1192 pmovsxbw(dst, src); 1193 } else { 1194 pmovzxbw(dst, src); 1195 } 1196 } 1197 1198 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1199 if (sign) { 1200 vpmovsxbw(dst, src, vector_len); 1201 } else { 1202 vpmovzxbw(dst, src, vector_len); 1203 } 1204 } 1205 1206 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1207 if (sign) { 1208 vpmovsxbd(dst, src, vector_len); 1209 } else { 1210 vpmovzxbd(dst, src, vector_len); 1211 } 1212 } 1213 1214 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1215 if (sign) { 1216 vpmovsxwd(dst, src, vector_len); 1217 } else { 1218 vpmovzxwd(dst, src, vector_len); 1219 } 1220 } 1221 1222 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1223 int shift, int vector_len) { 1224 if (opcode == Op_RotateLeftV) { 1225 if (etype == T_INT) { 1226 evprold(dst, src, shift, vector_len); 1227 } else { 1228 assert(etype == T_LONG, "expected type T_LONG"); 1229 evprolq(dst, src, shift, vector_len); 1230 } 1231 } else { 1232 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1233 if (etype == T_INT) { 1234 evprord(dst, src, shift, vector_len); 1235 } else { 1236 assert(etype == T_LONG, "expected type T_LONG"); 1237 evprorq(dst, src, shift, vector_len); 1238 } 1239 } 1240 } 1241 1242 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1243 XMMRegister shift, int vector_len) { 1244 if (opcode == Op_RotateLeftV) { 1245 if (etype == T_INT) { 1246 evprolvd(dst, src, shift, vector_len); 1247 } else { 1248 assert(etype == T_LONG, "expected type T_LONG"); 1249 evprolvq(dst, src, shift, vector_len); 1250 } 1251 } else { 1252 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1253 if (etype == T_INT) { 1254 evprorvd(dst, src, shift, vector_len); 1255 } else { 1256 assert(etype == T_LONG, "expected type T_LONG"); 1257 evprorvq(dst, src, shift, vector_len); 1258 } 1259 } 1260 } 1261 1262 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1263 if (opcode == Op_RShiftVI) { 1264 psrad(dst, shift); 1265 } else if (opcode == Op_LShiftVI) { 1266 pslld(dst, shift); 1267 } else { 1268 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1269 psrld(dst, shift); 1270 } 1271 } 1272 1273 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1274 switch (opcode) { 1275 case Op_RShiftVI: psrad(dst, shift); break; 1276 case Op_LShiftVI: pslld(dst, shift); break; 1277 case Op_URShiftVI: psrld(dst, shift); break; 1278 1279 default: assert(false, "%s", NodeClassNames[opcode]); 1280 } 1281 } 1282 1283 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1284 if (opcode == Op_RShiftVI) { 1285 vpsrad(dst, nds, shift, vector_len); 1286 } else if (opcode == Op_LShiftVI) { 1287 vpslld(dst, nds, shift, vector_len); 1288 } else { 1289 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1290 vpsrld(dst, nds, shift, vector_len); 1291 } 1292 } 1293 1294 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1295 switch (opcode) { 1296 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1297 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1298 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1299 1300 default: assert(false, "%s", NodeClassNames[opcode]); 1301 } 1302 } 1303 1304 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1305 switch (opcode) { 1306 case Op_RShiftVB: // fall-through 1307 case Op_RShiftVS: psraw(dst, shift); break; 1308 1309 case Op_LShiftVB: // fall-through 1310 case Op_LShiftVS: psllw(dst, shift); break; 1311 1312 case Op_URShiftVS: // fall-through 1313 case Op_URShiftVB: psrlw(dst, shift); break; 1314 1315 default: assert(false, "%s", NodeClassNames[opcode]); 1316 } 1317 } 1318 1319 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1320 switch (opcode) { 1321 case Op_RShiftVB: // fall-through 1322 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1323 1324 case Op_LShiftVB: // fall-through 1325 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1326 1327 case Op_URShiftVS: // fall-through 1328 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1329 1330 default: assert(false, "%s", NodeClassNames[opcode]); 1331 } 1332 } 1333 1334 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1335 switch (opcode) { 1336 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1337 case Op_LShiftVL: psllq(dst, shift); break; 1338 case Op_URShiftVL: psrlq(dst, shift); break; 1339 1340 default: assert(false, "%s", NodeClassNames[opcode]); 1341 } 1342 } 1343 1344 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1345 if (opcode == Op_RShiftVL) { 1346 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1347 } else if (opcode == Op_LShiftVL) { 1348 psllq(dst, shift); 1349 } else { 1350 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1351 psrlq(dst, shift); 1352 } 1353 } 1354 1355 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1356 switch (opcode) { 1357 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1358 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1359 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1360 1361 default: assert(false, "%s", NodeClassNames[opcode]); 1362 } 1363 } 1364 1365 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1366 if (opcode == Op_RShiftVL) { 1367 evpsraq(dst, nds, shift, vector_len); 1368 } else if (opcode == Op_LShiftVL) { 1369 vpsllq(dst, nds, shift, vector_len); 1370 } else { 1371 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1372 vpsrlq(dst, nds, shift, vector_len); 1373 } 1374 } 1375 1376 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1377 switch (opcode) { 1378 case Op_RShiftVB: // fall-through 1379 case Op_RShiftVS: // fall-through 1380 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1381 1382 case Op_LShiftVB: // fall-through 1383 case Op_LShiftVS: // fall-through 1384 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1385 1386 case Op_URShiftVB: // fall-through 1387 case Op_URShiftVS: // fall-through 1388 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1389 1390 default: assert(false, "%s", NodeClassNames[opcode]); 1391 } 1392 } 1393 1394 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1395 switch (opcode) { 1396 case Op_RShiftVB: // fall-through 1397 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1398 1399 case Op_LShiftVB: // fall-through 1400 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1401 1402 case Op_URShiftVB: // fall-through 1403 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1404 1405 default: assert(false, "%s", NodeClassNames[opcode]); 1406 } 1407 } 1408 1409 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1410 assert(UseAVX >= 2, "required"); 1411 switch (opcode) { 1412 case Op_RShiftVL: { 1413 if (UseAVX > 2) { 1414 assert(tmp == xnoreg, "not used"); 1415 if (!VM_Version::supports_avx512vl()) { 1416 vlen_enc = Assembler::AVX_512bit; 1417 } 1418 evpsravq(dst, src, shift, vlen_enc); 1419 } else { 1420 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1421 vpsrlvq(dst, src, shift, vlen_enc); 1422 vpsrlvq(tmp, tmp, shift, vlen_enc); 1423 vpxor(dst, dst, tmp, vlen_enc); 1424 vpsubq(dst, dst, tmp, vlen_enc); 1425 } 1426 break; 1427 } 1428 case Op_LShiftVL: { 1429 assert(tmp == xnoreg, "not used"); 1430 vpsllvq(dst, src, shift, vlen_enc); 1431 break; 1432 } 1433 case Op_URShiftVL: { 1434 assert(tmp == xnoreg, "not used"); 1435 vpsrlvq(dst, src, shift, vlen_enc); 1436 break; 1437 } 1438 default: assert(false, "%s", NodeClassNames[opcode]); 1439 } 1440 } 1441 1442 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1443 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1444 assert(opcode == Op_LShiftVB || 1445 opcode == Op_RShiftVB || 1446 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1447 bool sign = (opcode != Op_URShiftVB); 1448 assert(vector_len == 0, "required"); 1449 vextendbd(sign, dst, src, 1); 1450 vpmovzxbd(vtmp, shift, 1); 1451 varshiftd(opcode, dst, dst, vtmp, 1); 1452 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1453 vextracti128_high(vtmp, dst); 1454 vpackusdw(dst, dst, vtmp, 0); 1455 } 1456 1457 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1458 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1459 assert(opcode == Op_LShiftVB || 1460 opcode == Op_RShiftVB || 1461 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1462 bool sign = (opcode != Op_URShiftVB); 1463 int ext_vector_len = vector_len + 1; 1464 vextendbw(sign, dst, src, ext_vector_len); 1465 vpmovzxbw(vtmp, shift, ext_vector_len); 1466 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1467 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1468 if (vector_len == 0) { 1469 vextracti128_high(vtmp, dst); 1470 vpackuswb(dst, dst, vtmp, vector_len); 1471 } else { 1472 vextracti64x4_high(vtmp, dst); 1473 vpackuswb(dst, dst, vtmp, vector_len); 1474 vpermq(dst, dst, 0xD8, vector_len); 1475 } 1476 } 1477 1478 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1479 switch(typ) { 1480 case T_BYTE: 1481 pinsrb(dst, val, idx); 1482 break; 1483 case T_SHORT: 1484 pinsrw(dst, val, idx); 1485 break; 1486 case T_INT: 1487 pinsrd(dst, val, idx); 1488 break; 1489 case T_LONG: 1490 pinsrq(dst, val, idx); 1491 break; 1492 default: 1493 assert(false,"Should not reach here."); 1494 break; 1495 } 1496 } 1497 1498 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1499 switch(typ) { 1500 case T_BYTE: 1501 vpinsrb(dst, src, val, idx); 1502 break; 1503 case T_SHORT: 1504 vpinsrw(dst, src, val, idx); 1505 break; 1506 case T_INT: 1507 vpinsrd(dst, src, val, idx); 1508 break; 1509 case T_LONG: 1510 vpinsrq(dst, src, val, idx); 1511 break; 1512 default: 1513 assert(false,"Should not reach here."); 1514 break; 1515 } 1516 } 1517 1518 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1519 switch(typ) { 1520 case T_INT: 1521 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1522 break; 1523 case T_FLOAT: 1524 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1525 break; 1526 case T_LONG: 1527 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1528 break; 1529 case T_DOUBLE: 1530 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1531 break; 1532 default: 1533 assert(false,"Should not reach here."); 1534 break; 1535 } 1536 } 1537 1538 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1539 switch(typ) { 1540 case T_INT: 1541 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1542 break; 1543 case T_FLOAT: 1544 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1545 break; 1546 case T_LONG: 1547 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1548 break; 1549 case T_DOUBLE: 1550 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1551 break; 1552 default: 1553 assert(false,"Should not reach here."); 1554 break; 1555 } 1556 } 1557 1558 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1559 switch(typ) { 1560 case T_INT: 1561 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1562 break; 1563 case T_FLOAT: 1564 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1565 break; 1566 case T_LONG: 1567 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1568 break; 1569 case T_DOUBLE: 1570 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1571 break; 1572 default: 1573 assert(false,"Should not reach here."); 1574 break; 1575 } 1576 } 1577 1578 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1579 if (vlen_in_bytes <= 16) { 1580 pxor (dst, dst); 1581 psubb(dst, src); 1582 switch (elem_bt) { 1583 case T_BYTE: /* nothing to do */ break; 1584 case T_SHORT: pmovsxbw(dst, dst); break; 1585 case T_INT: pmovsxbd(dst, dst); break; 1586 case T_FLOAT: pmovsxbd(dst, dst); break; 1587 case T_LONG: pmovsxbq(dst, dst); break; 1588 case T_DOUBLE: pmovsxbq(dst, dst); break; 1589 1590 default: assert(false, "%s", type2name(elem_bt)); 1591 } 1592 } else { 1593 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1594 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1595 1596 vpxor (dst, dst, dst, vlen_enc); 1597 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1598 1599 switch (elem_bt) { 1600 case T_BYTE: /* nothing to do */ break; 1601 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1602 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1603 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1604 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1605 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1606 1607 default: assert(false, "%s", type2name(elem_bt)); 1608 } 1609 } 1610 } 1611 1612 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1613 if (novlbwdq) { 1614 vpmovsxbd(xtmp, src, vlen_enc); 1615 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1616 Assembler::eq, true, vlen_enc, noreg); 1617 } else { 1618 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1619 vpsubb(xtmp, xtmp, src, vlen_enc); 1620 evpmovb2m(dst, xtmp, vlen_enc); 1621 } 1622 } 1623 1624 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) { 1625 switch (vlen_in_bytes) { 1626 case 4: movdl(dst, src); break; 1627 case 8: movq(dst, src); break; 1628 case 16: movdqu(dst, src); break; 1629 case 32: vmovdqu(dst, src); break; 1630 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1631 default: ShouldNotReachHere(); 1632 } 1633 } 1634 1635 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1636 assert(rscratch != noreg || always_reachable(src), "missing"); 1637 1638 if (reachable(src)) { 1639 load_vector(dst, as_Address(src), vlen_in_bytes); 1640 } else { 1641 lea(rscratch, src); 1642 load_vector(dst, Address(rscratch, 0), vlen_in_bytes); 1643 } 1644 } 1645 1646 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1647 int vlen_enc = vector_length_encoding(vlen); 1648 if (VM_Version::supports_avx()) { 1649 if (bt == T_LONG) { 1650 if (VM_Version::supports_avx2()) { 1651 vpbroadcastq(dst, src, vlen_enc); 1652 } else { 1653 vmovddup(dst, src, vlen_enc); 1654 } 1655 } else if (bt == T_DOUBLE) { 1656 if (vlen_enc != Assembler::AVX_128bit) { 1657 vbroadcastsd(dst, src, vlen_enc, noreg); 1658 } else { 1659 vmovddup(dst, src, vlen_enc); 1660 } 1661 } else { 1662 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1663 vpbroadcastd(dst, src, vlen_enc); 1664 } else { 1665 vbroadcastss(dst, src, vlen_enc); 1666 } 1667 } 1668 } else if (VM_Version::supports_sse3()) { 1669 movddup(dst, src); 1670 } else { 1671 movq(dst, src); 1672 if (vlen == 16) { 1673 punpcklqdq(dst, dst); 1674 } 1675 } 1676 } 1677 1678 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1679 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1680 int offset = exact_log2(type2aelembytes(bt)) << 6; 1681 if (is_floating_point_type(bt)) { 1682 offset += 128; 1683 } 1684 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1685 load_vector(dst, addr, vlen_in_bytes); 1686 } 1687 1688 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1689 1690 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1691 int vector_len = Assembler::AVX_128bit; 1692 1693 switch (opcode) { 1694 case Op_AndReductionV: pand(dst, src); break; 1695 case Op_OrReductionV: por (dst, src); break; 1696 case Op_XorReductionV: pxor(dst, src); break; 1697 case Op_MinReductionV: 1698 switch (typ) { 1699 case T_BYTE: pminsb(dst, src); break; 1700 case T_SHORT: pminsw(dst, src); break; 1701 case T_INT: pminsd(dst, src); break; 1702 case T_LONG: assert(UseAVX > 2, "required"); 1703 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1704 default: assert(false, "wrong type"); 1705 } 1706 break; 1707 case Op_MaxReductionV: 1708 switch (typ) { 1709 case T_BYTE: pmaxsb(dst, src); break; 1710 case T_SHORT: pmaxsw(dst, src); break; 1711 case T_INT: pmaxsd(dst, src); break; 1712 case T_LONG: assert(UseAVX > 2, "required"); 1713 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1714 default: assert(false, "wrong type"); 1715 } 1716 break; 1717 case Op_AddReductionVF: addss(dst, src); break; 1718 case Op_AddReductionVD: addsd(dst, src); break; 1719 case Op_AddReductionVI: 1720 switch (typ) { 1721 case T_BYTE: paddb(dst, src); break; 1722 case T_SHORT: paddw(dst, src); break; 1723 case T_INT: paddd(dst, src); break; 1724 default: assert(false, "wrong type"); 1725 } 1726 break; 1727 case Op_AddReductionVL: paddq(dst, src); break; 1728 case Op_MulReductionVF: mulss(dst, src); break; 1729 case Op_MulReductionVD: mulsd(dst, src); break; 1730 case Op_MulReductionVI: 1731 switch (typ) { 1732 case T_SHORT: pmullw(dst, src); break; 1733 case T_INT: pmulld(dst, src); break; 1734 default: assert(false, "wrong type"); 1735 } 1736 break; 1737 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1738 evpmullq(dst, dst, src, vector_len); break; 1739 default: assert(false, "wrong opcode"); 1740 } 1741 } 1742 1743 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1744 int vector_len = Assembler::AVX_256bit; 1745 1746 switch (opcode) { 1747 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1748 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1749 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1750 case Op_MinReductionV: 1751 switch (typ) { 1752 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1753 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1754 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1755 case T_LONG: assert(UseAVX > 2, "required"); 1756 vpminsq(dst, src1, src2, vector_len); break; 1757 default: assert(false, "wrong type"); 1758 } 1759 break; 1760 case Op_MaxReductionV: 1761 switch (typ) { 1762 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1763 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1764 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1765 case T_LONG: assert(UseAVX > 2, "required"); 1766 vpmaxsq(dst, src1, src2, vector_len); break; 1767 default: assert(false, "wrong type"); 1768 } 1769 break; 1770 case Op_AddReductionVI: 1771 switch (typ) { 1772 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1773 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1774 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1775 default: assert(false, "wrong type"); 1776 } 1777 break; 1778 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1779 case Op_MulReductionVI: 1780 switch (typ) { 1781 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1782 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1783 default: assert(false, "wrong type"); 1784 } 1785 break; 1786 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1787 default: assert(false, "wrong opcode"); 1788 } 1789 } 1790 1791 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1792 XMMRegister dst, XMMRegister src, 1793 XMMRegister vtmp1, XMMRegister vtmp2) { 1794 switch (opcode) { 1795 case Op_AddReductionVF: 1796 case Op_MulReductionVF: 1797 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1798 break; 1799 1800 case Op_AddReductionVD: 1801 case Op_MulReductionVD: 1802 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1803 break; 1804 1805 default: assert(false, "wrong opcode"); 1806 } 1807 } 1808 1809 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1810 Register dst, Register src1, XMMRegister src2, 1811 XMMRegister vtmp1, XMMRegister vtmp2) { 1812 switch (vlen) { 1813 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1814 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1815 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1816 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1817 1818 default: assert(false, "wrong vector length"); 1819 } 1820 } 1821 1822 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1823 Register dst, Register src1, XMMRegister src2, 1824 XMMRegister vtmp1, XMMRegister vtmp2) { 1825 switch (vlen) { 1826 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1827 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1828 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1829 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1830 1831 default: assert(false, "wrong vector length"); 1832 } 1833 } 1834 1835 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1836 Register dst, Register src1, XMMRegister src2, 1837 XMMRegister vtmp1, XMMRegister vtmp2) { 1838 switch (vlen) { 1839 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1840 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1841 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1842 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1843 1844 default: assert(false, "wrong vector length"); 1845 } 1846 } 1847 1848 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1849 Register dst, Register src1, XMMRegister src2, 1850 XMMRegister vtmp1, XMMRegister vtmp2) { 1851 switch (vlen) { 1852 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1853 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1854 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1855 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1856 1857 default: assert(false, "wrong vector length"); 1858 } 1859 } 1860 1861 #ifdef _LP64 1862 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1863 Register dst, Register src1, XMMRegister src2, 1864 XMMRegister vtmp1, XMMRegister vtmp2) { 1865 switch (vlen) { 1866 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1867 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1868 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1869 1870 default: assert(false, "wrong vector length"); 1871 } 1872 } 1873 #endif // _LP64 1874 1875 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1876 switch (vlen) { 1877 case 2: 1878 assert(vtmp2 == xnoreg, ""); 1879 reduce2F(opcode, dst, src, vtmp1); 1880 break; 1881 case 4: 1882 assert(vtmp2 == xnoreg, ""); 1883 reduce4F(opcode, dst, src, vtmp1); 1884 break; 1885 case 8: 1886 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1887 break; 1888 case 16: 1889 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1890 break; 1891 default: assert(false, "wrong vector length"); 1892 } 1893 } 1894 1895 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1896 switch (vlen) { 1897 case 2: 1898 assert(vtmp2 == xnoreg, ""); 1899 reduce2D(opcode, dst, src, vtmp1); 1900 break; 1901 case 4: 1902 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1903 break; 1904 case 8: 1905 reduce8D(opcode, dst, src, vtmp1, vtmp2); 1906 break; 1907 default: assert(false, "wrong vector length"); 1908 } 1909 } 1910 1911 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1912 if (opcode == Op_AddReductionVI) { 1913 if (vtmp1 != src2) { 1914 movdqu(vtmp1, src2); 1915 } 1916 phaddd(vtmp1, vtmp1); 1917 } else { 1918 pshufd(vtmp1, src2, 0x1); 1919 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1920 } 1921 movdl(vtmp2, src1); 1922 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1923 movdl(dst, vtmp1); 1924 } 1925 1926 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1927 if (opcode == Op_AddReductionVI) { 1928 if (vtmp1 != src2) { 1929 movdqu(vtmp1, src2); 1930 } 1931 phaddd(vtmp1, src2); 1932 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1933 } else { 1934 pshufd(vtmp2, src2, 0xE); 1935 reduce_operation_128(T_INT, opcode, vtmp2, src2); 1936 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1937 } 1938 } 1939 1940 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1941 if (opcode == Op_AddReductionVI) { 1942 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 1943 vextracti128_high(vtmp2, vtmp1); 1944 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 1945 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1946 } else { 1947 vextracti128_high(vtmp1, src2); 1948 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1949 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1950 } 1951 } 1952 1953 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1954 vextracti64x4_high(vtmp2, src2); 1955 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 1956 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1957 } 1958 1959 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1960 pshufd(vtmp2, src2, 0x1); 1961 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1962 movdqu(vtmp1, vtmp2); 1963 psrldq(vtmp1, 2); 1964 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 1965 movdqu(vtmp2, vtmp1); 1966 psrldq(vtmp2, 1); 1967 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 1968 movdl(vtmp2, src1); 1969 pmovsxbd(vtmp1, vtmp1); 1970 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1971 pextrb(dst, vtmp1, 0x0); 1972 movsbl(dst, dst); 1973 } 1974 1975 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1976 pshufd(vtmp1, src2, 0xE); 1977 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 1978 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1979 } 1980 1981 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1982 vextracti128_high(vtmp2, src2); 1983 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1984 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1985 } 1986 1987 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1988 vextracti64x4_high(vtmp1, src2); 1989 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 1990 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1991 } 1992 1993 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1994 pmovsxbw(vtmp2, src2); 1995 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1996 } 1997 1998 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1999 if (UseAVX > 1) { 2000 int vector_len = Assembler::AVX_256bit; 2001 vpmovsxbw(vtmp1, src2, vector_len); 2002 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2003 } else { 2004 pmovsxbw(vtmp2, src2); 2005 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2006 pshufd(vtmp2, src2, 0x1); 2007 pmovsxbw(vtmp2, src2); 2008 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2009 } 2010 } 2011 2012 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2013 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2014 int vector_len = Assembler::AVX_512bit; 2015 vpmovsxbw(vtmp1, src2, vector_len); 2016 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2017 } else { 2018 assert(UseAVX >= 2,"Should not reach here."); 2019 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2020 vextracti128_high(vtmp2, src2); 2021 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2022 } 2023 } 2024 2025 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2026 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2027 vextracti64x4_high(vtmp2, src2); 2028 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2029 } 2030 2031 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2032 if (opcode == Op_AddReductionVI) { 2033 if (vtmp1 != src2) { 2034 movdqu(vtmp1, src2); 2035 } 2036 phaddw(vtmp1, vtmp1); 2037 phaddw(vtmp1, vtmp1); 2038 } else { 2039 pshufd(vtmp2, src2, 0x1); 2040 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2041 movdqu(vtmp1, vtmp2); 2042 psrldq(vtmp1, 2); 2043 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2044 } 2045 movdl(vtmp2, src1); 2046 pmovsxwd(vtmp1, vtmp1); 2047 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2048 pextrw(dst, vtmp1, 0x0); 2049 movswl(dst, dst); 2050 } 2051 2052 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2053 if (opcode == Op_AddReductionVI) { 2054 if (vtmp1 != src2) { 2055 movdqu(vtmp1, src2); 2056 } 2057 phaddw(vtmp1, src2); 2058 } else { 2059 pshufd(vtmp1, src2, 0xE); 2060 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2061 } 2062 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2063 } 2064 2065 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2066 if (opcode == Op_AddReductionVI) { 2067 int vector_len = Assembler::AVX_256bit; 2068 vphaddw(vtmp2, src2, src2, vector_len); 2069 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2070 } else { 2071 vextracti128_high(vtmp2, src2); 2072 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2073 } 2074 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2075 } 2076 2077 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2078 int vector_len = Assembler::AVX_256bit; 2079 vextracti64x4_high(vtmp1, src2); 2080 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2081 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2082 } 2083 2084 #ifdef _LP64 2085 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2086 pshufd(vtmp2, src2, 0xE); 2087 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2088 movdq(vtmp1, src1); 2089 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2090 movdq(dst, vtmp1); 2091 } 2092 2093 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2094 vextracti128_high(vtmp1, src2); 2095 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2096 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2097 } 2098 2099 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2100 vextracti64x4_high(vtmp2, src2); 2101 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2102 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2103 } 2104 2105 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2106 mov64(temp, -1L); 2107 bzhiq(temp, temp, len); 2108 kmovql(dst, temp); 2109 } 2110 #endif // _LP64 2111 2112 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2113 reduce_operation_128(T_FLOAT, opcode, dst, src); 2114 pshufd(vtmp, src, 0x1); 2115 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2116 } 2117 2118 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2119 reduce2F(opcode, dst, src, vtmp); 2120 pshufd(vtmp, src, 0x2); 2121 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2122 pshufd(vtmp, src, 0x3); 2123 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2124 } 2125 2126 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2127 reduce4F(opcode, dst, src, vtmp2); 2128 vextractf128_high(vtmp2, src); 2129 reduce4F(opcode, dst, vtmp2, vtmp1); 2130 } 2131 2132 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2133 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2134 vextracti64x4_high(vtmp1, src); 2135 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2136 } 2137 2138 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2139 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2140 pshufd(vtmp, src, 0xE); 2141 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2142 } 2143 2144 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2145 reduce2D(opcode, dst, src, vtmp2); 2146 vextractf128_high(vtmp2, src); 2147 reduce2D(opcode, dst, vtmp2, vtmp1); 2148 } 2149 2150 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2151 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2152 vextracti64x4_high(vtmp1, src); 2153 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2154 } 2155 2156 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2157 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2158 } 2159 2160 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2161 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2162 } 2163 2164 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2165 int vec_enc) { 2166 switch(elem_bt) { 2167 case T_INT: 2168 case T_FLOAT: 2169 vmaskmovps(dst, src, mask, vec_enc); 2170 break; 2171 case T_LONG: 2172 case T_DOUBLE: 2173 vmaskmovpd(dst, src, mask, vec_enc); 2174 break; 2175 default: 2176 fatal("Unsupported type %s", type2name(elem_bt)); 2177 break; 2178 } 2179 } 2180 2181 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2182 int vec_enc) { 2183 switch(elem_bt) { 2184 case T_INT: 2185 case T_FLOAT: 2186 vmaskmovps(dst, src, mask, vec_enc); 2187 break; 2188 case T_LONG: 2189 case T_DOUBLE: 2190 vmaskmovpd(dst, src, mask, vec_enc); 2191 break; 2192 default: 2193 fatal("Unsupported type %s", type2name(elem_bt)); 2194 break; 2195 } 2196 } 2197 2198 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2199 XMMRegister dst, XMMRegister src, 2200 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2201 XMMRegister xmm_0, XMMRegister xmm_1) { 2202 const int permconst[] = {1, 14}; 2203 XMMRegister wsrc = src; 2204 XMMRegister wdst = xmm_0; 2205 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2206 2207 int vlen_enc = Assembler::AVX_128bit; 2208 if (vlen == 16) { 2209 vlen_enc = Assembler::AVX_256bit; 2210 } 2211 2212 for (int i = log2(vlen) - 1; i >=0; i--) { 2213 if (i == 0 && !is_dst_valid) { 2214 wdst = dst; 2215 } 2216 if (i == 3) { 2217 vextracti64x4_high(wtmp, wsrc); 2218 } else if (i == 2) { 2219 vextracti128_high(wtmp, wsrc); 2220 } else { // i = [0,1] 2221 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2222 } 2223 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2224 wsrc = wdst; 2225 vlen_enc = Assembler::AVX_128bit; 2226 } 2227 if (is_dst_valid) { 2228 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2229 } 2230 } 2231 2232 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2233 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2234 XMMRegister xmm_0, XMMRegister xmm_1) { 2235 XMMRegister wsrc = src; 2236 XMMRegister wdst = xmm_0; 2237 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2238 int vlen_enc = Assembler::AVX_128bit; 2239 if (vlen == 8) { 2240 vlen_enc = Assembler::AVX_256bit; 2241 } 2242 for (int i = log2(vlen) - 1; i >=0; i--) { 2243 if (i == 0 && !is_dst_valid) { 2244 wdst = dst; 2245 } 2246 if (i == 1) { 2247 vextracti128_high(wtmp, wsrc); 2248 } else if (i == 2) { 2249 vextracti64x4_high(wtmp, wsrc); 2250 } else { 2251 assert(i == 0, "%d", i); 2252 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2253 } 2254 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2255 wsrc = wdst; 2256 vlen_enc = Assembler::AVX_128bit; 2257 } 2258 if (is_dst_valid) { 2259 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2260 } 2261 } 2262 2263 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2264 switch (bt) { 2265 case T_BYTE: pextrb(dst, src, idx); break; 2266 case T_SHORT: pextrw(dst, src, idx); break; 2267 case T_INT: pextrd(dst, src, idx); break; 2268 case T_LONG: pextrq(dst, src, idx); break; 2269 2270 default: 2271 assert(false,"Should not reach here."); 2272 break; 2273 } 2274 } 2275 2276 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2277 int esize = type2aelembytes(typ); 2278 int elem_per_lane = 16/esize; 2279 int lane = elemindex / elem_per_lane; 2280 int eindex = elemindex % elem_per_lane; 2281 2282 if (lane >= 2) { 2283 assert(UseAVX > 2, "required"); 2284 vextractf32x4(dst, src, lane & 3); 2285 return dst; 2286 } else if (lane > 0) { 2287 assert(UseAVX > 0, "required"); 2288 vextractf128(dst, src, lane); 2289 return dst; 2290 } else { 2291 return src; 2292 } 2293 } 2294 2295 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2296 if (typ == T_BYTE) { 2297 movsbl(dst, dst); 2298 } else if (typ == T_SHORT) { 2299 movswl(dst, dst); 2300 } 2301 } 2302 2303 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2304 int esize = type2aelembytes(typ); 2305 int elem_per_lane = 16/esize; 2306 int eindex = elemindex % elem_per_lane; 2307 assert(is_integral_type(typ),"required"); 2308 2309 if (eindex == 0) { 2310 if (typ == T_LONG) { 2311 movq(dst, src); 2312 } else { 2313 movdl(dst, src); 2314 movsxl(typ, dst); 2315 } 2316 } else { 2317 extract(typ, dst, src, eindex); 2318 movsxl(typ, dst); 2319 } 2320 } 2321 2322 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2323 int esize = type2aelembytes(typ); 2324 int elem_per_lane = 16/esize; 2325 int eindex = elemindex % elem_per_lane; 2326 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2327 2328 if (eindex == 0) { 2329 movq(dst, src); 2330 } else { 2331 if (typ == T_FLOAT) { 2332 if (UseAVX == 0) { 2333 movdqu(dst, src); 2334 shufps(dst, dst, eindex); 2335 } else { 2336 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2337 } 2338 } else { 2339 if (UseAVX == 0) { 2340 movdqu(dst, src); 2341 psrldq(dst, eindex*esize); 2342 } else { 2343 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2344 } 2345 movq(dst, dst); 2346 } 2347 } 2348 // Zero upper bits 2349 if (typ == T_FLOAT) { 2350 if (UseAVX == 0) { 2351 assert(vtmp != xnoreg, "required."); 2352 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2353 pand(dst, vtmp); 2354 } else { 2355 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2356 } 2357 } 2358 } 2359 2360 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2361 switch(typ) { 2362 case T_BYTE: 2363 case T_BOOLEAN: 2364 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2365 break; 2366 case T_SHORT: 2367 case T_CHAR: 2368 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2369 break; 2370 case T_INT: 2371 case T_FLOAT: 2372 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2373 break; 2374 case T_LONG: 2375 case T_DOUBLE: 2376 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2377 break; 2378 default: 2379 assert(false,"Should not reach here."); 2380 break; 2381 } 2382 } 2383 2384 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2385 assert(rscratch != noreg || always_reachable(src2), "missing"); 2386 2387 switch(typ) { 2388 case T_BOOLEAN: 2389 case T_BYTE: 2390 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2391 break; 2392 case T_CHAR: 2393 case T_SHORT: 2394 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2395 break; 2396 case T_INT: 2397 case T_FLOAT: 2398 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2399 break; 2400 case T_LONG: 2401 case T_DOUBLE: 2402 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2403 break; 2404 default: 2405 assert(false,"Should not reach here."); 2406 break; 2407 } 2408 } 2409 2410 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2411 switch(typ) { 2412 case T_BYTE: 2413 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2414 break; 2415 case T_SHORT: 2416 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2417 break; 2418 case T_INT: 2419 case T_FLOAT: 2420 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2421 break; 2422 case T_LONG: 2423 case T_DOUBLE: 2424 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2425 break; 2426 default: 2427 assert(false,"Should not reach here."); 2428 break; 2429 } 2430 } 2431 2432 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2433 assert(vlen_in_bytes <= 32, ""); 2434 int esize = type2aelembytes(bt); 2435 if (vlen_in_bytes == 32) { 2436 assert(vtmp == xnoreg, "required."); 2437 if (esize >= 4) { 2438 vtestps(src1, src2, AVX_256bit); 2439 } else { 2440 vptest(src1, src2, AVX_256bit); 2441 } 2442 return; 2443 } 2444 if (vlen_in_bytes < 16) { 2445 // Duplicate the lower part to fill the whole register, 2446 // Don't need to do so for src2 2447 assert(vtmp != xnoreg, "required"); 2448 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2449 pshufd(vtmp, src1, shuffle_imm); 2450 } else { 2451 assert(vtmp == xnoreg, "required"); 2452 vtmp = src1; 2453 } 2454 if (esize >= 4 && VM_Version::supports_avx()) { 2455 vtestps(vtmp, src2, AVX_128bit); 2456 } else { 2457 ptest(vtmp, src2); 2458 } 2459 } 2460 2461 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2462 assert(UseAVX >= 2, "required"); 2463 #ifdef ASSERT 2464 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2465 bool is_bw_supported = VM_Version::supports_avx512bw(); 2466 if (is_bw && !is_bw_supported) { 2467 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2468 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2469 "XMM register should be 0-15"); 2470 } 2471 #endif // ASSERT 2472 switch (elem_bt) { 2473 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2474 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2475 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2476 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2477 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2478 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2479 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2480 } 2481 } 2482 2483 #ifdef _LP64 2484 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2485 assert(UseAVX >= 2, "required"); 2486 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2487 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2488 if ((UseAVX > 2) && 2489 (!is_bw || VM_Version::supports_avx512bw()) && 2490 (!is_vl || VM_Version::supports_avx512vl())) { 2491 switch (elem_bt) { 2492 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2493 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2494 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2495 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2496 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2497 } 2498 } else { 2499 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2500 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2501 switch (elem_bt) { 2502 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2503 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2504 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2505 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2506 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2507 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2508 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2509 } 2510 } 2511 } 2512 #endif 2513 2514 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2515 switch (to_elem_bt) { 2516 case T_SHORT: 2517 vpmovsxbw(dst, src, vlen_enc); 2518 break; 2519 case T_INT: 2520 vpmovsxbd(dst, src, vlen_enc); 2521 break; 2522 case T_FLOAT: 2523 vpmovsxbd(dst, src, vlen_enc); 2524 vcvtdq2ps(dst, dst, vlen_enc); 2525 break; 2526 case T_LONG: 2527 vpmovsxbq(dst, src, vlen_enc); 2528 break; 2529 case T_DOUBLE: { 2530 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2531 vpmovsxbd(dst, src, mid_vlen_enc); 2532 vcvtdq2pd(dst, dst, vlen_enc); 2533 break; 2534 } 2535 default: 2536 fatal("Unsupported type %s", type2name(to_elem_bt)); 2537 break; 2538 } 2539 } 2540 2541 //------------------------------------------------------------------------------------------- 2542 2543 // IndexOf for constant substrings with size >= 8 chars 2544 // which don't need to be loaded through stack. 2545 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2546 Register cnt1, Register cnt2, 2547 int int_cnt2, Register result, 2548 XMMRegister vec, Register tmp, 2549 int ae) { 2550 ShortBranchVerifier sbv(this); 2551 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2552 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2553 2554 // This method uses the pcmpestri instruction with bound registers 2555 // inputs: 2556 // xmm - substring 2557 // rax - substring length (elements count) 2558 // mem - scanned string 2559 // rdx - string length (elements count) 2560 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2561 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2562 // outputs: 2563 // rcx - matched index in string 2564 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2565 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2566 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2567 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2568 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2569 2570 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2571 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2572 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2573 2574 // Note, inline_string_indexOf() generates checks: 2575 // if (substr.count > string.count) return -1; 2576 // if (substr.count == 0) return 0; 2577 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2578 2579 // Load substring. 2580 if (ae == StrIntrinsicNode::UL) { 2581 pmovzxbw(vec, Address(str2, 0)); 2582 } else { 2583 movdqu(vec, Address(str2, 0)); 2584 } 2585 movl(cnt2, int_cnt2); 2586 movptr(result, str1); // string addr 2587 2588 if (int_cnt2 > stride) { 2589 jmpb(SCAN_TO_SUBSTR); 2590 2591 // Reload substr for rescan, this code 2592 // is executed only for large substrings (> 8 chars) 2593 bind(RELOAD_SUBSTR); 2594 if (ae == StrIntrinsicNode::UL) { 2595 pmovzxbw(vec, Address(str2, 0)); 2596 } else { 2597 movdqu(vec, Address(str2, 0)); 2598 } 2599 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2600 2601 bind(RELOAD_STR); 2602 // We came here after the beginning of the substring was 2603 // matched but the rest of it was not so we need to search 2604 // again. Start from the next element after the previous match. 2605 2606 // cnt2 is number of substring reminding elements and 2607 // cnt1 is number of string reminding elements when cmp failed. 2608 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2609 subl(cnt1, cnt2); 2610 addl(cnt1, int_cnt2); 2611 movl(cnt2, int_cnt2); // Now restore cnt2 2612 2613 decrementl(cnt1); // Shift to next element 2614 cmpl(cnt1, cnt2); 2615 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2616 2617 addptr(result, (1<<scale1)); 2618 2619 } // (int_cnt2 > 8) 2620 2621 // Scan string for start of substr in 16-byte vectors 2622 bind(SCAN_TO_SUBSTR); 2623 pcmpestri(vec, Address(result, 0), mode); 2624 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2625 subl(cnt1, stride); 2626 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2627 cmpl(cnt1, cnt2); 2628 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2629 addptr(result, 16); 2630 jmpb(SCAN_TO_SUBSTR); 2631 2632 // Found a potential substr 2633 bind(FOUND_CANDIDATE); 2634 // Matched whole vector if first element matched (tmp(rcx) == 0). 2635 if (int_cnt2 == stride) { 2636 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2637 } else { // int_cnt2 > 8 2638 jccb(Assembler::overflow, FOUND_SUBSTR); 2639 } 2640 // After pcmpestri tmp(rcx) contains matched element index 2641 // Compute start addr of substr 2642 lea(result, Address(result, tmp, scale1)); 2643 2644 // Make sure string is still long enough 2645 subl(cnt1, tmp); 2646 cmpl(cnt1, cnt2); 2647 if (int_cnt2 == stride) { 2648 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2649 } else { // int_cnt2 > 8 2650 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2651 } 2652 // Left less then substring. 2653 2654 bind(RET_NOT_FOUND); 2655 movl(result, -1); 2656 jmp(EXIT); 2657 2658 if (int_cnt2 > stride) { 2659 // This code is optimized for the case when whole substring 2660 // is matched if its head is matched. 2661 bind(MATCH_SUBSTR_HEAD); 2662 pcmpestri(vec, Address(result, 0), mode); 2663 // Reload only string if does not match 2664 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2665 2666 Label CONT_SCAN_SUBSTR; 2667 // Compare the rest of substring (> 8 chars). 2668 bind(FOUND_SUBSTR); 2669 // First 8 chars are already matched. 2670 negptr(cnt2); 2671 addptr(cnt2, stride); 2672 2673 bind(SCAN_SUBSTR); 2674 subl(cnt1, stride); 2675 cmpl(cnt2, -stride); // Do not read beyond substring 2676 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2677 // Back-up strings to avoid reading beyond substring: 2678 // cnt1 = cnt1 - cnt2 + 8 2679 addl(cnt1, cnt2); // cnt2 is negative 2680 addl(cnt1, stride); 2681 movl(cnt2, stride); negptr(cnt2); 2682 bind(CONT_SCAN_SUBSTR); 2683 if (int_cnt2 < (int)G) { 2684 int tail_off1 = int_cnt2<<scale1; 2685 int tail_off2 = int_cnt2<<scale2; 2686 if (ae == StrIntrinsicNode::UL) { 2687 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2688 } else { 2689 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2690 } 2691 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2692 } else { 2693 // calculate index in register to avoid integer overflow (int_cnt2*2) 2694 movl(tmp, int_cnt2); 2695 addptr(tmp, cnt2); 2696 if (ae == StrIntrinsicNode::UL) { 2697 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2698 } else { 2699 movdqu(vec, Address(str2, tmp, scale2, 0)); 2700 } 2701 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2702 } 2703 // Need to reload strings pointers if not matched whole vector 2704 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2705 addptr(cnt2, stride); 2706 jcc(Assembler::negative, SCAN_SUBSTR); 2707 // Fall through if found full substring 2708 2709 } // (int_cnt2 > 8) 2710 2711 bind(RET_FOUND); 2712 // Found result if we matched full small substring. 2713 // Compute substr offset 2714 subptr(result, str1); 2715 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2716 shrl(result, 1); // index 2717 } 2718 bind(EXIT); 2719 2720 } // string_indexofC8 2721 2722 // Small strings are loaded through stack if they cross page boundary. 2723 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2724 Register cnt1, Register cnt2, 2725 int int_cnt2, Register result, 2726 XMMRegister vec, Register tmp, 2727 int ae) { 2728 ShortBranchVerifier sbv(this); 2729 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2730 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2731 2732 // 2733 // int_cnt2 is length of small (< 8 chars) constant substring 2734 // or (-1) for non constant substring in which case its length 2735 // is in cnt2 register. 2736 // 2737 // Note, inline_string_indexOf() generates checks: 2738 // if (substr.count > string.count) return -1; 2739 // if (substr.count == 0) return 0; 2740 // 2741 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2742 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2743 // This method uses the pcmpestri instruction with bound registers 2744 // inputs: 2745 // xmm - substring 2746 // rax - substring length (elements count) 2747 // mem - scanned string 2748 // rdx - string length (elements count) 2749 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2750 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2751 // outputs: 2752 // rcx - matched index in string 2753 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2754 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2755 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2756 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2757 2758 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2759 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2760 FOUND_CANDIDATE; 2761 2762 { //======================================================== 2763 // We don't know where these strings are located 2764 // and we can't read beyond them. Load them through stack. 2765 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2766 2767 movptr(tmp, rsp); // save old SP 2768 2769 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2770 if (int_cnt2 == (1>>scale2)) { // One byte 2771 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2772 load_unsigned_byte(result, Address(str2, 0)); 2773 movdl(vec, result); // move 32 bits 2774 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2775 // Not enough header space in 32-bit VM: 12+3 = 15. 2776 movl(result, Address(str2, -1)); 2777 shrl(result, 8); 2778 movdl(vec, result); // move 32 bits 2779 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2780 load_unsigned_short(result, Address(str2, 0)); 2781 movdl(vec, result); // move 32 bits 2782 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2783 movdl(vec, Address(str2, 0)); // move 32 bits 2784 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2785 movq(vec, Address(str2, 0)); // move 64 bits 2786 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2787 // Array header size is 12 bytes in 32-bit VM 2788 // + 6 bytes for 3 chars == 18 bytes, 2789 // enough space to load vec and shift. 2790 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2791 if (ae == StrIntrinsicNode::UL) { 2792 int tail_off = int_cnt2-8; 2793 pmovzxbw(vec, Address(str2, tail_off)); 2794 psrldq(vec, -2*tail_off); 2795 } 2796 else { 2797 int tail_off = int_cnt2*(1<<scale2); 2798 movdqu(vec, Address(str2, tail_off-16)); 2799 psrldq(vec, 16-tail_off); 2800 } 2801 } 2802 } else { // not constant substring 2803 cmpl(cnt2, stride); 2804 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 2805 2806 // We can read beyond string if srt+16 does not cross page boundary 2807 // since heaps are aligned and mapped by pages. 2808 assert(os::vm_page_size() < (int)G, "default page should be small"); 2809 movl(result, str2); // We need only low 32 bits 2810 andl(result, ((int)os::vm_page_size()-1)); 2811 cmpl(result, ((int)os::vm_page_size()-16)); 2812 jccb(Assembler::belowEqual, CHECK_STR); 2813 2814 // Move small strings to stack to allow load 16 bytes into vec. 2815 subptr(rsp, 16); 2816 int stk_offset = wordSize-(1<<scale2); 2817 push(cnt2); 2818 2819 bind(COPY_SUBSTR); 2820 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 2821 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 2822 movb(Address(rsp, cnt2, scale2, stk_offset), result); 2823 } else if (ae == StrIntrinsicNode::UU) { 2824 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 2825 movw(Address(rsp, cnt2, scale2, stk_offset), result); 2826 } 2827 decrement(cnt2); 2828 jccb(Assembler::notZero, COPY_SUBSTR); 2829 2830 pop(cnt2); 2831 movptr(str2, rsp); // New substring address 2832 } // non constant 2833 2834 bind(CHECK_STR); 2835 cmpl(cnt1, stride); 2836 jccb(Assembler::aboveEqual, BIG_STRINGS); 2837 2838 // Check cross page boundary. 2839 movl(result, str1); // We need only low 32 bits 2840 andl(result, ((int)os::vm_page_size()-1)); 2841 cmpl(result, ((int)os::vm_page_size()-16)); 2842 jccb(Assembler::belowEqual, BIG_STRINGS); 2843 2844 subptr(rsp, 16); 2845 int stk_offset = -(1<<scale1); 2846 if (int_cnt2 < 0) { // not constant 2847 push(cnt2); 2848 stk_offset += wordSize; 2849 } 2850 movl(cnt2, cnt1); 2851 2852 bind(COPY_STR); 2853 if (ae == StrIntrinsicNode::LL) { 2854 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 2855 movb(Address(rsp, cnt2, scale1, stk_offset), result); 2856 } else { 2857 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 2858 movw(Address(rsp, cnt2, scale1, stk_offset), result); 2859 } 2860 decrement(cnt2); 2861 jccb(Assembler::notZero, COPY_STR); 2862 2863 if (int_cnt2 < 0) { // not constant 2864 pop(cnt2); 2865 } 2866 movptr(str1, rsp); // New string address 2867 2868 bind(BIG_STRINGS); 2869 // Load substring. 2870 if (int_cnt2 < 0) { // -1 2871 if (ae == StrIntrinsicNode::UL) { 2872 pmovzxbw(vec, Address(str2, 0)); 2873 } else { 2874 movdqu(vec, Address(str2, 0)); 2875 } 2876 push(cnt2); // substr count 2877 push(str2); // substr addr 2878 push(str1); // string addr 2879 } else { 2880 // Small (< 8 chars) constant substrings are loaded already. 2881 movl(cnt2, int_cnt2); 2882 } 2883 push(tmp); // original SP 2884 2885 } // Finished loading 2886 2887 //======================================================== 2888 // Start search 2889 // 2890 2891 movptr(result, str1); // string addr 2892 2893 if (int_cnt2 < 0) { // Only for non constant substring 2894 jmpb(SCAN_TO_SUBSTR); 2895 2896 // SP saved at sp+0 2897 // String saved at sp+1*wordSize 2898 // Substr saved at sp+2*wordSize 2899 // Substr count saved at sp+3*wordSize 2900 2901 // Reload substr for rescan, this code 2902 // is executed only for large substrings (> 8 chars) 2903 bind(RELOAD_SUBSTR); 2904 movptr(str2, Address(rsp, 2*wordSize)); 2905 movl(cnt2, Address(rsp, 3*wordSize)); 2906 if (ae == StrIntrinsicNode::UL) { 2907 pmovzxbw(vec, Address(str2, 0)); 2908 } else { 2909 movdqu(vec, Address(str2, 0)); 2910 } 2911 // We came here after the beginning of the substring was 2912 // matched but the rest of it was not so we need to search 2913 // again. Start from the next element after the previous match. 2914 subptr(str1, result); // Restore counter 2915 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2916 shrl(str1, 1); 2917 } 2918 addl(cnt1, str1); 2919 decrementl(cnt1); // Shift to next element 2920 cmpl(cnt1, cnt2); 2921 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2922 2923 addptr(result, (1<<scale1)); 2924 } // non constant 2925 2926 // Scan string for start of substr in 16-byte vectors 2927 bind(SCAN_TO_SUBSTR); 2928 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2929 pcmpestri(vec, Address(result, 0), mode); 2930 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2931 subl(cnt1, stride); 2932 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2933 cmpl(cnt1, cnt2); 2934 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2935 addptr(result, 16); 2936 2937 bind(ADJUST_STR); 2938 cmpl(cnt1, stride); // Do not read beyond string 2939 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2940 // Back-up string to avoid reading beyond string. 2941 lea(result, Address(result, cnt1, scale1, -16)); 2942 movl(cnt1, stride); 2943 jmpb(SCAN_TO_SUBSTR); 2944 2945 // Found a potential substr 2946 bind(FOUND_CANDIDATE); 2947 // After pcmpestri tmp(rcx) contains matched element index 2948 2949 // Make sure string is still long enough 2950 subl(cnt1, tmp); 2951 cmpl(cnt1, cnt2); 2952 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 2953 // Left less then substring. 2954 2955 bind(RET_NOT_FOUND); 2956 movl(result, -1); 2957 jmp(CLEANUP); 2958 2959 bind(FOUND_SUBSTR); 2960 // Compute start addr of substr 2961 lea(result, Address(result, tmp, scale1)); 2962 if (int_cnt2 > 0) { // Constant substring 2963 // Repeat search for small substring (< 8 chars) 2964 // from new point without reloading substring. 2965 // Have to check that we don't read beyond string. 2966 cmpl(tmp, stride-int_cnt2); 2967 jccb(Assembler::greater, ADJUST_STR); 2968 // Fall through if matched whole substring. 2969 } else { // non constant 2970 assert(int_cnt2 == -1, "should be != 0"); 2971 2972 addl(tmp, cnt2); 2973 // Found result if we matched whole substring. 2974 cmpl(tmp, stride); 2975 jcc(Assembler::lessEqual, RET_FOUND); 2976 2977 // Repeat search for small substring (<= 8 chars) 2978 // from new point 'str1' without reloading substring. 2979 cmpl(cnt2, stride); 2980 // Have to check that we don't read beyond string. 2981 jccb(Assembler::lessEqual, ADJUST_STR); 2982 2983 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 2984 // Compare the rest of substring (> 8 chars). 2985 movptr(str1, result); 2986 2987 cmpl(tmp, cnt2); 2988 // First 8 chars are already matched. 2989 jccb(Assembler::equal, CHECK_NEXT); 2990 2991 bind(SCAN_SUBSTR); 2992 pcmpestri(vec, Address(str1, 0), mode); 2993 // Need to reload strings pointers if not matched whole vector 2994 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2995 2996 bind(CHECK_NEXT); 2997 subl(cnt2, stride); 2998 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 2999 addptr(str1, 16); 3000 if (ae == StrIntrinsicNode::UL) { 3001 addptr(str2, 8); 3002 } else { 3003 addptr(str2, 16); 3004 } 3005 subl(cnt1, stride); 3006 cmpl(cnt2, stride); // Do not read beyond substring 3007 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3008 // Back-up strings to avoid reading beyond substring. 3009 3010 if (ae == StrIntrinsicNode::UL) { 3011 lea(str2, Address(str2, cnt2, scale2, -8)); 3012 lea(str1, Address(str1, cnt2, scale1, -16)); 3013 } else { 3014 lea(str2, Address(str2, cnt2, scale2, -16)); 3015 lea(str1, Address(str1, cnt2, scale1, -16)); 3016 } 3017 subl(cnt1, cnt2); 3018 movl(cnt2, stride); 3019 addl(cnt1, stride); 3020 bind(CONT_SCAN_SUBSTR); 3021 if (ae == StrIntrinsicNode::UL) { 3022 pmovzxbw(vec, Address(str2, 0)); 3023 } else { 3024 movdqu(vec, Address(str2, 0)); 3025 } 3026 jmp(SCAN_SUBSTR); 3027 3028 bind(RET_FOUND_LONG); 3029 movptr(str1, Address(rsp, wordSize)); 3030 } // non constant 3031 3032 bind(RET_FOUND); 3033 // Compute substr offset 3034 subptr(result, str1); 3035 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3036 shrl(result, 1); // index 3037 } 3038 bind(CLEANUP); 3039 pop(rsp); // restore SP 3040 3041 } // string_indexof 3042 3043 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3044 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3045 ShortBranchVerifier sbv(this); 3046 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3047 3048 int stride = 8; 3049 3050 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3051 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3052 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3053 FOUND_SEQ_CHAR, DONE_LABEL; 3054 3055 movptr(result, str1); 3056 if (UseAVX >= 2) { 3057 cmpl(cnt1, stride); 3058 jcc(Assembler::less, SCAN_TO_CHAR); 3059 cmpl(cnt1, 2*stride); 3060 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3061 movdl(vec1, ch); 3062 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3063 vpxor(vec2, vec2); 3064 movl(tmp, cnt1); 3065 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3066 andl(cnt1,0x0000000F); //tail count (in chars) 3067 3068 bind(SCAN_TO_16_CHAR_LOOP); 3069 vmovdqu(vec3, Address(result, 0)); 3070 vpcmpeqw(vec3, vec3, vec1, 1); 3071 vptest(vec2, vec3); 3072 jcc(Assembler::carryClear, FOUND_CHAR); 3073 addptr(result, 32); 3074 subl(tmp, 2*stride); 3075 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3076 jmp(SCAN_TO_8_CHAR); 3077 bind(SCAN_TO_8_CHAR_INIT); 3078 movdl(vec1, ch); 3079 pshuflw(vec1, vec1, 0x00); 3080 pshufd(vec1, vec1, 0); 3081 pxor(vec2, vec2); 3082 } 3083 bind(SCAN_TO_8_CHAR); 3084 cmpl(cnt1, stride); 3085 jcc(Assembler::less, SCAN_TO_CHAR); 3086 if (UseAVX < 2) { 3087 movdl(vec1, ch); 3088 pshuflw(vec1, vec1, 0x00); 3089 pshufd(vec1, vec1, 0); 3090 pxor(vec2, vec2); 3091 } 3092 movl(tmp, cnt1); 3093 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3094 andl(cnt1,0x00000007); //tail count (in chars) 3095 3096 bind(SCAN_TO_8_CHAR_LOOP); 3097 movdqu(vec3, Address(result, 0)); 3098 pcmpeqw(vec3, vec1); 3099 ptest(vec2, vec3); 3100 jcc(Assembler::carryClear, FOUND_CHAR); 3101 addptr(result, 16); 3102 subl(tmp, stride); 3103 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3104 bind(SCAN_TO_CHAR); 3105 testl(cnt1, cnt1); 3106 jcc(Assembler::zero, RET_NOT_FOUND); 3107 bind(SCAN_TO_CHAR_LOOP); 3108 load_unsigned_short(tmp, Address(result, 0)); 3109 cmpl(ch, tmp); 3110 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3111 addptr(result, 2); 3112 subl(cnt1, 1); 3113 jccb(Assembler::zero, RET_NOT_FOUND); 3114 jmp(SCAN_TO_CHAR_LOOP); 3115 3116 bind(RET_NOT_FOUND); 3117 movl(result, -1); 3118 jmpb(DONE_LABEL); 3119 3120 bind(FOUND_CHAR); 3121 if (UseAVX >= 2) { 3122 vpmovmskb(tmp, vec3); 3123 } else { 3124 pmovmskb(tmp, vec3); 3125 } 3126 bsfl(ch, tmp); 3127 addptr(result, ch); 3128 3129 bind(FOUND_SEQ_CHAR); 3130 subptr(result, str1); 3131 shrl(result, 1); 3132 3133 bind(DONE_LABEL); 3134 } // string_indexof_char 3135 3136 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3137 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3138 ShortBranchVerifier sbv(this); 3139 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3140 3141 int stride = 16; 3142 3143 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3144 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3145 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3146 FOUND_SEQ_CHAR, DONE_LABEL; 3147 3148 movptr(result, str1); 3149 if (UseAVX >= 2) { 3150 cmpl(cnt1, stride); 3151 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3152 cmpl(cnt1, stride*2); 3153 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3154 movdl(vec1, ch); 3155 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3156 vpxor(vec2, vec2); 3157 movl(tmp, cnt1); 3158 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3159 andl(cnt1,0x0000001F); //tail count (in chars) 3160 3161 bind(SCAN_TO_32_CHAR_LOOP); 3162 vmovdqu(vec3, Address(result, 0)); 3163 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3164 vptest(vec2, vec3); 3165 jcc(Assembler::carryClear, FOUND_CHAR); 3166 addptr(result, 32); 3167 subl(tmp, stride*2); 3168 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3169 jmp(SCAN_TO_16_CHAR); 3170 3171 bind(SCAN_TO_16_CHAR_INIT); 3172 movdl(vec1, ch); 3173 pxor(vec2, vec2); 3174 pshufb(vec1, vec2); 3175 } 3176 3177 bind(SCAN_TO_16_CHAR); 3178 cmpl(cnt1, stride); 3179 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3180 if (UseAVX < 2) { 3181 movdl(vec1, ch); 3182 pxor(vec2, vec2); 3183 pshufb(vec1, vec2); 3184 } 3185 movl(tmp, cnt1); 3186 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3187 andl(cnt1,0x0000000F); //tail count (in bytes) 3188 3189 bind(SCAN_TO_16_CHAR_LOOP); 3190 movdqu(vec3, Address(result, 0)); 3191 pcmpeqb(vec3, vec1); 3192 ptest(vec2, vec3); 3193 jcc(Assembler::carryClear, FOUND_CHAR); 3194 addptr(result, 16); 3195 subl(tmp, stride); 3196 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3197 3198 bind(SCAN_TO_CHAR_INIT); 3199 testl(cnt1, cnt1); 3200 jcc(Assembler::zero, RET_NOT_FOUND); 3201 bind(SCAN_TO_CHAR_LOOP); 3202 load_unsigned_byte(tmp, Address(result, 0)); 3203 cmpl(ch, tmp); 3204 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3205 addptr(result, 1); 3206 subl(cnt1, 1); 3207 jccb(Assembler::zero, RET_NOT_FOUND); 3208 jmp(SCAN_TO_CHAR_LOOP); 3209 3210 bind(RET_NOT_FOUND); 3211 movl(result, -1); 3212 jmpb(DONE_LABEL); 3213 3214 bind(FOUND_CHAR); 3215 if (UseAVX >= 2) { 3216 vpmovmskb(tmp, vec3); 3217 } else { 3218 pmovmskb(tmp, vec3); 3219 } 3220 bsfl(ch, tmp); 3221 addptr(result, ch); 3222 3223 bind(FOUND_SEQ_CHAR); 3224 subptr(result, str1); 3225 3226 bind(DONE_LABEL); 3227 } // stringL_indexof_char 3228 3229 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3230 switch (eltype) { 3231 case T_BOOLEAN: return sizeof(jboolean); 3232 case T_BYTE: return sizeof(jbyte); 3233 case T_SHORT: return sizeof(jshort); 3234 case T_CHAR: return sizeof(jchar); 3235 case T_INT: return sizeof(jint); 3236 default: 3237 ShouldNotReachHere(); 3238 return -1; 3239 } 3240 } 3241 3242 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3243 switch (eltype) { 3244 // T_BOOLEAN used as surrogate for unsigned byte 3245 case T_BOOLEAN: movzbl(dst, src); break; 3246 case T_BYTE: movsbl(dst, src); break; 3247 case T_SHORT: movswl(dst, src); break; 3248 case T_CHAR: movzwl(dst, src); break; 3249 case T_INT: movl(dst, src); break; 3250 default: 3251 ShouldNotReachHere(); 3252 } 3253 } 3254 3255 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3256 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3257 } 3258 3259 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3260 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3261 } 3262 3263 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3264 const int vlen = Assembler::AVX_256bit; 3265 switch (eltype) { 3266 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3267 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3268 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3269 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3270 case T_INT: 3271 // do nothing 3272 break; 3273 default: 3274 ShouldNotReachHere(); 3275 } 3276 } 3277 3278 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3279 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3280 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3281 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3282 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3283 BasicType eltype) { 3284 ShortBranchVerifier sbv(this); 3285 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3286 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3287 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3288 3289 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3290 SHORT_UNROLLED_LOOP_EXIT, 3291 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3292 UNROLLED_VECTOR_LOOP_BEGIN, 3293 END; 3294 switch (eltype) { 3295 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3296 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3297 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3298 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3299 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3300 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3301 } 3302 3303 // For "renaming" for readibility of the code 3304 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3305 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3306 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3307 3308 const int elsize = arrays_hashcode_elsize(eltype); 3309 3310 /* 3311 if (cnt1 >= 2) { 3312 if (cnt1 >= 32) { 3313 UNROLLED VECTOR LOOP 3314 } 3315 UNROLLED SCALAR LOOP 3316 } 3317 SINGLE SCALAR 3318 */ 3319 3320 cmpl(cnt1, 32); 3321 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3322 3323 // cnt1 >= 32 && generate_vectorized_loop 3324 xorl(index, index); 3325 3326 // vresult = IntVector.zero(I256); 3327 for (int idx = 0; idx < 4; idx++) { 3328 vpxor(vresult[idx], vresult[idx]); 3329 } 3330 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3331 Register bound = tmp2; 3332 Register next = tmp3; 3333 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3334 movl(next, Address(tmp2, 0)); 3335 movdl(vnext, next); 3336 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3337 3338 // index = 0; 3339 // bound = cnt1 & ~(32 - 1); 3340 movl(bound, cnt1); 3341 andl(bound, ~(32 - 1)); 3342 // for (; index < bound; index += 32) { 3343 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3344 // result *= next; 3345 imull(result, next); 3346 // loop fission to upfront the cost of fetching from memory, OOO execution 3347 // can then hopefully do a better job of prefetching 3348 for (int idx = 0; idx < 4; idx++) { 3349 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3350 } 3351 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3352 for (int idx = 0; idx < 4; idx++) { 3353 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3354 arrays_hashcode_elvcast(vtmp[idx], eltype); 3355 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3356 } 3357 // index += 32; 3358 addl(index, 32); 3359 // index < bound; 3360 cmpl(index, bound); 3361 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3362 // } 3363 3364 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3365 subl(cnt1, bound); 3366 // release bound 3367 3368 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3369 for (int idx = 0; idx < 4; idx++) { 3370 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3371 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3372 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3373 } 3374 // result += vresult.reduceLanes(ADD); 3375 for (int idx = 0; idx < 4; idx++) { 3376 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3377 } 3378 3379 // } else if (cnt1 < 32) { 3380 3381 bind(SHORT_UNROLLED_BEGIN); 3382 // int i = 1; 3383 movl(index, 1); 3384 cmpl(index, cnt1); 3385 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3386 3387 // for (; i < cnt1 ; i += 2) { 3388 bind(SHORT_UNROLLED_LOOP_BEGIN); 3389 movl(tmp3, 961); 3390 imull(result, tmp3); 3391 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3392 movl(tmp3, tmp2); 3393 shll(tmp3, 5); 3394 subl(tmp3, tmp2); 3395 addl(result, tmp3); 3396 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3397 addl(result, tmp3); 3398 addl(index, 2); 3399 cmpl(index, cnt1); 3400 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3401 3402 // } 3403 // if (i >= cnt1) { 3404 bind(SHORT_UNROLLED_LOOP_EXIT); 3405 jccb(Assembler::greater, END); 3406 movl(tmp2, result); 3407 shll(result, 5); 3408 subl(result, tmp2); 3409 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3410 addl(result, tmp3); 3411 // } 3412 bind(END); 3413 3414 BLOCK_COMMENT("} // arrays_hashcode"); 3415 3416 } // arrays_hashcode 3417 3418 // helper function for string_compare 3419 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3420 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3421 Address::ScaleFactor scale2, Register index, int ae) { 3422 if (ae == StrIntrinsicNode::LL) { 3423 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3424 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3425 } else if (ae == StrIntrinsicNode::UU) { 3426 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3427 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3428 } else { 3429 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3430 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3431 } 3432 } 3433 3434 // Compare strings, used for char[] and byte[]. 3435 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3436 Register cnt1, Register cnt2, Register result, 3437 XMMRegister vec1, int ae, KRegister mask) { 3438 ShortBranchVerifier sbv(this); 3439 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3440 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3441 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3442 int stride2x2 = 0x40; 3443 Address::ScaleFactor scale = Address::no_scale; 3444 Address::ScaleFactor scale1 = Address::no_scale; 3445 Address::ScaleFactor scale2 = Address::no_scale; 3446 3447 if (ae != StrIntrinsicNode::LL) { 3448 stride2x2 = 0x20; 3449 } 3450 3451 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3452 shrl(cnt2, 1); 3453 } 3454 // Compute the minimum of the string lengths and the 3455 // difference of the string lengths (stack). 3456 // Do the conditional move stuff 3457 movl(result, cnt1); 3458 subl(cnt1, cnt2); 3459 push(cnt1); 3460 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3461 3462 // Is the minimum length zero? 3463 testl(cnt2, cnt2); 3464 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3465 if (ae == StrIntrinsicNode::LL) { 3466 // Load first bytes 3467 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3468 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3469 } else if (ae == StrIntrinsicNode::UU) { 3470 // Load first characters 3471 load_unsigned_short(result, Address(str1, 0)); 3472 load_unsigned_short(cnt1, Address(str2, 0)); 3473 } else { 3474 load_unsigned_byte(result, Address(str1, 0)); 3475 load_unsigned_short(cnt1, Address(str2, 0)); 3476 } 3477 subl(result, cnt1); 3478 jcc(Assembler::notZero, POP_LABEL); 3479 3480 if (ae == StrIntrinsicNode::UU) { 3481 // Divide length by 2 to get number of chars 3482 shrl(cnt2, 1); 3483 } 3484 cmpl(cnt2, 1); 3485 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3486 3487 // Check if the strings start at the same location and setup scale and stride 3488 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3489 cmpptr(str1, str2); 3490 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3491 if (ae == StrIntrinsicNode::LL) { 3492 scale = Address::times_1; 3493 stride = 16; 3494 } else { 3495 scale = Address::times_2; 3496 stride = 8; 3497 } 3498 } else { 3499 scale1 = Address::times_1; 3500 scale2 = Address::times_2; 3501 // scale not used 3502 stride = 8; 3503 } 3504 3505 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3506 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3507 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3508 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3509 Label COMPARE_TAIL_LONG; 3510 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3511 3512 int pcmpmask = 0x19; 3513 if (ae == StrIntrinsicNode::LL) { 3514 pcmpmask &= ~0x01; 3515 } 3516 3517 // Setup to compare 16-chars (32-bytes) vectors, 3518 // start from first character again because it has aligned address. 3519 if (ae == StrIntrinsicNode::LL) { 3520 stride2 = 32; 3521 } else { 3522 stride2 = 16; 3523 } 3524 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3525 adr_stride = stride << scale; 3526 } else { 3527 adr_stride1 = 8; //stride << scale1; 3528 adr_stride2 = 16; //stride << scale2; 3529 } 3530 3531 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3532 // rax and rdx are used by pcmpestri as elements counters 3533 movl(result, cnt2); 3534 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3535 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3536 3537 // fast path : compare first 2 8-char vectors. 3538 bind(COMPARE_16_CHARS); 3539 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3540 movdqu(vec1, Address(str1, 0)); 3541 } else { 3542 pmovzxbw(vec1, Address(str1, 0)); 3543 } 3544 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3545 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3546 3547 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3548 movdqu(vec1, Address(str1, adr_stride)); 3549 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3550 } else { 3551 pmovzxbw(vec1, Address(str1, adr_stride1)); 3552 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3553 } 3554 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3555 addl(cnt1, stride); 3556 3557 // Compare the characters at index in cnt1 3558 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3559 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3560 subl(result, cnt2); 3561 jmp(POP_LABEL); 3562 3563 // Setup the registers to start vector comparison loop 3564 bind(COMPARE_WIDE_VECTORS); 3565 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3566 lea(str1, Address(str1, result, scale)); 3567 lea(str2, Address(str2, result, scale)); 3568 } else { 3569 lea(str1, Address(str1, result, scale1)); 3570 lea(str2, Address(str2, result, scale2)); 3571 } 3572 subl(result, stride2); 3573 subl(cnt2, stride2); 3574 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3575 negptr(result); 3576 3577 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3578 bind(COMPARE_WIDE_VECTORS_LOOP); 3579 3580 #ifdef _LP64 3581 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3582 cmpl(cnt2, stride2x2); 3583 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3584 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3585 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3586 3587 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3588 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3589 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3590 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3591 } else { 3592 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3593 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3594 } 3595 kortestql(mask, mask); 3596 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3597 addptr(result, stride2x2); // update since we already compared at this addr 3598 subl(cnt2, stride2x2); // and sub the size too 3599 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3600 3601 vpxor(vec1, vec1); 3602 jmpb(COMPARE_WIDE_TAIL); 3603 }//if (VM_Version::supports_avx512vlbw()) 3604 #endif // _LP64 3605 3606 3607 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3608 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3609 vmovdqu(vec1, Address(str1, result, scale)); 3610 vpxor(vec1, Address(str2, result, scale)); 3611 } else { 3612 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3613 vpxor(vec1, Address(str2, result, scale2)); 3614 } 3615 vptest(vec1, vec1); 3616 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3617 addptr(result, stride2); 3618 subl(cnt2, stride2); 3619 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3620 // clean upper bits of YMM registers 3621 vpxor(vec1, vec1); 3622 3623 // compare wide vectors tail 3624 bind(COMPARE_WIDE_TAIL); 3625 testptr(result, result); 3626 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3627 3628 movl(result, stride2); 3629 movl(cnt2, result); 3630 negptr(result); 3631 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3632 3633 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3634 bind(VECTOR_NOT_EQUAL); 3635 // clean upper bits of YMM registers 3636 vpxor(vec1, vec1); 3637 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3638 lea(str1, Address(str1, result, scale)); 3639 lea(str2, Address(str2, result, scale)); 3640 } else { 3641 lea(str1, Address(str1, result, scale1)); 3642 lea(str2, Address(str2, result, scale2)); 3643 } 3644 jmp(COMPARE_16_CHARS); 3645 3646 // Compare tail chars, length between 1 to 15 chars 3647 bind(COMPARE_TAIL_LONG); 3648 movl(cnt2, result); 3649 cmpl(cnt2, stride); 3650 jcc(Assembler::less, COMPARE_SMALL_STR); 3651 3652 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3653 movdqu(vec1, Address(str1, 0)); 3654 } else { 3655 pmovzxbw(vec1, Address(str1, 0)); 3656 } 3657 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3658 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3659 subptr(cnt2, stride); 3660 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3661 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3662 lea(str1, Address(str1, result, scale)); 3663 lea(str2, Address(str2, result, scale)); 3664 } else { 3665 lea(str1, Address(str1, result, scale1)); 3666 lea(str2, Address(str2, result, scale2)); 3667 } 3668 negptr(cnt2); 3669 jmpb(WHILE_HEAD_LABEL); 3670 3671 bind(COMPARE_SMALL_STR); 3672 } else if (UseSSE42Intrinsics) { 3673 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3674 int pcmpmask = 0x19; 3675 // Setup to compare 8-char (16-byte) vectors, 3676 // start from first character again because it has aligned address. 3677 movl(result, cnt2); 3678 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3679 if (ae == StrIntrinsicNode::LL) { 3680 pcmpmask &= ~0x01; 3681 } 3682 jcc(Assembler::zero, COMPARE_TAIL); 3683 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3684 lea(str1, Address(str1, result, scale)); 3685 lea(str2, Address(str2, result, scale)); 3686 } else { 3687 lea(str1, Address(str1, result, scale1)); 3688 lea(str2, Address(str2, result, scale2)); 3689 } 3690 negptr(result); 3691 3692 // pcmpestri 3693 // inputs: 3694 // vec1- substring 3695 // rax - negative string length (elements count) 3696 // mem - scanned string 3697 // rdx - string length (elements count) 3698 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3699 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3700 // outputs: 3701 // rcx - first mismatched element index 3702 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3703 3704 bind(COMPARE_WIDE_VECTORS); 3705 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3706 movdqu(vec1, Address(str1, result, scale)); 3707 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3708 } else { 3709 pmovzxbw(vec1, Address(str1, result, scale1)); 3710 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3711 } 3712 // After pcmpestri cnt1(rcx) contains mismatched element index 3713 3714 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3715 addptr(result, stride); 3716 subptr(cnt2, stride); 3717 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3718 3719 // compare wide vectors tail 3720 testptr(result, result); 3721 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3722 3723 movl(cnt2, stride); 3724 movl(result, stride); 3725 negptr(result); 3726 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3727 movdqu(vec1, Address(str1, result, scale)); 3728 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3729 } else { 3730 pmovzxbw(vec1, Address(str1, result, scale1)); 3731 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3732 } 3733 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3734 3735 // Mismatched characters in the vectors 3736 bind(VECTOR_NOT_EQUAL); 3737 addptr(cnt1, result); 3738 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3739 subl(result, cnt2); 3740 jmpb(POP_LABEL); 3741 3742 bind(COMPARE_TAIL); // limit is zero 3743 movl(cnt2, result); 3744 // Fallthru to tail compare 3745 } 3746 // Shift str2 and str1 to the end of the arrays, negate min 3747 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3748 lea(str1, Address(str1, cnt2, scale)); 3749 lea(str2, Address(str2, cnt2, scale)); 3750 } else { 3751 lea(str1, Address(str1, cnt2, scale1)); 3752 lea(str2, Address(str2, cnt2, scale2)); 3753 } 3754 decrementl(cnt2); // first character was compared already 3755 negptr(cnt2); 3756 3757 // Compare the rest of the elements 3758 bind(WHILE_HEAD_LABEL); 3759 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3760 subl(result, cnt1); 3761 jccb(Assembler::notZero, POP_LABEL); 3762 increment(cnt2); 3763 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3764 3765 // Strings are equal up to min length. Return the length difference. 3766 bind(LENGTH_DIFF_LABEL); 3767 pop(result); 3768 if (ae == StrIntrinsicNode::UU) { 3769 // Divide diff by 2 to get number of chars 3770 sarl(result, 1); 3771 } 3772 jmpb(DONE_LABEL); 3773 3774 #ifdef _LP64 3775 if (VM_Version::supports_avx512vlbw()) { 3776 3777 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3778 3779 kmovql(cnt1, mask); 3780 notq(cnt1); 3781 bsfq(cnt2, cnt1); 3782 if (ae != StrIntrinsicNode::LL) { 3783 // Divide diff by 2 to get number of chars 3784 sarl(cnt2, 1); 3785 } 3786 addq(result, cnt2); 3787 if (ae == StrIntrinsicNode::LL) { 3788 load_unsigned_byte(cnt1, Address(str2, result)); 3789 load_unsigned_byte(result, Address(str1, result)); 3790 } else if (ae == StrIntrinsicNode::UU) { 3791 load_unsigned_short(cnt1, Address(str2, result, scale)); 3792 load_unsigned_short(result, Address(str1, result, scale)); 3793 } else { 3794 load_unsigned_short(cnt1, Address(str2, result, scale2)); 3795 load_unsigned_byte(result, Address(str1, result, scale1)); 3796 } 3797 subl(result, cnt1); 3798 jmpb(POP_LABEL); 3799 }//if (VM_Version::supports_avx512vlbw()) 3800 #endif // _LP64 3801 3802 // Discard the stored length difference 3803 bind(POP_LABEL); 3804 pop(cnt1); 3805 3806 // That's it 3807 bind(DONE_LABEL); 3808 if(ae == StrIntrinsicNode::UL) { 3809 negl(result); 3810 } 3811 3812 } 3813 3814 // Search for Non-ASCII character (Negative byte value) in a byte array, 3815 // return the index of the first such character, otherwise the length 3816 // of the array segment searched. 3817 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 3818 // @IntrinsicCandidate 3819 // public static int countPositives(byte[] ba, int off, int len) { 3820 // for (int i = off; i < off + len; i++) { 3821 // if (ba[i] < 0) { 3822 // return i - off; 3823 // } 3824 // } 3825 // return len; 3826 // } 3827 void C2_MacroAssembler::count_positives(Register ary1, Register len, 3828 Register result, Register tmp1, 3829 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 3830 // rsi: byte array 3831 // rcx: len 3832 // rax: result 3833 ShortBranchVerifier sbv(this); 3834 assert_different_registers(ary1, len, result, tmp1); 3835 assert_different_registers(vec1, vec2); 3836 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 3837 3838 movl(result, len); // copy 3839 // len == 0 3840 testl(len, len); 3841 jcc(Assembler::zero, DONE); 3842 3843 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 3844 VM_Version::supports_avx512vlbw() && 3845 VM_Version::supports_bmi2()) { 3846 3847 Label test_64_loop, test_tail, BREAK_LOOP; 3848 Register tmp3_aliased = len; 3849 3850 movl(tmp1, len); 3851 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 3852 3853 andl(tmp1, 64 - 1); // tail count (in chars) 0x3F 3854 andl(len, ~(64 - 1)); // vector count (in chars) 3855 jccb(Assembler::zero, test_tail); 3856 3857 lea(ary1, Address(ary1, len, Address::times_1)); 3858 negptr(len); 3859 3860 bind(test_64_loop); 3861 // Check whether our 64 elements of size byte contain negatives 3862 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 3863 kortestql(mask1, mask1); 3864 jcc(Assembler::notZero, BREAK_LOOP); 3865 3866 addptr(len, 64); 3867 jccb(Assembler::notZero, test_64_loop); 3868 3869 bind(test_tail); 3870 // bail out when there is nothing to be done 3871 testl(tmp1, -1); 3872 jcc(Assembler::zero, DONE); 3873 3874 // ~(~0 << len) applied up to two times (for 32-bit scenario) 3875 #ifdef _LP64 3876 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 3877 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 3878 notq(tmp3_aliased); 3879 kmovql(mask2, tmp3_aliased); 3880 #else 3881 Label k_init; 3882 jmp(k_init); 3883 3884 // We could not read 64-bits from a general purpose register thus we move 3885 // data required to compose 64 1's to the instruction stream 3886 // We emit 64 byte wide series of elements from 0..63 which later on would 3887 // be used as a compare targets with tail count contained in tmp1 register. 3888 // Result would be a k register having tmp1 consecutive number or 1 3889 // counting from least significant bit. 3890 address tmp = pc(); 3891 emit_int64(0x0706050403020100); 3892 emit_int64(0x0F0E0D0C0B0A0908); 3893 emit_int64(0x1716151413121110); 3894 emit_int64(0x1F1E1D1C1B1A1918); 3895 emit_int64(0x2726252423222120); 3896 emit_int64(0x2F2E2D2C2B2A2928); 3897 emit_int64(0x3736353433323130); 3898 emit_int64(0x3F3E3D3C3B3A3938); 3899 3900 bind(k_init); 3901 lea(len, InternalAddress(tmp)); 3902 // create mask to test for negative byte inside a vector 3903 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 3904 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 3905 3906 #endif 3907 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 3908 ktestq(mask1, mask2); 3909 jcc(Assembler::zero, DONE); 3910 3911 bind(BREAK_LOOP); 3912 // At least one byte in the last 64 bytes is negative. 3913 // Set up to look at the last 64 bytes as if they were a tail 3914 lea(ary1, Address(ary1, len, Address::times_1)); 3915 addptr(result, len); 3916 // Ignore the very last byte: if all others are positive, 3917 // it must be negative, so we can skip right to the 2+1 byte 3918 // end comparison at this point 3919 orl(result, 63); 3920 movl(len, 63); 3921 // Fallthru to tail compare 3922 } else { 3923 3924 if (UseAVX >= 2 && UseSSE >= 2) { 3925 // With AVX2, use 32-byte vector compare 3926 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 3927 3928 // Compare 32-byte vectors 3929 testl(len, 0xffffffe0); // vector count (in bytes) 3930 jccb(Assembler::zero, TAIL_START); 3931 3932 andl(len, 0xffffffe0); 3933 lea(ary1, Address(ary1, len, Address::times_1)); 3934 negptr(len); 3935 3936 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 3937 movdl(vec2, tmp1); 3938 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 3939 3940 bind(COMPARE_WIDE_VECTORS); 3941 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 3942 vptest(vec1, vec2); 3943 jccb(Assembler::notZero, BREAK_LOOP); 3944 addptr(len, 32); 3945 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3946 3947 testl(result, 0x0000001f); // any bytes remaining? 3948 jcc(Assembler::zero, DONE); 3949 3950 // Quick test using the already prepared vector mask 3951 movl(len, result); 3952 andl(len, 0x0000001f); 3953 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 3954 vptest(vec1, vec2); 3955 jcc(Assembler::zero, DONE); 3956 // There are zeros, jump to the tail to determine exactly where 3957 jmpb(TAIL_START); 3958 3959 bind(BREAK_LOOP); 3960 // At least one byte in the last 32-byte vector is negative. 3961 // Set up to look at the last 32 bytes as if they were a tail 3962 lea(ary1, Address(ary1, len, Address::times_1)); 3963 addptr(result, len); 3964 // Ignore the very last byte: if all others are positive, 3965 // it must be negative, so we can skip right to the 2+1 byte 3966 // end comparison at this point 3967 orl(result, 31); 3968 movl(len, 31); 3969 // Fallthru to tail compare 3970 } else if (UseSSE42Intrinsics) { 3971 // With SSE4.2, use double quad vector compare 3972 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 3973 3974 // Compare 16-byte vectors 3975 testl(len, 0xfffffff0); // vector count (in bytes) 3976 jcc(Assembler::zero, TAIL_START); 3977 3978 andl(len, 0xfffffff0); 3979 lea(ary1, Address(ary1, len, Address::times_1)); 3980 negptr(len); 3981 3982 movl(tmp1, 0x80808080); 3983 movdl(vec2, tmp1); 3984 pshufd(vec2, vec2, 0); 3985 3986 bind(COMPARE_WIDE_VECTORS); 3987 movdqu(vec1, Address(ary1, len, Address::times_1)); 3988 ptest(vec1, vec2); 3989 jccb(Assembler::notZero, BREAK_LOOP); 3990 addptr(len, 16); 3991 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3992 3993 testl(result, 0x0000000f); // len is zero, any bytes remaining? 3994 jcc(Assembler::zero, DONE); 3995 3996 // Quick test using the already prepared vector mask 3997 movl(len, result); 3998 andl(len, 0x0000000f); // tail count (in bytes) 3999 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4000 ptest(vec1, vec2); 4001 jcc(Assembler::zero, DONE); 4002 jmpb(TAIL_START); 4003 4004 bind(BREAK_LOOP); 4005 // At least one byte in the last 16-byte vector is negative. 4006 // Set up and look at the last 16 bytes as if they were a tail 4007 lea(ary1, Address(ary1, len, Address::times_1)); 4008 addptr(result, len); 4009 // Ignore the very last byte: if all others are positive, 4010 // it must be negative, so we can skip right to the 2+1 byte 4011 // end comparison at this point 4012 orl(result, 15); 4013 movl(len, 15); 4014 // Fallthru to tail compare 4015 } 4016 } 4017 4018 bind(TAIL_START); 4019 // Compare 4-byte vectors 4020 andl(len, 0xfffffffc); // vector count (in bytes) 4021 jccb(Assembler::zero, COMPARE_CHAR); 4022 4023 lea(ary1, Address(ary1, len, Address::times_1)); 4024 negptr(len); 4025 4026 bind(COMPARE_VECTORS); 4027 movl(tmp1, Address(ary1, len, Address::times_1)); 4028 andl(tmp1, 0x80808080); 4029 jccb(Assembler::notZero, TAIL_ADJUST); 4030 addptr(len, 4); 4031 jccb(Assembler::notZero, COMPARE_VECTORS); 4032 4033 // Compare trailing char (final 2-3 bytes), if any 4034 bind(COMPARE_CHAR); 4035 4036 testl(result, 0x2); // tail char 4037 jccb(Assembler::zero, COMPARE_BYTE); 4038 load_unsigned_short(tmp1, Address(ary1, 0)); 4039 andl(tmp1, 0x00008080); 4040 jccb(Assembler::notZero, CHAR_ADJUST); 4041 lea(ary1, Address(ary1, 2)); 4042 4043 bind(COMPARE_BYTE); 4044 testl(result, 0x1); // tail byte 4045 jccb(Assembler::zero, DONE); 4046 load_unsigned_byte(tmp1, Address(ary1, 0)); 4047 testl(tmp1, 0x00000080); 4048 jccb(Assembler::zero, DONE); 4049 subptr(result, 1); 4050 jmpb(DONE); 4051 4052 bind(TAIL_ADJUST); 4053 // there are negative bits in the last 4 byte block. 4054 // Adjust result and check the next three bytes 4055 addptr(result, len); 4056 orl(result, 3); 4057 lea(ary1, Address(ary1, len, Address::times_1)); 4058 jmpb(COMPARE_CHAR); 4059 4060 bind(CHAR_ADJUST); 4061 // We are looking at a char + optional byte tail, and found that one 4062 // of the bytes in the char is negative. Adjust the result, check the 4063 // first byte and readjust if needed. 4064 andl(result, 0xfffffffc); 4065 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4066 jccb(Assembler::notZero, DONE); 4067 addptr(result, 1); 4068 4069 // That's it 4070 bind(DONE); 4071 if (UseAVX >= 2 && UseSSE >= 2) { 4072 // clean upper bits of YMM registers 4073 vpxor(vec1, vec1); 4074 vpxor(vec2, vec2); 4075 } 4076 } 4077 4078 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4079 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4080 Register limit, Register result, Register chr, 4081 XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) { 4082 ShortBranchVerifier sbv(this); 4083 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4084 4085 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4086 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4087 4088 if (is_array_equ) { 4089 // Check the input args 4090 cmpoop(ary1, ary2); 4091 jcc(Assembler::equal, TRUE_LABEL); 4092 4093 // Need additional checks for arrays_equals. 4094 testptr(ary1, ary1); 4095 jcc(Assembler::zero, FALSE_LABEL); 4096 testptr(ary2, ary2); 4097 jcc(Assembler::zero, FALSE_LABEL); 4098 4099 // Check the lengths 4100 movl(limit, Address(ary1, length_offset)); 4101 cmpl(limit, Address(ary2, length_offset)); 4102 jcc(Assembler::notEqual, FALSE_LABEL); 4103 } 4104 4105 // count == 0 4106 testl(limit, limit); 4107 jcc(Assembler::zero, TRUE_LABEL); 4108 4109 if (is_array_equ) { 4110 // Load array address 4111 lea(ary1, Address(ary1, base_offset)); 4112 lea(ary2, Address(ary2, base_offset)); 4113 } 4114 4115 if (is_array_equ && is_char) { 4116 // arrays_equals when used for char[]. 4117 shll(limit, 1); // byte count != 0 4118 } 4119 movl(result, limit); // copy 4120 4121 if (UseAVX >= 2) { 4122 // With AVX2, use 32-byte vector compare 4123 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4124 4125 // Compare 32-byte vectors 4126 andl(result, 0x0000001f); // tail count (in bytes) 4127 andl(limit, 0xffffffe0); // vector count (in bytes) 4128 jcc(Assembler::zero, COMPARE_TAIL); 4129 4130 lea(ary1, Address(ary1, limit, Address::times_1)); 4131 lea(ary2, Address(ary2, limit, Address::times_1)); 4132 negptr(limit); 4133 4134 #ifdef _LP64 4135 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4136 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4137 4138 cmpl(limit, -64); 4139 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4140 4141 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4142 4143 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4144 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4145 kortestql(mask, mask); 4146 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4147 addptr(limit, 64); // update since we already compared at this addr 4148 cmpl(limit, -64); 4149 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4150 4151 // At this point we may still need to compare -limit+result bytes. 4152 // We could execute the next two instruction and just continue via non-wide path: 4153 // cmpl(limit, 0); 4154 // jcc(Assembler::equal, COMPARE_TAIL); // true 4155 // But since we stopped at the points ary{1,2}+limit which are 4156 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4157 // (|limit| <= 32 and result < 32), 4158 // we may just compare the last 64 bytes. 4159 // 4160 addptr(result, -64); // it is safe, bc we just came from this area 4161 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4162 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4163 kortestql(mask, mask); 4164 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4165 4166 jmp(TRUE_LABEL); 4167 4168 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4169 4170 }//if (VM_Version::supports_avx512vlbw()) 4171 #endif //_LP64 4172 bind(COMPARE_WIDE_VECTORS); 4173 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); 4174 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4175 vpxor(vec1, vec2); 4176 4177 vptest(vec1, vec1); 4178 jcc(Assembler::notZero, FALSE_LABEL); 4179 addptr(limit, 32); 4180 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4181 4182 testl(result, result); 4183 jcc(Assembler::zero, TRUE_LABEL); 4184 4185 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 4186 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4187 vpxor(vec1, vec2); 4188 4189 vptest(vec1, vec1); 4190 jccb(Assembler::notZero, FALSE_LABEL); 4191 jmpb(TRUE_LABEL); 4192 4193 bind(COMPARE_TAIL); // limit is zero 4194 movl(limit, result); 4195 // Fallthru to tail compare 4196 } else if (UseSSE42Intrinsics) { 4197 // With SSE4.2, use double quad vector compare 4198 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4199 4200 // Compare 16-byte vectors 4201 andl(result, 0x0000000f); // tail count (in bytes) 4202 andl(limit, 0xfffffff0); // vector count (in bytes) 4203 jcc(Assembler::zero, COMPARE_TAIL); 4204 4205 lea(ary1, Address(ary1, limit, Address::times_1)); 4206 lea(ary2, Address(ary2, limit, Address::times_1)); 4207 negptr(limit); 4208 4209 bind(COMPARE_WIDE_VECTORS); 4210 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4211 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4212 pxor(vec1, vec2); 4213 4214 ptest(vec1, vec1); 4215 jcc(Assembler::notZero, FALSE_LABEL); 4216 addptr(limit, 16); 4217 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4218 4219 testl(result, result); 4220 jcc(Assembler::zero, TRUE_LABEL); 4221 4222 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4223 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4224 pxor(vec1, vec2); 4225 4226 ptest(vec1, vec1); 4227 jccb(Assembler::notZero, FALSE_LABEL); 4228 jmpb(TRUE_LABEL); 4229 4230 bind(COMPARE_TAIL); // limit is zero 4231 movl(limit, result); 4232 // Fallthru to tail compare 4233 } 4234 4235 // Compare 4-byte vectors 4236 andl(limit, 0xfffffffc); // vector count (in bytes) 4237 jccb(Assembler::zero, COMPARE_CHAR); 4238 4239 lea(ary1, Address(ary1, limit, Address::times_1)); 4240 lea(ary2, Address(ary2, limit, Address::times_1)); 4241 negptr(limit); 4242 4243 bind(COMPARE_VECTORS); 4244 movl(chr, Address(ary1, limit, Address::times_1)); 4245 cmpl(chr, Address(ary2, limit, Address::times_1)); 4246 jccb(Assembler::notEqual, FALSE_LABEL); 4247 addptr(limit, 4); 4248 jcc(Assembler::notZero, COMPARE_VECTORS); 4249 4250 // Compare trailing char (final 2 bytes), if any 4251 bind(COMPARE_CHAR); 4252 testl(result, 0x2); // tail char 4253 jccb(Assembler::zero, COMPARE_BYTE); 4254 load_unsigned_short(chr, Address(ary1, 0)); 4255 load_unsigned_short(limit, Address(ary2, 0)); 4256 cmpl(chr, limit); 4257 jccb(Assembler::notEqual, FALSE_LABEL); 4258 4259 if (is_array_equ && is_char) { 4260 bind(COMPARE_BYTE); 4261 } else { 4262 lea(ary1, Address(ary1, 2)); 4263 lea(ary2, Address(ary2, 2)); 4264 4265 bind(COMPARE_BYTE); 4266 testl(result, 0x1); // tail byte 4267 jccb(Assembler::zero, TRUE_LABEL); 4268 load_unsigned_byte(chr, Address(ary1, 0)); 4269 load_unsigned_byte(limit, Address(ary2, 0)); 4270 cmpl(chr, limit); 4271 jccb(Assembler::notEqual, FALSE_LABEL); 4272 } 4273 bind(TRUE_LABEL); 4274 movl(result, 1); // return true 4275 jmpb(DONE); 4276 4277 bind(FALSE_LABEL); 4278 xorl(result, result); // return false 4279 4280 // That's it 4281 bind(DONE); 4282 if (UseAVX >= 2) { 4283 // clean upper bits of YMM registers 4284 vpxor(vec1, vec1); 4285 vpxor(vec2, vec2); 4286 } 4287 } 4288 4289 #ifdef _LP64 4290 4291 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4292 #define __ masm. 4293 Register dst = stub.data<0>(); 4294 XMMRegister src = stub.data<1>(); 4295 address target = stub.data<2>(); 4296 __ bind(stub.entry()); 4297 __ subptr(rsp, 8); 4298 __ movdbl(Address(rsp), src); 4299 __ call(RuntimeAddress(target)); 4300 __ pop(dst); 4301 __ jmp(stub.continuation()); 4302 #undef __ 4303 } 4304 4305 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4306 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4307 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4308 4309 address slowpath_target; 4310 if (dst_bt == T_INT) { 4311 if (src_bt == T_FLOAT) { 4312 cvttss2sil(dst, src); 4313 cmpl(dst, 0x80000000); 4314 slowpath_target = StubRoutines::x86::f2i_fixup(); 4315 } else { 4316 cvttsd2sil(dst, src); 4317 cmpl(dst, 0x80000000); 4318 slowpath_target = StubRoutines::x86::d2i_fixup(); 4319 } 4320 } else { 4321 if (src_bt == T_FLOAT) { 4322 cvttss2siq(dst, src); 4323 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4324 slowpath_target = StubRoutines::x86::f2l_fixup(); 4325 } else { 4326 cvttsd2siq(dst, src); 4327 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4328 slowpath_target = StubRoutines::x86::d2l_fixup(); 4329 } 4330 } 4331 4332 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4333 jcc(Assembler::equal, stub->entry()); 4334 bind(stub->continuation()); 4335 } 4336 4337 #endif // _LP64 4338 4339 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4340 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4341 switch(ideal_opc) { 4342 case Op_LShiftVS: 4343 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4344 case Op_LShiftVI: 4345 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4346 case Op_LShiftVL: 4347 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4348 case Op_RShiftVS: 4349 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4350 case Op_RShiftVI: 4351 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4352 case Op_RShiftVL: 4353 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4354 case Op_URShiftVS: 4355 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4356 case Op_URShiftVI: 4357 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4358 case Op_URShiftVL: 4359 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4360 case Op_RotateRightV: 4361 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4362 case Op_RotateLeftV: 4363 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4364 default: 4365 fatal("Unsupported masked operation"); break; 4366 } 4367 } 4368 4369 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4370 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4371 bool is_varshift) { 4372 switch (ideal_opc) { 4373 case Op_AddVB: 4374 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4375 case Op_AddVS: 4376 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4377 case Op_AddVI: 4378 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4379 case Op_AddVL: 4380 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4381 case Op_AddVF: 4382 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4383 case Op_AddVD: 4384 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4385 case Op_SubVB: 4386 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4387 case Op_SubVS: 4388 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4389 case Op_SubVI: 4390 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4391 case Op_SubVL: 4392 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4393 case Op_SubVF: 4394 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4395 case Op_SubVD: 4396 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4397 case Op_MulVS: 4398 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4399 case Op_MulVI: 4400 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4401 case Op_MulVL: 4402 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4403 case Op_MulVF: 4404 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4405 case Op_MulVD: 4406 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4407 case Op_DivVF: 4408 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4409 case Op_DivVD: 4410 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4411 case Op_SqrtVF: 4412 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4413 case Op_SqrtVD: 4414 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4415 case Op_AbsVB: 4416 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4417 case Op_AbsVS: 4418 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4419 case Op_AbsVI: 4420 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4421 case Op_AbsVL: 4422 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4423 case Op_FmaVF: 4424 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4425 case Op_FmaVD: 4426 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4427 case Op_VectorRearrange: 4428 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4429 case Op_LShiftVS: 4430 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4431 case Op_LShiftVI: 4432 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4433 case Op_LShiftVL: 4434 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4435 case Op_RShiftVS: 4436 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4437 case Op_RShiftVI: 4438 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4439 case Op_RShiftVL: 4440 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4441 case Op_URShiftVS: 4442 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4443 case Op_URShiftVI: 4444 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4445 case Op_URShiftVL: 4446 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4447 case Op_RotateLeftV: 4448 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4449 case Op_RotateRightV: 4450 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4451 case Op_MaxV: 4452 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4453 case Op_MinV: 4454 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4455 case Op_XorV: 4456 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4457 case Op_OrV: 4458 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4459 case Op_AndV: 4460 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4461 default: 4462 fatal("Unsupported masked operation"); break; 4463 } 4464 } 4465 4466 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4467 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4468 switch (ideal_opc) { 4469 case Op_AddVB: 4470 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4471 case Op_AddVS: 4472 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4473 case Op_AddVI: 4474 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4475 case Op_AddVL: 4476 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4477 case Op_AddVF: 4478 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4479 case Op_AddVD: 4480 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4481 case Op_SubVB: 4482 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4483 case Op_SubVS: 4484 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4485 case Op_SubVI: 4486 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4487 case Op_SubVL: 4488 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4489 case Op_SubVF: 4490 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4491 case Op_SubVD: 4492 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4493 case Op_MulVS: 4494 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4495 case Op_MulVI: 4496 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4497 case Op_MulVL: 4498 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4499 case Op_MulVF: 4500 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4501 case Op_MulVD: 4502 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4503 case Op_DivVF: 4504 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4505 case Op_DivVD: 4506 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4507 case Op_FmaVF: 4508 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4509 case Op_FmaVD: 4510 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4511 case Op_MaxV: 4512 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4513 case Op_MinV: 4514 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4515 case Op_XorV: 4516 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4517 case Op_OrV: 4518 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4519 case Op_AndV: 4520 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4521 default: 4522 fatal("Unsupported masked operation"); break; 4523 } 4524 } 4525 4526 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4527 KRegister src1, KRegister src2) { 4528 BasicType etype = T_ILLEGAL; 4529 switch(mask_len) { 4530 case 2: 4531 case 4: 4532 case 8: etype = T_BYTE; break; 4533 case 16: etype = T_SHORT; break; 4534 case 32: etype = T_INT; break; 4535 case 64: etype = T_LONG; break; 4536 default: fatal("Unsupported type"); break; 4537 } 4538 assert(etype != T_ILLEGAL, ""); 4539 switch(ideal_opc) { 4540 case Op_AndVMask: 4541 kand(etype, dst, src1, src2); break; 4542 case Op_OrVMask: 4543 kor(etype, dst, src1, src2); break; 4544 case Op_XorVMask: 4545 kxor(etype, dst, src1, src2); break; 4546 default: 4547 fatal("Unsupported masked operation"); break; 4548 } 4549 } 4550 4551 /* 4552 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4553 * If src is NaN, the result is 0. 4554 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4555 * the result is equal to the value of Integer.MIN_VALUE. 4556 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4557 * the result is equal to the value of Integer.MAX_VALUE. 4558 */ 4559 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4560 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4561 Register rscratch, AddressLiteral float_sign_flip, 4562 int vec_enc) { 4563 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4564 Label done; 4565 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4566 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4567 vptest(xtmp2, xtmp2, vec_enc); 4568 jccb(Assembler::equal, done); 4569 4570 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4571 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4572 4573 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4574 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4575 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4576 4577 // Recompute the mask for remaining special value. 4578 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4579 // Extract SRC values corresponding to TRUE mask lanes. 4580 vpand(xtmp4, xtmp2, src, vec_enc); 4581 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4582 // values are set. 4583 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4584 4585 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4586 bind(done); 4587 } 4588 4589 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4590 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4591 Register rscratch, AddressLiteral float_sign_flip, 4592 int vec_enc) { 4593 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4594 Label done; 4595 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4596 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4597 kortestwl(ktmp1, ktmp1); 4598 jccb(Assembler::equal, done); 4599 4600 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4601 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4602 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4603 4604 kxorwl(ktmp1, ktmp1, ktmp2); 4605 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4606 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4607 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4608 bind(done); 4609 } 4610 4611 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4612 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4613 Register rscratch, AddressLiteral double_sign_flip, 4614 int vec_enc) { 4615 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4616 4617 Label done; 4618 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4619 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4620 kortestwl(ktmp1, ktmp1); 4621 jccb(Assembler::equal, done); 4622 4623 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4624 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4625 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4626 4627 kxorwl(ktmp1, ktmp1, ktmp2); 4628 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4629 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4630 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4631 bind(done); 4632 } 4633 4634 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4635 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4636 Register rscratch, AddressLiteral float_sign_flip, 4637 int vec_enc) { 4638 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4639 Label done; 4640 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4641 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4642 kortestwl(ktmp1, ktmp1); 4643 jccb(Assembler::equal, done); 4644 4645 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4646 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4647 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4648 4649 kxorwl(ktmp1, ktmp1, ktmp2); 4650 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4651 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4652 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4653 bind(done); 4654 } 4655 4656 /* 4657 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4658 * If src is NaN, the result is 0. 4659 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 4660 * the result is equal to the value of Long.MIN_VALUE. 4661 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 4662 * the result is equal to the value of Long.MAX_VALUE. 4663 */ 4664 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4665 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4666 Register rscratch, AddressLiteral double_sign_flip, 4667 int vec_enc) { 4668 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4669 4670 Label done; 4671 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4672 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 4673 kortestwl(ktmp1, ktmp1); 4674 jccb(Assembler::equal, done); 4675 4676 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4677 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4678 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4679 4680 kxorwl(ktmp1, ktmp1, ktmp2); 4681 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4682 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4683 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4684 bind(done); 4685 } 4686 4687 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 4688 XMMRegister xtmp, int index, int vec_enc) { 4689 assert(vec_enc < Assembler::AVX_512bit, ""); 4690 if (vec_enc == Assembler::AVX_256bit) { 4691 vextractf128_high(xtmp, src); 4692 vshufps(dst, src, xtmp, index, vec_enc); 4693 } else { 4694 vshufps(dst, src, zero, index, vec_enc); 4695 } 4696 } 4697 4698 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4699 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 4700 AddressLiteral float_sign_flip, int src_vec_enc) { 4701 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4702 4703 Label done; 4704 // Compare the destination lanes with float_sign_flip 4705 // value to get mask for all special values. 4706 movdqu(xtmp1, float_sign_flip, rscratch); 4707 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 4708 ptest(xtmp2, xtmp2); 4709 jccb(Assembler::equal, done); 4710 4711 // Flip float_sign_flip to get max integer value. 4712 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 4713 pxor(xtmp1, xtmp4); 4714 4715 // Set detination lanes corresponding to unordered source lanes as zero. 4716 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 4717 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 4718 4719 // Shuffle mask vector and pack lower doubles word from each quadword lane. 4720 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 4721 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 4722 4723 // Recompute the mask for remaining special value. 4724 pxor(xtmp2, xtmp3); 4725 // Extract mask corresponding to non-negative source lanes. 4726 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 4727 4728 // Shuffle mask vector and pack lower doubles word from each quadword lane. 4729 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 4730 pand(xtmp3, xtmp2); 4731 4732 // Replace destination lanes holding special value(0x80000000) with max int 4733 // if corresponding source lane holds a +ve value. 4734 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 4735 bind(done); 4736 } 4737 4738 4739 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 4740 XMMRegister xtmp, Register rscratch, int vec_enc) { 4741 switch(to_elem_bt) { 4742 case T_SHORT: 4743 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 4744 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 4745 vpackusdw(dst, dst, zero, vec_enc); 4746 if (vec_enc == Assembler::AVX_256bit) { 4747 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 4748 } 4749 break; 4750 case T_BYTE: 4751 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 4752 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 4753 vpackusdw(dst, dst, zero, vec_enc); 4754 if (vec_enc == Assembler::AVX_256bit) { 4755 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 4756 } 4757 vpackuswb(dst, dst, zero, vec_enc); 4758 break; 4759 default: assert(false, "%s", type2name(to_elem_bt)); 4760 } 4761 } 4762 4763 /* 4764 * Algorithm for vector D2L and F2I conversions:- 4765 * a) Perform vector D2L/F2I cast. 4766 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 4767 * It signifies that source value could be any of the special floating point 4768 * values(NaN,-Inf,Inf,Max,-Min). 4769 * c) Set destination to zero if source is NaN value. 4770 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 4771 */ 4772 4773 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4774 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4775 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 4776 int to_elem_sz = type2aelembytes(to_elem_bt); 4777 assert(to_elem_sz <= 4, ""); 4778 vcvttps2dq(dst, src, vec_enc); 4779 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 4780 if (to_elem_sz < 4) { 4781 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4782 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 4783 } 4784 } 4785 4786 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4787 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 4788 Register rscratch, int vec_enc) { 4789 int to_elem_sz = type2aelembytes(to_elem_bt); 4790 assert(to_elem_sz <= 4, ""); 4791 vcvttps2dq(dst, src, vec_enc); 4792 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 4793 switch(to_elem_bt) { 4794 case T_INT: 4795 break; 4796 case T_SHORT: 4797 evpmovdw(dst, dst, vec_enc); 4798 break; 4799 case T_BYTE: 4800 evpmovdb(dst, dst, vec_enc); 4801 break; 4802 default: assert(false, "%s", type2name(to_elem_bt)); 4803 } 4804 } 4805 4806 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4807 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 4808 Register rscratch, int vec_enc) { 4809 evcvttps2qq(dst, src, vec_enc); 4810 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 4811 } 4812 4813 // Handling for downcasting from double to integer or sub-word types on AVX2. 4814 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4815 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 4816 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 4817 int to_elem_sz = type2aelembytes(to_elem_bt); 4818 assert(to_elem_sz < 8, ""); 4819 vcvttpd2dq(dst, src, vec_enc); 4820 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 4821 float_sign_flip, vec_enc); 4822 if (to_elem_sz < 4) { 4823 // xtmp4 holds all zero lanes. 4824 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 4825 } 4826 } 4827 4828 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 4829 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 4830 KRegister ktmp2, AddressLiteral sign_flip, 4831 Register rscratch, int vec_enc) { 4832 if (VM_Version::supports_avx512dq()) { 4833 evcvttpd2qq(dst, src, vec_enc); 4834 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 4835 switch(to_elem_bt) { 4836 case T_LONG: 4837 break; 4838 case T_INT: 4839 evpmovsqd(dst, dst, vec_enc); 4840 break; 4841 case T_SHORT: 4842 evpmovsqd(dst, dst, vec_enc); 4843 evpmovdw(dst, dst, vec_enc); 4844 break; 4845 case T_BYTE: 4846 evpmovsqd(dst, dst, vec_enc); 4847 evpmovdb(dst, dst, vec_enc); 4848 break; 4849 default: assert(false, "%s", type2name(to_elem_bt)); 4850 } 4851 } else { 4852 assert(type2aelembytes(to_elem_bt) <= 4, ""); 4853 vcvttpd2dq(dst, src, vec_enc); 4854 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 4855 switch(to_elem_bt) { 4856 case T_INT: 4857 break; 4858 case T_SHORT: 4859 evpmovdw(dst, dst, vec_enc); 4860 break; 4861 case T_BYTE: 4862 evpmovdb(dst, dst, vec_enc); 4863 break; 4864 default: assert(false, "%s", type2name(to_elem_bt)); 4865 } 4866 } 4867 } 4868 4869 #ifdef _LP64 4870 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 4871 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 4872 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 4873 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4874 // and re-instantiate original MXCSR.RC mode after that. 4875 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 4876 4877 mov64(tmp, julong_cast(0.5L)); 4878 evpbroadcastq(xtmp1, tmp, vec_enc); 4879 vaddpd(xtmp1, src , xtmp1, vec_enc); 4880 evcvtpd2qq(dst, xtmp1, vec_enc); 4881 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 4882 double_sign_flip, vec_enc);; 4883 4884 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 4885 } 4886 4887 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 4888 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 4889 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 4890 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4891 // and re-instantiate original MXCSR.RC mode after that. 4892 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 4893 4894 movl(tmp, jint_cast(0.5)); 4895 movq(xtmp1, tmp); 4896 vbroadcastss(xtmp1, xtmp1, vec_enc); 4897 vaddps(xtmp1, src , xtmp1, vec_enc); 4898 vcvtps2dq(dst, xtmp1, vec_enc); 4899 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 4900 float_sign_flip, vec_enc); 4901 4902 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 4903 } 4904 4905 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 4906 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 4907 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 4908 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4909 // and re-instantiate original MXCSR.RC mode after that. 4910 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 4911 4912 movl(tmp, jint_cast(0.5)); 4913 movq(xtmp1, tmp); 4914 vbroadcastss(xtmp1, xtmp1, vec_enc); 4915 vaddps(xtmp1, src , xtmp1, vec_enc); 4916 vcvtps2dq(dst, xtmp1, vec_enc); 4917 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 4918 4919 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 4920 } 4921 #endif // _LP64 4922 4923 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 4924 BasicType from_elem_bt, BasicType to_elem_bt) { 4925 switch (from_elem_bt) { 4926 case T_BYTE: 4927 switch (to_elem_bt) { 4928 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 4929 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 4930 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 4931 default: ShouldNotReachHere(); 4932 } 4933 break; 4934 case T_SHORT: 4935 switch (to_elem_bt) { 4936 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 4937 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 4938 default: ShouldNotReachHere(); 4939 } 4940 break; 4941 case T_INT: 4942 assert(to_elem_bt == T_LONG, ""); 4943 vpmovzxdq(dst, src, vlen_enc); 4944 break; 4945 default: 4946 ShouldNotReachHere(); 4947 } 4948 } 4949 4950 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 4951 BasicType from_elem_bt, BasicType to_elem_bt) { 4952 switch (from_elem_bt) { 4953 case T_BYTE: 4954 switch (to_elem_bt) { 4955 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 4956 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 4957 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 4958 default: ShouldNotReachHere(); 4959 } 4960 break; 4961 case T_SHORT: 4962 switch (to_elem_bt) { 4963 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 4964 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 4965 default: ShouldNotReachHere(); 4966 } 4967 break; 4968 case T_INT: 4969 assert(to_elem_bt == T_LONG, ""); 4970 vpmovsxdq(dst, src, vlen_enc); 4971 break; 4972 default: 4973 ShouldNotReachHere(); 4974 } 4975 } 4976 4977 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 4978 BasicType dst_bt, BasicType src_bt, int vlen) { 4979 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 4980 assert(vlen_enc != AVX_512bit, ""); 4981 4982 int dst_bt_size = type2aelembytes(dst_bt); 4983 int src_bt_size = type2aelembytes(src_bt); 4984 if (dst_bt_size > src_bt_size) { 4985 switch (dst_bt_size / src_bt_size) { 4986 case 2: vpmovsxbw(dst, src, vlen_enc); break; 4987 case 4: vpmovsxbd(dst, src, vlen_enc); break; 4988 case 8: vpmovsxbq(dst, src, vlen_enc); break; 4989 default: ShouldNotReachHere(); 4990 } 4991 } else { 4992 assert(dst_bt_size < src_bt_size, ""); 4993 switch (src_bt_size / dst_bt_size) { 4994 case 2: { 4995 if (vlen_enc == AVX_128bit) { 4996 vpacksswb(dst, src, src, vlen_enc); 4997 } else { 4998 vpacksswb(dst, src, src, vlen_enc); 4999 vpermq(dst, dst, 0x08, vlen_enc); 5000 } 5001 break; 5002 } 5003 case 4: { 5004 if (vlen_enc == AVX_128bit) { 5005 vpackssdw(dst, src, src, vlen_enc); 5006 vpacksswb(dst, dst, dst, vlen_enc); 5007 } else { 5008 vpackssdw(dst, src, src, vlen_enc); 5009 vpermq(dst, dst, 0x08, vlen_enc); 5010 vpacksswb(dst, dst, dst, AVX_128bit); 5011 } 5012 break; 5013 } 5014 case 8: { 5015 if (vlen_enc == AVX_128bit) { 5016 vpshufd(dst, src, 0x08, vlen_enc); 5017 vpackssdw(dst, dst, dst, vlen_enc); 5018 vpacksswb(dst, dst, dst, vlen_enc); 5019 } else { 5020 vpshufd(dst, src, 0x08, vlen_enc); 5021 vpermq(dst, dst, 0x08, vlen_enc); 5022 vpackssdw(dst, dst, dst, AVX_128bit); 5023 vpacksswb(dst, dst, dst, AVX_128bit); 5024 } 5025 break; 5026 } 5027 default: ShouldNotReachHere(); 5028 } 5029 } 5030 } 5031 5032 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5033 bool merge, BasicType bt, int vlen_enc) { 5034 if (bt == T_INT) { 5035 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5036 } else { 5037 assert(bt == T_LONG, ""); 5038 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5039 } 5040 } 5041 5042 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5043 bool merge, BasicType bt, int vlen_enc) { 5044 if (bt == T_INT) { 5045 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5046 } else { 5047 assert(bt == T_LONG, ""); 5048 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5049 } 5050 } 5051 5052 #ifdef _LP64 5053 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5054 Register rtmp2, XMMRegister xtmp, int mask_len, 5055 int vec_enc) { 5056 int index = 0; 5057 int vindex = 0; 5058 mov64(rtmp1, 0x0101010101010101L); 5059 pdepq(rtmp1, src, rtmp1); 5060 if (mask_len > 8) { 5061 movq(rtmp2, src); 5062 vpxor(xtmp, xtmp, xtmp, vec_enc); 5063 movq(xtmp, rtmp1); 5064 } 5065 movq(dst, rtmp1); 5066 5067 mask_len -= 8; 5068 while (mask_len > 0) { 5069 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5070 index++; 5071 if ((index % 2) == 0) { 5072 pxor(xtmp, xtmp); 5073 } 5074 mov64(rtmp1, 0x0101010101010101L); 5075 shrq(rtmp2, 8); 5076 pdepq(rtmp1, rtmp2, rtmp1); 5077 pinsrq(xtmp, rtmp1, index % 2); 5078 vindex = index / 2; 5079 if (vindex) { 5080 // Write entire 16 byte vector when both 64 bit 5081 // lanes are update to save redundant instructions. 5082 if (index % 2) { 5083 vinsertf128(dst, dst, xtmp, vindex); 5084 } 5085 } else { 5086 vmovdqu(dst, xtmp); 5087 } 5088 mask_len -= 8; 5089 } 5090 } 5091 5092 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5093 switch(opc) { 5094 case Op_VectorMaskTrueCount: 5095 popcntq(dst, tmp); 5096 break; 5097 case Op_VectorMaskLastTrue: 5098 if (VM_Version::supports_lzcnt()) { 5099 lzcntq(tmp, tmp); 5100 movl(dst, 63); 5101 subl(dst, tmp); 5102 } else { 5103 movl(dst, -1); 5104 bsrq(tmp, tmp); 5105 cmov32(Assembler::notZero, dst, tmp); 5106 } 5107 break; 5108 case Op_VectorMaskFirstTrue: 5109 if (VM_Version::supports_bmi1()) { 5110 if (masklen < 32) { 5111 orl(tmp, 1 << masklen); 5112 tzcntl(dst, tmp); 5113 } else if (masklen == 32) { 5114 tzcntl(dst, tmp); 5115 } else { 5116 assert(masklen == 64, ""); 5117 tzcntq(dst, tmp); 5118 } 5119 } else { 5120 if (masklen < 32) { 5121 orl(tmp, 1 << masklen); 5122 bsfl(dst, tmp); 5123 } else { 5124 assert(masklen == 32 || masklen == 64, ""); 5125 movl(dst, masklen); 5126 if (masklen == 32) { 5127 bsfl(tmp, tmp); 5128 } else { 5129 bsfq(tmp, tmp); 5130 } 5131 cmov32(Assembler::notZero, dst, tmp); 5132 } 5133 } 5134 break; 5135 case Op_VectorMaskToLong: 5136 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5137 break; 5138 default: assert(false, "Unhandled mask operation"); 5139 } 5140 } 5141 5142 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5143 int masklen, int masksize, int vec_enc) { 5144 assert(VM_Version::supports_popcnt(), ""); 5145 5146 if(VM_Version::supports_avx512bw()) { 5147 kmovql(tmp, mask); 5148 } else { 5149 assert(masklen <= 16, ""); 5150 kmovwl(tmp, mask); 5151 } 5152 5153 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5154 // operations needs to be clipped. 5155 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5156 andq(tmp, (1 << masklen) - 1); 5157 } 5158 5159 vector_mask_operation_helper(opc, dst, tmp, masklen); 5160 } 5161 5162 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5163 Register tmp, int masklen, BasicType bt, int vec_enc) { 5164 assert(vec_enc == AVX_128bit && VM_Version::supports_avx() || 5165 vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4), ""); 5166 assert(VM_Version::supports_popcnt(), ""); 5167 5168 bool need_clip = false; 5169 switch(bt) { 5170 case T_BOOLEAN: 5171 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5172 vpxor(xtmp, xtmp, xtmp, vec_enc); 5173 vpsubb(xtmp, xtmp, mask, vec_enc); 5174 vpmovmskb(tmp, xtmp, vec_enc); 5175 need_clip = masklen < 16; 5176 break; 5177 case T_BYTE: 5178 vpmovmskb(tmp, mask, vec_enc); 5179 need_clip = masklen < 16; 5180 break; 5181 case T_SHORT: 5182 vpacksswb(xtmp, mask, mask, vec_enc); 5183 if (masklen >= 16) { 5184 vpermpd(xtmp, xtmp, 8, vec_enc); 5185 } 5186 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5187 need_clip = masklen < 16; 5188 break; 5189 case T_INT: 5190 case T_FLOAT: 5191 vmovmskps(tmp, mask, vec_enc); 5192 need_clip = masklen < 4; 5193 break; 5194 case T_LONG: 5195 case T_DOUBLE: 5196 vmovmskpd(tmp, mask, vec_enc); 5197 need_clip = masklen < 2; 5198 break; 5199 default: assert(false, "Unhandled type, %s", type2name(bt)); 5200 } 5201 5202 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5203 // operations needs to be clipped. 5204 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5205 // need_clip implies masklen < 32 5206 andq(tmp, (1 << masklen) - 1); 5207 } 5208 5209 vector_mask_operation_helper(opc, dst, tmp, masklen); 5210 } 5211 5212 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5213 Register rtmp2, int mask_len) { 5214 kmov(rtmp1, src); 5215 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5216 mov64(rtmp2, -1L); 5217 pextq(rtmp2, rtmp2, rtmp1); 5218 kmov(dst, rtmp2); 5219 } 5220 5221 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5222 bool merge, BasicType bt, int vec_enc) { 5223 if (opcode == Op_CompressV) { 5224 switch(bt) { 5225 case T_BYTE: 5226 evpcompressb(dst, mask, src, merge, vec_enc); 5227 break; 5228 case T_CHAR: 5229 case T_SHORT: 5230 evpcompressw(dst, mask, src, merge, vec_enc); 5231 break; 5232 case T_INT: 5233 evpcompressd(dst, mask, src, merge, vec_enc); 5234 break; 5235 case T_FLOAT: 5236 evcompressps(dst, mask, src, merge, vec_enc); 5237 break; 5238 case T_LONG: 5239 evpcompressq(dst, mask, src, merge, vec_enc); 5240 break; 5241 case T_DOUBLE: 5242 evcompresspd(dst, mask, src, merge, vec_enc); 5243 break; 5244 default: 5245 fatal("Unsupported type %s", type2name(bt)); 5246 break; 5247 } 5248 } else { 5249 assert(opcode == Op_ExpandV, ""); 5250 switch(bt) { 5251 case T_BYTE: 5252 evpexpandb(dst, mask, src, merge, vec_enc); 5253 break; 5254 case T_CHAR: 5255 case T_SHORT: 5256 evpexpandw(dst, mask, src, merge, vec_enc); 5257 break; 5258 case T_INT: 5259 evpexpandd(dst, mask, src, merge, vec_enc); 5260 break; 5261 case T_FLOAT: 5262 evexpandps(dst, mask, src, merge, vec_enc); 5263 break; 5264 case T_LONG: 5265 evpexpandq(dst, mask, src, merge, vec_enc); 5266 break; 5267 case T_DOUBLE: 5268 evexpandpd(dst, mask, src, merge, vec_enc); 5269 break; 5270 default: 5271 fatal("Unsupported type %s", type2name(bt)); 5272 break; 5273 } 5274 } 5275 } 5276 #endif 5277 5278 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5279 KRegister ktmp1, int vec_enc) { 5280 if (opcode == Op_SignumVD) { 5281 vsubpd(dst, zero, one, vec_enc); 5282 // if src < 0 ? -1 : 1 5283 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5284 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5285 // if src == NaN, -0.0 or 0.0 return src. 5286 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5287 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5288 } else { 5289 assert(opcode == Op_SignumVF, ""); 5290 vsubps(dst, zero, one, vec_enc); 5291 // if src < 0 ? -1 : 1 5292 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5293 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5294 // if src == NaN, -0.0 or 0.0 return src. 5295 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5296 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5297 } 5298 } 5299 5300 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5301 XMMRegister xtmp1, int vec_enc) { 5302 if (opcode == Op_SignumVD) { 5303 vsubpd(dst, zero, one, vec_enc); 5304 // if src < 0 ? -1 : 1 5305 vblendvpd(dst, one, dst, src, vec_enc); 5306 // if src == NaN, -0.0 or 0.0 return src. 5307 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5308 vblendvpd(dst, dst, src, xtmp1, vec_enc); 5309 } else { 5310 assert(opcode == Op_SignumVF, ""); 5311 vsubps(dst, zero, one, vec_enc); 5312 // if src < 0 ? -1 : 1 5313 vblendvps(dst, one, dst, src, vec_enc); 5314 // if src == NaN, -0.0 or 0.0 return src. 5315 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5316 vblendvps(dst, dst, src, xtmp1, vec_enc); 5317 } 5318 } 5319 5320 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5321 if (VM_Version::supports_avx512bw()) { 5322 if (mask_len > 32) { 5323 kmovql(dst, src); 5324 } else { 5325 kmovdl(dst, src); 5326 if (mask_len != 32) { 5327 kshiftrdl(dst, dst, 32 - mask_len); 5328 } 5329 } 5330 } else { 5331 assert(mask_len <= 16, ""); 5332 kmovwl(dst, src); 5333 if (mask_len != 16) { 5334 kshiftrwl(dst, dst, 16 - mask_len); 5335 } 5336 } 5337 } 5338 5339 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5340 int lane_size = type2aelembytes(bt); 5341 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5342 if ((is_LP64 || lane_size < 8) && 5343 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5344 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5345 movptr(rtmp, imm32); 5346 switch(lane_size) { 5347 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5348 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5349 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5350 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5351 fatal("Unsupported lane size %d", lane_size); 5352 break; 5353 } 5354 } else { 5355 movptr(rtmp, imm32); 5356 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5357 switch(lane_size) { 5358 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5359 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5360 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5361 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5362 fatal("Unsupported lane size %d", lane_size); 5363 break; 5364 } 5365 } 5366 } 5367 5368 // 5369 // Following is lookup table based popcount computation algorithm:- 5370 // Index Bit set count 5371 // [ 0000 -> 0, 5372 // 0001 -> 1, 5373 // 0010 -> 1, 5374 // 0011 -> 2, 5375 // 0100 -> 1, 5376 // 0101 -> 2, 5377 // 0110 -> 2, 5378 // 0111 -> 3, 5379 // 1000 -> 1, 5380 // 1001 -> 2, 5381 // 1010 -> 3, 5382 // 1011 -> 3, 5383 // 1100 -> 2, 5384 // 1101 -> 3, 5385 // 1111 -> 4 ] 5386 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5387 // shuffle indices for lookup table access. 5388 // b. Right shift each byte of vector lane by 4 positions. 5389 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5390 // shuffle indices for lookup table access. 5391 // d. Add the bitset count of upper and lower 4 bits of each byte. 5392 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5393 // count of all the bytes of a quadword. 5394 // f. Perform step e. for upper 128bit vector lane. 5395 // g. Pack the bitset count of quadwords back to double word. 5396 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5397 5398 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5399 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5400 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5401 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5402 vpsrlw(dst, src, 4, vec_enc); 5403 vpand(dst, dst, xtmp1, vec_enc); 5404 vpand(xtmp1, src, xtmp1, vec_enc); 5405 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5406 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5407 vpshufb(dst, xtmp2, dst, vec_enc); 5408 vpaddb(dst, dst, xtmp1, vec_enc); 5409 } 5410 5411 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5412 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5413 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5414 // Following code is as per steps e,f,g and h of above algorithm. 5415 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5416 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5417 vpsadbw(dst, dst, xtmp2, vec_enc); 5418 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5419 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5420 vpackuswb(dst, xtmp1, dst, vec_enc); 5421 } 5422 5423 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5424 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5425 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5426 // Add the popcount of upper and lower bytes of word. 5427 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5428 vpsrlw(dst, xtmp1, 8, vec_enc); 5429 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5430 vpaddw(dst, dst, xtmp1, vec_enc); 5431 } 5432 5433 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5434 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5435 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5436 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5437 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5438 } 5439 5440 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5441 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5442 switch(bt) { 5443 case T_LONG: 5444 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5445 break; 5446 case T_INT: 5447 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5448 break; 5449 case T_CHAR: 5450 case T_SHORT: 5451 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5452 break; 5453 case T_BYTE: 5454 case T_BOOLEAN: 5455 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5456 break; 5457 default: 5458 fatal("Unsupported type %s", type2name(bt)); 5459 break; 5460 } 5461 } 5462 5463 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5464 KRegister mask, bool merge, int vec_enc) { 5465 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5466 switch(bt) { 5467 case T_LONG: 5468 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5469 evpopcntq(dst, mask, src, merge, vec_enc); 5470 break; 5471 case T_INT: 5472 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5473 evpopcntd(dst, mask, src, merge, vec_enc); 5474 break; 5475 case T_CHAR: 5476 case T_SHORT: 5477 assert(VM_Version::supports_avx512_bitalg(), ""); 5478 evpopcntw(dst, mask, src, merge, vec_enc); 5479 break; 5480 case T_BYTE: 5481 case T_BOOLEAN: 5482 assert(VM_Version::supports_avx512_bitalg(), ""); 5483 evpopcntb(dst, mask, src, merge, vec_enc); 5484 break; 5485 default: 5486 fatal("Unsupported type %s", type2name(bt)); 5487 break; 5488 } 5489 } 5490 5491 #ifndef _LP64 5492 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 5493 assert(VM_Version::supports_avx512bw(), ""); 5494 kmovdl(tmp, src); 5495 kunpckdql(dst, tmp, tmp); 5496 } 5497 #endif 5498 5499 // Bit reversal algorithm first reverses the bits of each byte followed by 5500 // a byte level reversal for multi-byte primitive types (short/int/long). 5501 // Algorithm performs a lookup table access to get reverse bit sequence 5502 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5503 // is obtained by swapping the reverse bit sequences of upper and lower 5504 // nibble of a byte. 5505 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5506 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5507 if (VM_Version::supports_avx512vlbw()) { 5508 5509 // Get the reverse bit sequence of lower nibble of each byte. 5510 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5511 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5512 evpandq(dst, xtmp2, src, vec_enc); 5513 vpshufb(dst, xtmp1, dst, vec_enc); 5514 vpsllq(dst, dst, 4, vec_enc); 5515 5516 // Get the reverse bit sequence of upper nibble of each byte. 5517 vpandn(xtmp2, xtmp2, src, vec_enc); 5518 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5519 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5520 5521 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5522 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5523 evporq(xtmp2, dst, xtmp2, vec_enc); 5524 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5525 5526 } else if(vec_enc == Assembler::AVX_512bit) { 5527 // Shift based bit reversal. 5528 assert(bt == T_LONG || bt == T_INT, ""); 5529 5530 // Swap lower and upper nibble of each byte. 5531 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5532 5533 // Swap two least and most significant bits of each nibble. 5534 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5535 5536 // Swap adjacent pair of bits. 5537 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5538 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5539 5540 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5541 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5542 } else { 5543 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5544 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5545 5546 // Get the reverse bit sequence of lower nibble of each byte. 5547 vpand(dst, xtmp2, src, vec_enc); 5548 vpshufb(dst, xtmp1, dst, vec_enc); 5549 vpsllq(dst, dst, 4, vec_enc); 5550 5551 // Get the reverse bit sequence of upper nibble of each byte. 5552 vpandn(xtmp2, xtmp2, src, vec_enc); 5553 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5554 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5555 5556 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5557 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5558 vpor(xtmp2, dst, xtmp2, vec_enc); 5559 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5560 } 5561 } 5562 5563 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 5564 XMMRegister xtmp, Register rscratch) { 5565 assert(VM_Version::supports_gfni(), ""); 5566 assert(rscratch != noreg || always_reachable(mask), "missing"); 5567 5568 // Galois field instruction based bit reversal based on following algorithm. 5569 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5570 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 5571 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 5572 vector_reverse_byte(bt, dst, xtmp, vec_enc); 5573 } 5574 5575 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 5576 XMMRegister xtmp1, Register rtmp, int vec_enc) { 5577 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 5578 evpandq(dst, xtmp1, src, vec_enc); 5579 vpsllq(dst, dst, nbits, vec_enc); 5580 vpandn(xtmp1, xtmp1, src, vec_enc); 5581 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 5582 evporq(dst, dst, xtmp1, vec_enc); 5583 } 5584 5585 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5586 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5587 // Shift based bit reversal. 5588 assert(VM_Version::supports_evex(), ""); 5589 switch(bt) { 5590 case T_LONG: 5591 // Swap upper and lower double word of each quad word. 5592 evprorq(xtmp1, k0, src, 32, true, vec_enc); 5593 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 5594 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5595 break; 5596 case T_INT: 5597 // Swap upper and lower word of each double word. 5598 evprord(xtmp1, k0, src, 16, true, vec_enc); 5599 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5600 break; 5601 case T_CHAR: 5602 case T_SHORT: 5603 // Swap upper and lower byte of each word. 5604 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 5605 break; 5606 case T_BYTE: 5607 evmovdquq(dst, k0, src, true, vec_enc); 5608 break; 5609 default: 5610 fatal("Unsupported type %s", type2name(bt)); 5611 break; 5612 } 5613 } 5614 5615 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5616 if (bt == T_BYTE) { 5617 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 5618 evmovdquq(dst, k0, src, true, vec_enc); 5619 } else { 5620 vmovdqu(dst, src); 5621 } 5622 return; 5623 } 5624 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 5625 // pre-computed shuffle indices. 5626 switch(bt) { 5627 case T_LONG: 5628 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 5629 break; 5630 case T_INT: 5631 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 5632 break; 5633 case T_CHAR: 5634 case T_SHORT: 5635 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 5636 break; 5637 default: 5638 fatal("Unsupported type %s", type2name(bt)); 5639 break; 5640 } 5641 vpshufb(dst, src, dst, vec_enc); 5642 } 5643 5644 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5645 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5646 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 5647 assert(is_integral_type(bt), ""); 5648 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5649 assert(VM_Version::supports_avx512cd(), ""); 5650 switch(bt) { 5651 case T_LONG: 5652 evplzcntq(dst, ktmp, src, merge, vec_enc); 5653 break; 5654 case T_INT: 5655 evplzcntd(dst, ktmp, src, merge, vec_enc); 5656 break; 5657 case T_SHORT: 5658 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 5659 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 5660 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 5661 vpunpckhwd(dst, xtmp1, src, vec_enc); 5662 evplzcntd(dst, ktmp, dst, merge, vec_enc); 5663 vpackusdw(dst, xtmp2, dst, vec_enc); 5664 break; 5665 case T_BYTE: 5666 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5667 // accessing the lookup table. 5668 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 5669 // accessing the lookup table. 5670 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 5671 assert(VM_Version::supports_avx512bw(), ""); 5672 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 5673 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 5674 vpand(xtmp2, dst, src, vec_enc); 5675 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5676 vpsrlw(xtmp3, src, 4, vec_enc); 5677 vpand(xtmp3, dst, xtmp3, vec_enc); 5678 vpshufb(dst, xtmp1, xtmp3, vec_enc); 5679 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 5680 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 5681 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 5682 break; 5683 default: 5684 fatal("Unsupported type %s", type2name(bt)); 5685 break; 5686 } 5687 } 5688 5689 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5690 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5691 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 5692 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5693 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5694 // accessing the lookup table. 5695 vpand(dst, xtmp2, src, vec_enc); 5696 vpshufb(dst, xtmp1, dst, vec_enc); 5697 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 5698 // accessing the lookup table. 5699 vpsrlw(xtmp3, src, 4, vec_enc); 5700 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 5701 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 5702 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 5703 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 5704 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 5705 vpaddb(dst, dst, xtmp2, vec_enc); 5706 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 5707 } 5708 5709 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5710 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5711 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5712 // Add zero counts of lower byte and upper byte of a word if 5713 // upper byte holds a zero value. 5714 vpsrlw(xtmp3, src, 8, vec_enc); 5715 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 5716 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 5717 vpsllw(xtmp2, dst, 8, vec_enc); 5718 vpaddw(xtmp2, xtmp2, dst, vec_enc); 5719 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5720 vpsrlw(dst, dst, 8, vec_enc); 5721 } 5722 5723 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5724 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 5725 // Since IEEE 754 floating point format represents mantissa in 1.0 format 5726 // hence biased exponent can be used to compute leading zero count as per 5727 // following formula:- 5728 // LZCNT = 32 - (biased_exp - 127) 5729 // Special handling has been introduced for Zero, Max_Int and -ve source values. 5730 5731 // Broadcast 0xFF 5732 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 5733 vpsrld(xtmp1, xtmp1, 24, vec_enc); 5734 5735 // Extract biased exponent. 5736 vcvtdq2ps(dst, src, vec_enc); 5737 vpsrld(dst, dst, 23, vec_enc); 5738 vpand(dst, dst, xtmp1, vec_enc); 5739 5740 // Broadcast 127. 5741 vpsrld(xtmp1, xtmp1, 1, vec_enc); 5742 // Exponent = biased_exp - 127 5743 vpsubd(dst, dst, xtmp1, vec_enc); 5744 5745 // Exponent = Exponent + 1 5746 vpsrld(xtmp3, xtmp1, 6, vec_enc); 5747 vpaddd(dst, dst, xtmp3, vec_enc); 5748 5749 // Replace -ve exponent with zero, exponent is -ve when src 5750 // lane contains a zero value. 5751 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5752 vblendvps(dst, dst, xtmp2, dst, vec_enc); 5753 5754 // Rematerialize broadcast 32. 5755 vpslld(xtmp1, xtmp3, 5, vec_enc); 5756 // Exponent is 32 if corresponding source lane contains max_int value. 5757 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 5758 // LZCNT = 32 - exponent 5759 vpsubd(dst, xtmp1, dst, vec_enc); 5760 5761 // Replace LZCNT with a value 1 if corresponding source lane 5762 // contains max_int value. 5763 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 5764 5765 // Replace biased_exp with 0 if source lane value is less than zero. 5766 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5767 vblendvps(dst, dst, xtmp2, src, vec_enc); 5768 } 5769 5770 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5771 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5772 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5773 // Add zero counts of lower word and upper word of a double word if 5774 // upper word holds a zero value. 5775 vpsrld(xtmp3, src, 16, vec_enc); 5776 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 5777 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 5778 vpslld(xtmp2, dst, 16, vec_enc); 5779 vpaddd(xtmp2, xtmp2, dst, vec_enc); 5780 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5781 vpsrld(dst, dst, 16, vec_enc); 5782 // Add zero counts of lower doubleword and upper doubleword of a 5783 // quadword if upper doubleword holds a zero value. 5784 vpsrlq(xtmp3, src, 32, vec_enc); 5785 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 5786 vpsllq(xtmp2, dst, 32, vec_enc); 5787 vpaddq(xtmp2, xtmp2, dst, vec_enc); 5788 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5789 vpsrlq(dst, dst, 32, vec_enc); 5790 } 5791 5792 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 5793 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5794 Register rtmp, int vec_enc) { 5795 assert(is_integral_type(bt), "unexpected type"); 5796 assert(vec_enc < Assembler::AVX_512bit, ""); 5797 switch(bt) { 5798 case T_LONG: 5799 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5800 break; 5801 case T_INT: 5802 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 5803 break; 5804 case T_SHORT: 5805 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5806 break; 5807 case T_BYTE: 5808 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5809 break; 5810 default: 5811 fatal("Unsupported type %s", type2name(bt)); 5812 break; 5813 } 5814 } 5815 5816 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 5817 switch(bt) { 5818 case T_BYTE: 5819 vpsubb(dst, src1, src2, vec_enc); 5820 break; 5821 case T_SHORT: 5822 vpsubw(dst, src1, src2, vec_enc); 5823 break; 5824 case T_INT: 5825 vpsubd(dst, src1, src2, vec_enc); 5826 break; 5827 case T_LONG: 5828 vpsubq(dst, src1, src2, vec_enc); 5829 break; 5830 default: 5831 fatal("Unsupported type %s", type2name(bt)); 5832 break; 5833 } 5834 } 5835 5836 // Trailing zero count computation is based on leading zero count operation as per 5837 // following equation. All AVX3 targets support AVX512CD feature which offers 5838 // direct vector instruction to compute leading zero count. 5839 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 5840 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5841 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5842 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 5843 assert(is_integral_type(bt), ""); 5844 // xtmp = -1 5845 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 5846 // xtmp = xtmp + src 5847 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 5848 // xtmp = xtmp & ~src 5849 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 5850 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 5851 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 5852 vpsub(bt, dst, xtmp4, dst, vec_enc); 5853 } 5854 5855 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 5856 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 5857 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5858 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5859 assert(is_integral_type(bt), ""); 5860 // xtmp = 0 5861 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 5862 // xtmp = 0 - src 5863 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 5864 // xtmp = xtmp | src 5865 vpor(xtmp3, xtmp3, src, vec_enc); 5866 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 5867 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 5868 vpsub(bt, dst, xtmp1, dst, vec_enc); 5869 } 5870 5871 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 5872 Label done; 5873 Label neg_divisor_fastpath; 5874 cmpl(divisor, 0); 5875 jccb(Assembler::less, neg_divisor_fastpath); 5876 xorl(rdx, rdx); 5877 divl(divisor); 5878 jmpb(done); 5879 bind(neg_divisor_fastpath); 5880 // Fastpath for divisor < 0: 5881 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 5882 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 5883 movl(rdx, rax); 5884 subl(rdx, divisor); 5885 if (VM_Version::supports_bmi1()) { 5886 andnl(rax, rdx, rax); 5887 } else { 5888 notl(rdx); 5889 andl(rax, rdx); 5890 } 5891 shrl(rax, 31); 5892 bind(done); 5893 } 5894 5895 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 5896 Label done; 5897 Label neg_divisor_fastpath; 5898 cmpl(divisor, 0); 5899 jccb(Assembler::less, neg_divisor_fastpath); 5900 xorl(rdx, rdx); 5901 divl(divisor); 5902 jmpb(done); 5903 bind(neg_divisor_fastpath); 5904 // Fastpath when divisor < 0: 5905 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 5906 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 5907 movl(rdx, rax); 5908 subl(rax, divisor); 5909 if (VM_Version::supports_bmi1()) { 5910 andnl(rax, rax, rdx); 5911 } else { 5912 notl(rax); 5913 andl(rax, rdx); 5914 } 5915 sarl(rax, 31); 5916 andl(rax, divisor); 5917 subl(rdx, rax); 5918 bind(done); 5919 } 5920 5921 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 5922 Label done; 5923 Label neg_divisor_fastpath; 5924 5925 cmpl(divisor, 0); 5926 jccb(Assembler::less, neg_divisor_fastpath); 5927 xorl(rdx, rdx); 5928 divl(divisor); 5929 jmpb(done); 5930 bind(neg_divisor_fastpath); 5931 // Fastpath for divisor < 0: 5932 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 5933 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 5934 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 5935 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 5936 movl(rdx, rax); 5937 subl(rax, divisor); 5938 if (VM_Version::supports_bmi1()) { 5939 andnl(rax, rax, rdx); 5940 } else { 5941 notl(rax); 5942 andl(rax, rdx); 5943 } 5944 movl(tmp, rax); 5945 shrl(rax, 31); // quotient 5946 sarl(tmp, 31); 5947 andl(tmp, divisor); 5948 subl(rdx, tmp); // remainder 5949 bind(done); 5950 } 5951 5952 #ifdef _LP64 5953 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 5954 XMMRegister xtmp2, Register rtmp) { 5955 if(VM_Version::supports_gfni()) { 5956 // Galois field instruction based bit reversal based on following algorithm. 5957 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5958 mov64(rtmp, 0x8040201008040201L); 5959 movq(xtmp1, src); 5960 movq(xtmp2, rtmp); 5961 gf2p8affineqb(xtmp1, xtmp2, 0); 5962 movq(dst, xtmp1); 5963 } else { 5964 // Swap even and odd numbered bits. 5965 movl(rtmp, src); 5966 andl(rtmp, 0x55555555); 5967 shll(rtmp, 1); 5968 movl(dst, src); 5969 andl(dst, 0xAAAAAAAA); 5970 shrl(dst, 1); 5971 orl(dst, rtmp); 5972 5973 // Swap LSB and MSB 2 bits of each nibble. 5974 movl(rtmp, dst); 5975 andl(rtmp, 0x33333333); 5976 shll(rtmp, 2); 5977 andl(dst, 0xCCCCCCCC); 5978 shrl(dst, 2); 5979 orl(dst, rtmp); 5980 5981 // Swap LSB and MSB 4 bits of each byte. 5982 movl(rtmp, dst); 5983 andl(rtmp, 0x0F0F0F0F); 5984 shll(rtmp, 4); 5985 andl(dst, 0xF0F0F0F0); 5986 shrl(dst, 4); 5987 orl(dst, rtmp); 5988 } 5989 bswapl(dst); 5990 } 5991 5992 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 5993 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 5994 if(VM_Version::supports_gfni()) { 5995 // Galois field instruction based bit reversal based on following algorithm. 5996 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5997 mov64(rtmp1, 0x8040201008040201L); 5998 movq(xtmp1, src); 5999 movq(xtmp2, rtmp1); 6000 gf2p8affineqb(xtmp1, xtmp2, 0); 6001 movq(dst, xtmp1); 6002 } else { 6003 // Swap even and odd numbered bits. 6004 movq(rtmp1, src); 6005 mov64(rtmp2, 0x5555555555555555L); 6006 andq(rtmp1, rtmp2); 6007 shlq(rtmp1, 1); 6008 movq(dst, src); 6009 notq(rtmp2); 6010 andq(dst, rtmp2); 6011 shrq(dst, 1); 6012 orq(dst, rtmp1); 6013 6014 // Swap LSB and MSB 2 bits of each nibble. 6015 movq(rtmp1, dst); 6016 mov64(rtmp2, 0x3333333333333333L); 6017 andq(rtmp1, rtmp2); 6018 shlq(rtmp1, 2); 6019 notq(rtmp2); 6020 andq(dst, rtmp2); 6021 shrq(dst, 2); 6022 orq(dst, rtmp1); 6023 6024 // Swap LSB and MSB 4 bits of each byte. 6025 movq(rtmp1, dst); 6026 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6027 andq(rtmp1, rtmp2); 6028 shlq(rtmp1, 4); 6029 notq(rtmp2); 6030 andq(dst, rtmp2); 6031 shrq(dst, 4); 6032 orq(dst, rtmp1); 6033 } 6034 bswapq(dst); 6035 } 6036 6037 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6038 Label done; 6039 Label neg_divisor_fastpath; 6040 cmpq(divisor, 0); 6041 jccb(Assembler::less, neg_divisor_fastpath); 6042 xorl(rdx, rdx); 6043 divq(divisor); 6044 jmpb(done); 6045 bind(neg_divisor_fastpath); 6046 // Fastpath for divisor < 0: 6047 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6048 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6049 movq(rdx, rax); 6050 subq(rdx, divisor); 6051 if (VM_Version::supports_bmi1()) { 6052 andnq(rax, rdx, rax); 6053 } else { 6054 notq(rdx); 6055 andq(rax, rdx); 6056 } 6057 shrq(rax, 63); 6058 bind(done); 6059 } 6060 6061 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6062 Label done; 6063 Label neg_divisor_fastpath; 6064 cmpq(divisor, 0); 6065 jccb(Assembler::less, neg_divisor_fastpath); 6066 xorq(rdx, rdx); 6067 divq(divisor); 6068 jmp(done); 6069 bind(neg_divisor_fastpath); 6070 // Fastpath when divisor < 0: 6071 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6072 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6073 movq(rdx, rax); 6074 subq(rax, divisor); 6075 if (VM_Version::supports_bmi1()) { 6076 andnq(rax, rax, rdx); 6077 } else { 6078 notq(rax); 6079 andq(rax, rdx); 6080 } 6081 sarq(rax, 63); 6082 andq(rax, divisor); 6083 subq(rdx, rax); 6084 bind(done); 6085 } 6086 6087 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6088 Label done; 6089 Label neg_divisor_fastpath; 6090 cmpq(divisor, 0); 6091 jccb(Assembler::less, neg_divisor_fastpath); 6092 xorq(rdx, rdx); 6093 divq(divisor); 6094 jmp(done); 6095 bind(neg_divisor_fastpath); 6096 // Fastpath for divisor < 0: 6097 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6098 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6099 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6100 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6101 movq(rdx, rax); 6102 subq(rax, divisor); 6103 if (VM_Version::supports_bmi1()) { 6104 andnq(rax, rax, rdx); 6105 } else { 6106 notq(rax); 6107 andq(rax, rdx); 6108 } 6109 movq(tmp, rax); 6110 shrq(rax, 63); // quotient 6111 sarq(tmp, 63); 6112 andq(tmp, divisor); 6113 subq(rdx, tmp); // remainder 6114 bind(done); 6115 } 6116 #endif 6117 6118 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6119 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6120 int vlen_enc) { 6121 assert(VM_Version::supports_avx512bw(), ""); 6122 // Byte shuffles are inlane operations and indices are determined using 6123 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6124 // normalized to index range 0-15. This makes sure that all the multiples 6125 // of an index value are placed at same relative position in 128 bit 6126 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6127 // will be 16th element in their respective 128 bit lanes. 6128 movl(rtmp, 16); 6129 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6130 6131 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6132 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6133 // original shuffle indices and move the shuffled lanes corresponding to true 6134 // mask to destination vector. 6135 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6136 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6137 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6138 6139 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6140 // and broadcasting second 128 bit lane. 6141 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6142 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6143 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6144 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6145 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6146 6147 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6148 // and broadcasting third 128 bit lane. 6149 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6150 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6151 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6152 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6153 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6154 6155 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6156 // and broadcasting third 128 bit lane. 6157 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6158 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6159 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6160 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6161 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6162 } 6163 6164 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6165 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6166 if (vlen_enc == AVX_128bit) { 6167 vpermilps(dst, src, shuffle, vlen_enc); 6168 } else if (bt == T_INT) { 6169 vpermd(dst, shuffle, src, vlen_enc); 6170 } else { 6171 assert(bt == T_FLOAT, ""); 6172 vpermps(dst, shuffle, src, vlen_enc); 6173 } 6174 }