1 /* 2 * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "oops/methodData.hpp" 31 #include "opto/c2_MacroAssembler.hpp" 32 #include "opto/intrinsicnode.hpp" 33 #include "opto/output.hpp" 34 #include "opto/opcodes.hpp" 35 #include "opto/subnode.hpp" 36 #include "runtime/objectMonitor.hpp" 37 #include "runtime/stubRoutines.hpp" 38 39 #ifdef PRODUCT 40 #define BLOCK_COMMENT(str) /* nothing */ 41 #define STOP(error) stop(error) 42 #else 43 #define BLOCK_COMMENT(str) block_comment(str) 44 #define STOP(error) block_comment(error); stop(error) 45 #endif 46 47 // C2 compiled method's prolog code. 48 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { 49 50 // WARNING: Initial instruction MUST be 5 bytes or longer so that 51 // NativeJump::patch_verified_entry will be able to patch out the entry 52 // code safely. The push to verify stack depth is ok at 5 bytes, 53 // the frame allocation can be either 3 or 6 bytes. So if we don't do 54 // stack bang then we must use the 6 byte frame allocation even if 55 // we have no frame. :-( 56 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 57 58 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 59 // Remove word for return addr 60 framesize -= wordSize; 61 stack_bang_size -= wordSize; 62 63 // Calls to C2R adapters often do not accept exceptional returns. 64 // We require that their callers must bang for them. But be careful, because 65 // some VM calls (such as call site linkage) can use several kilobytes of 66 // stack. But the stack safety zone should account for that. 67 // See bugs 4446381, 4468289, 4497237. 68 if (stack_bang_size > 0) { 69 generate_stack_overflow_check(stack_bang_size); 70 71 // We always push rbp, so that on return to interpreter rbp, will be 72 // restored correctly and we can correct the stack. 73 push(rbp); 74 // Save caller's stack pointer into RBP if the frame pointer is preserved. 75 if (PreserveFramePointer) { 76 mov(rbp, rsp); 77 } 78 // Remove word for ebp 79 framesize -= wordSize; 80 81 // Create frame 82 if (framesize) { 83 subptr(rsp, framesize); 84 } 85 } else { 86 // Create frame (force generation of a 4 byte immediate value) 87 subptr_imm32(rsp, framesize); 88 89 // Save RBP register now. 90 framesize -= wordSize; 91 movptr(Address(rsp, framesize), rbp); 92 // Save caller's stack pointer into RBP if the frame pointer is preserved. 93 if (PreserveFramePointer) { 94 movptr(rbp, rsp); 95 if (framesize > 0) { 96 addptr(rbp, framesize); 97 } 98 } 99 } 100 101 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 102 framesize -= wordSize; 103 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 104 } 105 106 #ifndef _LP64 107 // If method sets FPU control word do it now 108 if (fp_mode_24b) { 109 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 110 } 111 if (UseSSE >= 2 && VerifyFPU) { 112 verify_FPU(0, "FPU stack must be clean on entry"); 113 } 114 #endif 115 116 #ifdef ASSERT 117 if (VerifyStackAtCalls) { 118 Label L; 119 push(rax); 120 mov(rax, rsp); 121 andptr(rax, StackAlignmentInBytes-1); 122 cmpptr(rax, StackAlignmentInBytes-wordSize); 123 pop(rax); 124 jcc(Assembler::equal, L); 125 STOP("Stack is not properly aligned!"); 126 bind(L); 127 } 128 #endif 129 130 if (!is_stub) { 131 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 132 #ifdef _LP64 133 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 134 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 135 Label dummy_slow_path; 136 Label dummy_continuation; 137 Label* slow_path = &dummy_slow_path; 138 Label* continuation = &dummy_continuation; 139 if (!Compile::current()->output()->in_scratch_emit_size()) { 140 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 141 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 142 Compile::current()->output()->add_stub(stub); 143 slow_path = &stub->entry(); 144 continuation = &stub->continuation(); 145 } 146 bs->nmethod_entry_barrier(this, slow_path, continuation); 147 } 148 #else 149 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 150 bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */); 151 #endif 152 } 153 } 154 155 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 156 switch (vlen_in_bytes) { 157 case 4: // fall-through 158 case 8: // fall-through 159 case 16: return Assembler::AVX_128bit; 160 case 32: return Assembler::AVX_256bit; 161 case 64: return Assembler::AVX_512bit; 162 163 default: { 164 ShouldNotReachHere(); 165 return Assembler::AVX_NoVec; 166 } 167 } 168 } 169 170 #if INCLUDE_RTM_OPT 171 172 // Update rtm_counters based on abort status 173 // input: abort_status 174 // rtm_counters (RTMLockingCounters*) 175 // flags are killed 176 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) { 177 178 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset())); 179 if (PrintPreciseRTMLockingStatistics) { 180 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 181 Label check_abort; 182 testl(abort_status, (1<<i)); 183 jccb(Assembler::equal, check_abort); 184 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx)))); 185 bind(check_abort); 186 } 187 } 188 } 189 190 // Branch if (random & (count-1) != 0), count is 2^n 191 // tmp, scr and flags are killed 192 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) { 193 assert(tmp == rax, ""); 194 assert(scr == rdx, ""); 195 rdtsc(); // modifies EDX:EAX 196 andptr(tmp, count-1); 197 jccb(Assembler::notZero, brLabel); 198 } 199 200 // Perform abort ratio calculation, set no_rtm bit if high ratio 201 // input: rtm_counters_Reg (RTMLockingCounters* address) 202 // tmpReg, rtm_counters_Reg and flags are killed 203 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg, 204 Register rtm_counters_Reg, 205 RTMLockingCounters* rtm_counters, 206 Metadata* method_data) { 207 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 208 209 if (RTMLockingCalculationDelay > 0) { 210 // Delay calculation 211 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr())); 212 testptr(tmpReg, tmpReg); 213 jccb(Assembler::equal, L_done); 214 } 215 // Abort ratio calculation only if abort_count > RTMAbortThreshold 216 // Aborted transactions = abort_count * 100 217 // All transactions = total_count * RTMTotalCountIncrRate 218 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 219 220 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset())); 221 cmpptr(tmpReg, RTMAbortThreshold); 222 jccb(Assembler::below, L_check_always_rtm2); 223 imulptr(tmpReg, tmpReg, 100); 224 225 Register scrReg = rtm_counters_Reg; 226 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 227 imulptr(scrReg, scrReg, RTMTotalCountIncrRate); 228 imulptr(scrReg, scrReg, RTMAbortRatio); 229 cmpptr(tmpReg, scrReg); 230 jccb(Assembler::below, L_check_always_rtm1); 231 if (method_data != nullptr) { 232 // set rtm_state to "no rtm" in MDO 233 mov_metadata(tmpReg, method_data); 234 lock(); 235 orl(Address(tmpReg, MethodData::rtm_state_offset()), NoRTM); 236 } 237 jmpb(L_done); 238 bind(L_check_always_rtm1); 239 // Reload RTMLockingCounters* address 240 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 241 bind(L_check_always_rtm2); 242 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 243 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 244 jccb(Assembler::below, L_done); 245 if (method_data != nullptr) { 246 // set rtm_state to "always rtm" in MDO 247 mov_metadata(tmpReg, method_data); 248 lock(); 249 orl(Address(tmpReg, MethodData::rtm_state_offset()), UseRTM); 250 } 251 bind(L_done); 252 } 253 254 // Update counters and perform abort ratio calculation 255 // input: abort_status_Reg 256 // rtm_counters_Reg, flags are killed 257 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg, 258 Register rtm_counters_Reg, 259 RTMLockingCounters* rtm_counters, 260 Metadata* method_data, 261 bool profile_rtm) { 262 263 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 264 // update rtm counters based on rax value at abort 265 // reads abort_status_Reg, updates flags 266 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 267 rtm_counters_update(abort_status_Reg, rtm_counters_Reg); 268 if (profile_rtm) { 269 // Save abort status because abort_status_Reg is used by following code. 270 if (RTMRetryCount > 0) { 271 push(abort_status_Reg); 272 } 273 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 274 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data); 275 // restore abort status 276 if (RTMRetryCount > 0) { 277 pop(abort_status_Reg); 278 } 279 } 280 } 281 282 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4) 283 // inputs: retry_count_Reg 284 // : abort_status_Reg 285 // output: retry_count_Reg decremented by 1 286 // flags are killed 287 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) { 288 Label doneRetry; 289 assert(abort_status_Reg == rax, ""); 290 // The abort reason bits are in eax (see all states in rtmLocking.hpp) 291 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) 292 // if reason is in 0x6 and retry count != 0 then retry 293 andptr(abort_status_Reg, 0x6); 294 jccb(Assembler::zero, doneRetry); 295 testl(retry_count_Reg, retry_count_Reg); 296 jccb(Assembler::zero, doneRetry); 297 pause(); 298 decrementl(retry_count_Reg); 299 jmp(retryLabel); 300 bind(doneRetry); 301 } 302 303 // Spin and retry if lock is busy, 304 // inputs: box_Reg (monitor address) 305 // : retry_count_Reg 306 // output: retry_count_Reg decremented by 1 307 // : clear z flag if retry count exceeded 308 // tmp_Reg, scr_Reg, flags are killed 309 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg, 310 Register tmp_Reg, Register scr_Reg, Label& retryLabel) { 311 Label SpinLoop, SpinExit, doneRetry; 312 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 313 314 testl(retry_count_Reg, retry_count_Reg); 315 jccb(Assembler::zero, doneRetry); 316 decrementl(retry_count_Reg); 317 movptr(scr_Reg, RTMSpinLoopCount); 318 319 bind(SpinLoop); 320 pause(); 321 decrementl(scr_Reg); 322 jccb(Assembler::lessEqual, SpinExit); 323 movptr(tmp_Reg, Address(box_Reg, owner_offset)); 324 testptr(tmp_Reg, tmp_Reg); 325 jccb(Assembler::notZero, SpinLoop); 326 327 bind(SpinExit); 328 jmp(retryLabel); 329 bind(doneRetry); 330 incrementl(retry_count_Reg); // clear z flag 331 } 332 333 // Use RTM for normal stack locks 334 // Input: objReg (object to lock) 335 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg, 336 Register retry_on_abort_count_Reg, 337 RTMLockingCounters* stack_rtm_counters, 338 Metadata* method_data, bool profile_rtm, 339 Label& DONE_LABEL, Label& IsInflated) { 340 assert(UseRTMForStackLocks, "why call this otherwise?"); 341 assert(tmpReg == rax, ""); 342 assert(scrReg == rdx, ""); 343 Label L_rtm_retry, L_decrement_retry, L_on_abort; 344 345 if (RTMRetryCount > 0) { 346 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 347 bind(L_rtm_retry); 348 } 349 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 350 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 351 jcc(Assembler::notZero, IsInflated); 352 353 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 354 Label L_noincrement; 355 if (RTMTotalCountIncrRate > 1) { 356 // tmpReg, scrReg and flags are killed 357 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 358 } 359 assert(stack_rtm_counters != nullptr, "should not be null when profiling RTM"); 360 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg); 361 bind(L_noincrement); 362 } 363 xbegin(L_on_abort); 364 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 365 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 366 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 367 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked 368 369 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 370 if (UseRTMXendForLockBusy) { 371 xend(); 372 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry) 373 jmp(L_decrement_retry); 374 } 375 else { 376 xabort(0); 377 } 378 bind(L_on_abort); 379 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 380 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm); 381 } 382 bind(L_decrement_retry); 383 if (RTMRetryCount > 0) { 384 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 385 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 386 } 387 } 388 389 // Use RTM for inflating locks 390 // inputs: objReg (object to lock) 391 // boxReg (on-stack box address (displaced header location) - KILLED) 392 // tmpReg (ObjectMonitor address + markWord::monitor_value) 393 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg, 394 Register scrReg, Register retry_on_busy_count_Reg, 395 Register retry_on_abort_count_Reg, 396 RTMLockingCounters* rtm_counters, 397 Metadata* method_data, bool profile_rtm, 398 Label& DONE_LABEL) { 399 assert(UseRTMLocking, "why call this otherwise?"); 400 assert(tmpReg == rax, ""); 401 assert(scrReg == rdx, ""); 402 Label L_rtm_retry, L_decrement_retry, L_on_abort; 403 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 404 405 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 406 movptr(boxReg, tmpReg); // Save ObjectMonitor address 407 408 if (RTMRetryCount > 0) { 409 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy 410 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 411 bind(L_rtm_retry); 412 } 413 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 414 Label L_noincrement; 415 if (RTMTotalCountIncrRate > 1) { 416 // tmpReg, scrReg and flags are killed 417 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 418 } 419 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 420 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg); 421 bind(L_noincrement); 422 } 423 xbegin(L_on_abort); 424 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 425 movptr(tmpReg, Address(tmpReg, owner_offset)); 426 testptr(tmpReg, tmpReg); 427 jcc(Assembler::zero, DONE_LABEL); 428 if (UseRTMXendForLockBusy) { 429 xend(); 430 jmp(L_decrement_retry); 431 } 432 else { 433 xabort(0); 434 } 435 bind(L_on_abort); 436 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 437 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 438 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm); 439 } 440 if (RTMRetryCount > 0) { 441 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 442 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 443 } 444 445 movptr(tmpReg, Address(boxReg, owner_offset)) ; 446 testptr(tmpReg, tmpReg) ; 447 jccb(Assembler::notZero, L_decrement_retry) ; 448 449 // Appears unlocked - try to swing _owner from null to non-null. 450 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 451 #ifdef _LP64 452 Register threadReg = r15_thread; 453 #else 454 get_thread(scrReg); 455 Register threadReg = scrReg; 456 #endif 457 lock(); 458 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg 459 460 if (RTMRetryCount > 0) { 461 // success done else retry 462 jccb(Assembler::equal, DONE_LABEL) ; 463 bind(L_decrement_retry); 464 // Spin and retry if lock is busy. 465 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry); 466 } 467 else { 468 bind(L_decrement_retry); 469 } 470 } 471 472 #endif // INCLUDE_RTM_OPT 473 474 // fast_lock and fast_unlock used by C2 475 476 // Because the transitions from emitted code to the runtime 477 // monitorenter/exit helper stubs are so slow it's critical that 478 // we inline both the stack-locking fast path and the inflated fast path. 479 // 480 // See also: cmpFastLock and cmpFastUnlock. 481 // 482 // What follows is a specialized inline transliteration of the code 483 // in enter() and exit(). If we're concerned about I$ bloat another 484 // option would be to emit TrySlowEnter and TrySlowExit methods 485 // at startup-time. These methods would accept arguments as 486 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 487 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 488 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 489 // In practice, however, the # of lock sites is bounded and is usually small. 490 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 491 // if the processor uses simple bimodal branch predictors keyed by EIP 492 // Since the helper routines would be called from multiple synchronization 493 // sites. 494 // 495 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 496 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 497 // to those specialized methods. That'd give us a mostly platform-independent 498 // implementation that the JITs could optimize and inline at their pleasure. 499 // Done correctly, the only time we'd need to cross to native could would be 500 // to park() or unpark() threads. We'd also need a few more unsafe operators 501 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 502 // (b) explicit barriers or fence operations. 503 // 504 // TODO: 505 // 506 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 507 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 508 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 509 // the lock operators would typically be faster than reifying Self. 510 // 511 // * Ideally I'd define the primitives as: 512 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 513 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 514 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 515 // Instead, we're stuck with a rather awkward and brittle register assignments below. 516 // Furthermore the register assignments are overconstrained, possibly resulting in 517 // sub-optimal code near the synchronization site. 518 // 519 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 520 // Alternately, use a better sp-proximity test. 521 // 522 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 523 // Either one is sufficient to uniquely identify a thread. 524 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 525 // 526 // * Intrinsify notify() and notifyAll() for the common cases where the 527 // object is locked by the calling thread but the waitlist is empty. 528 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 529 // 530 // * use jccb and jmpb instead of jcc and jmp to improve code density. 531 // But beware of excessive branch density on AMD Opterons. 532 // 533 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 534 // or failure of the fast path. If the fast path fails then we pass 535 // control to the slow path, typically in C. In fast_lock and 536 // fast_unlock we often branch to DONE_LABEL, just to find that C2 537 // will emit a conditional branch immediately after the node. 538 // So we have branches to branches and lots of ICC.ZF games. 539 // Instead, it might be better to have C2 pass a "FailureLabel" 540 // into fast_lock and fast_unlock. In the case of success, control 541 // will drop through the node. ICC.ZF is undefined at exit. 542 // In the case of failure, the node will branch directly to the 543 // FailureLabel 544 545 546 // obj: object to lock 547 // box: on-stack box address (displaced header location) - KILLED 548 // rax,: tmp -- KILLED 549 // scr: tmp -- KILLED 550 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 551 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 552 RTMLockingCounters* rtm_counters, 553 RTMLockingCounters* stack_rtm_counters, 554 Metadata* method_data, 555 bool use_rtm, bool profile_rtm) { 556 // Ensure the register assignments are disjoint 557 assert(tmpReg == rax, ""); 558 559 if (use_rtm) { 560 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg); 561 } else { 562 assert(cx1Reg == noreg, ""); 563 assert(cx2Reg == noreg, ""); 564 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 565 } 566 567 // Possible cases that we'll encounter in fast_lock 568 // ------------------------------------------------ 569 // * Inflated 570 // -- unlocked 571 // -- Locked 572 // = by self 573 // = by other 574 // * neutral 575 // * stack-locked 576 // -- by self 577 // = sp-proximity test hits 578 // = sp-proximity test generates false-negative 579 // -- by other 580 // 581 582 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 583 584 if (DiagnoseSyncOnValueBasedClasses != 0) { 585 load_klass(tmpReg, objReg, scrReg); 586 movl(tmpReg, Address(tmpReg, Klass::access_flags_offset())); 587 testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS); 588 jcc(Assembler::notZero, DONE_LABEL); 589 } 590 591 #if INCLUDE_RTM_OPT 592 if (UseRTMForStackLocks && use_rtm) { 593 assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive"); 594 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg, 595 stack_rtm_counters, method_data, profile_rtm, 596 DONE_LABEL, IsInflated); 597 } 598 #endif // INCLUDE_RTM_OPT 599 600 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 601 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 602 jcc(Assembler::notZero, IsInflated); 603 604 if (LockingMode == LM_MONITOR) { 605 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 606 testptr(objReg, objReg); 607 } else if (LockingMode == LM_LEGACY) { 608 // Attempt stack-locking ... 609 orptr (tmpReg, markWord::unlocked_value); 610 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 611 lock(); 612 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 613 jcc(Assembler::equal, COUNT); // Success 614 615 // Recursive locking. 616 // The object is stack-locked: markword contains stack pointer to BasicLock. 617 // Locked by current thread if difference with current SP is less than one page. 618 subptr(tmpReg, rsp); 619 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 620 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) ); 621 movptr(Address(boxReg, 0), tmpReg); 622 } else { 623 assert(LockingMode == LM_LIGHTWEIGHT, ""); 624 lightweight_lock(objReg, tmpReg, thread, scrReg, NO_COUNT); 625 jmp(COUNT); 626 } 627 jmp(DONE_LABEL); 628 629 bind(IsInflated); 630 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 631 632 #if INCLUDE_RTM_OPT 633 // Use the same RTM locking code in 32- and 64-bit VM. 634 if (use_rtm) { 635 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg, 636 rtm_counters, method_data, profile_rtm, DONE_LABEL); 637 } else { 638 #endif // INCLUDE_RTM_OPT 639 640 #ifndef _LP64 641 // The object is inflated. 642 643 // boxReg refers to the on-stack BasicLock in the current frame. 644 // We'd like to write: 645 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 646 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 647 // additional latency as we have another ST in the store buffer that must drain. 648 649 // avoid ST-before-CAS 650 // register juggle because we need tmpReg for cmpxchgptr below 651 movptr(scrReg, boxReg); 652 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 653 654 // Optimistic form: consider XORL tmpReg,tmpReg 655 movptr(tmpReg, NULL_WORD); 656 657 // Appears unlocked - try to swing _owner from null to non-null. 658 // Ideally, I'd manifest "Self" with get_thread and then attempt 659 // to CAS the register containing Self into m->Owner. 660 // But we don't have enough registers, so instead we can either try to CAS 661 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 662 // we later store "Self" into m->Owner. Transiently storing a stack address 663 // (rsp or the address of the box) into m->owner is harmless. 664 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 665 lock(); 666 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 667 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 668 // If we weren't able to swing _owner from null to the BasicLock 669 // then take the slow path. 670 jccb (Assembler::notZero, NO_COUNT); 671 // update _owner from BasicLock to thread 672 get_thread (scrReg); // beware: clobbers ICCs 673 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 674 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 675 676 // If the CAS fails we can either retry or pass control to the slow path. 677 // We use the latter tactic. 678 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 679 // If the CAS was successful ... 680 // Self has acquired the lock 681 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 682 // Intentional fall-through into DONE_LABEL ... 683 #else // _LP64 684 // It's inflated and we use scrReg for ObjectMonitor* in this section. 685 movq(scrReg, tmpReg); 686 xorq(tmpReg, tmpReg); 687 lock(); 688 cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 689 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 690 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 691 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 692 // Propagate ICC.ZF from CAS above into DONE_LABEL. 693 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 694 695 cmpptr(thread, rax); // Check if we are already the owner (recursive lock) 696 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 697 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 698 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 699 #endif // _LP64 700 #if INCLUDE_RTM_OPT 701 } // use_rtm() 702 #endif 703 bind(DONE_LABEL); 704 705 // ZFlag == 1 count in fast path 706 // ZFlag == 0 count in slow path 707 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 708 709 bind(COUNT); 710 // Count monitors in fast path 711 increment(Address(thread, JavaThread::held_monitor_count_offset())); 712 713 xorl(tmpReg, tmpReg); // Set ZF == 1 714 715 bind(NO_COUNT); 716 717 // At NO_COUNT the icc ZFlag is set as follows ... 718 // fast_unlock uses the same protocol. 719 // ZFlag == 1 -> Success 720 // ZFlag == 0 -> Failure - force control through the slow path 721 } 722 723 // obj: object to unlock 724 // box: box address (displaced header location), killed. Must be EAX. 725 // tmp: killed, cannot be obj nor box. 726 // 727 // Some commentary on balanced locking: 728 // 729 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 730 // Methods that don't have provably balanced locking are forced to run in the 731 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 732 // The interpreter provides two properties: 733 // I1: At return-time the interpreter automatically and quietly unlocks any 734 // objects acquired the current activation (frame). Recall that the 735 // interpreter maintains an on-stack list of locks currently held by 736 // a frame. 737 // I2: If a method attempts to unlock an object that is not held by the 738 // the frame the interpreter throws IMSX. 739 // 740 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 741 // B() doesn't have provably balanced locking so it runs in the interpreter. 742 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 743 // is still locked by A(). 744 // 745 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 746 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 747 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 748 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 749 // Arguably given that the spec legislates the JNI case as undefined our implementation 750 // could reasonably *avoid* checking owner in fast_unlock(). 751 // In the interest of performance we elide m->Owner==Self check in unlock. 752 // A perfectly viable alternative is to elide the owner check except when 753 // Xcheck:jni is enabled. 754 755 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { 756 assert(boxReg == rax, ""); 757 assert_different_registers(objReg, boxReg, tmpReg); 758 759 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 760 761 #if INCLUDE_RTM_OPT 762 if (UseRTMForStackLocks && use_rtm) { 763 assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive"); 764 Label L_regular_unlock; 765 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 766 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 767 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 768 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock 769 xend(); // otherwise end... 770 jmp(DONE_LABEL); // ... and we're done 771 bind(L_regular_unlock); 772 } 773 #endif 774 775 if (LockingMode == LM_LEGACY) { 776 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 777 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 778 } 779 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 780 if (LockingMode != LM_MONITOR) { 781 testptr(tmpReg, markWord::monitor_value); // Inflated? 782 jcc(Assembler::zero, Stacked); 783 } 784 785 // It's inflated. 786 if (LockingMode == LM_LIGHTWEIGHT) { 787 // If the owner is ANONYMOUS, we need to fix it - in an outline stub. 788 testb(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t) ObjectMonitor::ANONYMOUS_OWNER); 789 #ifdef _LP64 790 if (!Compile::current()->output()->in_scratch_emit_size()) { 791 C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmpReg, boxReg); 792 Compile::current()->output()->add_stub(stub); 793 jcc(Assembler::notEqual, stub->entry()); 794 bind(stub->continuation()); 795 } else 796 #endif 797 { 798 // We can't easily implement this optimization on 32 bit because we don't have a thread register. 799 // Call the slow-path instead. 800 jcc(Assembler::notEqual, NO_COUNT); 801 } 802 } 803 804 #if INCLUDE_RTM_OPT 805 if (use_rtm) { 806 Label L_regular_inflated_unlock; 807 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 808 movptr(boxReg, Address(tmpReg, owner_offset)); 809 testptr(boxReg, boxReg); 810 jccb(Assembler::notZero, L_regular_inflated_unlock); 811 xend(); 812 jmp(DONE_LABEL); 813 bind(L_regular_inflated_unlock); 814 } 815 #endif 816 817 // Despite our balanced locking property we still check that m->_owner == Self 818 // as java routines or native JNI code called by this thread might 819 // have released the lock. 820 // Refer to the comments in synchronizer.cpp for how we might encode extra 821 // state in _succ so we can avoid fetching EntryList|cxq. 822 // 823 // If there's no contention try a 1-0 exit. That is, exit without 824 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 825 // we detect and recover from the race that the 1-0 exit admits. 826 // 827 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 828 // before it STs null into _owner, releasing the lock. Updates 829 // to data protected by the critical section must be visible before 830 // we drop the lock (and thus before any other thread could acquire 831 // the lock and observe the fields protected by the lock). 832 // IA32's memory-model is SPO, so STs are ordered with respect to 833 // each other and there's no need for an explicit barrier (fence). 834 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 835 #ifndef _LP64 836 // Note that we could employ various encoding schemes to reduce 837 // the number of loads below (currently 4) to just 2 or 3. 838 // Refer to the comments in synchronizer.cpp. 839 // In practice the chain of fetches doesn't seem to impact performance, however. 840 xorptr(boxReg, boxReg); 841 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 842 jccb (Assembler::notZero, DONE_LABEL); 843 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 844 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 845 jccb (Assembler::notZero, DONE_LABEL); 846 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 847 jmpb (DONE_LABEL); 848 #else // _LP64 849 // It's inflated 850 Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath; 851 852 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 853 jccb(Assembler::equal, LNotRecursive); 854 855 // Recursive inflated unlock 856 decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 857 jmpb(LSuccess); 858 859 bind(LNotRecursive); 860 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 861 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 862 jccb (Assembler::notZero, CheckSucc); 863 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 864 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 865 jmpb (DONE_LABEL); 866 867 // Try to avoid passing control into the slow_path ... 868 bind (CheckSucc); 869 870 // The following optional optimization can be elided if necessary 871 // Effectively: if (succ == null) goto slow path 872 // The code reduces the window for a race, however, 873 // and thus benefits performance. 874 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 875 jccb (Assembler::zero, LGoSlowPath); 876 877 xorptr(boxReg, boxReg); 878 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 879 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 880 881 // Memory barrier/fence 882 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 883 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 884 // This is faster on Nehalem and AMD Shanghai/Barcelona. 885 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 886 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 887 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 888 lock(); addl(Address(rsp, 0), 0); 889 890 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 891 jccb (Assembler::notZero, LSuccess); 892 893 // Rare inopportune interleaving - race. 894 // The successor vanished in the small window above. 895 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 896 // We need to ensure progress and succession. 897 // Try to reacquire the lock. 898 // If that fails then the new owner is responsible for succession and this 899 // thread needs to take no further action and can exit via the fast path (success). 900 // If the re-acquire succeeds then pass control into the slow path. 901 // As implemented, this latter mode is horrible because we generated more 902 // coherence traffic on the lock *and* artificially extended the critical section 903 // length while by virtue of passing control into the slow path. 904 905 // box is really RAX -- the following CMPXCHG depends on that binding 906 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 907 lock(); 908 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 909 // There's no successor so we tried to regrab the lock. 910 // If that didn't work, then another thread grabbed the 911 // lock so we're done (and exit was a success). 912 jccb (Assembler::notEqual, LSuccess); 913 // Intentional fall-through into slow path 914 915 bind (LGoSlowPath); 916 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 917 jmpb (DONE_LABEL); 918 919 bind (LSuccess); 920 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 921 jmpb (DONE_LABEL); 922 923 #endif 924 if (LockingMode != LM_MONITOR) { 925 bind (Stacked); 926 if (LockingMode == LM_LIGHTWEIGHT) { 927 mov(boxReg, tmpReg); 928 lightweight_unlock(objReg, boxReg, tmpReg, NO_COUNT); 929 jmp(COUNT); 930 } else if (LockingMode == LM_LEGACY) { 931 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 932 lock(); 933 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 934 } 935 // Intentional fall-thru into DONE_LABEL 936 } 937 bind(DONE_LABEL); 938 939 // ZFlag == 1 count in fast path 940 // ZFlag == 0 count in slow path 941 jccb(Assembler::notZero, NO_COUNT); 942 943 bind(COUNT); 944 // Count monitors in fast path 945 #ifndef _LP64 946 get_thread(tmpReg); 947 decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); 948 #else // _LP64 949 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 950 #endif 951 952 xorl(tmpReg, tmpReg); // Set ZF == 1 953 954 bind(NO_COUNT); 955 } 956 957 //------------------------------------------------------------------------------------------- 958 // Generic instructions support for use in .ad files C2 code generation 959 960 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 961 if (dst != src) { 962 movdqu(dst, src); 963 } 964 if (opcode == Op_AbsVD) { 965 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 966 } else { 967 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 968 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 969 } 970 } 971 972 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 973 if (opcode == Op_AbsVD) { 974 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 975 } else { 976 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 977 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 978 } 979 } 980 981 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 982 if (dst != src) { 983 movdqu(dst, src); 984 } 985 if (opcode == Op_AbsVF) { 986 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 987 } else { 988 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 989 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 990 } 991 } 992 993 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 994 if (opcode == Op_AbsVF) { 995 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 996 } else { 997 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 998 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 999 } 1000 } 1001 1002 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 1003 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1004 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 1005 1006 if (opcode == Op_MinV) { 1007 if (elem_bt == T_BYTE) { 1008 pminsb(dst, src); 1009 } else if (elem_bt == T_SHORT) { 1010 pminsw(dst, src); 1011 } else if (elem_bt == T_INT) { 1012 pminsd(dst, src); 1013 } else { 1014 assert(elem_bt == T_LONG, "required"); 1015 assert(tmp == xmm0, "required"); 1016 assert_different_registers(dst, src, tmp); 1017 movdqu(xmm0, dst); 1018 pcmpgtq(xmm0, src); 1019 blendvpd(dst, src); // xmm0 as mask 1020 } 1021 } else { // opcode == Op_MaxV 1022 if (elem_bt == T_BYTE) { 1023 pmaxsb(dst, src); 1024 } else if (elem_bt == T_SHORT) { 1025 pmaxsw(dst, src); 1026 } else if (elem_bt == T_INT) { 1027 pmaxsd(dst, src); 1028 } else { 1029 assert(elem_bt == T_LONG, "required"); 1030 assert(tmp == xmm0, "required"); 1031 assert_different_registers(dst, src, tmp); 1032 movdqu(xmm0, src); 1033 pcmpgtq(xmm0, dst); 1034 blendvpd(dst, src); // xmm0 as mask 1035 } 1036 } 1037 } 1038 1039 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 1040 XMMRegister dst, XMMRegister src1, XMMRegister src2, 1041 int vlen_enc) { 1042 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1043 1044 if (opcode == Op_MinV) { 1045 if (elem_bt == T_BYTE) { 1046 vpminsb(dst, src1, src2, vlen_enc); 1047 } else if (elem_bt == T_SHORT) { 1048 vpminsw(dst, src1, src2, vlen_enc); 1049 } else if (elem_bt == T_INT) { 1050 vpminsd(dst, src1, src2, vlen_enc); 1051 } else { 1052 assert(elem_bt == T_LONG, "required"); 1053 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1054 vpminsq(dst, src1, src2, vlen_enc); 1055 } else { 1056 assert_different_registers(dst, src1, src2); 1057 vpcmpgtq(dst, src1, src2, vlen_enc); 1058 vblendvpd(dst, src1, src2, dst, vlen_enc); 1059 } 1060 } 1061 } else { // opcode == Op_MaxV 1062 if (elem_bt == T_BYTE) { 1063 vpmaxsb(dst, src1, src2, vlen_enc); 1064 } else if (elem_bt == T_SHORT) { 1065 vpmaxsw(dst, src1, src2, vlen_enc); 1066 } else if (elem_bt == T_INT) { 1067 vpmaxsd(dst, src1, src2, vlen_enc); 1068 } else { 1069 assert(elem_bt == T_LONG, "required"); 1070 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1071 vpmaxsq(dst, src1, src2, vlen_enc); 1072 } else { 1073 assert_different_registers(dst, src1, src2); 1074 vpcmpgtq(dst, src1, src2, vlen_enc); 1075 vblendvpd(dst, src2, src1, dst, vlen_enc); 1076 } 1077 } 1078 } 1079 } 1080 1081 // Float/Double min max 1082 1083 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1084 XMMRegister dst, XMMRegister a, XMMRegister b, 1085 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1086 int vlen_enc) { 1087 assert(UseAVX > 0, "required"); 1088 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1089 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1090 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1091 assert_different_registers(a, b, tmp, atmp, btmp); 1092 1093 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1094 bool is_double_word = is_double_word_type(elem_bt); 1095 1096 if (!is_double_word && is_min) { 1097 vblendvps(atmp, a, b, a, vlen_enc); 1098 vblendvps(btmp, b, a, a, vlen_enc); 1099 vminps(tmp, atmp, btmp, vlen_enc); 1100 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1101 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 1102 } else if (!is_double_word && !is_min) { 1103 vblendvps(btmp, b, a, b, vlen_enc); 1104 vblendvps(atmp, a, b, b, vlen_enc); 1105 vmaxps(tmp, atmp, btmp, vlen_enc); 1106 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1107 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 1108 } else if (is_double_word && is_min) { 1109 vblendvpd(atmp, a, b, a, vlen_enc); 1110 vblendvpd(btmp, b, a, a, vlen_enc); 1111 vminpd(tmp, atmp, btmp, vlen_enc); 1112 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1113 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 1114 } else { 1115 assert(is_double_word && !is_min, "sanity"); 1116 vblendvpd(btmp, b, a, b, vlen_enc); 1117 vblendvpd(atmp, a, b, b, vlen_enc); 1118 vmaxpd(tmp, atmp, btmp, vlen_enc); 1119 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1120 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 1121 } 1122 } 1123 1124 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1125 XMMRegister dst, XMMRegister a, XMMRegister b, 1126 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1127 int vlen_enc) { 1128 assert(UseAVX > 2, "required"); 1129 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1130 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1131 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1132 assert_different_registers(dst, a, b, atmp, btmp); 1133 1134 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1135 bool is_double_word = is_double_word_type(elem_bt); 1136 bool merge = true; 1137 1138 if (!is_double_word && is_min) { 1139 evpmovd2m(ktmp, a, vlen_enc); 1140 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1141 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1142 vminps(dst, atmp, btmp, vlen_enc); 1143 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1144 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1145 } else if (!is_double_word && !is_min) { 1146 evpmovd2m(ktmp, b, vlen_enc); 1147 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1148 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1149 vmaxps(dst, atmp, btmp, vlen_enc); 1150 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1151 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1152 } else if (is_double_word && is_min) { 1153 evpmovq2m(ktmp, a, vlen_enc); 1154 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1155 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1156 vminpd(dst, atmp, btmp, vlen_enc); 1157 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1158 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1159 } else { 1160 assert(is_double_word && !is_min, "sanity"); 1161 evpmovq2m(ktmp, b, vlen_enc); 1162 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1163 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1164 vmaxpd(dst, atmp, btmp, vlen_enc); 1165 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1166 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1167 } 1168 } 1169 1170 // Float/Double signum 1171 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1172 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1173 1174 Label DONE_LABEL; 1175 1176 if (opcode == Op_SignumF) { 1177 assert(UseSSE > 0, "required"); 1178 ucomiss(dst, zero); 1179 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1180 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1181 movflt(dst, one); 1182 jcc(Assembler::above, DONE_LABEL); 1183 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1184 } else if (opcode == Op_SignumD) { 1185 assert(UseSSE > 1, "required"); 1186 ucomisd(dst, zero); 1187 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1188 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1189 movdbl(dst, one); 1190 jcc(Assembler::above, DONE_LABEL); 1191 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1192 } 1193 1194 bind(DONE_LABEL); 1195 } 1196 1197 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1198 if (sign) { 1199 pmovsxbw(dst, src); 1200 } else { 1201 pmovzxbw(dst, src); 1202 } 1203 } 1204 1205 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1206 if (sign) { 1207 vpmovsxbw(dst, src, vector_len); 1208 } else { 1209 vpmovzxbw(dst, src, vector_len); 1210 } 1211 } 1212 1213 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1214 if (sign) { 1215 vpmovsxbd(dst, src, vector_len); 1216 } else { 1217 vpmovzxbd(dst, src, vector_len); 1218 } 1219 } 1220 1221 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1222 if (sign) { 1223 vpmovsxwd(dst, src, vector_len); 1224 } else { 1225 vpmovzxwd(dst, src, vector_len); 1226 } 1227 } 1228 1229 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1230 int shift, int vector_len) { 1231 if (opcode == Op_RotateLeftV) { 1232 if (etype == T_INT) { 1233 evprold(dst, src, shift, vector_len); 1234 } else { 1235 assert(etype == T_LONG, "expected type T_LONG"); 1236 evprolq(dst, src, shift, vector_len); 1237 } 1238 } else { 1239 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1240 if (etype == T_INT) { 1241 evprord(dst, src, shift, vector_len); 1242 } else { 1243 assert(etype == T_LONG, "expected type T_LONG"); 1244 evprorq(dst, src, shift, vector_len); 1245 } 1246 } 1247 } 1248 1249 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1250 XMMRegister shift, int vector_len) { 1251 if (opcode == Op_RotateLeftV) { 1252 if (etype == T_INT) { 1253 evprolvd(dst, src, shift, vector_len); 1254 } else { 1255 assert(etype == T_LONG, "expected type T_LONG"); 1256 evprolvq(dst, src, shift, vector_len); 1257 } 1258 } else { 1259 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1260 if (etype == T_INT) { 1261 evprorvd(dst, src, shift, vector_len); 1262 } else { 1263 assert(etype == T_LONG, "expected type T_LONG"); 1264 evprorvq(dst, src, shift, vector_len); 1265 } 1266 } 1267 } 1268 1269 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1270 if (opcode == Op_RShiftVI) { 1271 psrad(dst, shift); 1272 } else if (opcode == Op_LShiftVI) { 1273 pslld(dst, shift); 1274 } else { 1275 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1276 psrld(dst, shift); 1277 } 1278 } 1279 1280 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1281 switch (opcode) { 1282 case Op_RShiftVI: psrad(dst, shift); break; 1283 case Op_LShiftVI: pslld(dst, shift); break; 1284 case Op_URShiftVI: psrld(dst, shift); break; 1285 1286 default: assert(false, "%s", NodeClassNames[opcode]); 1287 } 1288 } 1289 1290 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1291 if (opcode == Op_RShiftVI) { 1292 vpsrad(dst, nds, shift, vector_len); 1293 } else if (opcode == Op_LShiftVI) { 1294 vpslld(dst, nds, shift, vector_len); 1295 } else { 1296 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1297 vpsrld(dst, nds, shift, vector_len); 1298 } 1299 } 1300 1301 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1302 switch (opcode) { 1303 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1304 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1305 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1306 1307 default: assert(false, "%s", NodeClassNames[opcode]); 1308 } 1309 } 1310 1311 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1312 switch (opcode) { 1313 case Op_RShiftVB: // fall-through 1314 case Op_RShiftVS: psraw(dst, shift); break; 1315 1316 case Op_LShiftVB: // fall-through 1317 case Op_LShiftVS: psllw(dst, shift); break; 1318 1319 case Op_URShiftVS: // fall-through 1320 case Op_URShiftVB: psrlw(dst, shift); break; 1321 1322 default: assert(false, "%s", NodeClassNames[opcode]); 1323 } 1324 } 1325 1326 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1327 switch (opcode) { 1328 case Op_RShiftVB: // fall-through 1329 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1330 1331 case Op_LShiftVB: // fall-through 1332 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1333 1334 case Op_URShiftVS: // fall-through 1335 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1336 1337 default: assert(false, "%s", NodeClassNames[opcode]); 1338 } 1339 } 1340 1341 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1342 switch (opcode) { 1343 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1344 case Op_LShiftVL: psllq(dst, shift); break; 1345 case Op_URShiftVL: psrlq(dst, shift); break; 1346 1347 default: assert(false, "%s", NodeClassNames[opcode]); 1348 } 1349 } 1350 1351 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1352 if (opcode == Op_RShiftVL) { 1353 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1354 } else if (opcode == Op_LShiftVL) { 1355 psllq(dst, shift); 1356 } else { 1357 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1358 psrlq(dst, shift); 1359 } 1360 } 1361 1362 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1363 switch (opcode) { 1364 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1365 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1366 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1367 1368 default: assert(false, "%s", NodeClassNames[opcode]); 1369 } 1370 } 1371 1372 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1373 if (opcode == Op_RShiftVL) { 1374 evpsraq(dst, nds, shift, vector_len); 1375 } else if (opcode == Op_LShiftVL) { 1376 vpsllq(dst, nds, shift, vector_len); 1377 } else { 1378 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1379 vpsrlq(dst, nds, shift, vector_len); 1380 } 1381 } 1382 1383 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1384 switch (opcode) { 1385 case Op_RShiftVB: // fall-through 1386 case Op_RShiftVS: // fall-through 1387 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1388 1389 case Op_LShiftVB: // fall-through 1390 case Op_LShiftVS: // fall-through 1391 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1392 1393 case Op_URShiftVB: // fall-through 1394 case Op_URShiftVS: // fall-through 1395 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1396 1397 default: assert(false, "%s", NodeClassNames[opcode]); 1398 } 1399 } 1400 1401 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1402 switch (opcode) { 1403 case Op_RShiftVB: // fall-through 1404 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1405 1406 case Op_LShiftVB: // fall-through 1407 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1408 1409 case Op_URShiftVB: // fall-through 1410 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1411 1412 default: assert(false, "%s", NodeClassNames[opcode]); 1413 } 1414 } 1415 1416 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1417 assert(UseAVX >= 2, "required"); 1418 switch (opcode) { 1419 case Op_RShiftVL: { 1420 if (UseAVX > 2) { 1421 assert(tmp == xnoreg, "not used"); 1422 if (!VM_Version::supports_avx512vl()) { 1423 vlen_enc = Assembler::AVX_512bit; 1424 } 1425 evpsravq(dst, src, shift, vlen_enc); 1426 } else { 1427 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1428 vpsrlvq(dst, src, shift, vlen_enc); 1429 vpsrlvq(tmp, tmp, shift, vlen_enc); 1430 vpxor(dst, dst, tmp, vlen_enc); 1431 vpsubq(dst, dst, tmp, vlen_enc); 1432 } 1433 break; 1434 } 1435 case Op_LShiftVL: { 1436 assert(tmp == xnoreg, "not used"); 1437 vpsllvq(dst, src, shift, vlen_enc); 1438 break; 1439 } 1440 case Op_URShiftVL: { 1441 assert(tmp == xnoreg, "not used"); 1442 vpsrlvq(dst, src, shift, vlen_enc); 1443 break; 1444 } 1445 default: assert(false, "%s", NodeClassNames[opcode]); 1446 } 1447 } 1448 1449 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1450 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1451 assert(opcode == Op_LShiftVB || 1452 opcode == Op_RShiftVB || 1453 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1454 bool sign = (opcode != Op_URShiftVB); 1455 assert(vector_len == 0, "required"); 1456 vextendbd(sign, dst, src, 1); 1457 vpmovzxbd(vtmp, shift, 1); 1458 varshiftd(opcode, dst, dst, vtmp, 1); 1459 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1460 vextracti128_high(vtmp, dst); 1461 vpackusdw(dst, dst, vtmp, 0); 1462 } 1463 1464 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1465 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1466 assert(opcode == Op_LShiftVB || 1467 opcode == Op_RShiftVB || 1468 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1469 bool sign = (opcode != Op_URShiftVB); 1470 int ext_vector_len = vector_len + 1; 1471 vextendbw(sign, dst, src, ext_vector_len); 1472 vpmovzxbw(vtmp, shift, ext_vector_len); 1473 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1474 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1475 if (vector_len == 0) { 1476 vextracti128_high(vtmp, dst); 1477 vpackuswb(dst, dst, vtmp, vector_len); 1478 } else { 1479 vextracti64x4_high(vtmp, dst); 1480 vpackuswb(dst, dst, vtmp, vector_len); 1481 vpermq(dst, dst, 0xD8, vector_len); 1482 } 1483 } 1484 1485 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1486 switch(typ) { 1487 case T_BYTE: 1488 pinsrb(dst, val, idx); 1489 break; 1490 case T_SHORT: 1491 pinsrw(dst, val, idx); 1492 break; 1493 case T_INT: 1494 pinsrd(dst, val, idx); 1495 break; 1496 case T_LONG: 1497 pinsrq(dst, val, idx); 1498 break; 1499 default: 1500 assert(false,"Should not reach here."); 1501 break; 1502 } 1503 } 1504 1505 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1506 switch(typ) { 1507 case T_BYTE: 1508 vpinsrb(dst, src, val, idx); 1509 break; 1510 case T_SHORT: 1511 vpinsrw(dst, src, val, idx); 1512 break; 1513 case T_INT: 1514 vpinsrd(dst, src, val, idx); 1515 break; 1516 case T_LONG: 1517 vpinsrq(dst, src, val, idx); 1518 break; 1519 default: 1520 assert(false,"Should not reach here."); 1521 break; 1522 } 1523 } 1524 1525 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1526 switch(typ) { 1527 case T_INT: 1528 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1529 break; 1530 case T_FLOAT: 1531 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1532 break; 1533 case T_LONG: 1534 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1535 break; 1536 case T_DOUBLE: 1537 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1538 break; 1539 default: 1540 assert(false,"Should not reach here."); 1541 break; 1542 } 1543 } 1544 1545 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1546 switch(typ) { 1547 case T_INT: 1548 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1549 break; 1550 case T_FLOAT: 1551 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1552 break; 1553 case T_LONG: 1554 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1555 break; 1556 case T_DOUBLE: 1557 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1558 break; 1559 default: 1560 assert(false,"Should not reach here."); 1561 break; 1562 } 1563 } 1564 1565 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1566 switch(typ) { 1567 case T_INT: 1568 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1569 break; 1570 case T_FLOAT: 1571 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1572 break; 1573 case T_LONG: 1574 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1575 break; 1576 case T_DOUBLE: 1577 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1578 break; 1579 default: 1580 assert(false,"Should not reach here."); 1581 break; 1582 } 1583 } 1584 1585 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1586 if (vlen_in_bytes <= 16) { 1587 pxor (dst, dst); 1588 psubb(dst, src); 1589 switch (elem_bt) { 1590 case T_BYTE: /* nothing to do */ break; 1591 case T_SHORT: pmovsxbw(dst, dst); break; 1592 case T_INT: pmovsxbd(dst, dst); break; 1593 case T_FLOAT: pmovsxbd(dst, dst); break; 1594 case T_LONG: pmovsxbq(dst, dst); break; 1595 case T_DOUBLE: pmovsxbq(dst, dst); break; 1596 1597 default: assert(false, "%s", type2name(elem_bt)); 1598 } 1599 } else { 1600 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1601 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1602 1603 vpxor (dst, dst, dst, vlen_enc); 1604 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1605 1606 switch (elem_bt) { 1607 case T_BYTE: /* nothing to do */ break; 1608 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1609 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1610 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1611 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1612 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1613 1614 default: assert(false, "%s", type2name(elem_bt)); 1615 } 1616 } 1617 } 1618 1619 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1620 if (novlbwdq) { 1621 vpmovsxbd(xtmp, src, vlen_enc); 1622 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1623 Assembler::eq, true, vlen_enc, noreg); 1624 } else { 1625 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1626 vpsubb(xtmp, xtmp, src, vlen_enc); 1627 evpmovb2m(dst, xtmp, vlen_enc); 1628 } 1629 } 1630 1631 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) { 1632 switch (vlen_in_bytes) { 1633 case 4: movdl(dst, src); break; 1634 case 8: movq(dst, src); break; 1635 case 16: movdqu(dst, src); break; 1636 case 32: vmovdqu(dst, src); break; 1637 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1638 default: ShouldNotReachHere(); 1639 } 1640 } 1641 1642 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1643 assert(rscratch != noreg || always_reachable(src), "missing"); 1644 1645 if (reachable(src)) { 1646 load_vector(dst, as_Address(src), vlen_in_bytes); 1647 } else { 1648 lea(rscratch, src); 1649 load_vector(dst, Address(rscratch, 0), vlen_in_bytes); 1650 } 1651 } 1652 1653 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1654 int vlen_enc = vector_length_encoding(vlen); 1655 if (VM_Version::supports_avx()) { 1656 if (bt == T_LONG) { 1657 if (VM_Version::supports_avx2()) { 1658 vpbroadcastq(dst, src, vlen_enc); 1659 } else { 1660 vmovddup(dst, src, vlen_enc); 1661 } 1662 } else if (bt == T_DOUBLE) { 1663 if (vlen_enc != Assembler::AVX_128bit) { 1664 vbroadcastsd(dst, src, vlen_enc, noreg); 1665 } else { 1666 vmovddup(dst, src, vlen_enc); 1667 } 1668 } else { 1669 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1670 vpbroadcastd(dst, src, vlen_enc); 1671 } else { 1672 vbroadcastss(dst, src, vlen_enc); 1673 } 1674 } 1675 } else if (VM_Version::supports_sse3()) { 1676 movddup(dst, src); 1677 } else { 1678 movq(dst, src); 1679 if (vlen == 16) { 1680 punpcklqdq(dst, dst); 1681 } 1682 } 1683 } 1684 1685 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1686 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1687 int offset = exact_log2(type2aelembytes(bt)) << 6; 1688 if (is_floating_point_type(bt)) { 1689 offset += 128; 1690 } 1691 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1692 load_vector(dst, addr, vlen_in_bytes); 1693 } 1694 1695 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1696 1697 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1698 int vector_len = Assembler::AVX_128bit; 1699 1700 switch (opcode) { 1701 case Op_AndReductionV: pand(dst, src); break; 1702 case Op_OrReductionV: por (dst, src); break; 1703 case Op_XorReductionV: pxor(dst, src); break; 1704 case Op_MinReductionV: 1705 switch (typ) { 1706 case T_BYTE: pminsb(dst, src); break; 1707 case T_SHORT: pminsw(dst, src); break; 1708 case T_INT: pminsd(dst, src); break; 1709 case T_LONG: assert(UseAVX > 2, "required"); 1710 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1711 default: assert(false, "wrong type"); 1712 } 1713 break; 1714 case Op_MaxReductionV: 1715 switch (typ) { 1716 case T_BYTE: pmaxsb(dst, src); break; 1717 case T_SHORT: pmaxsw(dst, src); break; 1718 case T_INT: pmaxsd(dst, src); break; 1719 case T_LONG: assert(UseAVX > 2, "required"); 1720 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1721 default: assert(false, "wrong type"); 1722 } 1723 break; 1724 case Op_AddReductionVF: addss(dst, src); break; 1725 case Op_AddReductionVD: addsd(dst, src); break; 1726 case Op_AddReductionVI: 1727 switch (typ) { 1728 case T_BYTE: paddb(dst, src); break; 1729 case T_SHORT: paddw(dst, src); break; 1730 case T_INT: paddd(dst, src); break; 1731 default: assert(false, "wrong type"); 1732 } 1733 break; 1734 case Op_AddReductionVL: paddq(dst, src); break; 1735 case Op_MulReductionVF: mulss(dst, src); break; 1736 case Op_MulReductionVD: mulsd(dst, src); break; 1737 case Op_MulReductionVI: 1738 switch (typ) { 1739 case T_SHORT: pmullw(dst, src); break; 1740 case T_INT: pmulld(dst, src); break; 1741 default: assert(false, "wrong type"); 1742 } 1743 break; 1744 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1745 evpmullq(dst, dst, src, vector_len); break; 1746 default: assert(false, "wrong opcode"); 1747 } 1748 } 1749 1750 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1751 int vector_len = Assembler::AVX_256bit; 1752 1753 switch (opcode) { 1754 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1755 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1756 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1757 case Op_MinReductionV: 1758 switch (typ) { 1759 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1760 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1761 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1762 case T_LONG: assert(UseAVX > 2, "required"); 1763 vpminsq(dst, src1, src2, vector_len); break; 1764 default: assert(false, "wrong type"); 1765 } 1766 break; 1767 case Op_MaxReductionV: 1768 switch (typ) { 1769 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1770 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1771 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1772 case T_LONG: assert(UseAVX > 2, "required"); 1773 vpmaxsq(dst, src1, src2, vector_len); break; 1774 default: assert(false, "wrong type"); 1775 } 1776 break; 1777 case Op_AddReductionVI: 1778 switch (typ) { 1779 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1780 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1781 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1782 default: assert(false, "wrong type"); 1783 } 1784 break; 1785 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1786 case Op_MulReductionVI: 1787 switch (typ) { 1788 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1789 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1790 default: assert(false, "wrong type"); 1791 } 1792 break; 1793 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1794 default: assert(false, "wrong opcode"); 1795 } 1796 } 1797 1798 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1799 XMMRegister dst, XMMRegister src, 1800 XMMRegister vtmp1, XMMRegister vtmp2) { 1801 switch (opcode) { 1802 case Op_AddReductionVF: 1803 case Op_MulReductionVF: 1804 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1805 break; 1806 1807 case Op_AddReductionVD: 1808 case Op_MulReductionVD: 1809 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1810 break; 1811 1812 default: assert(false, "wrong opcode"); 1813 } 1814 } 1815 1816 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1817 Register dst, Register src1, XMMRegister src2, 1818 XMMRegister vtmp1, XMMRegister vtmp2) { 1819 switch (vlen) { 1820 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1821 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1822 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1823 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1824 1825 default: assert(false, "wrong vector length"); 1826 } 1827 } 1828 1829 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1830 Register dst, Register src1, XMMRegister src2, 1831 XMMRegister vtmp1, XMMRegister vtmp2) { 1832 switch (vlen) { 1833 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1834 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1835 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1836 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1837 1838 default: assert(false, "wrong vector length"); 1839 } 1840 } 1841 1842 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1843 Register dst, Register src1, XMMRegister src2, 1844 XMMRegister vtmp1, XMMRegister vtmp2) { 1845 switch (vlen) { 1846 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1847 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1848 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1849 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1850 1851 default: assert(false, "wrong vector length"); 1852 } 1853 } 1854 1855 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1856 Register dst, Register src1, XMMRegister src2, 1857 XMMRegister vtmp1, XMMRegister vtmp2) { 1858 switch (vlen) { 1859 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1860 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1861 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1862 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1863 1864 default: assert(false, "wrong vector length"); 1865 } 1866 } 1867 1868 #ifdef _LP64 1869 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1870 Register dst, Register src1, XMMRegister src2, 1871 XMMRegister vtmp1, XMMRegister vtmp2) { 1872 switch (vlen) { 1873 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1874 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1875 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1876 1877 default: assert(false, "wrong vector length"); 1878 } 1879 } 1880 #endif // _LP64 1881 1882 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1883 switch (vlen) { 1884 case 2: 1885 assert(vtmp2 == xnoreg, ""); 1886 reduce2F(opcode, dst, src, vtmp1); 1887 break; 1888 case 4: 1889 assert(vtmp2 == xnoreg, ""); 1890 reduce4F(opcode, dst, src, vtmp1); 1891 break; 1892 case 8: 1893 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1894 break; 1895 case 16: 1896 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1897 break; 1898 default: assert(false, "wrong vector length"); 1899 } 1900 } 1901 1902 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1903 switch (vlen) { 1904 case 2: 1905 assert(vtmp2 == xnoreg, ""); 1906 reduce2D(opcode, dst, src, vtmp1); 1907 break; 1908 case 4: 1909 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1910 break; 1911 case 8: 1912 reduce8D(opcode, dst, src, vtmp1, vtmp2); 1913 break; 1914 default: assert(false, "wrong vector length"); 1915 } 1916 } 1917 1918 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1919 if (opcode == Op_AddReductionVI) { 1920 if (vtmp1 != src2) { 1921 movdqu(vtmp1, src2); 1922 } 1923 phaddd(vtmp1, vtmp1); 1924 } else { 1925 pshufd(vtmp1, src2, 0x1); 1926 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1927 } 1928 movdl(vtmp2, src1); 1929 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1930 movdl(dst, vtmp1); 1931 } 1932 1933 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1934 if (opcode == Op_AddReductionVI) { 1935 if (vtmp1 != src2) { 1936 movdqu(vtmp1, src2); 1937 } 1938 phaddd(vtmp1, src2); 1939 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1940 } else { 1941 pshufd(vtmp2, src2, 0xE); 1942 reduce_operation_128(T_INT, opcode, vtmp2, src2); 1943 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1944 } 1945 } 1946 1947 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1948 if (opcode == Op_AddReductionVI) { 1949 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 1950 vextracti128_high(vtmp2, vtmp1); 1951 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 1952 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1953 } else { 1954 vextracti128_high(vtmp1, src2); 1955 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1956 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1957 } 1958 } 1959 1960 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1961 vextracti64x4_high(vtmp2, src2); 1962 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 1963 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1964 } 1965 1966 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1967 pshufd(vtmp2, src2, 0x1); 1968 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1969 movdqu(vtmp1, vtmp2); 1970 psrldq(vtmp1, 2); 1971 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 1972 movdqu(vtmp2, vtmp1); 1973 psrldq(vtmp2, 1); 1974 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 1975 movdl(vtmp2, src1); 1976 pmovsxbd(vtmp1, vtmp1); 1977 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1978 pextrb(dst, vtmp1, 0x0); 1979 movsbl(dst, dst); 1980 } 1981 1982 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1983 pshufd(vtmp1, src2, 0xE); 1984 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 1985 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1986 } 1987 1988 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1989 vextracti128_high(vtmp2, src2); 1990 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1991 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1992 } 1993 1994 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1995 vextracti64x4_high(vtmp1, src2); 1996 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 1997 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1998 } 1999 2000 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2001 pmovsxbw(vtmp2, src2); 2002 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2003 } 2004 2005 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2006 if (UseAVX > 1) { 2007 int vector_len = Assembler::AVX_256bit; 2008 vpmovsxbw(vtmp1, src2, vector_len); 2009 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2010 } else { 2011 pmovsxbw(vtmp2, src2); 2012 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2013 pshufd(vtmp2, src2, 0x1); 2014 pmovsxbw(vtmp2, src2); 2015 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2016 } 2017 } 2018 2019 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2020 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2021 int vector_len = Assembler::AVX_512bit; 2022 vpmovsxbw(vtmp1, src2, vector_len); 2023 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2024 } else { 2025 assert(UseAVX >= 2,"Should not reach here."); 2026 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2027 vextracti128_high(vtmp2, src2); 2028 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2029 } 2030 } 2031 2032 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2033 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2034 vextracti64x4_high(vtmp2, src2); 2035 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2036 } 2037 2038 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2039 if (opcode == Op_AddReductionVI) { 2040 if (vtmp1 != src2) { 2041 movdqu(vtmp1, src2); 2042 } 2043 phaddw(vtmp1, vtmp1); 2044 phaddw(vtmp1, vtmp1); 2045 } else { 2046 pshufd(vtmp2, src2, 0x1); 2047 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2048 movdqu(vtmp1, vtmp2); 2049 psrldq(vtmp1, 2); 2050 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2051 } 2052 movdl(vtmp2, src1); 2053 pmovsxwd(vtmp1, vtmp1); 2054 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2055 pextrw(dst, vtmp1, 0x0); 2056 movswl(dst, dst); 2057 } 2058 2059 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2060 if (opcode == Op_AddReductionVI) { 2061 if (vtmp1 != src2) { 2062 movdqu(vtmp1, src2); 2063 } 2064 phaddw(vtmp1, src2); 2065 } else { 2066 pshufd(vtmp1, src2, 0xE); 2067 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2068 } 2069 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2070 } 2071 2072 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2073 if (opcode == Op_AddReductionVI) { 2074 int vector_len = Assembler::AVX_256bit; 2075 vphaddw(vtmp2, src2, src2, vector_len); 2076 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2077 } else { 2078 vextracti128_high(vtmp2, src2); 2079 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2080 } 2081 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2082 } 2083 2084 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2085 int vector_len = Assembler::AVX_256bit; 2086 vextracti64x4_high(vtmp1, src2); 2087 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2088 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2089 } 2090 2091 #ifdef _LP64 2092 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2093 pshufd(vtmp2, src2, 0xE); 2094 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2095 movdq(vtmp1, src1); 2096 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2097 movdq(dst, vtmp1); 2098 } 2099 2100 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2101 vextracti128_high(vtmp1, src2); 2102 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2103 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2104 } 2105 2106 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2107 vextracti64x4_high(vtmp2, src2); 2108 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2109 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2110 } 2111 2112 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2113 mov64(temp, -1L); 2114 bzhiq(temp, temp, len); 2115 kmovql(dst, temp); 2116 } 2117 #endif // _LP64 2118 2119 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2120 reduce_operation_128(T_FLOAT, opcode, dst, src); 2121 pshufd(vtmp, src, 0x1); 2122 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2123 } 2124 2125 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2126 reduce2F(opcode, dst, src, vtmp); 2127 pshufd(vtmp, src, 0x2); 2128 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2129 pshufd(vtmp, src, 0x3); 2130 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2131 } 2132 2133 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2134 reduce4F(opcode, dst, src, vtmp2); 2135 vextractf128_high(vtmp2, src); 2136 reduce4F(opcode, dst, vtmp2, vtmp1); 2137 } 2138 2139 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2140 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2141 vextracti64x4_high(vtmp1, src); 2142 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2143 } 2144 2145 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2146 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2147 pshufd(vtmp, src, 0xE); 2148 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2149 } 2150 2151 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2152 reduce2D(opcode, dst, src, vtmp2); 2153 vextractf128_high(vtmp2, src); 2154 reduce2D(opcode, dst, vtmp2, vtmp1); 2155 } 2156 2157 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2158 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2159 vextracti64x4_high(vtmp1, src); 2160 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2161 } 2162 2163 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2164 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2165 } 2166 2167 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2168 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2169 } 2170 2171 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2172 int vec_enc) { 2173 switch(elem_bt) { 2174 case T_INT: 2175 case T_FLOAT: 2176 vmaskmovps(dst, src, mask, vec_enc); 2177 break; 2178 case T_LONG: 2179 case T_DOUBLE: 2180 vmaskmovpd(dst, src, mask, vec_enc); 2181 break; 2182 default: 2183 fatal("Unsupported type %s", type2name(elem_bt)); 2184 break; 2185 } 2186 } 2187 2188 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2189 int vec_enc) { 2190 switch(elem_bt) { 2191 case T_INT: 2192 case T_FLOAT: 2193 vmaskmovps(dst, src, mask, vec_enc); 2194 break; 2195 case T_LONG: 2196 case T_DOUBLE: 2197 vmaskmovpd(dst, src, mask, vec_enc); 2198 break; 2199 default: 2200 fatal("Unsupported type %s", type2name(elem_bt)); 2201 break; 2202 } 2203 } 2204 2205 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2206 XMMRegister dst, XMMRegister src, 2207 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2208 XMMRegister xmm_0, XMMRegister xmm_1) { 2209 const int permconst[] = {1, 14}; 2210 XMMRegister wsrc = src; 2211 XMMRegister wdst = xmm_0; 2212 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2213 2214 int vlen_enc = Assembler::AVX_128bit; 2215 if (vlen == 16) { 2216 vlen_enc = Assembler::AVX_256bit; 2217 } 2218 2219 for (int i = log2(vlen) - 1; i >=0; i--) { 2220 if (i == 0 && !is_dst_valid) { 2221 wdst = dst; 2222 } 2223 if (i == 3) { 2224 vextracti64x4_high(wtmp, wsrc); 2225 } else if (i == 2) { 2226 vextracti128_high(wtmp, wsrc); 2227 } else { // i = [0,1] 2228 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2229 } 2230 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2231 wsrc = wdst; 2232 vlen_enc = Assembler::AVX_128bit; 2233 } 2234 if (is_dst_valid) { 2235 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2236 } 2237 } 2238 2239 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2240 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2241 XMMRegister xmm_0, XMMRegister xmm_1) { 2242 XMMRegister wsrc = src; 2243 XMMRegister wdst = xmm_0; 2244 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2245 int vlen_enc = Assembler::AVX_128bit; 2246 if (vlen == 8) { 2247 vlen_enc = Assembler::AVX_256bit; 2248 } 2249 for (int i = log2(vlen) - 1; i >=0; i--) { 2250 if (i == 0 && !is_dst_valid) { 2251 wdst = dst; 2252 } 2253 if (i == 1) { 2254 vextracti128_high(wtmp, wsrc); 2255 } else if (i == 2) { 2256 vextracti64x4_high(wtmp, wsrc); 2257 } else { 2258 assert(i == 0, "%d", i); 2259 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2260 } 2261 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2262 wsrc = wdst; 2263 vlen_enc = Assembler::AVX_128bit; 2264 } 2265 if (is_dst_valid) { 2266 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2267 } 2268 } 2269 2270 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2271 switch (bt) { 2272 case T_BYTE: pextrb(dst, src, idx); break; 2273 case T_SHORT: pextrw(dst, src, idx); break; 2274 case T_INT: pextrd(dst, src, idx); break; 2275 case T_LONG: pextrq(dst, src, idx); break; 2276 2277 default: 2278 assert(false,"Should not reach here."); 2279 break; 2280 } 2281 } 2282 2283 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2284 int esize = type2aelembytes(typ); 2285 int elem_per_lane = 16/esize; 2286 int lane = elemindex / elem_per_lane; 2287 int eindex = elemindex % elem_per_lane; 2288 2289 if (lane >= 2) { 2290 assert(UseAVX > 2, "required"); 2291 vextractf32x4(dst, src, lane & 3); 2292 return dst; 2293 } else if (lane > 0) { 2294 assert(UseAVX > 0, "required"); 2295 vextractf128(dst, src, lane); 2296 return dst; 2297 } else { 2298 return src; 2299 } 2300 } 2301 2302 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2303 if (typ == T_BYTE) { 2304 movsbl(dst, dst); 2305 } else if (typ == T_SHORT) { 2306 movswl(dst, dst); 2307 } 2308 } 2309 2310 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2311 int esize = type2aelembytes(typ); 2312 int elem_per_lane = 16/esize; 2313 int eindex = elemindex % elem_per_lane; 2314 assert(is_integral_type(typ),"required"); 2315 2316 if (eindex == 0) { 2317 if (typ == T_LONG) { 2318 movq(dst, src); 2319 } else { 2320 movdl(dst, src); 2321 movsxl(typ, dst); 2322 } 2323 } else { 2324 extract(typ, dst, src, eindex); 2325 movsxl(typ, dst); 2326 } 2327 } 2328 2329 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2330 int esize = type2aelembytes(typ); 2331 int elem_per_lane = 16/esize; 2332 int eindex = elemindex % elem_per_lane; 2333 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2334 2335 if (eindex == 0) { 2336 movq(dst, src); 2337 } else { 2338 if (typ == T_FLOAT) { 2339 if (UseAVX == 0) { 2340 movdqu(dst, src); 2341 shufps(dst, dst, eindex); 2342 } else { 2343 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2344 } 2345 } else { 2346 if (UseAVX == 0) { 2347 movdqu(dst, src); 2348 psrldq(dst, eindex*esize); 2349 } else { 2350 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2351 } 2352 movq(dst, dst); 2353 } 2354 } 2355 // Zero upper bits 2356 if (typ == T_FLOAT) { 2357 if (UseAVX == 0) { 2358 assert(vtmp != xnoreg, "required."); 2359 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2360 pand(dst, vtmp); 2361 } else { 2362 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2363 } 2364 } 2365 } 2366 2367 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2368 switch(typ) { 2369 case T_BYTE: 2370 case T_BOOLEAN: 2371 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2372 break; 2373 case T_SHORT: 2374 case T_CHAR: 2375 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2376 break; 2377 case T_INT: 2378 case T_FLOAT: 2379 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2380 break; 2381 case T_LONG: 2382 case T_DOUBLE: 2383 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2384 break; 2385 default: 2386 assert(false,"Should not reach here."); 2387 break; 2388 } 2389 } 2390 2391 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2392 assert(rscratch != noreg || always_reachable(src2), "missing"); 2393 2394 switch(typ) { 2395 case T_BOOLEAN: 2396 case T_BYTE: 2397 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2398 break; 2399 case T_CHAR: 2400 case T_SHORT: 2401 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2402 break; 2403 case T_INT: 2404 case T_FLOAT: 2405 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2406 break; 2407 case T_LONG: 2408 case T_DOUBLE: 2409 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2410 break; 2411 default: 2412 assert(false,"Should not reach here."); 2413 break; 2414 } 2415 } 2416 2417 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2418 switch(typ) { 2419 case T_BYTE: 2420 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2421 break; 2422 case T_SHORT: 2423 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2424 break; 2425 case T_INT: 2426 case T_FLOAT: 2427 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2428 break; 2429 case T_LONG: 2430 case T_DOUBLE: 2431 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2432 break; 2433 default: 2434 assert(false,"Should not reach here."); 2435 break; 2436 } 2437 } 2438 2439 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2440 assert(vlen_in_bytes <= 32, ""); 2441 int esize = type2aelembytes(bt); 2442 if (vlen_in_bytes == 32) { 2443 assert(vtmp == xnoreg, "required."); 2444 if (esize >= 4) { 2445 vtestps(src1, src2, AVX_256bit); 2446 } else { 2447 vptest(src1, src2, AVX_256bit); 2448 } 2449 return; 2450 } 2451 if (vlen_in_bytes < 16) { 2452 // Duplicate the lower part to fill the whole register, 2453 // Don't need to do so for src2 2454 assert(vtmp != xnoreg, "required"); 2455 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2456 pshufd(vtmp, src1, shuffle_imm); 2457 } else { 2458 assert(vtmp == xnoreg, "required"); 2459 vtmp = src1; 2460 } 2461 if (esize >= 4 && VM_Version::supports_avx()) { 2462 vtestps(vtmp, src2, AVX_128bit); 2463 } else { 2464 ptest(vtmp, src2); 2465 } 2466 } 2467 2468 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2469 assert(UseAVX >= 2, "required"); 2470 #ifdef ASSERT 2471 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2472 bool is_bw_supported = VM_Version::supports_avx512bw(); 2473 if (is_bw && !is_bw_supported) { 2474 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2475 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2476 "XMM register should be 0-15"); 2477 } 2478 #endif // ASSERT 2479 switch (elem_bt) { 2480 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2481 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2482 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2483 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2484 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2485 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2486 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2487 } 2488 } 2489 2490 #ifdef _LP64 2491 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2492 assert(UseAVX >= 2, "required"); 2493 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2494 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2495 if ((UseAVX > 2) && 2496 (!is_bw || VM_Version::supports_avx512bw()) && 2497 (!is_vl || VM_Version::supports_avx512vl())) { 2498 switch (elem_bt) { 2499 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2500 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2501 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2502 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2503 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2504 } 2505 } else { 2506 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2507 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2508 switch (elem_bt) { 2509 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2510 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2511 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2512 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2513 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2514 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2515 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2516 } 2517 } 2518 } 2519 #endif 2520 2521 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2522 switch (to_elem_bt) { 2523 case T_SHORT: 2524 vpmovsxbw(dst, src, vlen_enc); 2525 break; 2526 case T_INT: 2527 vpmovsxbd(dst, src, vlen_enc); 2528 break; 2529 case T_FLOAT: 2530 vpmovsxbd(dst, src, vlen_enc); 2531 vcvtdq2ps(dst, dst, vlen_enc); 2532 break; 2533 case T_LONG: 2534 vpmovsxbq(dst, src, vlen_enc); 2535 break; 2536 case T_DOUBLE: { 2537 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2538 vpmovsxbd(dst, src, mid_vlen_enc); 2539 vcvtdq2pd(dst, dst, vlen_enc); 2540 break; 2541 } 2542 default: 2543 fatal("Unsupported type %s", type2name(to_elem_bt)); 2544 break; 2545 } 2546 } 2547 2548 //------------------------------------------------------------------------------------------- 2549 2550 // IndexOf for constant substrings with size >= 8 chars 2551 // which don't need to be loaded through stack. 2552 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2553 Register cnt1, Register cnt2, 2554 int int_cnt2, Register result, 2555 XMMRegister vec, Register tmp, 2556 int ae) { 2557 ShortBranchVerifier sbv(this); 2558 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2559 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2560 2561 // This method uses the pcmpestri instruction with bound registers 2562 // inputs: 2563 // xmm - substring 2564 // rax - substring length (elements count) 2565 // mem - scanned string 2566 // rdx - string length (elements count) 2567 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2568 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2569 // outputs: 2570 // rcx - matched index in string 2571 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2572 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2573 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2574 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2575 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2576 2577 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2578 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2579 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2580 2581 // Note, inline_string_indexOf() generates checks: 2582 // if (substr.count > string.count) return -1; 2583 // if (substr.count == 0) return 0; 2584 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2585 2586 // Load substring. 2587 if (ae == StrIntrinsicNode::UL) { 2588 pmovzxbw(vec, Address(str2, 0)); 2589 } else { 2590 movdqu(vec, Address(str2, 0)); 2591 } 2592 movl(cnt2, int_cnt2); 2593 movptr(result, str1); // string addr 2594 2595 if (int_cnt2 > stride) { 2596 jmpb(SCAN_TO_SUBSTR); 2597 2598 // Reload substr for rescan, this code 2599 // is executed only for large substrings (> 8 chars) 2600 bind(RELOAD_SUBSTR); 2601 if (ae == StrIntrinsicNode::UL) { 2602 pmovzxbw(vec, Address(str2, 0)); 2603 } else { 2604 movdqu(vec, Address(str2, 0)); 2605 } 2606 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2607 2608 bind(RELOAD_STR); 2609 // We came here after the beginning of the substring was 2610 // matched but the rest of it was not so we need to search 2611 // again. Start from the next element after the previous match. 2612 2613 // cnt2 is number of substring reminding elements and 2614 // cnt1 is number of string reminding elements when cmp failed. 2615 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2616 subl(cnt1, cnt2); 2617 addl(cnt1, int_cnt2); 2618 movl(cnt2, int_cnt2); // Now restore cnt2 2619 2620 decrementl(cnt1); // Shift to next element 2621 cmpl(cnt1, cnt2); 2622 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2623 2624 addptr(result, (1<<scale1)); 2625 2626 } // (int_cnt2 > 8) 2627 2628 // Scan string for start of substr in 16-byte vectors 2629 bind(SCAN_TO_SUBSTR); 2630 pcmpestri(vec, Address(result, 0), mode); 2631 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2632 subl(cnt1, stride); 2633 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2634 cmpl(cnt1, cnt2); 2635 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2636 addptr(result, 16); 2637 jmpb(SCAN_TO_SUBSTR); 2638 2639 // Found a potential substr 2640 bind(FOUND_CANDIDATE); 2641 // Matched whole vector if first element matched (tmp(rcx) == 0). 2642 if (int_cnt2 == stride) { 2643 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2644 } else { // int_cnt2 > 8 2645 jccb(Assembler::overflow, FOUND_SUBSTR); 2646 } 2647 // After pcmpestri tmp(rcx) contains matched element index 2648 // Compute start addr of substr 2649 lea(result, Address(result, tmp, scale1)); 2650 2651 // Make sure string is still long enough 2652 subl(cnt1, tmp); 2653 cmpl(cnt1, cnt2); 2654 if (int_cnt2 == stride) { 2655 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2656 } else { // int_cnt2 > 8 2657 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2658 } 2659 // Left less then substring. 2660 2661 bind(RET_NOT_FOUND); 2662 movl(result, -1); 2663 jmp(EXIT); 2664 2665 if (int_cnt2 > stride) { 2666 // This code is optimized for the case when whole substring 2667 // is matched if its head is matched. 2668 bind(MATCH_SUBSTR_HEAD); 2669 pcmpestri(vec, Address(result, 0), mode); 2670 // Reload only string if does not match 2671 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2672 2673 Label CONT_SCAN_SUBSTR; 2674 // Compare the rest of substring (> 8 chars). 2675 bind(FOUND_SUBSTR); 2676 // First 8 chars are already matched. 2677 negptr(cnt2); 2678 addptr(cnt2, stride); 2679 2680 bind(SCAN_SUBSTR); 2681 subl(cnt1, stride); 2682 cmpl(cnt2, -stride); // Do not read beyond substring 2683 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2684 // Back-up strings to avoid reading beyond substring: 2685 // cnt1 = cnt1 - cnt2 + 8 2686 addl(cnt1, cnt2); // cnt2 is negative 2687 addl(cnt1, stride); 2688 movl(cnt2, stride); negptr(cnt2); 2689 bind(CONT_SCAN_SUBSTR); 2690 if (int_cnt2 < (int)G) { 2691 int tail_off1 = int_cnt2<<scale1; 2692 int tail_off2 = int_cnt2<<scale2; 2693 if (ae == StrIntrinsicNode::UL) { 2694 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2695 } else { 2696 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2697 } 2698 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2699 } else { 2700 // calculate index in register to avoid integer overflow (int_cnt2*2) 2701 movl(tmp, int_cnt2); 2702 addptr(tmp, cnt2); 2703 if (ae == StrIntrinsicNode::UL) { 2704 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2705 } else { 2706 movdqu(vec, Address(str2, tmp, scale2, 0)); 2707 } 2708 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2709 } 2710 // Need to reload strings pointers if not matched whole vector 2711 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2712 addptr(cnt2, stride); 2713 jcc(Assembler::negative, SCAN_SUBSTR); 2714 // Fall through if found full substring 2715 2716 } // (int_cnt2 > 8) 2717 2718 bind(RET_FOUND); 2719 // Found result if we matched full small substring. 2720 // Compute substr offset 2721 subptr(result, str1); 2722 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2723 shrl(result, 1); // index 2724 } 2725 bind(EXIT); 2726 2727 } // string_indexofC8 2728 2729 // Small strings are loaded through stack if they cross page boundary. 2730 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2731 Register cnt1, Register cnt2, 2732 int int_cnt2, Register result, 2733 XMMRegister vec, Register tmp, 2734 int ae) { 2735 ShortBranchVerifier sbv(this); 2736 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2737 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2738 2739 // 2740 // int_cnt2 is length of small (< 8 chars) constant substring 2741 // or (-1) for non constant substring in which case its length 2742 // is in cnt2 register. 2743 // 2744 // Note, inline_string_indexOf() generates checks: 2745 // if (substr.count > string.count) return -1; 2746 // if (substr.count == 0) return 0; 2747 // 2748 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2749 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2750 // This method uses the pcmpestri instruction with bound registers 2751 // inputs: 2752 // xmm - substring 2753 // rax - substring length (elements count) 2754 // mem - scanned string 2755 // rdx - string length (elements count) 2756 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2757 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2758 // outputs: 2759 // rcx - matched index in string 2760 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2761 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2762 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2763 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2764 2765 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2766 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2767 FOUND_CANDIDATE; 2768 2769 { //======================================================== 2770 // We don't know where these strings are located 2771 // and we can't read beyond them. Load them through stack. 2772 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2773 2774 movptr(tmp, rsp); // save old SP 2775 2776 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2777 if (int_cnt2 == (1>>scale2)) { // One byte 2778 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2779 load_unsigned_byte(result, Address(str2, 0)); 2780 movdl(vec, result); // move 32 bits 2781 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2782 // Not enough header space in 32-bit VM: 12+3 = 15. 2783 movl(result, Address(str2, -1)); 2784 shrl(result, 8); 2785 movdl(vec, result); // move 32 bits 2786 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2787 load_unsigned_short(result, Address(str2, 0)); 2788 movdl(vec, result); // move 32 bits 2789 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2790 movdl(vec, Address(str2, 0)); // move 32 bits 2791 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2792 movq(vec, Address(str2, 0)); // move 64 bits 2793 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2794 // Array header size is 12 bytes in 32-bit VM 2795 // + 6 bytes for 3 chars == 18 bytes, 2796 // enough space to load vec and shift. 2797 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2798 if (ae == StrIntrinsicNode::UL) { 2799 int tail_off = int_cnt2-8; 2800 pmovzxbw(vec, Address(str2, tail_off)); 2801 psrldq(vec, -2*tail_off); 2802 } 2803 else { 2804 int tail_off = int_cnt2*(1<<scale2); 2805 movdqu(vec, Address(str2, tail_off-16)); 2806 psrldq(vec, 16-tail_off); 2807 } 2808 } 2809 } else { // not constant substring 2810 cmpl(cnt2, stride); 2811 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 2812 2813 // We can read beyond string if srt+16 does not cross page boundary 2814 // since heaps are aligned and mapped by pages. 2815 assert(os::vm_page_size() < (int)G, "default page should be small"); 2816 movl(result, str2); // We need only low 32 bits 2817 andl(result, ((int)os::vm_page_size()-1)); 2818 cmpl(result, ((int)os::vm_page_size()-16)); 2819 jccb(Assembler::belowEqual, CHECK_STR); 2820 2821 // Move small strings to stack to allow load 16 bytes into vec. 2822 subptr(rsp, 16); 2823 int stk_offset = wordSize-(1<<scale2); 2824 push(cnt2); 2825 2826 bind(COPY_SUBSTR); 2827 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 2828 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 2829 movb(Address(rsp, cnt2, scale2, stk_offset), result); 2830 } else if (ae == StrIntrinsicNode::UU) { 2831 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 2832 movw(Address(rsp, cnt2, scale2, stk_offset), result); 2833 } 2834 decrement(cnt2); 2835 jccb(Assembler::notZero, COPY_SUBSTR); 2836 2837 pop(cnt2); 2838 movptr(str2, rsp); // New substring address 2839 } // non constant 2840 2841 bind(CHECK_STR); 2842 cmpl(cnt1, stride); 2843 jccb(Assembler::aboveEqual, BIG_STRINGS); 2844 2845 // Check cross page boundary. 2846 movl(result, str1); // We need only low 32 bits 2847 andl(result, ((int)os::vm_page_size()-1)); 2848 cmpl(result, ((int)os::vm_page_size()-16)); 2849 jccb(Assembler::belowEqual, BIG_STRINGS); 2850 2851 subptr(rsp, 16); 2852 int stk_offset = -(1<<scale1); 2853 if (int_cnt2 < 0) { // not constant 2854 push(cnt2); 2855 stk_offset += wordSize; 2856 } 2857 movl(cnt2, cnt1); 2858 2859 bind(COPY_STR); 2860 if (ae == StrIntrinsicNode::LL) { 2861 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 2862 movb(Address(rsp, cnt2, scale1, stk_offset), result); 2863 } else { 2864 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 2865 movw(Address(rsp, cnt2, scale1, stk_offset), result); 2866 } 2867 decrement(cnt2); 2868 jccb(Assembler::notZero, COPY_STR); 2869 2870 if (int_cnt2 < 0) { // not constant 2871 pop(cnt2); 2872 } 2873 movptr(str1, rsp); // New string address 2874 2875 bind(BIG_STRINGS); 2876 // Load substring. 2877 if (int_cnt2 < 0) { // -1 2878 if (ae == StrIntrinsicNode::UL) { 2879 pmovzxbw(vec, Address(str2, 0)); 2880 } else { 2881 movdqu(vec, Address(str2, 0)); 2882 } 2883 push(cnt2); // substr count 2884 push(str2); // substr addr 2885 push(str1); // string addr 2886 } else { 2887 // Small (< 8 chars) constant substrings are loaded already. 2888 movl(cnt2, int_cnt2); 2889 } 2890 push(tmp); // original SP 2891 2892 } // Finished loading 2893 2894 //======================================================== 2895 // Start search 2896 // 2897 2898 movptr(result, str1); // string addr 2899 2900 if (int_cnt2 < 0) { // Only for non constant substring 2901 jmpb(SCAN_TO_SUBSTR); 2902 2903 // SP saved at sp+0 2904 // String saved at sp+1*wordSize 2905 // Substr saved at sp+2*wordSize 2906 // Substr count saved at sp+3*wordSize 2907 2908 // Reload substr for rescan, this code 2909 // is executed only for large substrings (> 8 chars) 2910 bind(RELOAD_SUBSTR); 2911 movptr(str2, Address(rsp, 2*wordSize)); 2912 movl(cnt2, Address(rsp, 3*wordSize)); 2913 if (ae == StrIntrinsicNode::UL) { 2914 pmovzxbw(vec, Address(str2, 0)); 2915 } else { 2916 movdqu(vec, Address(str2, 0)); 2917 } 2918 // We came here after the beginning of the substring was 2919 // matched but the rest of it was not so we need to search 2920 // again. Start from the next element after the previous match. 2921 subptr(str1, result); // Restore counter 2922 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2923 shrl(str1, 1); 2924 } 2925 addl(cnt1, str1); 2926 decrementl(cnt1); // Shift to next element 2927 cmpl(cnt1, cnt2); 2928 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2929 2930 addptr(result, (1<<scale1)); 2931 } // non constant 2932 2933 // Scan string for start of substr in 16-byte vectors 2934 bind(SCAN_TO_SUBSTR); 2935 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2936 pcmpestri(vec, Address(result, 0), mode); 2937 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2938 subl(cnt1, stride); 2939 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2940 cmpl(cnt1, cnt2); 2941 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2942 addptr(result, 16); 2943 2944 bind(ADJUST_STR); 2945 cmpl(cnt1, stride); // Do not read beyond string 2946 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2947 // Back-up string to avoid reading beyond string. 2948 lea(result, Address(result, cnt1, scale1, -16)); 2949 movl(cnt1, stride); 2950 jmpb(SCAN_TO_SUBSTR); 2951 2952 // Found a potential substr 2953 bind(FOUND_CANDIDATE); 2954 // After pcmpestri tmp(rcx) contains matched element index 2955 2956 // Make sure string is still long enough 2957 subl(cnt1, tmp); 2958 cmpl(cnt1, cnt2); 2959 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 2960 // Left less then substring. 2961 2962 bind(RET_NOT_FOUND); 2963 movl(result, -1); 2964 jmp(CLEANUP); 2965 2966 bind(FOUND_SUBSTR); 2967 // Compute start addr of substr 2968 lea(result, Address(result, tmp, scale1)); 2969 if (int_cnt2 > 0) { // Constant substring 2970 // Repeat search for small substring (< 8 chars) 2971 // from new point without reloading substring. 2972 // Have to check that we don't read beyond string. 2973 cmpl(tmp, stride-int_cnt2); 2974 jccb(Assembler::greater, ADJUST_STR); 2975 // Fall through if matched whole substring. 2976 } else { // non constant 2977 assert(int_cnt2 == -1, "should be != 0"); 2978 2979 addl(tmp, cnt2); 2980 // Found result if we matched whole substring. 2981 cmpl(tmp, stride); 2982 jcc(Assembler::lessEqual, RET_FOUND); 2983 2984 // Repeat search for small substring (<= 8 chars) 2985 // from new point 'str1' without reloading substring. 2986 cmpl(cnt2, stride); 2987 // Have to check that we don't read beyond string. 2988 jccb(Assembler::lessEqual, ADJUST_STR); 2989 2990 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 2991 // Compare the rest of substring (> 8 chars). 2992 movptr(str1, result); 2993 2994 cmpl(tmp, cnt2); 2995 // First 8 chars are already matched. 2996 jccb(Assembler::equal, CHECK_NEXT); 2997 2998 bind(SCAN_SUBSTR); 2999 pcmpestri(vec, Address(str1, 0), mode); 3000 // Need to reload strings pointers if not matched whole vector 3001 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3002 3003 bind(CHECK_NEXT); 3004 subl(cnt2, stride); 3005 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3006 addptr(str1, 16); 3007 if (ae == StrIntrinsicNode::UL) { 3008 addptr(str2, 8); 3009 } else { 3010 addptr(str2, 16); 3011 } 3012 subl(cnt1, stride); 3013 cmpl(cnt2, stride); // Do not read beyond substring 3014 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3015 // Back-up strings to avoid reading beyond substring. 3016 3017 if (ae == StrIntrinsicNode::UL) { 3018 lea(str2, Address(str2, cnt2, scale2, -8)); 3019 lea(str1, Address(str1, cnt2, scale1, -16)); 3020 } else { 3021 lea(str2, Address(str2, cnt2, scale2, -16)); 3022 lea(str1, Address(str1, cnt2, scale1, -16)); 3023 } 3024 subl(cnt1, cnt2); 3025 movl(cnt2, stride); 3026 addl(cnt1, stride); 3027 bind(CONT_SCAN_SUBSTR); 3028 if (ae == StrIntrinsicNode::UL) { 3029 pmovzxbw(vec, Address(str2, 0)); 3030 } else { 3031 movdqu(vec, Address(str2, 0)); 3032 } 3033 jmp(SCAN_SUBSTR); 3034 3035 bind(RET_FOUND_LONG); 3036 movptr(str1, Address(rsp, wordSize)); 3037 } // non constant 3038 3039 bind(RET_FOUND); 3040 // Compute substr offset 3041 subptr(result, str1); 3042 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3043 shrl(result, 1); // index 3044 } 3045 bind(CLEANUP); 3046 pop(rsp); // restore SP 3047 3048 } // string_indexof 3049 3050 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3051 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3052 ShortBranchVerifier sbv(this); 3053 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3054 3055 int stride = 8; 3056 3057 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3058 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3059 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3060 FOUND_SEQ_CHAR, DONE_LABEL; 3061 3062 movptr(result, str1); 3063 if (UseAVX >= 2) { 3064 cmpl(cnt1, stride); 3065 jcc(Assembler::less, SCAN_TO_CHAR); 3066 cmpl(cnt1, 2*stride); 3067 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3068 movdl(vec1, ch); 3069 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3070 vpxor(vec2, vec2); 3071 movl(tmp, cnt1); 3072 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3073 andl(cnt1,0x0000000F); //tail count (in chars) 3074 3075 bind(SCAN_TO_16_CHAR_LOOP); 3076 vmovdqu(vec3, Address(result, 0)); 3077 vpcmpeqw(vec3, vec3, vec1, 1); 3078 vptest(vec2, vec3); 3079 jcc(Assembler::carryClear, FOUND_CHAR); 3080 addptr(result, 32); 3081 subl(tmp, 2*stride); 3082 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3083 jmp(SCAN_TO_8_CHAR); 3084 bind(SCAN_TO_8_CHAR_INIT); 3085 movdl(vec1, ch); 3086 pshuflw(vec1, vec1, 0x00); 3087 pshufd(vec1, vec1, 0); 3088 pxor(vec2, vec2); 3089 } 3090 bind(SCAN_TO_8_CHAR); 3091 cmpl(cnt1, stride); 3092 jcc(Assembler::less, SCAN_TO_CHAR); 3093 if (UseAVX < 2) { 3094 movdl(vec1, ch); 3095 pshuflw(vec1, vec1, 0x00); 3096 pshufd(vec1, vec1, 0); 3097 pxor(vec2, vec2); 3098 } 3099 movl(tmp, cnt1); 3100 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3101 andl(cnt1,0x00000007); //tail count (in chars) 3102 3103 bind(SCAN_TO_8_CHAR_LOOP); 3104 movdqu(vec3, Address(result, 0)); 3105 pcmpeqw(vec3, vec1); 3106 ptest(vec2, vec3); 3107 jcc(Assembler::carryClear, FOUND_CHAR); 3108 addptr(result, 16); 3109 subl(tmp, stride); 3110 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3111 bind(SCAN_TO_CHAR); 3112 testl(cnt1, cnt1); 3113 jcc(Assembler::zero, RET_NOT_FOUND); 3114 bind(SCAN_TO_CHAR_LOOP); 3115 load_unsigned_short(tmp, Address(result, 0)); 3116 cmpl(ch, tmp); 3117 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3118 addptr(result, 2); 3119 subl(cnt1, 1); 3120 jccb(Assembler::zero, RET_NOT_FOUND); 3121 jmp(SCAN_TO_CHAR_LOOP); 3122 3123 bind(RET_NOT_FOUND); 3124 movl(result, -1); 3125 jmpb(DONE_LABEL); 3126 3127 bind(FOUND_CHAR); 3128 if (UseAVX >= 2) { 3129 vpmovmskb(tmp, vec3); 3130 } else { 3131 pmovmskb(tmp, vec3); 3132 } 3133 bsfl(ch, tmp); 3134 addptr(result, ch); 3135 3136 bind(FOUND_SEQ_CHAR); 3137 subptr(result, str1); 3138 shrl(result, 1); 3139 3140 bind(DONE_LABEL); 3141 } // string_indexof_char 3142 3143 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3144 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3145 ShortBranchVerifier sbv(this); 3146 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3147 3148 int stride = 16; 3149 3150 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3151 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3152 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3153 FOUND_SEQ_CHAR, DONE_LABEL; 3154 3155 movptr(result, str1); 3156 if (UseAVX >= 2) { 3157 cmpl(cnt1, stride); 3158 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3159 cmpl(cnt1, stride*2); 3160 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3161 movdl(vec1, ch); 3162 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3163 vpxor(vec2, vec2); 3164 movl(tmp, cnt1); 3165 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3166 andl(cnt1,0x0000001F); //tail count (in chars) 3167 3168 bind(SCAN_TO_32_CHAR_LOOP); 3169 vmovdqu(vec3, Address(result, 0)); 3170 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3171 vptest(vec2, vec3); 3172 jcc(Assembler::carryClear, FOUND_CHAR); 3173 addptr(result, 32); 3174 subl(tmp, stride*2); 3175 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3176 jmp(SCAN_TO_16_CHAR); 3177 3178 bind(SCAN_TO_16_CHAR_INIT); 3179 movdl(vec1, ch); 3180 pxor(vec2, vec2); 3181 pshufb(vec1, vec2); 3182 } 3183 3184 bind(SCAN_TO_16_CHAR); 3185 cmpl(cnt1, stride); 3186 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3187 if (UseAVX < 2) { 3188 movdl(vec1, ch); 3189 pxor(vec2, vec2); 3190 pshufb(vec1, vec2); 3191 } 3192 movl(tmp, cnt1); 3193 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3194 andl(cnt1,0x0000000F); //tail count (in bytes) 3195 3196 bind(SCAN_TO_16_CHAR_LOOP); 3197 movdqu(vec3, Address(result, 0)); 3198 pcmpeqb(vec3, vec1); 3199 ptest(vec2, vec3); 3200 jcc(Assembler::carryClear, FOUND_CHAR); 3201 addptr(result, 16); 3202 subl(tmp, stride); 3203 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3204 3205 bind(SCAN_TO_CHAR_INIT); 3206 testl(cnt1, cnt1); 3207 jcc(Assembler::zero, RET_NOT_FOUND); 3208 bind(SCAN_TO_CHAR_LOOP); 3209 load_unsigned_byte(tmp, Address(result, 0)); 3210 cmpl(ch, tmp); 3211 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3212 addptr(result, 1); 3213 subl(cnt1, 1); 3214 jccb(Assembler::zero, RET_NOT_FOUND); 3215 jmp(SCAN_TO_CHAR_LOOP); 3216 3217 bind(RET_NOT_FOUND); 3218 movl(result, -1); 3219 jmpb(DONE_LABEL); 3220 3221 bind(FOUND_CHAR); 3222 if (UseAVX >= 2) { 3223 vpmovmskb(tmp, vec3); 3224 } else { 3225 pmovmskb(tmp, vec3); 3226 } 3227 bsfl(ch, tmp); 3228 addptr(result, ch); 3229 3230 bind(FOUND_SEQ_CHAR); 3231 subptr(result, str1); 3232 3233 bind(DONE_LABEL); 3234 } // stringL_indexof_char 3235 3236 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3237 switch (eltype) { 3238 case T_BOOLEAN: return sizeof(jboolean); 3239 case T_BYTE: return sizeof(jbyte); 3240 case T_SHORT: return sizeof(jshort); 3241 case T_CHAR: return sizeof(jchar); 3242 case T_INT: return sizeof(jint); 3243 default: 3244 ShouldNotReachHere(); 3245 return -1; 3246 } 3247 } 3248 3249 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3250 switch (eltype) { 3251 // T_BOOLEAN used as surrogate for unsigned byte 3252 case T_BOOLEAN: movzbl(dst, src); break; 3253 case T_BYTE: movsbl(dst, src); break; 3254 case T_SHORT: movswl(dst, src); break; 3255 case T_CHAR: movzwl(dst, src); break; 3256 case T_INT: movl(dst, src); break; 3257 default: 3258 ShouldNotReachHere(); 3259 } 3260 } 3261 3262 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3263 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3264 } 3265 3266 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3267 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3268 } 3269 3270 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3271 const int vlen = Assembler::AVX_256bit; 3272 switch (eltype) { 3273 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3274 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3275 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3276 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3277 case T_INT: 3278 // do nothing 3279 break; 3280 default: 3281 ShouldNotReachHere(); 3282 } 3283 } 3284 3285 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3286 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3287 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3288 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3289 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3290 BasicType eltype) { 3291 ShortBranchVerifier sbv(this); 3292 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3293 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3294 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3295 3296 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3297 SHORT_UNROLLED_LOOP_EXIT, 3298 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3299 UNROLLED_VECTOR_LOOP_BEGIN, 3300 END; 3301 switch (eltype) { 3302 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3303 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3304 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3305 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3306 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3307 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3308 } 3309 3310 // For "renaming" for readibility of the code 3311 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3312 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3313 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3314 3315 const int elsize = arrays_hashcode_elsize(eltype); 3316 3317 /* 3318 if (cnt1 >= 2) { 3319 if (cnt1 >= 32) { 3320 UNROLLED VECTOR LOOP 3321 } 3322 UNROLLED SCALAR LOOP 3323 } 3324 SINGLE SCALAR 3325 */ 3326 3327 cmpl(cnt1, 32); 3328 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3329 3330 // cnt1 >= 32 && generate_vectorized_loop 3331 xorl(index, index); 3332 3333 // vresult = IntVector.zero(I256); 3334 for (int idx = 0; idx < 4; idx++) { 3335 vpxor(vresult[idx], vresult[idx]); 3336 } 3337 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3338 Register bound = tmp2; 3339 Register next = tmp3; 3340 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3341 movl(next, Address(tmp2, 0)); 3342 movdl(vnext, next); 3343 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3344 3345 // index = 0; 3346 // bound = cnt1 & ~(32 - 1); 3347 movl(bound, cnt1); 3348 andl(bound, ~(32 - 1)); 3349 // for (; index < bound; index += 32) { 3350 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3351 // result *= next; 3352 imull(result, next); 3353 // loop fission to upfront the cost of fetching from memory, OOO execution 3354 // can then hopefully do a better job of prefetching 3355 for (int idx = 0; idx < 4; idx++) { 3356 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3357 } 3358 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3359 for (int idx = 0; idx < 4; idx++) { 3360 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3361 arrays_hashcode_elvcast(vtmp[idx], eltype); 3362 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3363 } 3364 // index += 32; 3365 addl(index, 32); 3366 // index < bound; 3367 cmpl(index, bound); 3368 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3369 // } 3370 3371 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3372 subl(cnt1, bound); 3373 // release bound 3374 3375 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3376 for (int idx = 0; idx < 4; idx++) { 3377 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3378 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3379 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3380 } 3381 // result += vresult.reduceLanes(ADD); 3382 for (int idx = 0; idx < 4; idx++) { 3383 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3384 } 3385 3386 // } else if (cnt1 < 32) { 3387 3388 bind(SHORT_UNROLLED_BEGIN); 3389 // int i = 1; 3390 movl(index, 1); 3391 cmpl(index, cnt1); 3392 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3393 3394 // for (; i < cnt1 ; i += 2) { 3395 bind(SHORT_UNROLLED_LOOP_BEGIN); 3396 movl(tmp3, 961); 3397 imull(result, tmp3); 3398 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3399 movl(tmp3, tmp2); 3400 shll(tmp3, 5); 3401 subl(tmp3, tmp2); 3402 addl(result, tmp3); 3403 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3404 addl(result, tmp3); 3405 addl(index, 2); 3406 cmpl(index, cnt1); 3407 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3408 3409 // } 3410 // if (i >= cnt1) { 3411 bind(SHORT_UNROLLED_LOOP_EXIT); 3412 jccb(Assembler::greater, END); 3413 movl(tmp2, result); 3414 shll(result, 5); 3415 subl(result, tmp2); 3416 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3417 addl(result, tmp3); 3418 // } 3419 bind(END); 3420 3421 BLOCK_COMMENT("} // arrays_hashcode"); 3422 3423 } // arrays_hashcode 3424 3425 // helper function for string_compare 3426 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3427 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3428 Address::ScaleFactor scale2, Register index, int ae) { 3429 if (ae == StrIntrinsicNode::LL) { 3430 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3431 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3432 } else if (ae == StrIntrinsicNode::UU) { 3433 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3434 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3435 } else { 3436 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3437 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3438 } 3439 } 3440 3441 // Compare strings, used for char[] and byte[]. 3442 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3443 Register cnt1, Register cnt2, Register result, 3444 XMMRegister vec1, int ae, KRegister mask) { 3445 ShortBranchVerifier sbv(this); 3446 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3447 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3448 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3449 int stride2x2 = 0x40; 3450 Address::ScaleFactor scale = Address::no_scale; 3451 Address::ScaleFactor scale1 = Address::no_scale; 3452 Address::ScaleFactor scale2 = Address::no_scale; 3453 3454 if (ae != StrIntrinsicNode::LL) { 3455 stride2x2 = 0x20; 3456 } 3457 3458 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3459 shrl(cnt2, 1); 3460 } 3461 // Compute the minimum of the string lengths and the 3462 // difference of the string lengths (stack). 3463 // Do the conditional move stuff 3464 movl(result, cnt1); 3465 subl(cnt1, cnt2); 3466 push(cnt1); 3467 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3468 3469 // Is the minimum length zero? 3470 testl(cnt2, cnt2); 3471 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3472 if (ae == StrIntrinsicNode::LL) { 3473 // Load first bytes 3474 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3475 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3476 } else if (ae == StrIntrinsicNode::UU) { 3477 // Load first characters 3478 load_unsigned_short(result, Address(str1, 0)); 3479 load_unsigned_short(cnt1, Address(str2, 0)); 3480 } else { 3481 load_unsigned_byte(result, Address(str1, 0)); 3482 load_unsigned_short(cnt1, Address(str2, 0)); 3483 } 3484 subl(result, cnt1); 3485 jcc(Assembler::notZero, POP_LABEL); 3486 3487 if (ae == StrIntrinsicNode::UU) { 3488 // Divide length by 2 to get number of chars 3489 shrl(cnt2, 1); 3490 } 3491 cmpl(cnt2, 1); 3492 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3493 3494 // Check if the strings start at the same location and setup scale and stride 3495 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3496 cmpptr(str1, str2); 3497 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3498 if (ae == StrIntrinsicNode::LL) { 3499 scale = Address::times_1; 3500 stride = 16; 3501 } else { 3502 scale = Address::times_2; 3503 stride = 8; 3504 } 3505 } else { 3506 scale1 = Address::times_1; 3507 scale2 = Address::times_2; 3508 // scale not used 3509 stride = 8; 3510 } 3511 3512 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3513 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3514 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3515 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3516 Label COMPARE_TAIL_LONG; 3517 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3518 3519 int pcmpmask = 0x19; 3520 if (ae == StrIntrinsicNode::LL) { 3521 pcmpmask &= ~0x01; 3522 } 3523 3524 // Setup to compare 16-chars (32-bytes) vectors, 3525 // start from first character again because it has aligned address. 3526 if (ae == StrIntrinsicNode::LL) { 3527 stride2 = 32; 3528 } else { 3529 stride2 = 16; 3530 } 3531 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3532 adr_stride = stride << scale; 3533 } else { 3534 adr_stride1 = 8; //stride << scale1; 3535 adr_stride2 = 16; //stride << scale2; 3536 } 3537 3538 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3539 // rax and rdx are used by pcmpestri as elements counters 3540 movl(result, cnt2); 3541 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3542 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3543 3544 // fast path : compare first 2 8-char vectors. 3545 bind(COMPARE_16_CHARS); 3546 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3547 movdqu(vec1, Address(str1, 0)); 3548 } else { 3549 pmovzxbw(vec1, Address(str1, 0)); 3550 } 3551 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3552 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3553 3554 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3555 movdqu(vec1, Address(str1, adr_stride)); 3556 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3557 } else { 3558 pmovzxbw(vec1, Address(str1, adr_stride1)); 3559 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3560 } 3561 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3562 addl(cnt1, stride); 3563 3564 // Compare the characters at index in cnt1 3565 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3566 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3567 subl(result, cnt2); 3568 jmp(POP_LABEL); 3569 3570 // Setup the registers to start vector comparison loop 3571 bind(COMPARE_WIDE_VECTORS); 3572 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3573 lea(str1, Address(str1, result, scale)); 3574 lea(str2, Address(str2, result, scale)); 3575 } else { 3576 lea(str1, Address(str1, result, scale1)); 3577 lea(str2, Address(str2, result, scale2)); 3578 } 3579 subl(result, stride2); 3580 subl(cnt2, stride2); 3581 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3582 negptr(result); 3583 3584 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3585 bind(COMPARE_WIDE_VECTORS_LOOP); 3586 3587 #ifdef _LP64 3588 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3589 cmpl(cnt2, stride2x2); 3590 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3591 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3592 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3593 3594 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3595 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3596 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3597 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3598 } else { 3599 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3600 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3601 } 3602 kortestql(mask, mask); 3603 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3604 addptr(result, stride2x2); // update since we already compared at this addr 3605 subl(cnt2, stride2x2); // and sub the size too 3606 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3607 3608 vpxor(vec1, vec1); 3609 jmpb(COMPARE_WIDE_TAIL); 3610 }//if (VM_Version::supports_avx512vlbw()) 3611 #endif // _LP64 3612 3613 3614 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3615 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3616 vmovdqu(vec1, Address(str1, result, scale)); 3617 vpxor(vec1, Address(str2, result, scale)); 3618 } else { 3619 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3620 vpxor(vec1, Address(str2, result, scale2)); 3621 } 3622 vptest(vec1, vec1); 3623 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3624 addptr(result, stride2); 3625 subl(cnt2, stride2); 3626 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3627 // clean upper bits of YMM registers 3628 vpxor(vec1, vec1); 3629 3630 // compare wide vectors tail 3631 bind(COMPARE_WIDE_TAIL); 3632 testptr(result, result); 3633 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3634 3635 movl(result, stride2); 3636 movl(cnt2, result); 3637 negptr(result); 3638 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3639 3640 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3641 bind(VECTOR_NOT_EQUAL); 3642 // clean upper bits of YMM registers 3643 vpxor(vec1, vec1); 3644 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3645 lea(str1, Address(str1, result, scale)); 3646 lea(str2, Address(str2, result, scale)); 3647 } else { 3648 lea(str1, Address(str1, result, scale1)); 3649 lea(str2, Address(str2, result, scale2)); 3650 } 3651 jmp(COMPARE_16_CHARS); 3652 3653 // Compare tail chars, length between 1 to 15 chars 3654 bind(COMPARE_TAIL_LONG); 3655 movl(cnt2, result); 3656 cmpl(cnt2, stride); 3657 jcc(Assembler::less, COMPARE_SMALL_STR); 3658 3659 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3660 movdqu(vec1, Address(str1, 0)); 3661 } else { 3662 pmovzxbw(vec1, Address(str1, 0)); 3663 } 3664 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3665 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3666 subptr(cnt2, stride); 3667 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3668 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3669 lea(str1, Address(str1, result, scale)); 3670 lea(str2, Address(str2, result, scale)); 3671 } else { 3672 lea(str1, Address(str1, result, scale1)); 3673 lea(str2, Address(str2, result, scale2)); 3674 } 3675 negptr(cnt2); 3676 jmpb(WHILE_HEAD_LABEL); 3677 3678 bind(COMPARE_SMALL_STR); 3679 } else if (UseSSE42Intrinsics) { 3680 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3681 int pcmpmask = 0x19; 3682 // Setup to compare 8-char (16-byte) vectors, 3683 // start from first character again because it has aligned address. 3684 movl(result, cnt2); 3685 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3686 if (ae == StrIntrinsicNode::LL) { 3687 pcmpmask &= ~0x01; 3688 } 3689 jcc(Assembler::zero, COMPARE_TAIL); 3690 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3691 lea(str1, Address(str1, result, scale)); 3692 lea(str2, Address(str2, result, scale)); 3693 } else { 3694 lea(str1, Address(str1, result, scale1)); 3695 lea(str2, Address(str2, result, scale2)); 3696 } 3697 negptr(result); 3698 3699 // pcmpestri 3700 // inputs: 3701 // vec1- substring 3702 // rax - negative string length (elements count) 3703 // mem - scanned string 3704 // rdx - string length (elements count) 3705 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3706 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3707 // outputs: 3708 // rcx - first mismatched element index 3709 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3710 3711 bind(COMPARE_WIDE_VECTORS); 3712 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3713 movdqu(vec1, Address(str1, result, scale)); 3714 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3715 } else { 3716 pmovzxbw(vec1, Address(str1, result, scale1)); 3717 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3718 } 3719 // After pcmpestri cnt1(rcx) contains mismatched element index 3720 3721 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3722 addptr(result, stride); 3723 subptr(cnt2, stride); 3724 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3725 3726 // compare wide vectors tail 3727 testptr(result, result); 3728 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3729 3730 movl(cnt2, stride); 3731 movl(result, stride); 3732 negptr(result); 3733 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3734 movdqu(vec1, Address(str1, result, scale)); 3735 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3736 } else { 3737 pmovzxbw(vec1, Address(str1, result, scale1)); 3738 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3739 } 3740 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3741 3742 // Mismatched characters in the vectors 3743 bind(VECTOR_NOT_EQUAL); 3744 addptr(cnt1, result); 3745 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3746 subl(result, cnt2); 3747 jmpb(POP_LABEL); 3748 3749 bind(COMPARE_TAIL); // limit is zero 3750 movl(cnt2, result); 3751 // Fallthru to tail compare 3752 } 3753 // Shift str2 and str1 to the end of the arrays, negate min 3754 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3755 lea(str1, Address(str1, cnt2, scale)); 3756 lea(str2, Address(str2, cnt2, scale)); 3757 } else { 3758 lea(str1, Address(str1, cnt2, scale1)); 3759 lea(str2, Address(str2, cnt2, scale2)); 3760 } 3761 decrementl(cnt2); // first character was compared already 3762 negptr(cnt2); 3763 3764 // Compare the rest of the elements 3765 bind(WHILE_HEAD_LABEL); 3766 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3767 subl(result, cnt1); 3768 jccb(Assembler::notZero, POP_LABEL); 3769 increment(cnt2); 3770 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3771 3772 // Strings are equal up to min length. Return the length difference. 3773 bind(LENGTH_DIFF_LABEL); 3774 pop(result); 3775 if (ae == StrIntrinsicNode::UU) { 3776 // Divide diff by 2 to get number of chars 3777 sarl(result, 1); 3778 } 3779 jmpb(DONE_LABEL); 3780 3781 #ifdef _LP64 3782 if (VM_Version::supports_avx512vlbw()) { 3783 3784 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3785 3786 kmovql(cnt1, mask); 3787 notq(cnt1); 3788 bsfq(cnt2, cnt1); 3789 if (ae != StrIntrinsicNode::LL) { 3790 // Divide diff by 2 to get number of chars 3791 sarl(cnt2, 1); 3792 } 3793 addq(result, cnt2); 3794 if (ae == StrIntrinsicNode::LL) { 3795 load_unsigned_byte(cnt1, Address(str2, result)); 3796 load_unsigned_byte(result, Address(str1, result)); 3797 } else if (ae == StrIntrinsicNode::UU) { 3798 load_unsigned_short(cnt1, Address(str2, result, scale)); 3799 load_unsigned_short(result, Address(str1, result, scale)); 3800 } else { 3801 load_unsigned_short(cnt1, Address(str2, result, scale2)); 3802 load_unsigned_byte(result, Address(str1, result, scale1)); 3803 } 3804 subl(result, cnt1); 3805 jmpb(POP_LABEL); 3806 }//if (VM_Version::supports_avx512vlbw()) 3807 #endif // _LP64 3808 3809 // Discard the stored length difference 3810 bind(POP_LABEL); 3811 pop(cnt1); 3812 3813 // That's it 3814 bind(DONE_LABEL); 3815 if(ae == StrIntrinsicNode::UL) { 3816 negl(result); 3817 } 3818 3819 } 3820 3821 // Search for Non-ASCII character (Negative byte value) in a byte array, 3822 // return the index of the first such character, otherwise the length 3823 // of the array segment searched. 3824 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 3825 // @IntrinsicCandidate 3826 // public static int countPositives(byte[] ba, int off, int len) { 3827 // for (int i = off; i < off + len; i++) { 3828 // if (ba[i] < 0) { 3829 // return i - off; 3830 // } 3831 // } 3832 // return len; 3833 // } 3834 void C2_MacroAssembler::count_positives(Register ary1, Register len, 3835 Register result, Register tmp1, 3836 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 3837 // rsi: byte array 3838 // rcx: len 3839 // rax: result 3840 ShortBranchVerifier sbv(this); 3841 assert_different_registers(ary1, len, result, tmp1); 3842 assert_different_registers(vec1, vec2); 3843 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 3844 3845 movl(result, len); // copy 3846 // len == 0 3847 testl(len, len); 3848 jcc(Assembler::zero, DONE); 3849 3850 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 3851 VM_Version::supports_avx512vlbw() && 3852 VM_Version::supports_bmi2()) { 3853 3854 Label test_64_loop, test_tail, BREAK_LOOP; 3855 Register tmp3_aliased = len; 3856 3857 movl(tmp1, len); 3858 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 3859 3860 andl(tmp1, 64 - 1); // tail count (in chars) 0x3F 3861 andl(len, ~(64 - 1)); // vector count (in chars) 3862 jccb(Assembler::zero, test_tail); 3863 3864 lea(ary1, Address(ary1, len, Address::times_1)); 3865 negptr(len); 3866 3867 bind(test_64_loop); 3868 // Check whether our 64 elements of size byte contain negatives 3869 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 3870 kortestql(mask1, mask1); 3871 jcc(Assembler::notZero, BREAK_LOOP); 3872 3873 addptr(len, 64); 3874 jccb(Assembler::notZero, test_64_loop); 3875 3876 bind(test_tail); 3877 // bail out when there is nothing to be done 3878 testl(tmp1, -1); 3879 jcc(Assembler::zero, DONE); 3880 3881 // ~(~0 << len) applied up to two times (for 32-bit scenario) 3882 #ifdef _LP64 3883 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 3884 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 3885 notq(tmp3_aliased); 3886 kmovql(mask2, tmp3_aliased); 3887 #else 3888 Label k_init; 3889 jmp(k_init); 3890 3891 // We could not read 64-bits from a general purpose register thus we move 3892 // data required to compose 64 1's to the instruction stream 3893 // We emit 64 byte wide series of elements from 0..63 which later on would 3894 // be used as a compare targets with tail count contained in tmp1 register. 3895 // Result would be a k register having tmp1 consecutive number or 1 3896 // counting from least significant bit. 3897 address tmp = pc(); 3898 emit_int64(0x0706050403020100); 3899 emit_int64(0x0F0E0D0C0B0A0908); 3900 emit_int64(0x1716151413121110); 3901 emit_int64(0x1F1E1D1C1B1A1918); 3902 emit_int64(0x2726252423222120); 3903 emit_int64(0x2F2E2D2C2B2A2928); 3904 emit_int64(0x3736353433323130); 3905 emit_int64(0x3F3E3D3C3B3A3938); 3906 3907 bind(k_init); 3908 lea(len, InternalAddress(tmp)); 3909 // create mask to test for negative byte inside a vector 3910 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 3911 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 3912 3913 #endif 3914 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 3915 ktestq(mask1, mask2); 3916 jcc(Assembler::zero, DONE); 3917 3918 bind(BREAK_LOOP); 3919 // At least one byte in the last 64 bytes is negative. 3920 // Set up to look at the last 64 bytes as if they were a tail 3921 lea(ary1, Address(ary1, len, Address::times_1)); 3922 addptr(result, len); 3923 // Ignore the very last byte: if all others are positive, 3924 // it must be negative, so we can skip right to the 2+1 byte 3925 // end comparison at this point 3926 orl(result, 63); 3927 movl(len, 63); 3928 // Fallthru to tail compare 3929 } else { 3930 3931 if (UseAVX >= 2 && UseSSE >= 2) { 3932 // With AVX2, use 32-byte vector compare 3933 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 3934 3935 // Compare 32-byte vectors 3936 testl(len, 0xffffffe0); // vector count (in bytes) 3937 jccb(Assembler::zero, TAIL_START); 3938 3939 andl(len, 0xffffffe0); 3940 lea(ary1, Address(ary1, len, Address::times_1)); 3941 negptr(len); 3942 3943 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 3944 movdl(vec2, tmp1); 3945 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 3946 3947 bind(COMPARE_WIDE_VECTORS); 3948 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 3949 vptest(vec1, vec2); 3950 jccb(Assembler::notZero, BREAK_LOOP); 3951 addptr(len, 32); 3952 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3953 3954 testl(result, 0x0000001f); // any bytes remaining? 3955 jcc(Assembler::zero, DONE); 3956 3957 // Quick test using the already prepared vector mask 3958 movl(len, result); 3959 andl(len, 0x0000001f); 3960 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 3961 vptest(vec1, vec2); 3962 jcc(Assembler::zero, DONE); 3963 // There are zeros, jump to the tail to determine exactly where 3964 jmpb(TAIL_START); 3965 3966 bind(BREAK_LOOP); 3967 // At least one byte in the last 32-byte vector is negative. 3968 // Set up to look at the last 32 bytes as if they were a tail 3969 lea(ary1, Address(ary1, len, Address::times_1)); 3970 addptr(result, len); 3971 // Ignore the very last byte: if all others are positive, 3972 // it must be negative, so we can skip right to the 2+1 byte 3973 // end comparison at this point 3974 orl(result, 31); 3975 movl(len, 31); 3976 // Fallthru to tail compare 3977 } else if (UseSSE42Intrinsics) { 3978 // With SSE4.2, use double quad vector compare 3979 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 3980 3981 // Compare 16-byte vectors 3982 testl(len, 0xfffffff0); // vector count (in bytes) 3983 jcc(Assembler::zero, TAIL_START); 3984 3985 andl(len, 0xfffffff0); 3986 lea(ary1, Address(ary1, len, Address::times_1)); 3987 negptr(len); 3988 3989 movl(tmp1, 0x80808080); 3990 movdl(vec2, tmp1); 3991 pshufd(vec2, vec2, 0); 3992 3993 bind(COMPARE_WIDE_VECTORS); 3994 movdqu(vec1, Address(ary1, len, Address::times_1)); 3995 ptest(vec1, vec2); 3996 jccb(Assembler::notZero, BREAK_LOOP); 3997 addptr(len, 16); 3998 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3999 4000 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4001 jcc(Assembler::zero, DONE); 4002 4003 // Quick test using the already prepared vector mask 4004 movl(len, result); 4005 andl(len, 0x0000000f); // tail count (in bytes) 4006 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4007 ptest(vec1, vec2); 4008 jcc(Assembler::zero, DONE); 4009 jmpb(TAIL_START); 4010 4011 bind(BREAK_LOOP); 4012 // At least one byte in the last 16-byte vector is negative. 4013 // Set up and look at the last 16 bytes as if they were a tail 4014 lea(ary1, Address(ary1, len, Address::times_1)); 4015 addptr(result, len); 4016 // Ignore the very last byte: if all others are positive, 4017 // it must be negative, so we can skip right to the 2+1 byte 4018 // end comparison at this point 4019 orl(result, 15); 4020 movl(len, 15); 4021 // Fallthru to tail compare 4022 } 4023 } 4024 4025 bind(TAIL_START); 4026 // Compare 4-byte vectors 4027 andl(len, 0xfffffffc); // vector count (in bytes) 4028 jccb(Assembler::zero, COMPARE_CHAR); 4029 4030 lea(ary1, Address(ary1, len, Address::times_1)); 4031 negptr(len); 4032 4033 bind(COMPARE_VECTORS); 4034 movl(tmp1, Address(ary1, len, Address::times_1)); 4035 andl(tmp1, 0x80808080); 4036 jccb(Assembler::notZero, TAIL_ADJUST); 4037 addptr(len, 4); 4038 jccb(Assembler::notZero, COMPARE_VECTORS); 4039 4040 // Compare trailing char (final 2-3 bytes), if any 4041 bind(COMPARE_CHAR); 4042 4043 testl(result, 0x2); // tail char 4044 jccb(Assembler::zero, COMPARE_BYTE); 4045 load_unsigned_short(tmp1, Address(ary1, 0)); 4046 andl(tmp1, 0x00008080); 4047 jccb(Assembler::notZero, CHAR_ADJUST); 4048 lea(ary1, Address(ary1, 2)); 4049 4050 bind(COMPARE_BYTE); 4051 testl(result, 0x1); // tail byte 4052 jccb(Assembler::zero, DONE); 4053 load_unsigned_byte(tmp1, Address(ary1, 0)); 4054 testl(tmp1, 0x00000080); 4055 jccb(Assembler::zero, DONE); 4056 subptr(result, 1); 4057 jmpb(DONE); 4058 4059 bind(TAIL_ADJUST); 4060 // there are negative bits in the last 4 byte block. 4061 // Adjust result and check the next three bytes 4062 addptr(result, len); 4063 orl(result, 3); 4064 lea(ary1, Address(ary1, len, Address::times_1)); 4065 jmpb(COMPARE_CHAR); 4066 4067 bind(CHAR_ADJUST); 4068 // We are looking at a char + optional byte tail, and found that one 4069 // of the bytes in the char is negative. Adjust the result, check the 4070 // first byte and readjust if needed. 4071 andl(result, 0xfffffffc); 4072 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4073 jccb(Assembler::notZero, DONE); 4074 addptr(result, 1); 4075 4076 // That's it 4077 bind(DONE); 4078 if (UseAVX >= 2 && UseSSE >= 2) { 4079 // clean upper bits of YMM registers 4080 vpxor(vec1, vec1); 4081 vpxor(vec2, vec2); 4082 } 4083 } 4084 4085 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4086 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4087 Register limit, Register result, Register chr, 4088 XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) { 4089 ShortBranchVerifier sbv(this); 4090 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4091 4092 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4093 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4094 4095 if (is_array_equ) { 4096 // Check the input args 4097 cmpoop(ary1, ary2); 4098 jcc(Assembler::equal, TRUE_LABEL); 4099 4100 // Need additional checks for arrays_equals. 4101 testptr(ary1, ary1); 4102 jcc(Assembler::zero, FALSE_LABEL); 4103 testptr(ary2, ary2); 4104 jcc(Assembler::zero, FALSE_LABEL); 4105 4106 // Check the lengths 4107 movl(limit, Address(ary1, length_offset)); 4108 cmpl(limit, Address(ary2, length_offset)); 4109 jcc(Assembler::notEqual, FALSE_LABEL); 4110 } 4111 4112 // count == 0 4113 testl(limit, limit); 4114 jcc(Assembler::zero, TRUE_LABEL); 4115 4116 if (is_array_equ) { 4117 // Load array address 4118 lea(ary1, Address(ary1, base_offset)); 4119 lea(ary2, Address(ary2, base_offset)); 4120 } 4121 4122 if (is_array_equ && is_char) { 4123 // arrays_equals when used for char[]. 4124 shll(limit, 1); // byte count != 0 4125 } 4126 movl(result, limit); // copy 4127 4128 if (UseAVX >= 2) { 4129 // With AVX2, use 32-byte vector compare 4130 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4131 4132 // Compare 32-byte vectors 4133 andl(result, 0x0000001f); // tail count (in bytes) 4134 andl(limit, 0xffffffe0); // vector count (in bytes) 4135 jcc(Assembler::zero, COMPARE_TAIL); 4136 4137 lea(ary1, Address(ary1, limit, Address::times_1)); 4138 lea(ary2, Address(ary2, limit, Address::times_1)); 4139 negptr(limit); 4140 4141 #ifdef _LP64 4142 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4143 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4144 4145 cmpl(limit, -64); 4146 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4147 4148 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4149 4150 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4151 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4152 kortestql(mask, mask); 4153 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4154 addptr(limit, 64); // update since we already compared at this addr 4155 cmpl(limit, -64); 4156 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4157 4158 // At this point we may still need to compare -limit+result bytes. 4159 // We could execute the next two instruction and just continue via non-wide path: 4160 // cmpl(limit, 0); 4161 // jcc(Assembler::equal, COMPARE_TAIL); // true 4162 // But since we stopped at the points ary{1,2}+limit which are 4163 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4164 // (|limit| <= 32 and result < 32), 4165 // we may just compare the last 64 bytes. 4166 // 4167 addptr(result, -64); // it is safe, bc we just came from this area 4168 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4169 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4170 kortestql(mask, mask); 4171 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4172 4173 jmp(TRUE_LABEL); 4174 4175 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4176 4177 }//if (VM_Version::supports_avx512vlbw()) 4178 #endif //_LP64 4179 bind(COMPARE_WIDE_VECTORS); 4180 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); 4181 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4182 vpxor(vec1, vec2); 4183 4184 vptest(vec1, vec1); 4185 jcc(Assembler::notZero, FALSE_LABEL); 4186 addptr(limit, 32); 4187 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4188 4189 testl(result, result); 4190 jcc(Assembler::zero, TRUE_LABEL); 4191 4192 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 4193 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4194 vpxor(vec1, vec2); 4195 4196 vptest(vec1, vec1); 4197 jccb(Assembler::notZero, FALSE_LABEL); 4198 jmpb(TRUE_LABEL); 4199 4200 bind(COMPARE_TAIL); // limit is zero 4201 movl(limit, result); 4202 // Fallthru to tail compare 4203 } else if (UseSSE42Intrinsics) { 4204 // With SSE4.2, use double quad vector compare 4205 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4206 4207 // Compare 16-byte vectors 4208 andl(result, 0x0000000f); // tail count (in bytes) 4209 andl(limit, 0xfffffff0); // vector count (in bytes) 4210 jcc(Assembler::zero, COMPARE_TAIL); 4211 4212 lea(ary1, Address(ary1, limit, Address::times_1)); 4213 lea(ary2, Address(ary2, limit, Address::times_1)); 4214 negptr(limit); 4215 4216 bind(COMPARE_WIDE_VECTORS); 4217 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4218 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4219 pxor(vec1, vec2); 4220 4221 ptest(vec1, vec1); 4222 jcc(Assembler::notZero, FALSE_LABEL); 4223 addptr(limit, 16); 4224 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4225 4226 testl(result, result); 4227 jcc(Assembler::zero, TRUE_LABEL); 4228 4229 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4230 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4231 pxor(vec1, vec2); 4232 4233 ptest(vec1, vec1); 4234 jccb(Assembler::notZero, FALSE_LABEL); 4235 jmpb(TRUE_LABEL); 4236 4237 bind(COMPARE_TAIL); // limit is zero 4238 movl(limit, result); 4239 // Fallthru to tail compare 4240 } 4241 4242 // Compare 4-byte vectors 4243 andl(limit, 0xfffffffc); // vector count (in bytes) 4244 jccb(Assembler::zero, COMPARE_CHAR); 4245 4246 lea(ary1, Address(ary1, limit, Address::times_1)); 4247 lea(ary2, Address(ary2, limit, Address::times_1)); 4248 negptr(limit); 4249 4250 bind(COMPARE_VECTORS); 4251 movl(chr, Address(ary1, limit, Address::times_1)); 4252 cmpl(chr, Address(ary2, limit, Address::times_1)); 4253 jccb(Assembler::notEqual, FALSE_LABEL); 4254 addptr(limit, 4); 4255 jcc(Assembler::notZero, COMPARE_VECTORS); 4256 4257 // Compare trailing char (final 2 bytes), if any 4258 bind(COMPARE_CHAR); 4259 testl(result, 0x2); // tail char 4260 jccb(Assembler::zero, COMPARE_BYTE); 4261 load_unsigned_short(chr, Address(ary1, 0)); 4262 load_unsigned_short(limit, Address(ary2, 0)); 4263 cmpl(chr, limit); 4264 jccb(Assembler::notEqual, FALSE_LABEL); 4265 4266 if (is_array_equ && is_char) { 4267 bind(COMPARE_BYTE); 4268 } else { 4269 lea(ary1, Address(ary1, 2)); 4270 lea(ary2, Address(ary2, 2)); 4271 4272 bind(COMPARE_BYTE); 4273 testl(result, 0x1); // tail byte 4274 jccb(Assembler::zero, TRUE_LABEL); 4275 load_unsigned_byte(chr, Address(ary1, 0)); 4276 load_unsigned_byte(limit, Address(ary2, 0)); 4277 cmpl(chr, limit); 4278 jccb(Assembler::notEqual, FALSE_LABEL); 4279 } 4280 bind(TRUE_LABEL); 4281 movl(result, 1); // return true 4282 jmpb(DONE); 4283 4284 bind(FALSE_LABEL); 4285 xorl(result, result); // return false 4286 4287 // That's it 4288 bind(DONE); 4289 if (UseAVX >= 2) { 4290 // clean upper bits of YMM registers 4291 vpxor(vec1, vec1); 4292 vpxor(vec2, vec2); 4293 } 4294 } 4295 4296 #ifdef _LP64 4297 4298 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4299 #define __ masm. 4300 Register dst = stub.data<0>(); 4301 XMMRegister src = stub.data<1>(); 4302 address target = stub.data<2>(); 4303 __ bind(stub.entry()); 4304 __ subptr(rsp, 8); 4305 __ movdbl(Address(rsp), src); 4306 __ call(RuntimeAddress(target)); 4307 __ pop(dst); 4308 __ jmp(stub.continuation()); 4309 #undef __ 4310 } 4311 4312 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4313 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4314 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4315 4316 address slowpath_target; 4317 if (dst_bt == T_INT) { 4318 if (src_bt == T_FLOAT) { 4319 cvttss2sil(dst, src); 4320 cmpl(dst, 0x80000000); 4321 slowpath_target = StubRoutines::x86::f2i_fixup(); 4322 } else { 4323 cvttsd2sil(dst, src); 4324 cmpl(dst, 0x80000000); 4325 slowpath_target = StubRoutines::x86::d2i_fixup(); 4326 } 4327 } else { 4328 if (src_bt == T_FLOAT) { 4329 cvttss2siq(dst, src); 4330 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4331 slowpath_target = StubRoutines::x86::f2l_fixup(); 4332 } else { 4333 cvttsd2siq(dst, src); 4334 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4335 slowpath_target = StubRoutines::x86::d2l_fixup(); 4336 } 4337 } 4338 4339 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4340 jcc(Assembler::equal, stub->entry()); 4341 bind(stub->continuation()); 4342 } 4343 4344 #endif // _LP64 4345 4346 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4347 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4348 switch(ideal_opc) { 4349 case Op_LShiftVS: 4350 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4351 case Op_LShiftVI: 4352 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4353 case Op_LShiftVL: 4354 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4355 case Op_RShiftVS: 4356 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4357 case Op_RShiftVI: 4358 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4359 case Op_RShiftVL: 4360 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4361 case Op_URShiftVS: 4362 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4363 case Op_URShiftVI: 4364 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4365 case Op_URShiftVL: 4366 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4367 case Op_RotateRightV: 4368 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4369 case Op_RotateLeftV: 4370 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4371 default: 4372 fatal("Unsupported masked operation"); break; 4373 } 4374 } 4375 4376 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4377 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4378 bool is_varshift) { 4379 switch (ideal_opc) { 4380 case Op_AddVB: 4381 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4382 case Op_AddVS: 4383 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4384 case Op_AddVI: 4385 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4386 case Op_AddVL: 4387 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4388 case Op_AddVF: 4389 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4390 case Op_AddVD: 4391 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4392 case Op_SubVB: 4393 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4394 case Op_SubVS: 4395 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4396 case Op_SubVI: 4397 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4398 case Op_SubVL: 4399 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4400 case Op_SubVF: 4401 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4402 case Op_SubVD: 4403 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4404 case Op_MulVS: 4405 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4406 case Op_MulVI: 4407 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4408 case Op_MulVL: 4409 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4410 case Op_MulVF: 4411 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4412 case Op_MulVD: 4413 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4414 case Op_DivVF: 4415 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4416 case Op_DivVD: 4417 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4418 case Op_SqrtVF: 4419 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4420 case Op_SqrtVD: 4421 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4422 case Op_AbsVB: 4423 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4424 case Op_AbsVS: 4425 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4426 case Op_AbsVI: 4427 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4428 case Op_AbsVL: 4429 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4430 case Op_FmaVF: 4431 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4432 case Op_FmaVD: 4433 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4434 case Op_VectorRearrange: 4435 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4436 case Op_LShiftVS: 4437 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4438 case Op_LShiftVI: 4439 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4440 case Op_LShiftVL: 4441 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4442 case Op_RShiftVS: 4443 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4444 case Op_RShiftVI: 4445 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4446 case Op_RShiftVL: 4447 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4448 case Op_URShiftVS: 4449 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4450 case Op_URShiftVI: 4451 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4452 case Op_URShiftVL: 4453 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4454 case Op_RotateLeftV: 4455 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4456 case Op_RotateRightV: 4457 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4458 case Op_MaxV: 4459 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4460 case Op_MinV: 4461 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4462 case Op_XorV: 4463 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4464 case Op_OrV: 4465 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4466 case Op_AndV: 4467 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4468 default: 4469 fatal("Unsupported masked operation"); break; 4470 } 4471 } 4472 4473 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4474 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4475 switch (ideal_opc) { 4476 case Op_AddVB: 4477 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4478 case Op_AddVS: 4479 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4480 case Op_AddVI: 4481 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4482 case Op_AddVL: 4483 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4484 case Op_AddVF: 4485 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4486 case Op_AddVD: 4487 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4488 case Op_SubVB: 4489 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4490 case Op_SubVS: 4491 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4492 case Op_SubVI: 4493 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4494 case Op_SubVL: 4495 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4496 case Op_SubVF: 4497 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4498 case Op_SubVD: 4499 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4500 case Op_MulVS: 4501 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4502 case Op_MulVI: 4503 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4504 case Op_MulVL: 4505 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4506 case Op_MulVF: 4507 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4508 case Op_MulVD: 4509 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4510 case Op_DivVF: 4511 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4512 case Op_DivVD: 4513 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4514 case Op_FmaVF: 4515 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4516 case Op_FmaVD: 4517 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4518 case Op_MaxV: 4519 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4520 case Op_MinV: 4521 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4522 case Op_XorV: 4523 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4524 case Op_OrV: 4525 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4526 case Op_AndV: 4527 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4528 default: 4529 fatal("Unsupported masked operation"); break; 4530 } 4531 } 4532 4533 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4534 KRegister src1, KRegister src2) { 4535 BasicType etype = T_ILLEGAL; 4536 switch(mask_len) { 4537 case 2: 4538 case 4: 4539 case 8: etype = T_BYTE; break; 4540 case 16: etype = T_SHORT; break; 4541 case 32: etype = T_INT; break; 4542 case 64: etype = T_LONG; break; 4543 default: fatal("Unsupported type"); break; 4544 } 4545 assert(etype != T_ILLEGAL, ""); 4546 switch(ideal_opc) { 4547 case Op_AndVMask: 4548 kand(etype, dst, src1, src2); break; 4549 case Op_OrVMask: 4550 kor(etype, dst, src1, src2); break; 4551 case Op_XorVMask: 4552 kxor(etype, dst, src1, src2); break; 4553 default: 4554 fatal("Unsupported masked operation"); break; 4555 } 4556 } 4557 4558 /* 4559 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4560 * If src is NaN, the result is 0. 4561 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4562 * the result is equal to the value of Integer.MIN_VALUE. 4563 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4564 * the result is equal to the value of Integer.MAX_VALUE. 4565 */ 4566 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4567 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4568 Register rscratch, AddressLiteral float_sign_flip, 4569 int vec_enc) { 4570 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4571 Label done; 4572 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4573 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4574 vptest(xtmp2, xtmp2, vec_enc); 4575 jccb(Assembler::equal, done); 4576 4577 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4578 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4579 4580 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4581 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4582 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4583 4584 // Recompute the mask for remaining special value. 4585 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4586 // Extract SRC values corresponding to TRUE mask lanes. 4587 vpand(xtmp4, xtmp2, src, vec_enc); 4588 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4589 // values are set. 4590 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4591 4592 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4593 bind(done); 4594 } 4595 4596 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4597 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4598 Register rscratch, AddressLiteral float_sign_flip, 4599 int vec_enc) { 4600 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4601 Label done; 4602 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4603 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4604 kortestwl(ktmp1, ktmp1); 4605 jccb(Assembler::equal, done); 4606 4607 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4608 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4609 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4610 4611 kxorwl(ktmp1, ktmp1, ktmp2); 4612 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4613 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4614 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4615 bind(done); 4616 } 4617 4618 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4619 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4620 Register rscratch, AddressLiteral double_sign_flip, 4621 int vec_enc) { 4622 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4623 4624 Label done; 4625 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4626 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4627 kortestwl(ktmp1, ktmp1); 4628 jccb(Assembler::equal, done); 4629 4630 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4631 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4632 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4633 4634 kxorwl(ktmp1, ktmp1, ktmp2); 4635 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4636 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4637 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4638 bind(done); 4639 } 4640 4641 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4642 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4643 Register rscratch, AddressLiteral float_sign_flip, 4644 int vec_enc) { 4645 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4646 Label done; 4647 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4648 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4649 kortestwl(ktmp1, ktmp1); 4650 jccb(Assembler::equal, done); 4651 4652 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4653 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4654 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4655 4656 kxorwl(ktmp1, ktmp1, ktmp2); 4657 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4658 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4659 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4660 bind(done); 4661 } 4662 4663 /* 4664 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4665 * If src is NaN, the result is 0. 4666 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 4667 * the result is equal to the value of Long.MIN_VALUE. 4668 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 4669 * the result is equal to the value of Long.MAX_VALUE. 4670 */ 4671 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4672 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4673 Register rscratch, AddressLiteral double_sign_flip, 4674 int vec_enc) { 4675 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4676 4677 Label done; 4678 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4679 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 4680 kortestwl(ktmp1, ktmp1); 4681 jccb(Assembler::equal, done); 4682 4683 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4684 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4685 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4686 4687 kxorwl(ktmp1, ktmp1, ktmp2); 4688 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4689 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4690 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4691 bind(done); 4692 } 4693 4694 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 4695 XMMRegister xtmp, int index, int vec_enc) { 4696 assert(vec_enc < Assembler::AVX_512bit, ""); 4697 if (vec_enc == Assembler::AVX_256bit) { 4698 vextractf128_high(xtmp, src); 4699 vshufps(dst, src, xtmp, index, vec_enc); 4700 } else { 4701 vshufps(dst, src, zero, index, vec_enc); 4702 } 4703 } 4704 4705 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4706 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 4707 AddressLiteral float_sign_flip, int src_vec_enc) { 4708 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4709 4710 Label done; 4711 // Compare the destination lanes with float_sign_flip 4712 // value to get mask for all special values. 4713 movdqu(xtmp1, float_sign_flip, rscratch); 4714 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 4715 ptest(xtmp2, xtmp2); 4716 jccb(Assembler::equal, done); 4717 4718 // Flip float_sign_flip to get max integer value. 4719 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 4720 pxor(xtmp1, xtmp4); 4721 4722 // Set detination lanes corresponding to unordered source lanes as zero. 4723 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 4724 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 4725 4726 // Shuffle mask vector and pack lower doubles word from each quadword lane. 4727 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 4728 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 4729 4730 // Recompute the mask for remaining special value. 4731 pxor(xtmp2, xtmp3); 4732 // Extract mask corresponding to non-negative source lanes. 4733 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 4734 4735 // Shuffle mask vector and pack lower doubles word from each quadword lane. 4736 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 4737 pand(xtmp3, xtmp2); 4738 4739 // Replace destination lanes holding special value(0x80000000) with max int 4740 // if corresponding source lane holds a +ve value. 4741 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 4742 bind(done); 4743 } 4744 4745 4746 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 4747 XMMRegister xtmp, Register rscratch, int vec_enc) { 4748 switch(to_elem_bt) { 4749 case T_SHORT: 4750 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 4751 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 4752 vpackusdw(dst, dst, zero, vec_enc); 4753 if (vec_enc == Assembler::AVX_256bit) { 4754 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 4755 } 4756 break; 4757 case T_BYTE: 4758 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 4759 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 4760 vpackusdw(dst, dst, zero, vec_enc); 4761 if (vec_enc == Assembler::AVX_256bit) { 4762 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 4763 } 4764 vpackuswb(dst, dst, zero, vec_enc); 4765 break; 4766 default: assert(false, "%s", type2name(to_elem_bt)); 4767 } 4768 } 4769 4770 /* 4771 * Algorithm for vector D2L and F2I conversions:- 4772 * a) Perform vector D2L/F2I cast. 4773 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 4774 * It signifies that source value could be any of the special floating point 4775 * values(NaN,-Inf,Inf,Max,-Min). 4776 * c) Set destination to zero if source is NaN value. 4777 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 4778 */ 4779 4780 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4781 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4782 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 4783 int to_elem_sz = type2aelembytes(to_elem_bt); 4784 assert(to_elem_sz <= 4, ""); 4785 vcvttps2dq(dst, src, vec_enc); 4786 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 4787 if (to_elem_sz < 4) { 4788 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4789 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 4790 } 4791 } 4792 4793 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4794 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 4795 Register rscratch, int vec_enc) { 4796 int to_elem_sz = type2aelembytes(to_elem_bt); 4797 assert(to_elem_sz <= 4, ""); 4798 vcvttps2dq(dst, src, vec_enc); 4799 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 4800 switch(to_elem_bt) { 4801 case T_INT: 4802 break; 4803 case T_SHORT: 4804 evpmovdw(dst, dst, vec_enc); 4805 break; 4806 case T_BYTE: 4807 evpmovdb(dst, dst, vec_enc); 4808 break; 4809 default: assert(false, "%s", type2name(to_elem_bt)); 4810 } 4811 } 4812 4813 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4814 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 4815 Register rscratch, int vec_enc) { 4816 evcvttps2qq(dst, src, vec_enc); 4817 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 4818 } 4819 4820 // Handling for downcasting from double to integer or sub-word types on AVX2. 4821 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4822 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 4823 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 4824 int to_elem_sz = type2aelembytes(to_elem_bt); 4825 assert(to_elem_sz < 8, ""); 4826 vcvttpd2dq(dst, src, vec_enc); 4827 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 4828 float_sign_flip, vec_enc); 4829 if (to_elem_sz < 4) { 4830 // xtmp4 holds all zero lanes. 4831 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 4832 } 4833 } 4834 4835 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 4836 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 4837 KRegister ktmp2, AddressLiteral sign_flip, 4838 Register rscratch, int vec_enc) { 4839 if (VM_Version::supports_avx512dq()) { 4840 evcvttpd2qq(dst, src, vec_enc); 4841 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 4842 switch(to_elem_bt) { 4843 case T_LONG: 4844 break; 4845 case T_INT: 4846 evpmovsqd(dst, dst, vec_enc); 4847 break; 4848 case T_SHORT: 4849 evpmovsqd(dst, dst, vec_enc); 4850 evpmovdw(dst, dst, vec_enc); 4851 break; 4852 case T_BYTE: 4853 evpmovsqd(dst, dst, vec_enc); 4854 evpmovdb(dst, dst, vec_enc); 4855 break; 4856 default: assert(false, "%s", type2name(to_elem_bt)); 4857 } 4858 } else { 4859 assert(type2aelembytes(to_elem_bt) <= 4, ""); 4860 vcvttpd2dq(dst, src, vec_enc); 4861 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 4862 switch(to_elem_bt) { 4863 case T_INT: 4864 break; 4865 case T_SHORT: 4866 evpmovdw(dst, dst, vec_enc); 4867 break; 4868 case T_BYTE: 4869 evpmovdb(dst, dst, vec_enc); 4870 break; 4871 default: assert(false, "%s", type2name(to_elem_bt)); 4872 } 4873 } 4874 } 4875 4876 #ifdef _LP64 4877 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 4878 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 4879 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 4880 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4881 // and re-instantiate original MXCSR.RC mode after that. 4882 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 4883 4884 mov64(tmp, julong_cast(0.5L)); 4885 evpbroadcastq(xtmp1, tmp, vec_enc); 4886 vaddpd(xtmp1, src , xtmp1, vec_enc); 4887 evcvtpd2qq(dst, xtmp1, vec_enc); 4888 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 4889 double_sign_flip, vec_enc);; 4890 4891 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 4892 } 4893 4894 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 4895 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 4896 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 4897 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4898 // and re-instantiate original MXCSR.RC mode after that. 4899 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 4900 4901 movl(tmp, jint_cast(0.5)); 4902 movq(xtmp1, tmp); 4903 vbroadcastss(xtmp1, xtmp1, vec_enc); 4904 vaddps(xtmp1, src , xtmp1, vec_enc); 4905 vcvtps2dq(dst, xtmp1, vec_enc); 4906 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 4907 float_sign_flip, vec_enc); 4908 4909 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 4910 } 4911 4912 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 4913 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 4914 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 4915 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4916 // and re-instantiate original MXCSR.RC mode after that. 4917 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 4918 4919 movl(tmp, jint_cast(0.5)); 4920 movq(xtmp1, tmp); 4921 vbroadcastss(xtmp1, xtmp1, vec_enc); 4922 vaddps(xtmp1, src , xtmp1, vec_enc); 4923 vcvtps2dq(dst, xtmp1, vec_enc); 4924 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 4925 4926 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 4927 } 4928 #endif // _LP64 4929 4930 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 4931 BasicType from_elem_bt, BasicType to_elem_bt) { 4932 switch (from_elem_bt) { 4933 case T_BYTE: 4934 switch (to_elem_bt) { 4935 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 4936 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 4937 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 4938 default: ShouldNotReachHere(); 4939 } 4940 break; 4941 case T_SHORT: 4942 switch (to_elem_bt) { 4943 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 4944 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 4945 default: ShouldNotReachHere(); 4946 } 4947 break; 4948 case T_INT: 4949 assert(to_elem_bt == T_LONG, ""); 4950 vpmovzxdq(dst, src, vlen_enc); 4951 break; 4952 default: 4953 ShouldNotReachHere(); 4954 } 4955 } 4956 4957 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 4958 BasicType from_elem_bt, BasicType to_elem_bt) { 4959 switch (from_elem_bt) { 4960 case T_BYTE: 4961 switch (to_elem_bt) { 4962 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 4963 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 4964 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 4965 default: ShouldNotReachHere(); 4966 } 4967 break; 4968 case T_SHORT: 4969 switch (to_elem_bt) { 4970 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 4971 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 4972 default: ShouldNotReachHere(); 4973 } 4974 break; 4975 case T_INT: 4976 assert(to_elem_bt == T_LONG, ""); 4977 vpmovsxdq(dst, src, vlen_enc); 4978 break; 4979 default: 4980 ShouldNotReachHere(); 4981 } 4982 } 4983 4984 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 4985 BasicType dst_bt, BasicType src_bt, int vlen) { 4986 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 4987 assert(vlen_enc != AVX_512bit, ""); 4988 4989 int dst_bt_size = type2aelembytes(dst_bt); 4990 int src_bt_size = type2aelembytes(src_bt); 4991 if (dst_bt_size > src_bt_size) { 4992 switch (dst_bt_size / src_bt_size) { 4993 case 2: vpmovsxbw(dst, src, vlen_enc); break; 4994 case 4: vpmovsxbd(dst, src, vlen_enc); break; 4995 case 8: vpmovsxbq(dst, src, vlen_enc); break; 4996 default: ShouldNotReachHere(); 4997 } 4998 } else { 4999 assert(dst_bt_size < src_bt_size, ""); 5000 switch (src_bt_size / dst_bt_size) { 5001 case 2: { 5002 if (vlen_enc == AVX_128bit) { 5003 vpacksswb(dst, src, src, vlen_enc); 5004 } else { 5005 vpacksswb(dst, src, src, vlen_enc); 5006 vpermq(dst, dst, 0x08, vlen_enc); 5007 } 5008 break; 5009 } 5010 case 4: { 5011 if (vlen_enc == AVX_128bit) { 5012 vpackssdw(dst, src, src, vlen_enc); 5013 vpacksswb(dst, dst, dst, vlen_enc); 5014 } else { 5015 vpackssdw(dst, src, src, vlen_enc); 5016 vpermq(dst, dst, 0x08, vlen_enc); 5017 vpacksswb(dst, dst, dst, AVX_128bit); 5018 } 5019 break; 5020 } 5021 case 8: { 5022 if (vlen_enc == AVX_128bit) { 5023 vpshufd(dst, src, 0x08, vlen_enc); 5024 vpackssdw(dst, dst, dst, vlen_enc); 5025 vpacksswb(dst, dst, dst, vlen_enc); 5026 } else { 5027 vpshufd(dst, src, 0x08, vlen_enc); 5028 vpermq(dst, dst, 0x08, vlen_enc); 5029 vpackssdw(dst, dst, dst, AVX_128bit); 5030 vpacksswb(dst, dst, dst, AVX_128bit); 5031 } 5032 break; 5033 } 5034 default: ShouldNotReachHere(); 5035 } 5036 } 5037 } 5038 5039 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5040 bool merge, BasicType bt, int vlen_enc) { 5041 if (bt == T_INT) { 5042 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5043 } else { 5044 assert(bt == T_LONG, ""); 5045 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5046 } 5047 } 5048 5049 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5050 bool merge, BasicType bt, int vlen_enc) { 5051 if (bt == T_INT) { 5052 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5053 } else { 5054 assert(bt == T_LONG, ""); 5055 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5056 } 5057 } 5058 5059 #ifdef _LP64 5060 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5061 Register rtmp2, XMMRegister xtmp, int mask_len, 5062 int vec_enc) { 5063 int index = 0; 5064 int vindex = 0; 5065 mov64(rtmp1, 0x0101010101010101L); 5066 pdepq(rtmp1, src, rtmp1); 5067 if (mask_len > 8) { 5068 movq(rtmp2, src); 5069 vpxor(xtmp, xtmp, xtmp, vec_enc); 5070 movq(xtmp, rtmp1); 5071 } 5072 movq(dst, rtmp1); 5073 5074 mask_len -= 8; 5075 while (mask_len > 0) { 5076 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5077 index++; 5078 if ((index % 2) == 0) { 5079 pxor(xtmp, xtmp); 5080 } 5081 mov64(rtmp1, 0x0101010101010101L); 5082 shrq(rtmp2, 8); 5083 pdepq(rtmp1, rtmp2, rtmp1); 5084 pinsrq(xtmp, rtmp1, index % 2); 5085 vindex = index / 2; 5086 if (vindex) { 5087 // Write entire 16 byte vector when both 64 bit 5088 // lanes are update to save redundant instructions. 5089 if (index % 2) { 5090 vinsertf128(dst, dst, xtmp, vindex); 5091 } 5092 } else { 5093 vmovdqu(dst, xtmp); 5094 } 5095 mask_len -= 8; 5096 } 5097 } 5098 5099 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5100 switch(opc) { 5101 case Op_VectorMaskTrueCount: 5102 popcntq(dst, tmp); 5103 break; 5104 case Op_VectorMaskLastTrue: 5105 if (VM_Version::supports_lzcnt()) { 5106 lzcntq(tmp, tmp); 5107 movl(dst, 63); 5108 subl(dst, tmp); 5109 } else { 5110 movl(dst, -1); 5111 bsrq(tmp, tmp); 5112 cmov32(Assembler::notZero, dst, tmp); 5113 } 5114 break; 5115 case Op_VectorMaskFirstTrue: 5116 if (VM_Version::supports_bmi1()) { 5117 if (masklen < 32) { 5118 orl(tmp, 1 << masklen); 5119 tzcntl(dst, tmp); 5120 } else if (masklen == 32) { 5121 tzcntl(dst, tmp); 5122 } else { 5123 assert(masklen == 64, ""); 5124 tzcntq(dst, tmp); 5125 } 5126 } else { 5127 if (masklen < 32) { 5128 orl(tmp, 1 << masklen); 5129 bsfl(dst, tmp); 5130 } else { 5131 assert(masklen == 32 || masklen == 64, ""); 5132 movl(dst, masklen); 5133 if (masklen == 32) { 5134 bsfl(tmp, tmp); 5135 } else { 5136 bsfq(tmp, tmp); 5137 } 5138 cmov32(Assembler::notZero, dst, tmp); 5139 } 5140 } 5141 break; 5142 case Op_VectorMaskToLong: 5143 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5144 break; 5145 default: assert(false, "Unhandled mask operation"); 5146 } 5147 } 5148 5149 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5150 int masklen, int masksize, int vec_enc) { 5151 assert(VM_Version::supports_popcnt(), ""); 5152 5153 if(VM_Version::supports_avx512bw()) { 5154 kmovql(tmp, mask); 5155 } else { 5156 assert(masklen <= 16, ""); 5157 kmovwl(tmp, mask); 5158 } 5159 5160 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5161 // operations needs to be clipped. 5162 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5163 andq(tmp, (1 << masklen) - 1); 5164 } 5165 5166 vector_mask_operation_helper(opc, dst, tmp, masklen); 5167 } 5168 5169 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5170 Register tmp, int masklen, BasicType bt, int vec_enc) { 5171 assert(vec_enc == AVX_128bit && VM_Version::supports_avx() || 5172 vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4), ""); 5173 assert(VM_Version::supports_popcnt(), ""); 5174 5175 bool need_clip = false; 5176 switch(bt) { 5177 case T_BOOLEAN: 5178 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5179 vpxor(xtmp, xtmp, xtmp, vec_enc); 5180 vpsubb(xtmp, xtmp, mask, vec_enc); 5181 vpmovmskb(tmp, xtmp, vec_enc); 5182 need_clip = masklen < 16; 5183 break; 5184 case T_BYTE: 5185 vpmovmskb(tmp, mask, vec_enc); 5186 need_clip = masklen < 16; 5187 break; 5188 case T_SHORT: 5189 vpacksswb(xtmp, mask, mask, vec_enc); 5190 if (masklen >= 16) { 5191 vpermpd(xtmp, xtmp, 8, vec_enc); 5192 } 5193 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5194 need_clip = masklen < 16; 5195 break; 5196 case T_INT: 5197 case T_FLOAT: 5198 vmovmskps(tmp, mask, vec_enc); 5199 need_clip = masklen < 4; 5200 break; 5201 case T_LONG: 5202 case T_DOUBLE: 5203 vmovmskpd(tmp, mask, vec_enc); 5204 need_clip = masklen < 2; 5205 break; 5206 default: assert(false, "Unhandled type, %s", type2name(bt)); 5207 } 5208 5209 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5210 // operations needs to be clipped. 5211 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5212 // need_clip implies masklen < 32 5213 andq(tmp, (1 << masklen) - 1); 5214 } 5215 5216 vector_mask_operation_helper(opc, dst, tmp, masklen); 5217 } 5218 5219 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5220 Register rtmp2, int mask_len) { 5221 kmov(rtmp1, src); 5222 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5223 mov64(rtmp2, -1L); 5224 pextq(rtmp2, rtmp2, rtmp1); 5225 kmov(dst, rtmp2); 5226 } 5227 5228 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5229 bool merge, BasicType bt, int vec_enc) { 5230 if (opcode == Op_CompressV) { 5231 switch(bt) { 5232 case T_BYTE: 5233 evpcompressb(dst, mask, src, merge, vec_enc); 5234 break; 5235 case T_CHAR: 5236 case T_SHORT: 5237 evpcompressw(dst, mask, src, merge, vec_enc); 5238 break; 5239 case T_INT: 5240 evpcompressd(dst, mask, src, merge, vec_enc); 5241 break; 5242 case T_FLOAT: 5243 evcompressps(dst, mask, src, merge, vec_enc); 5244 break; 5245 case T_LONG: 5246 evpcompressq(dst, mask, src, merge, vec_enc); 5247 break; 5248 case T_DOUBLE: 5249 evcompresspd(dst, mask, src, merge, vec_enc); 5250 break; 5251 default: 5252 fatal("Unsupported type %s", type2name(bt)); 5253 break; 5254 } 5255 } else { 5256 assert(opcode == Op_ExpandV, ""); 5257 switch(bt) { 5258 case T_BYTE: 5259 evpexpandb(dst, mask, src, merge, vec_enc); 5260 break; 5261 case T_CHAR: 5262 case T_SHORT: 5263 evpexpandw(dst, mask, src, merge, vec_enc); 5264 break; 5265 case T_INT: 5266 evpexpandd(dst, mask, src, merge, vec_enc); 5267 break; 5268 case T_FLOAT: 5269 evexpandps(dst, mask, src, merge, vec_enc); 5270 break; 5271 case T_LONG: 5272 evpexpandq(dst, mask, src, merge, vec_enc); 5273 break; 5274 case T_DOUBLE: 5275 evexpandpd(dst, mask, src, merge, vec_enc); 5276 break; 5277 default: 5278 fatal("Unsupported type %s", type2name(bt)); 5279 break; 5280 } 5281 } 5282 } 5283 #endif 5284 5285 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5286 KRegister ktmp1, int vec_enc) { 5287 if (opcode == Op_SignumVD) { 5288 vsubpd(dst, zero, one, vec_enc); 5289 // if src < 0 ? -1 : 1 5290 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5291 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5292 // if src == NaN, -0.0 or 0.0 return src. 5293 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5294 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5295 } else { 5296 assert(opcode == Op_SignumVF, ""); 5297 vsubps(dst, zero, one, vec_enc); 5298 // if src < 0 ? -1 : 1 5299 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5300 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5301 // if src == NaN, -0.0 or 0.0 return src. 5302 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5303 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5304 } 5305 } 5306 5307 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5308 XMMRegister xtmp1, int vec_enc) { 5309 if (opcode == Op_SignumVD) { 5310 vsubpd(dst, zero, one, vec_enc); 5311 // if src < 0 ? -1 : 1 5312 vblendvpd(dst, one, dst, src, vec_enc); 5313 // if src == NaN, -0.0 or 0.0 return src. 5314 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5315 vblendvpd(dst, dst, src, xtmp1, vec_enc); 5316 } else { 5317 assert(opcode == Op_SignumVF, ""); 5318 vsubps(dst, zero, one, vec_enc); 5319 // if src < 0 ? -1 : 1 5320 vblendvps(dst, one, dst, src, vec_enc); 5321 // if src == NaN, -0.0 or 0.0 return src. 5322 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5323 vblendvps(dst, dst, src, xtmp1, vec_enc); 5324 } 5325 } 5326 5327 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5328 if (VM_Version::supports_avx512bw()) { 5329 if (mask_len > 32) { 5330 kmovql(dst, src); 5331 } else { 5332 kmovdl(dst, src); 5333 if (mask_len != 32) { 5334 kshiftrdl(dst, dst, 32 - mask_len); 5335 } 5336 } 5337 } else { 5338 assert(mask_len <= 16, ""); 5339 kmovwl(dst, src); 5340 if (mask_len != 16) { 5341 kshiftrwl(dst, dst, 16 - mask_len); 5342 } 5343 } 5344 } 5345 5346 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5347 int lane_size = type2aelembytes(bt); 5348 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5349 if ((is_LP64 || lane_size < 8) && 5350 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5351 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5352 movptr(rtmp, imm32); 5353 switch(lane_size) { 5354 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5355 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5356 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5357 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5358 fatal("Unsupported lane size %d", lane_size); 5359 break; 5360 } 5361 } else { 5362 movptr(rtmp, imm32); 5363 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5364 switch(lane_size) { 5365 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5366 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5367 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5368 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5369 fatal("Unsupported lane size %d", lane_size); 5370 break; 5371 } 5372 } 5373 } 5374 5375 // 5376 // Following is lookup table based popcount computation algorithm:- 5377 // Index Bit set count 5378 // [ 0000 -> 0, 5379 // 0001 -> 1, 5380 // 0010 -> 1, 5381 // 0011 -> 2, 5382 // 0100 -> 1, 5383 // 0101 -> 2, 5384 // 0110 -> 2, 5385 // 0111 -> 3, 5386 // 1000 -> 1, 5387 // 1001 -> 2, 5388 // 1010 -> 3, 5389 // 1011 -> 3, 5390 // 1100 -> 2, 5391 // 1101 -> 3, 5392 // 1111 -> 4 ] 5393 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5394 // shuffle indices for lookup table access. 5395 // b. Right shift each byte of vector lane by 4 positions. 5396 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5397 // shuffle indices for lookup table access. 5398 // d. Add the bitset count of upper and lower 4 bits of each byte. 5399 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5400 // count of all the bytes of a quadword. 5401 // f. Perform step e. for upper 128bit vector lane. 5402 // g. Pack the bitset count of quadwords back to double word. 5403 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5404 5405 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5406 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5407 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5408 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5409 vpsrlw(dst, src, 4, vec_enc); 5410 vpand(dst, dst, xtmp1, vec_enc); 5411 vpand(xtmp1, src, xtmp1, vec_enc); 5412 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5413 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5414 vpshufb(dst, xtmp2, dst, vec_enc); 5415 vpaddb(dst, dst, xtmp1, vec_enc); 5416 } 5417 5418 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5419 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5420 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5421 // Following code is as per steps e,f,g and h of above algorithm. 5422 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5423 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5424 vpsadbw(dst, dst, xtmp2, vec_enc); 5425 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5426 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5427 vpackuswb(dst, xtmp1, dst, vec_enc); 5428 } 5429 5430 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5431 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5432 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5433 // Add the popcount of upper and lower bytes of word. 5434 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5435 vpsrlw(dst, xtmp1, 8, vec_enc); 5436 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5437 vpaddw(dst, dst, xtmp1, vec_enc); 5438 } 5439 5440 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5441 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5442 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5443 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5444 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5445 } 5446 5447 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5448 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5449 switch(bt) { 5450 case T_LONG: 5451 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5452 break; 5453 case T_INT: 5454 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5455 break; 5456 case T_CHAR: 5457 case T_SHORT: 5458 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5459 break; 5460 case T_BYTE: 5461 case T_BOOLEAN: 5462 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5463 break; 5464 default: 5465 fatal("Unsupported type %s", type2name(bt)); 5466 break; 5467 } 5468 } 5469 5470 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5471 KRegister mask, bool merge, int vec_enc) { 5472 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5473 switch(bt) { 5474 case T_LONG: 5475 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5476 evpopcntq(dst, mask, src, merge, vec_enc); 5477 break; 5478 case T_INT: 5479 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5480 evpopcntd(dst, mask, src, merge, vec_enc); 5481 break; 5482 case T_CHAR: 5483 case T_SHORT: 5484 assert(VM_Version::supports_avx512_bitalg(), ""); 5485 evpopcntw(dst, mask, src, merge, vec_enc); 5486 break; 5487 case T_BYTE: 5488 case T_BOOLEAN: 5489 assert(VM_Version::supports_avx512_bitalg(), ""); 5490 evpopcntb(dst, mask, src, merge, vec_enc); 5491 break; 5492 default: 5493 fatal("Unsupported type %s", type2name(bt)); 5494 break; 5495 } 5496 } 5497 5498 #ifndef _LP64 5499 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 5500 assert(VM_Version::supports_avx512bw(), ""); 5501 kmovdl(tmp, src); 5502 kunpckdql(dst, tmp, tmp); 5503 } 5504 #endif 5505 5506 // Bit reversal algorithm first reverses the bits of each byte followed by 5507 // a byte level reversal for multi-byte primitive types (short/int/long). 5508 // Algorithm performs a lookup table access to get reverse bit sequence 5509 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5510 // is obtained by swapping the reverse bit sequences of upper and lower 5511 // nibble of a byte. 5512 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5513 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5514 if (VM_Version::supports_avx512vlbw()) { 5515 5516 // Get the reverse bit sequence of lower nibble of each byte. 5517 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5518 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5519 evpandq(dst, xtmp2, src, vec_enc); 5520 vpshufb(dst, xtmp1, dst, vec_enc); 5521 vpsllq(dst, dst, 4, vec_enc); 5522 5523 // Get the reverse bit sequence of upper nibble of each byte. 5524 vpandn(xtmp2, xtmp2, src, vec_enc); 5525 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5526 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5527 5528 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5529 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5530 evporq(xtmp2, dst, xtmp2, vec_enc); 5531 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5532 5533 } else if(vec_enc == Assembler::AVX_512bit) { 5534 // Shift based bit reversal. 5535 assert(bt == T_LONG || bt == T_INT, ""); 5536 5537 // Swap lower and upper nibble of each byte. 5538 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5539 5540 // Swap two least and most significant bits of each nibble. 5541 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5542 5543 // Swap adjacent pair of bits. 5544 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5545 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5546 5547 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5548 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5549 } else { 5550 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5551 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5552 5553 // Get the reverse bit sequence of lower nibble of each byte. 5554 vpand(dst, xtmp2, src, vec_enc); 5555 vpshufb(dst, xtmp1, dst, vec_enc); 5556 vpsllq(dst, dst, 4, vec_enc); 5557 5558 // Get the reverse bit sequence of upper nibble of each byte. 5559 vpandn(xtmp2, xtmp2, src, vec_enc); 5560 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5561 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5562 5563 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5564 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5565 vpor(xtmp2, dst, xtmp2, vec_enc); 5566 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5567 } 5568 } 5569 5570 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 5571 XMMRegister xtmp, Register rscratch) { 5572 assert(VM_Version::supports_gfni(), ""); 5573 assert(rscratch != noreg || always_reachable(mask), "missing"); 5574 5575 // Galois field instruction based bit reversal based on following algorithm. 5576 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5577 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 5578 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 5579 vector_reverse_byte(bt, dst, xtmp, vec_enc); 5580 } 5581 5582 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 5583 XMMRegister xtmp1, Register rtmp, int vec_enc) { 5584 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 5585 evpandq(dst, xtmp1, src, vec_enc); 5586 vpsllq(dst, dst, nbits, vec_enc); 5587 vpandn(xtmp1, xtmp1, src, vec_enc); 5588 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 5589 evporq(dst, dst, xtmp1, vec_enc); 5590 } 5591 5592 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5593 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5594 // Shift based bit reversal. 5595 assert(VM_Version::supports_evex(), ""); 5596 switch(bt) { 5597 case T_LONG: 5598 // Swap upper and lower double word of each quad word. 5599 evprorq(xtmp1, k0, src, 32, true, vec_enc); 5600 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 5601 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5602 break; 5603 case T_INT: 5604 // Swap upper and lower word of each double word. 5605 evprord(xtmp1, k0, src, 16, true, vec_enc); 5606 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5607 break; 5608 case T_CHAR: 5609 case T_SHORT: 5610 // Swap upper and lower byte of each word. 5611 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 5612 break; 5613 case T_BYTE: 5614 evmovdquq(dst, k0, src, true, vec_enc); 5615 break; 5616 default: 5617 fatal("Unsupported type %s", type2name(bt)); 5618 break; 5619 } 5620 } 5621 5622 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5623 if (bt == T_BYTE) { 5624 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 5625 evmovdquq(dst, k0, src, true, vec_enc); 5626 } else { 5627 vmovdqu(dst, src); 5628 } 5629 return; 5630 } 5631 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 5632 // pre-computed shuffle indices. 5633 switch(bt) { 5634 case T_LONG: 5635 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 5636 break; 5637 case T_INT: 5638 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 5639 break; 5640 case T_CHAR: 5641 case T_SHORT: 5642 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 5643 break; 5644 default: 5645 fatal("Unsupported type %s", type2name(bt)); 5646 break; 5647 } 5648 vpshufb(dst, src, dst, vec_enc); 5649 } 5650 5651 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5652 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5653 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 5654 assert(is_integral_type(bt), ""); 5655 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5656 assert(VM_Version::supports_avx512cd(), ""); 5657 switch(bt) { 5658 case T_LONG: 5659 evplzcntq(dst, ktmp, src, merge, vec_enc); 5660 break; 5661 case T_INT: 5662 evplzcntd(dst, ktmp, src, merge, vec_enc); 5663 break; 5664 case T_SHORT: 5665 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 5666 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 5667 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 5668 vpunpckhwd(dst, xtmp1, src, vec_enc); 5669 evplzcntd(dst, ktmp, dst, merge, vec_enc); 5670 vpackusdw(dst, xtmp2, dst, vec_enc); 5671 break; 5672 case T_BYTE: 5673 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5674 // accessing the lookup table. 5675 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 5676 // accessing the lookup table. 5677 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 5678 assert(VM_Version::supports_avx512bw(), ""); 5679 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 5680 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 5681 vpand(xtmp2, dst, src, vec_enc); 5682 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5683 vpsrlw(xtmp3, src, 4, vec_enc); 5684 vpand(xtmp3, dst, xtmp3, vec_enc); 5685 vpshufb(dst, xtmp1, xtmp3, vec_enc); 5686 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 5687 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 5688 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 5689 break; 5690 default: 5691 fatal("Unsupported type %s", type2name(bt)); 5692 break; 5693 } 5694 } 5695 5696 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5697 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5698 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 5699 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5700 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5701 // accessing the lookup table. 5702 vpand(dst, xtmp2, src, vec_enc); 5703 vpshufb(dst, xtmp1, dst, vec_enc); 5704 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 5705 // accessing the lookup table. 5706 vpsrlw(xtmp3, src, 4, vec_enc); 5707 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 5708 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 5709 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 5710 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 5711 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 5712 vpaddb(dst, dst, xtmp2, vec_enc); 5713 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 5714 } 5715 5716 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5717 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5718 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5719 // Add zero counts of lower byte and upper byte of a word if 5720 // upper byte holds a zero value. 5721 vpsrlw(xtmp3, src, 8, vec_enc); 5722 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 5723 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 5724 vpsllw(xtmp2, dst, 8, vec_enc); 5725 vpaddw(xtmp2, xtmp2, dst, vec_enc); 5726 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5727 vpsrlw(dst, dst, 8, vec_enc); 5728 } 5729 5730 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5731 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 5732 // Since IEEE 754 floating point format represents mantissa in 1.0 format 5733 // hence biased exponent can be used to compute leading zero count as per 5734 // following formula:- 5735 // LZCNT = 32 - (biased_exp - 127) 5736 // Special handling has been introduced for Zero, Max_Int and -ve source values. 5737 5738 // Broadcast 0xFF 5739 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 5740 vpsrld(xtmp1, xtmp1, 24, vec_enc); 5741 5742 // Extract biased exponent. 5743 vcvtdq2ps(dst, src, vec_enc); 5744 vpsrld(dst, dst, 23, vec_enc); 5745 vpand(dst, dst, xtmp1, vec_enc); 5746 5747 // Broadcast 127. 5748 vpsrld(xtmp1, xtmp1, 1, vec_enc); 5749 // Exponent = biased_exp - 127 5750 vpsubd(dst, dst, xtmp1, vec_enc); 5751 5752 // Exponent = Exponent + 1 5753 vpsrld(xtmp3, xtmp1, 6, vec_enc); 5754 vpaddd(dst, dst, xtmp3, vec_enc); 5755 5756 // Replace -ve exponent with zero, exponent is -ve when src 5757 // lane contains a zero value. 5758 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5759 vblendvps(dst, dst, xtmp2, dst, vec_enc); 5760 5761 // Rematerialize broadcast 32. 5762 vpslld(xtmp1, xtmp3, 5, vec_enc); 5763 // Exponent is 32 if corresponding source lane contains max_int value. 5764 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 5765 // LZCNT = 32 - exponent 5766 vpsubd(dst, xtmp1, dst, vec_enc); 5767 5768 // Replace LZCNT with a value 1 if corresponding source lane 5769 // contains max_int value. 5770 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 5771 5772 // Replace biased_exp with 0 if source lane value is less than zero. 5773 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5774 vblendvps(dst, dst, xtmp2, src, vec_enc); 5775 } 5776 5777 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5778 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5779 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5780 // Add zero counts of lower word and upper word of a double word if 5781 // upper word holds a zero value. 5782 vpsrld(xtmp3, src, 16, vec_enc); 5783 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 5784 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 5785 vpslld(xtmp2, dst, 16, vec_enc); 5786 vpaddd(xtmp2, xtmp2, dst, vec_enc); 5787 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5788 vpsrld(dst, dst, 16, vec_enc); 5789 // Add zero counts of lower doubleword and upper doubleword of a 5790 // quadword if upper doubleword holds a zero value. 5791 vpsrlq(xtmp3, src, 32, vec_enc); 5792 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 5793 vpsllq(xtmp2, dst, 32, vec_enc); 5794 vpaddq(xtmp2, xtmp2, dst, vec_enc); 5795 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5796 vpsrlq(dst, dst, 32, vec_enc); 5797 } 5798 5799 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 5800 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5801 Register rtmp, int vec_enc) { 5802 assert(is_integral_type(bt), "unexpected type"); 5803 assert(vec_enc < Assembler::AVX_512bit, ""); 5804 switch(bt) { 5805 case T_LONG: 5806 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5807 break; 5808 case T_INT: 5809 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 5810 break; 5811 case T_SHORT: 5812 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5813 break; 5814 case T_BYTE: 5815 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5816 break; 5817 default: 5818 fatal("Unsupported type %s", type2name(bt)); 5819 break; 5820 } 5821 } 5822 5823 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 5824 switch(bt) { 5825 case T_BYTE: 5826 vpsubb(dst, src1, src2, vec_enc); 5827 break; 5828 case T_SHORT: 5829 vpsubw(dst, src1, src2, vec_enc); 5830 break; 5831 case T_INT: 5832 vpsubd(dst, src1, src2, vec_enc); 5833 break; 5834 case T_LONG: 5835 vpsubq(dst, src1, src2, vec_enc); 5836 break; 5837 default: 5838 fatal("Unsupported type %s", type2name(bt)); 5839 break; 5840 } 5841 } 5842 5843 // Trailing zero count computation is based on leading zero count operation as per 5844 // following equation. All AVX3 targets support AVX512CD feature which offers 5845 // direct vector instruction to compute leading zero count. 5846 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 5847 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5848 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5849 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 5850 assert(is_integral_type(bt), ""); 5851 // xtmp = -1 5852 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 5853 // xtmp = xtmp + src 5854 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 5855 // xtmp = xtmp & ~src 5856 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 5857 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 5858 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 5859 vpsub(bt, dst, xtmp4, dst, vec_enc); 5860 } 5861 5862 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 5863 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 5864 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5865 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5866 assert(is_integral_type(bt), ""); 5867 // xtmp = 0 5868 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 5869 // xtmp = 0 - src 5870 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 5871 // xtmp = xtmp | src 5872 vpor(xtmp3, xtmp3, src, vec_enc); 5873 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 5874 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 5875 vpsub(bt, dst, xtmp1, dst, vec_enc); 5876 } 5877 5878 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 5879 Label done; 5880 Label neg_divisor_fastpath; 5881 cmpl(divisor, 0); 5882 jccb(Assembler::less, neg_divisor_fastpath); 5883 xorl(rdx, rdx); 5884 divl(divisor); 5885 jmpb(done); 5886 bind(neg_divisor_fastpath); 5887 // Fastpath for divisor < 0: 5888 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 5889 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 5890 movl(rdx, rax); 5891 subl(rdx, divisor); 5892 if (VM_Version::supports_bmi1()) { 5893 andnl(rax, rdx, rax); 5894 } else { 5895 notl(rdx); 5896 andl(rax, rdx); 5897 } 5898 shrl(rax, 31); 5899 bind(done); 5900 } 5901 5902 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 5903 Label done; 5904 Label neg_divisor_fastpath; 5905 cmpl(divisor, 0); 5906 jccb(Assembler::less, neg_divisor_fastpath); 5907 xorl(rdx, rdx); 5908 divl(divisor); 5909 jmpb(done); 5910 bind(neg_divisor_fastpath); 5911 // Fastpath when divisor < 0: 5912 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 5913 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 5914 movl(rdx, rax); 5915 subl(rax, divisor); 5916 if (VM_Version::supports_bmi1()) { 5917 andnl(rax, rax, rdx); 5918 } else { 5919 notl(rax); 5920 andl(rax, rdx); 5921 } 5922 sarl(rax, 31); 5923 andl(rax, divisor); 5924 subl(rdx, rax); 5925 bind(done); 5926 } 5927 5928 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 5929 Label done; 5930 Label neg_divisor_fastpath; 5931 5932 cmpl(divisor, 0); 5933 jccb(Assembler::less, neg_divisor_fastpath); 5934 xorl(rdx, rdx); 5935 divl(divisor); 5936 jmpb(done); 5937 bind(neg_divisor_fastpath); 5938 // Fastpath for divisor < 0: 5939 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 5940 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 5941 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 5942 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 5943 movl(rdx, rax); 5944 subl(rax, divisor); 5945 if (VM_Version::supports_bmi1()) { 5946 andnl(rax, rax, rdx); 5947 } else { 5948 notl(rax); 5949 andl(rax, rdx); 5950 } 5951 movl(tmp, rax); 5952 shrl(rax, 31); // quotient 5953 sarl(tmp, 31); 5954 andl(tmp, divisor); 5955 subl(rdx, tmp); // remainder 5956 bind(done); 5957 } 5958 5959 #ifdef _LP64 5960 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 5961 XMMRegister xtmp2, Register rtmp) { 5962 if(VM_Version::supports_gfni()) { 5963 // Galois field instruction based bit reversal based on following algorithm. 5964 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5965 mov64(rtmp, 0x8040201008040201L); 5966 movq(xtmp1, src); 5967 movq(xtmp2, rtmp); 5968 gf2p8affineqb(xtmp1, xtmp2, 0); 5969 movq(dst, xtmp1); 5970 } else { 5971 // Swap even and odd numbered bits. 5972 movl(rtmp, src); 5973 andl(rtmp, 0x55555555); 5974 shll(rtmp, 1); 5975 movl(dst, src); 5976 andl(dst, 0xAAAAAAAA); 5977 shrl(dst, 1); 5978 orl(dst, rtmp); 5979 5980 // Swap LSB and MSB 2 bits of each nibble. 5981 movl(rtmp, dst); 5982 andl(rtmp, 0x33333333); 5983 shll(rtmp, 2); 5984 andl(dst, 0xCCCCCCCC); 5985 shrl(dst, 2); 5986 orl(dst, rtmp); 5987 5988 // Swap LSB and MSB 4 bits of each byte. 5989 movl(rtmp, dst); 5990 andl(rtmp, 0x0F0F0F0F); 5991 shll(rtmp, 4); 5992 andl(dst, 0xF0F0F0F0); 5993 shrl(dst, 4); 5994 orl(dst, rtmp); 5995 } 5996 bswapl(dst); 5997 } 5998 5999 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6000 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6001 if(VM_Version::supports_gfni()) { 6002 // Galois field instruction based bit reversal based on following algorithm. 6003 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6004 mov64(rtmp1, 0x8040201008040201L); 6005 movq(xtmp1, src); 6006 movq(xtmp2, rtmp1); 6007 gf2p8affineqb(xtmp1, xtmp2, 0); 6008 movq(dst, xtmp1); 6009 } else { 6010 // Swap even and odd numbered bits. 6011 movq(rtmp1, src); 6012 mov64(rtmp2, 0x5555555555555555L); 6013 andq(rtmp1, rtmp2); 6014 shlq(rtmp1, 1); 6015 movq(dst, src); 6016 notq(rtmp2); 6017 andq(dst, rtmp2); 6018 shrq(dst, 1); 6019 orq(dst, rtmp1); 6020 6021 // Swap LSB and MSB 2 bits of each nibble. 6022 movq(rtmp1, dst); 6023 mov64(rtmp2, 0x3333333333333333L); 6024 andq(rtmp1, rtmp2); 6025 shlq(rtmp1, 2); 6026 notq(rtmp2); 6027 andq(dst, rtmp2); 6028 shrq(dst, 2); 6029 orq(dst, rtmp1); 6030 6031 // Swap LSB and MSB 4 bits of each byte. 6032 movq(rtmp1, dst); 6033 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6034 andq(rtmp1, rtmp2); 6035 shlq(rtmp1, 4); 6036 notq(rtmp2); 6037 andq(dst, rtmp2); 6038 shrq(dst, 4); 6039 orq(dst, rtmp1); 6040 } 6041 bswapq(dst); 6042 } 6043 6044 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6045 Label done; 6046 Label neg_divisor_fastpath; 6047 cmpq(divisor, 0); 6048 jccb(Assembler::less, neg_divisor_fastpath); 6049 xorl(rdx, rdx); 6050 divq(divisor); 6051 jmpb(done); 6052 bind(neg_divisor_fastpath); 6053 // Fastpath for divisor < 0: 6054 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6055 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6056 movq(rdx, rax); 6057 subq(rdx, divisor); 6058 if (VM_Version::supports_bmi1()) { 6059 andnq(rax, rdx, rax); 6060 } else { 6061 notq(rdx); 6062 andq(rax, rdx); 6063 } 6064 shrq(rax, 63); 6065 bind(done); 6066 } 6067 6068 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6069 Label done; 6070 Label neg_divisor_fastpath; 6071 cmpq(divisor, 0); 6072 jccb(Assembler::less, neg_divisor_fastpath); 6073 xorq(rdx, rdx); 6074 divq(divisor); 6075 jmp(done); 6076 bind(neg_divisor_fastpath); 6077 // Fastpath when divisor < 0: 6078 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6079 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6080 movq(rdx, rax); 6081 subq(rax, divisor); 6082 if (VM_Version::supports_bmi1()) { 6083 andnq(rax, rax, rdx); 6084 } else { 6085 notq(rax); 6086 andq(rax, rdx); 6087 } 6088 sarq(rax, 63); 6089 andq(rax, divisor); 6090 subq(rdx, rax); 6091 bind(done); 6092 } 6093 6094 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6095 Label done; 6096 Label neg_divisor_fastpath; 6097 cmpq(divisor, 0); 6098 jccb(Assembler::less, neg_divisor_fastpath); 6099 xorq(rdx, rdx); 6100 divq(divisor); 6101 jmp(done); 6102 bind(neg_divisor_fastpath); 6103 // Fastpath for divisor < 0: 6104 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6105 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6106 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6107 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6108 movq(rdx, rax); 6109 subq(rax, divisor); 6110 if (VM_Version::supports_bmi1()) { 6111 andnq(rax, rax, rdx); 6112 } else { 6113 notq(rax); 6114 andq(rax, rdx); 6115 } 6116 movq(tmp, rax); 6117 shrq(rax, 63); // quotient 6118 sarq(tmp, 63); 6119 andq(tmp, divisor); 6120 subq(rdx, tmp); // remainder 6121 bind(done); 6122 } 6123 #endif 6124 6125 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6126 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6127 int vlen_enc) { 6128 assert(VM_Version::supports_avx512bw(), ""); 6129 // Byte shuffles are inlane operations and indices are determined using 6130 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6131 // normalized to index range 0-15. This makes sure that all the multiples 6132 // of an index value are placed at same relative position in 128 bit 6133 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6134 // will be 16th element in their respective 128 bit lanes. 6135 movl(rtmp, 16); 6136 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6137 6138 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6139 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6140 // original shuffle indices and move the shuffled lanes corresponding to true 6141 // mask to destination vector. 6142 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6143 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6144 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6145 6146 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6147 // and broadcasting second 128 bit lane. 6148 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6149 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6150 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6151 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6152 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6153 6154 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6155 // and broadcasting third 128 bit lane. 6156 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6157 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6158 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6159 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6160 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6161 6162 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6163 // and broadcasting third 128 bit lane. 6164 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6165 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6166 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6167 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6168 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6169 } 6170 6171 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6172 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6173 if (vlen_enc == AVX_128bit) { 6174 vpermilps(dst, src, shuffle, vlen_enc); 6175 } else if (bt == T_INT) { 6176 vpermd(dst, shuffle, src, vlen_enc); 6177 } else { 6178 assert(bt == T_FLOAT, ""); 6179 vpermps(dst, shuffle, src, vlen_enc); 6180 } 6181 }