1 /* 2 * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "oops/methodData.hpp" 31 #include "opto/c2_MacroAssembler.hpp" 32 #include "opto/intrinsicnode.hpp" 33 #include "opto/output.hpp" 34 #include "opto/opcodes.hpp" 35 #include "opto/subnode.hpp" 36 #include "runtime/objectMonitor.hpp" 37 #include "runtime/stubRoutines.hpp" 38 39 #ifdef PRODUCT 40 #define BLOCK_COMMENT(str) /* nothing */ 41 #define STOP(error) stop(error) 42 #else 43 #define BLOCK_COMMENT(str) block_comment(str) 44 #define STOP(error) block_comment(error); stop(error) 45 #endif 46 47 // C2 compiled method's prolog code. 48 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { 49 50 // WARNING: Initial instruction MUST be 5 bytes or longer so that 51 // NativeJump::patch_verified_entry will be able to patch out the entry 52 // code safely. The push to verify stack depth is ok at 5 bytes, 53 // the frame allocation can be either 3 or 6 bytes. So if we don't do 54 // stack bang then we must use the 6 byte frame allocation even if 55 // we have no frame. :-( 56 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 57 58 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 59 // Remove word for return addr 60 framesize -= wordSize; 61 stack_bang_size -= wordSize; 62 63 // Calls to C2R adapters often do not accept exceptional returns. 64 // We require that their callers must bang for them. But be careful, because 65 // some VM calls (such as call site linkage) can use several kilobytes of 66 // stack. But the stack safety zone should account for that. 67 // See bugs 4446381, 4468289, 4497237. 68 if (stack_bang_size > 0) { 69 generate_stack_overflow_check(stack_bang_size); 70 71 // We always push rbp, so that on return to interpreter rbp, will be 72 // restored correctly and we can correct the stack. 73 push(rbp); 74 // Save caller's stack pointer into RBP if the frame pointer is preserved. 75 if (PreserveFramePointer) { 76 mov(rbp, rsp); 77 } 78 // Remove word for ebp 79 framesize -= wordSize; 80 81 // Create frame 82 if (framesize) { 83 subptr(rsp, framesize); 84 } 85 } else { 86 // Create frame (force generation of a 4 byte immediate value) 87 subptr_imm32(rsp, framesize); 88 89 // Save RBP register now. 90 framesize -= wordSize; 91 movptr(Address(rsp, framesize), rbp); 92 // Save caller's stack pointer into RBP if the frame pointer is preserved. 93 if (PreserveFramePointer) { 94 movptr(rbp, rsp); 95 if (framesize > 0) { 96 addptr(rbp, framesize); 97 } 98 } 99 } 100 101 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 102 framesize -= wordSize; 103 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 104 } 105 106 #ifndef _LP64 107 // If method sets FPU control word do it now 108 if (fp_mode_24b) { 109 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 110 } 111 if (UseSSE >= 2 && VerifyFPU) { 112 verify_FPU(0, "FPU stack must be clean on entry"); 113 } 114 #endif 115 116 #ifdef ASSERT 117 if (VerifyStackAtCalls) { 118 Label L; 119 push(rax); 120 mov(rax, rsp); 121 andptr(rax, StackAlignmentInBytes-1); 122 cmpptr(rax, StackAlignmentInBytes-wordSize); 123 pop(rax); 124 jcc(Assembler::equal, L); 125 STOP("Stack is not properly aligned!"); 126 bind(L); 127 } 128 #endif 129 130 if (!is_stub) { 131 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 132 #ifdef _LP64 133 if (BarrierSet::barrier_set()->barrier_set_nmethod() != NULL) { 134 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 135 Label dummy_slow_path; 136 Label dummy_continuation; 137 Label* slow_path = &dummy_slow_path; 138 Label* continuation = &dummy_continuation; 139 if (!Compile::current()->output()->in_scratch_emit_size()) { 140 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 141 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 142 Compile::current()->output()->add_stub(stub); 143 slow_path = &stub->entry(); 144 continuation = &stub->continuation(); 145 } 146 bs->nmethod_entry_barrier(this, slow_path, continuation); 147 } 148 #else 149 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 150 bs->nmethod_entry_barrier(this, NULL /* slow_path */, NULL /* continuation */); 151 #endif 152 } 153 } 154 155 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 156 switch (vlen_in_bytes) { 157 case 4: // fall-through 158 case 8: // fall-through 159 case 16: return Assembler::AVX_128bit; 160 case 32: return Assembler::AVX_256bit; 161 case 64: return Assembler::AVX_512bit; 162 163 default: { 164 ShouldNotReachHere(); 165 return Assembler::AVX_NoVec; 166 } 167 } 168 } 169 170 #if INCLUDE_RTM_OPT 171 172 // Update rtm_counters based on abort status 173 // input: abort_status 174 // rtm_counters (RTMLockingCounters*) 175 // flags are killed 176 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) { 177 178 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset())); 179 if (PrintPreciseRTMLockingStatistics) { 180 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 181 Label check_abort; 182 testl(abort_status, (1<<i)); 183 jccb(Assembler::equal, check_abort); 184 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx)))); 185 bind(check_abort); 186 } 187 } 188 } 189 190 // Branch if (random & (count-1) != 0), count is 2^n 191 // tmp, scr and flags are killed 192 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) { 193 assert(tmp == rax, ""); 194 assert(scr == rdx, ""); 195 rdtsc(); // modifies EDX:EAX 196 andptr(tmp, count-1); 197 jccb(Assembler::notZero, brLabel); 198 } 199 200 // Perform abort ratio calculation, set no_rtm bit if high ratio 201 // input: rtm_counters_Reg (RTMLockingCounters* address) 202 // tmpReg, rtm_counters_Reg and flags are killed 203 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg, 204 Register rtm_counters_Reg, 205 RTMLockingCounters* rtm_counters, 206 Metadata* method_data) { 207 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 208 209 if (RTMLockingCalculationDelay > 0) { 210 // Delay calculation 211 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr())); 212 testptr(tmpReg, tmpReg); 213 jccb(Assembler::equal, L_done); 214 } 215 // Abort ratio calculation only if abort_count > RTMAbortThreshold 216 // Aborted transactions = abort_count * 100 217 // All transactions = total_count * RTMTotalCountIncrRate 218 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 219 220 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset())); 221 cmpptr(tmpReg, RTMAbortThreshold); 222 jccb(Assembler::below, L_check_always_rtm2); 223 imulptr(tmpReg, tmpReg, 100); 224 225 Register scrReg = rtm_counters_Reg; 226 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 227 imulptr(scrReg, scrReg, RTMTotalCountIncrRate); 228 imulptr(scrReg, scrReg, RTMAbortRatio); 229 cmpptr(tmpReg, scrReg); 230 jccb(Assembler::below, L_check_always_rtm1); 231 if (method_data != NULL) { 232 // set rtm_state to "no rtm" in MDO 233 mov_metadata(tmpReg, method_data); 234 lock(); 235 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM); 236 } 237 jmpb(L_done); 238 bind(L_check_always_rtm1); 239 // Reload RTMLockingCounters* address 240 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 241 bind(L_check_always_rtm2); 242 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 243 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 244 jccb(Assembler::below, L_done); 245 if (method_data != NULL) { 246 // set rtm_state to "always rtm" in MDO 247 mov_metadata(tmpReg, method_data); 248 lock(); 249 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM); 250 } 251 bind(L_done); 252 } 253 254 // Update counters and perform abort ratio calculation 255 // input: abort_status_Reg 256 // rtm_counters_Reg, flags are killed 257 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg, 258 Register rtm_counters_Reg, 259 RTMLockingCounters* rtm_counters, 260 Metadata* method_data, 261 bool profile_rtm) { 262 263 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 264 // update rtm counters based on rax value at abort 265 // reads abort_status_Reg, updates flags 266 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 267 rtm_counters_update(abort_status_Reg, rtm_counters_Reg); 268 if (profile_rtm) { 269 // Save abort status because abort_status_Reg is used by following code. 270 if (RTMRetryCount > 0) { 271 push(abort_status_Reg); 272 } 273 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 274 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data); 275 // restore abort status 276 if (RTMRetryCount > 0) { 277 pop(abort_status_Reg); 278 } 279 } 280 } 281 282 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4) 283 // inputs: retry_count_Reg 284 // : abort_status_Reg 285 // output: retry_count_Reg decremented by 1 286 // flags are killed 287 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) { 288 Label doneRetry; 289 assert(abort_status_Reg == rax, ""); 290 // The abort reason bits are in eax (see all states in rtmLocking.hpp) 291 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) 292 // if reason is in 0x6 and retry count != 0 then retry 293 andptr(abort_status_Reg, 0x6); 294 jccb(Assembler::zero, doneRetry); 295 testl(retry_count_Reg, retry_count_Reg); 296 jccb(Assembler::zero, doneRetry); 297 pause(); 298 decrementl(retry_count_Reg); 299 jmp(retryLabel); 300 bind(doneRetry); 301 } 302 303 // Spin and retry if lock is busy, 304 // inputs: box_Reg (monitor address) 305 // : retry_count_Reg 306 // output: retry_count_Reg decremented by 1 307 // : clear z flag if retry count exceeded 308 // tmp_Reg, scr_Reg, flags are killed 309 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg, 310 Register tmp_Reg, Register scr_Reg, Label& retryLabel) { 311 Label SpinLoop, SpinExit, doneRetry; 312 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 313 314 testl(retry_count_Reg, retry_count_Reg); 315 jccb(Assembler::zero, doneRetry); 316 decrementl(retry_count_Reg); 317 movptr(scr_Reg, RTMSpinLoopCount); 318 319 bind(SpinLoop); 320 pause(); 321 decrementl(scr_Reg); 322 jccb(Assembler::lessEqual, SpinExit); 323 movptr(tmp_Reg, Address(box_Reg, owner_offset)); 324 testptr(tmp_Reg, tmp_Reg); 325 jccb(Assembler::notZero, SpinLoop); 326 327 bind(SpinExit); 328 jmp(retryLabel); 329 bind(doneRetry); 330 incrementl(retry_count_Reg); // clear z flag 331 } 332 333 // Use RTM for normal stack locks 334 // Input: objReg (object to lock) 335 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg, 336 Register retry_on_abort_count_Reg, 337 RTMLockingCounters* stack_rtm_counters, 338 Metadata* method_data, bool profile_rtm, 339 Label& DONE_LABEL, Label& IsInflated) { 340 assert(UseRTMForStackLocks, "why call this otherwise?"); 341 assert(tmpReg == rax, ""); 342 assert(scrReg == rdx, ""); 343 Label L_rtm_retry, L_decrement_retry, L_on_abort; 344 345 if (RTMRetryCount > 0) { 346 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 347 bind(L_rtm_retry); 348 } 349 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 350 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 351 jcc(Assembler::notZero, IsInflated); 352 353 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 354 Label L_noincrement; 355 if (RTMTotalCountIncrRate > 1) { 356 // tmpReg, scrReg and flags are killed 357 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 358 } 359 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 360 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg); 361 bind(L_noincrement); 362 } 363 xbegin(L_on_abort); 364 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 365 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 366 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 367 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked 368 369 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 370 if (UseRTMXendForLockBusy) { 371 xend(); 372 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry) 373 jmp(L_decrement_retry); 374 } 375 else { 376 xabort(0); 377 } 378 bind(L_on_abort); 379 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 380 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm); 381 } 382 bind(L_decrement_retry); 383 if (RTMRetryCount > 0) { 384 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 385 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 386 } 387 } 388 389 // Use RTM for inflating locks 390 // inputs: objReg (object to lock) 391 // boxReg (on-stack box address (displaced header location) - KILLED) 392 // tmpReg (ObjectMonitor address + markWord::monitor_value) 393 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg, 394 Register scrReg, Register retry_on_busy_count_Reg, 395 Register retry_on_abort_count_Reg, 396 RTMLockingCounters* rtm_counters, 397 Metadata* method_data, bool profile_rtm, 398 Label& DONE_LABEL) { 399 assert(UseRTMLocking, "why call this otherwise?"); 400 assert(tmpReg == rax, ""); 401 assert(scrReg == rdx, ""); 402 Label L_rtm_retry, L_decrement_retry, L_on_abort; 403 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 404 405 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 406 movptr(boxReg, tmpReg); // Save ObjectMonitor address 407 408 if (RTMRetryCount > 0) { 409 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy 410 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 411 bind(L_rtm_retry); 412 } 413 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 414 Label L_noincrement; 415 if (RTMTotalCountIncrRate > 1) { 416 // tmpReg, scrReg and flags are killed 417 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 418 } 419 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 420 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg); 421 bind(L_noincrement); 422 } 423 xbegin(L_on_abort); 424 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 425 movptr(tmpReg, Address(tmpReg, owner_offset)); 426 testptr(tmpReg, tmpReg); 427 jcc(Assembler::zero, DONE_LABEL); 428 if (UseRTMXendForLockBusy) { 429 xend(); 430 jmp(L_decrement_retry); 431 } 432 else { 433 xabort(0); 434 } 435 bind(L_on_abort); 436 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 437 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 438 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm); 439 } 440 if (RTMRetryCount > 0) { 441 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 442 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 443 } 444 445 movptr(tmpReg, Address(boxReg, owner_offset)) ; 446 testptr(tmpReg, tmpReg) ; 447 jccb(Assembler::notZero, L_decrement_retry) ; 448 449 // Appears unlocked - try to swing _owner from null to non-null. 450 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 451 #ifdef _LP64 452 Register threadReg = r15_thread; 453 #else 454 get_thread(scrReg); 455 Register threadReg = scrReg; 456 #endif 457 lock(); 458 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg 459 460 if (RTMRetryCount > 0) { 461 // success done else retry 462 jccb(Assembler::equal, DONE_LABEL) ; 463 bind(L_decrement_retry); 464 // Spin and retry if lock is busy. 465 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry); 466 } 467 else { 468 bind(L_decrement_retry); 469 } 470 } 471 472 #endif // INCLUDE_RTM_OPT 473 474 // fast_lock and fast_unlock used by C2 475 476 // Because the transitions from emitted code to the runtime 477 // monitorenter/exit helper stubs are so slow it's critical that 478 // we inline both the stack-locking fast path and the inflated fast path. 479 // 480 // See also: cmpFastLock and cmpFastUnlock. 481 // 482 // What follows is a specialized inline transliteration of the code 483 // in enter() and exit(). If we're concerned about I$ bloat another 484 // option would be to emit TrySlowEnter and TrySlowExit methods 485 // at startup-time. These methods would accept arguments as 486 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 487 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 488 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 489 // In practice, however, the # of lock sites is bounded and is usually small. 490 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 491 // if the processor uses simple bimodal branch predictors keyed by EIP 492 // Since the helper routines would be called from multiple synchronization 493 // sites. 494 // 495 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 496 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 497 // to those specialized methods. That'd give us a mostly platform-independent 498 // implementation that the JITs could optimize and inline at their pleasure. 499 // Done correctly, the only time we'd need to cross to native could would be 500 // to park() or unpark() threads. We'd also need a few more unsafe operators 501 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 502 // (b) explicit barriers or fence operations. 503 // 504 // TODO: 505 // 506 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 507 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 508 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 509 // the lock operators would typically be faster than reifying Self. 510 // 511 // * Ideally I'd define the primitives as: 512 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 513 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 514 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 515 // Instead, we're stuck with a rather awkward and brittle register assignments below. 516 // Furthermore the register assignments are overconstrained, possibly resulting in 517 // sub-optimal code near the synchronization site. 518 // 519 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 520 // Alternately, use a better sp-proximity test. 521 // 522 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 523 // Either one is sufficient to uniquely identify a thread. 524 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 525 // 526 // * Intrinsify notify() and notifyAll() for the common cases where the 527 // object is locked by the calling thread but the waitlist is empty. 528 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 529 // 530 // * use jccb and jmpb instead of jcc and jmp to improve code density. 531 // But beware of excessive branch density on AMD Opterons. 532 // 533 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 534 // or failure of the fast path. If the fast path fails then we pass 535 // control to the slow path, typically in C. In fast_lock and 536 // fast_unlock we often branch to DONE_LABEL, just to find that C2 537 // will emit a conditional branch immediately after the node. 538 // So we have branches to branches and lots of ICC.ZF games. 539 // Instead, it might be better to have C2 pass a "FailureLabel" 540 // into fast_lock and fast_unlock. In the case of success, control 541 // will drop through the node. ICC.ZF is undefined at exit. 542 // In the case of failure, the node will branch directly to the 543 // FailureLabel 544 545 546 // obj: object to lock 547 // box: on-stack box address (displaced header location) - KILLED 548 // rax,: tmp -- KILLED 549 // scr: tmp -- KILLED 550 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 551 Register scrReg, Register cx1Reg, Register cx2Reg, 552 RTMLockingCounters* rtm_counters, 553 RTMLockingCounters* stack_rtm_counters, 554 Metadata* method_data, 555 bool use_rtm, bool profile_rtm) { 556 // Ensure the register assignments are disjoint 557 assert(tmpReg == rax, ""); 558 559 if (use_rtm) { 560 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg); 561 } else { 562 assert(cx1Reg == noreg, ""); 563 assert(cx2Reg == noreg, ""); 564 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 565 } 566 567 // Possible cases that we'll encounter in fast_lock 568 // ------------------------------------------------ 569 // * Inflated 570 // -- unlocked 571 // -- Locked 572 // = by self 573 // = by other 574 // * neutral 575 // * stack-locked 576 // -- by self 577 // = sp-proximity test hits 578 // = sp-proximity test generates false-negative 579 // -- by other 580 // 581 582 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 583 584 if (DiagnoseSyncOnValueBasedClasses != 0) { 585 load_klass(tmpReg, objReg, scrReg); 586 movl(tmpReg, Address(tmpReg, Klass::access_flags_offset())); 587 testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS); 588 jcc(Assembler::notZero, DONE_LABEL); 589 } 590 591 #if INCLUDE_RTM_OPT 592 if (UseRTMForStackLocks && use_rtm) { 593 assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive"); 594 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg, 595 stack_rtm_counters, method_data, profile_rtm, 596 DONE_LABEL, IsInflated); 597 } 598 #endif // INCLUDE_RTM_OPT 599 600 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 601 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 602 jccb(Assembler::notZero, IsInflated); 603 604 if (!UseHeavyMonitors) { 605 // Attempt stack-locking ... 606 orptr (tmpReg, markWord::unlocked_value); 607 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 608 lock(); 609 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 610 jcc(Assembler::equal, COUNT); // Success 611 612 // Recursive locking. 613 // The object is stack-locked: markword contains stack pointer to BasicLock. 614 // Locked by current thread if difference with current SP is less than one page. 615 subptr(tmpReg, rsp); 616 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 617 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) ); 618 movptr(Address(boxReg, 0), tmpReg); 619 } else { 620 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 621 testptr(objReg, objReg); 622 } 623 jmp(DONE_LABEL); 624 625 bind(IsInflated); 626 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 627 628 #if INCLUDE_RTM_OPT 629 // Use the same RTM locking code in 32- and 64-bit VM. 630 if (use_rtm) { 631 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg, 632 rtm_counters, method_data, profile_rtm, DONE_LABEL); 633 } else { 634 #endif // INCLUDE_RTM_OPT 635 636 #ifndef _LP64 637 // The object is inflated. 638 639 // boxReg refers to the on-stack BasicLock in the current frame. 640 // We'd like to write: 641 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 642 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 643 // additional latency as we have another ST in the store buffer that must drain. 644 645 // avoid ST-before-CAS 646 // register juggle because we need tmpReg for cmpxchgptr below 647 movptr(scrReg, boxReg); 648 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 649 650 // Optimistic form: consider XORL tmpReg,tmpReg 651 movptr(tmpReg, NULL_WORD); 652 653 // Appears unlocked - try to swing _owner from null to non-null. 654 // Ideally, I'd manifest "Self" with get_thread and then attempt 655 // to CAS the register containing Self into m->Owner. 656 // But we don't have enough registers, so instead we can either try to CAS 657 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 658 // we later store "Self" into m->Owner. Transiently storing a stack address 659 // (rsp or the address of the box) into m->owner is harmless. 660 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 661 lock(); 662 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 663 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 664 // If we weren't able to swing _owner from NULL to the BasicLock 665 // then take the slow path. 666 jccb (Assembler::notZero, NO_COUNT); 667 // update _owner from BasicLock to thread 668 get_thread (scrReg); // beware: clobbers ICCs 669 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 670 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 671 672 // If the CAS fails we can either retry or pass control to the slow path. 673 // We use the latter tactic. 674 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 675 // If the CAS was successful ... 676 // Self has acquired the lock 677 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 678 // Intentional fall-through into DONE_LABEL ... 679 #else // _LP64 680 // It's inflated and we use scrReg for ObjectMonitor* in this section. 681 movq(scrReg, tmpReg); 682 xorq(tmpReg, tmpReg); 683 lock(); 684 cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 685 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 686 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 687 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 688 // Propagate ICC.ZF from CAS above into DONE_LABEL. 689 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 690 691 cmpptr(r15_thread, rax); // Check if we are already the owner (recursive lock) 692 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 693 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 694 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 695 #endif // _LP64 696 #if INCLUDE_RTM_OPT 697 } // use_rtm() 698 #endif 699 bind(DONE_LABEL); 700 701 // ZFlag == 1 count in fast path 702 // ZFlag == 0 count in slow path 703 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 704 705 bind(COUNT); 706 // Count monitors in fast path 707 #ifndef _LP64 708 get_thread(tmpReg); 709 incrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); 710 #else // _LP64 711 incrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 712 #endif 713 714 xorl(tmpReg, tmpReg); // Set ZF == 1 715 716 bind(NO_COUNT); 717 718 // At NO_COUNT the icc ZFlag is set as follows ... 719 // fast_unlock uses the same protocol. 720 // ZFlag == 1 -> Success 721 // ZFlag == 0 -> Failure - force control through the slow path 722 } 723 724 // obj: object to unlock 725 // box: box address (displaced header location), killed. Must be EAX. 726 // tmp: killed, cannot be obj nor box. 727 // 728 // Some commentary on balanced locking: 729 // 730 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 731 // Methods that don't have provably balanced locking are forced to run in the 732 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 733 // The interpreter provides two properties: 734 // I1: At return-time the interpreter automatically and quietly unlocks any 735 // objects acquired the current activation (frame). Recall that the 736 // interpreter maintains an on-stack list of locks currently held by 737 // a frame. 738 // I2: If a method attempts to unlock an object that is not held by the 739 // the frame the interpreter throws IMSX. 740 // 741 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 742 // B() doesn't have provably balanced locking so it runs in the interpreter. 743 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 744 // is still locked by A(). 745 // 746 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 747 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 748 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 749 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 750 // Arguably given that the spec legislates the JNI case as undefined our implementation 751 // could reasonably *avoid* checking owner in fast_unlock(). 752 // In the interest of performance we elide m->Owner==Self check in unlock. 753 // A perfectly viable alternative is to elide the owner check except when 754 // Xcheck:jni is enabled. 755 756 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { 757 assert(boxReg == rax, ""); 758 assert_different_registers(objReg, boxReg, tmpReg); 759 760 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 761 762 #if INCLUDE_RTM_OPT 763 if (UseRTMForStackLocks && use_rtm) { 764 assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive"); 765 Label L_regular_unlock; 766 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 767 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 768 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 769 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock 770 xend(); // otherwise end... 771 jmp(DONE_LABEL); // ... and we're done 772 bind(L_regular_unlock); 773 } 774 #endif 775 776 if (!UseHeavyMonitors) { 777 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 778 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 779 } 780 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 781 if (!UseHeavyMonitors) { 782 testptr(tmpReg, markWord::monitor_value); // Inflated? 783 jccb (Assembler::zero, Stacked); 784 } 785 786 // It's inflated. 787 #if INCLUDE_RTM_OPT 788 if (use_rtm) { 789 Label L_regular_inflated_unlock; 790 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 791 movptr(boxReg, Address(tmpReg, owner_offset)); 792 testptr(boxReg, boxReg); 793 jccb(Assembler::notZero, L_regular_inflated_unlock); 794 xend(); 795 jmpb(DONE_LABEL); 796 bind(L_regular_inflated_unlock); 797 } 798 #endif 799 800 // Despite our balanced locking property we still check that m->_owner == Self 801 // as java routines or native JNI code called by this thread might 802 // have released the lock. 803 // Refer to the comments in synchronizer.cpp for how we might encode extra 804 // state in _succ so we can avoid fetching EntryList|cxq. 805 // 806 // If there's no contention try a 1-0 exit. That is, exit without 807 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 808 // we detect and recover from the race that the 1-0 exit admits. 809 // 810 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 811 // before it STs null into _owner, releasing the lock. Updates 812 // to data protected by the critical section must be visible before 813 // we drop the lock (and thus before any other thread could acquire 814 // the lock and observe the fields protected by the lock). 815 // IA32's memory-model is SPO, so STs are ordered with respect to 816 // each other and there's no need for an explicit barrier (fence). 817 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 818 #ifndef _LP64 819 // Note that we could employ various encoding schemes to reduce 820 // the number of loads below (currently 4) to just 2 or 3. 821 // Refer to the comments in synchronizer.cpp. 822 // In practice the chain of fetches doesn't seem to impact performance, however. 823 xorptr(boxReg, boxReg); 824 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 825 jccb (Assembler::notZero, DONE_LABEL); 826 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 827 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 828 jccb (Assembler::notZero, DONE_LABEL); 829 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 830 jmpb (DONE_LABEL); 831 #else // _LP64 832 // It's inflated 833 Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath; 834 835 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 836 jccb(Assembler::equal, LNotRecursive); 837 838 // Recursive inflated unlock 839 decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 840 jmpb(LSuccess); 841 842 bind(LNotRecursive); 843 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 844 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 845 jccb (Assembler::notZero, CheckSucc); 846 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 847 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 848 jmpb (DONE_LABEL); 849 850 // Try to avoid passing control into the slow_path ... 851 bind (CheckSucc); 852 853 // The following optional optimization can be elided if necessary 854 // Effectively: if (succ == null) goto slow path 855 // The code reduces the window for a race, however, 856 // and thus benefits performance. 857 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 858 jccb (Assembler::zero, LGoSlowPath); 859 860 xorptr(boxReg, boxReg); 861 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 862 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 863 864 // Memory barrier/fence 865 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 866 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 867 // This is faster on Nehalem and AMD Shanghai/Barcelona. 868 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 869 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 870 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 871 lock(); addl(Address(rsp, 0), 0); 872 873 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 874 jccb (Assembler::notZero, LSuccess); 875 876 // Rare inopportune interleaving - race. 877 // The successor vanished in the small window above. 878 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 879 // We need to ensure progress and succession. 880 // Try to reacquire the lock. 881 // If that fails then the new owner is responsible for succession and this 882 // thread needs to take no further action and can exit via the fast path (success). 883 // If the re-acquire succeeds then pass control into the slow path. 884 // As implemented, this latter mode is horrible because we generated more 885 // coherence traffic on the lock *and* artificially extended the critical section 886 // length while by virtue of passing control into the slow path. 887 888 // box is really RAX -- the following CMPXCHG depends on that binding 889 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 890 lock(); 891 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 892 // There's no successor so we tried to regrab the lock. 893 // If that didn't work, then another thread grabbed the 894 // lock so we're done (and exit was a success). 895 jccb (Assembler::notEqual, LSuccess); 896 // Intentional fall-through into slow path 897 898 bind (LGoSlowPath); 899 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 900 jmpb (DONE_LABEL); 901 902 bind (LSuccess); 903 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 904 jmpb (DONE_LABEL); 905 906 #endif 907 if (!UseHeavyMonitors) { 908 bind (Stacked); 909 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 910 lock(); 911 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 912 // Intentional fall-thru into DONE_LABEL 913 } 914 bind(DONE_LABEL); 915 916 // ZFlag == 1 count in fast path 917 // ZFlag == 0 count in slow path 918 jccb(Assembler::notZero, NO_COUNT); 919 920 bind(COUNT); 921 // Count monitors in fast path 922 #ifndef _LP64 923 get_thread(tmpReg); 924 decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); 925 #else // _LP64 926 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 927 #endif 928 929 xorl(tmpReg, tmpReg); // Set ZF == 1 930 931 bind(NO_COUNT); 932 } 933 934 //------------------------------------------------------------------------------------------- 935 // Generic instructions support for use in .ad files C2 code generation 936 937 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 938 if (dst != src) { 939 movdqu(dst, src); 940 } 941 if (opcode == Op_AbsVD) { 942 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 943 } else { 944 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 945 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 946 } 947 } 948 949 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 950 if (opcode == Op_AbsVD) { 951 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 952 } else { 953 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 954 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 955 } 956 } 957 958 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 959 if (dst != src) { 960 movdqu(dst, src); 961 } 962 if (opcode == Op_AbsVF) { 963 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 964 } else { 965 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 966 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 967 } 968 } 969 970 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 971 if (opcode == Op_AbsVF) { 972 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 973 } else { 974 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 975 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 976 } 977 } 978 979 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 980 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 981 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 982 983 if (opcode == Op_MinV) { 984 if (elem_bt == T_BYTE) { 985 pminsb(dst, src); 986 } else if (elem_bt == T_SHORT) { 987 pminsw(dst, src); 988 } else if (elem_bt == T_INT) { 989 pminsd(dst, src); 990 } else { 991 assert(elem_bt == T_LONG, "required"); 992 assert(tmp == xmm0, "required"); 993 assert_different_registers(dst, src, tmp); 994 movdqu(xmm0, dst); 995 pcmpgtq(xmm0, src); 996 blendvpd(dst, src); // xmm0 as mask 997 } 998 } else { // opcode == Op_MaxV 999 if (elem_bt == T_BYTE) { 1000 pmaxsb(dst, src); 1001 } else if (elem_bt == T_SHORT) { 1002 pmaxsw(dst, src); 1003 } else if (elem_bt == T_INT) { 1004 pmaxsd(dst, src); 1005 } else { 1006 assert(elem_bt == T_LONG, "required"); 1007 assert(tmp == xmm0, "required"); 1008 assert_different_registers(dst, src, tmp); 1009 movdqu(xmm0, src); 1010 pcmpgtq(xmm0, dst); 1011 blendvpd(dst, src); // xmm0 as mask 1012 } 1013 } 1014 } 1015 1016 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 1017 XMMRegister dst, XMMRegister src1, XMMRegister src2, 1018 int vlen_enc) { 1019 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1020 1021 if (opcode == Op_MinV) { 1022 if (elem_bt == T_BYTE) { 1023 vpminsb(dst, src1, src2, vlen_enc); 1024 } else if (elem_bt == T_SHORT) { 1025 vpminsw(dst, src1, src2, vlen_enc); 1026 } else if (elem_bt == T_INT) { 1027 vpminsd(dst, src1, src2, vlen_enc); 1028 } else { 1029 assert(elem_bt == T_LONG, "required"); 1030 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1031 vpminsq(dst, src1, src2, vlen_enc); 1032 } else { 1033 assert_different_registers(dst, src1, src2); 1034 vpcmpgtq(dst, src1, src2, vlen_enc); 1035 vblendvpd(dst, src1, src2, dst, vlen_enc); 1036 } 1037 } 1038 } else { // opcode == Op_MaxV 1039 if (elem_bt == T_BYTE) { 1040 vpmaxsb(dst, src1, src2, vlen_enc); 1041 } else if (elem_bt == T_SHORT) { 1042 vpmaxsw(dst, src1, src2, vlen_enc); 1043 } else if (elem_bt == T_INT) { 1044 vpmaxsd(dst, src1, src2, vlen_enc); 1045 } else { 1046 assert(elem_bt == T_LONG, "required"); 1047 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1048 vpmaxsq(dst, src1, src2, vlen_enc); 1049 } else { 1050 assert_different_registers(dst, src1, src2); 1051 vpcmpgtq(dst, src1, src2, vlen_enc); 1052 vblendvpd(dst, src2, src1, dst, vlen_enc); 1053 } 1054 } 1055 } 1056 } 1057 1058 // Float/Double min max 1059 1060 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1061 XMMRegister dst, XMMRegister a, XMMRegister b, 1062 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1063 int vlen_enc) { 1064 assert(UseAVX > 0, "required"); 1065 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1066 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1067 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1068 assert_different_registers(a, b, tmp, atmp, btmp); 1069 1070 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1071 bool is_double_word = is_double_word_type(elem_bt); 1072 1073 if (!is_double_word && is_min) { 1074 vblendvps(atmp, a, b, a, vlen_enc); 1075 vblendvps(btmp, b, a, a, vlen_enc); 1076 vminps(tmp, atmp, btmp, vlen_enc); 1077 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1078 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 1079 } else if (!is_double_word && !is_min) { 1080 vblendvps(btmp, b, a, b, vlen_enc); 1081 vblendvps(atmp, a, b, b, vlen_enc); 1082 vmaxps(tmp, atmp, btmp, vlen_enc); 1083 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1084 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 1085 } else if (is_double_word && is_min) { 1086 vblendvpd(atmp, a, b, a, vlen_enc); 1087 vblendvpd(btmp, b, a, a, vlen_enc); 1088 vminpd(tmp, atmp, btmp, vlen_enc); 1089 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1090 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 1091 } else { 1092 assert(is_double_word && !is_min, "sanity"); 1093 vblendvpd(btmp, b, a, b, vlen_enc); 1094 vblendvpd(atmp, a, b, b, vlen_enc); 1095 vmaxpd(tmp, atmp, btmp, vlen_enc); 1096 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1097 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 1098 } 1099 } 1100 1101 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1102 XMMRegister dst, XMMRegister a, XMMRegister b, 1103 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1104 int vlen_enc) { 1105 assert(UseAVX > 2, "required"); 1106 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1107 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1108 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1109 assert_different_registers(dst, a, b, atmp, btmp); 1110 1111 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1112 bool is_double_word = is_double_word_type(elem_bt); 1113 bool merge = true; 1114 1115 if (!is_double_word && is_min) { 1116 evpmovd2m(ktmp, a, vlen_enc); 1117 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1118 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1119 vminps(dst, atmp, btmp, vlen_enc); 1120 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1121 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1122 } else if (!is_double_word && !is_min) { 1123 evpmovd2m(ktmp, b, vlen_enc); 1124 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1125 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1126 vmaxps(dst, atmp, btmp, vlen_enc); 1127 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1128 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1129 } else if (is_double_word && is_min) { 1130 evpmovq2m(ktmp, a, vlen_enc); 1131 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1132 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1133 vminpd(dst, atmp, btmp, vlen_enc); 1134 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1135 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1136 } else { 1137 assert(is_double_word && !is_min, "sanity"); 1138 evpmovq2m(ktmp, b, vlen_enc); 1139 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1140 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1141 vmaxpd(dst, atmp, btmp, vlen_enc); 1142 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1143 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1144 } 1145 } 1146 1147 // Float/Double signum 1148 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1149 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1150 1151 Label DONE_LABEL; 1152 1153 if (opcode == Op_SignumF) { 1154 assert(UseSSE > 0, "required"); 1155 ucomiss(dst, zero); 1156 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1157 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1158 movflt(dst, one); 1159 jcc(Assembler::above, DONE_LABEL); 1160 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1161 } else if (opcode == Op_SignumD) { 1162 assert(UseSSE > 1, "required"); 1163 ucomisd(dst, zero); 1164 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1165 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1166 movdbl(dst, one); 1167 jcc(Assembler::above, DONE_LABEL); 1168 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1169 } 1170 1171 bind(DONE_LABEL); 1172 } 1173 1174 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1175 if (sign) { 1176 pmovsxbw(dst, src); 1177 } else { 1178 pmovzxbw(dst, src); 1179 } 1180 } 1181 1182 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1183 if (sign) { 1184 vpmovsxbw(dst, src, vector_len); 1185 } else { 1186 vpmovzxbw(dst, src, vector_len); 1187 } 1188 } 1189 1190 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1191 if (sign) { 1192 vpmovsxbd(dst, src, vector_len); 1193 } else { 1194 vpmovzxbd(dst, src, vector_len); 1195 } 1196 } 1197 1198 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1199 if (sign) { 1200 vpmovsxwd(dst, src, vector_len); 1201 } else { 1202 vpmovzxwd(dst, src, vector_len); 1203 } 1204 } 1205 1206 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1207 int shift, int vector_len) { 1208 if (opcode == Op_RotateLeftV) { 1209 if (etype == T_INT) { 1210 evprold(dst, src, shift, vector_len); 1211 } else { 1212 assert(etype == T_LONG, "expected type T_LONG"); 1213 evprolq(dst, src, shift, vector_len); 1214 } 1215 } else { 1216 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1217 if (etype == T_INT) { 1218 evprord(dst, src, shift, vector_len); 1219 } else { 1220 assert(etype == T_LONG, "expected type T_LONG"); 1221 evprorq(dst, src, shift, vector_len); 1222 } 1223 } 1224 } 1225 1226 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1227 XMMRegister shift, int vector_len) { 1228 if (opcode == Op_RotateLeftV) { 1229 if (etype == T_INT) { 1230 evprolvd(dst, src, shift, vector_len); 1231 } else { 1232 assert(etype == T_LONG, "expected type T_LONG"); 1233 evprolvq(dst, src, shift, vector_len); 1234 } 1235 } else { 1236 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1237 if (etype == T_INT) { 1238 evprorvd(dst, src, shift, vector_len); 1239 } else { 1240 assert(etype == T_LONG, "expected type T_LONG"); 1241 evprorvq(dst, src, shift, vector_len); 1242 } 1243 } 1244 } 1245 1246 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1247 if (opcode == Op_RShiftVI) { 1248 psrad(dst, shift); 1249 } else if (opcode == Op_LShiftVI) { 1250 pslld(dst, shift); 1251 } else { 1252 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1253 psrld(dst, shift); 1254 } 1255 } 1256 1257 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1258 switch (opcode) { 1259 case Op_RShiftVI: psrad(dst, shift); break; 1260 case Op_LShiftVI: pslld(dst, shift); break; 1261 case Op_URShiftVI: psrld(dst, shift); break; 1262 1263 default: assert(false, "%s", NodeClassNames[opcode]); 1264 } 1265 } 1266 1267 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1268 if (opcode == Op_RShiftVI) { 1269 vpsrad(dst, nds, shift, vector_len); 1270 } else if (opcode == Op_LShiftVI) { 1271 vpslld(dst, nds, shift, vector_len); 1272 } else { 1273 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1274 vpsrld(dst, nds, shift, vector_len); 1275 } 1276 } 1277 1278 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1279 switch (opcode) { 1280 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1281 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1282 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1283 1284 default: assert(false, "%s", NodeClassNames[opcode]); 1285 } 1286 } 1287 1288 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1289 switch (opcode) { 1290 case Op_RShiftVB: // fall-through 1291 case Op_RShiftVS: psraw(dst, shift); break; 1292 1293 case Op_LShiftVB: // fall-through 1294 case Op_LShiftVS: psllw(dst, shift); break; 1295 1296 case Op_URShiftVS: // fall-through 1297 case Op_URShiftVB: psrlw(dst, shift); break; 1298 1299 default: assert(false, "%s", NodeClassNames[opcode]); 1300 } 1301 } 1302 1303 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1304 switch (opcode) { 1305 case Op_RShiftVB: // fall-through 1306 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1307 1308 case Op_LShiftVB: // fall-through 1309 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1310 1311 case Op_URShiftVS: // fall-through 1312 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1313 1314 default: assert(false, "%s", NodeClassNames[opcode]); 1315 } 1316 } 1317 1318 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1319 switch (opcode) { 1320 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1321 case Op_LShiftVL: psllq(dst, shift); break; 1322 case Op_URShiftVL: psrlq(dst, shift); break; 1323 1324 default: assert(false, "%s", NodeClassNames[opcode]); 1325 } 1326 } 1327 1328 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1329 if (opcode == Op_RShiftVL) { 1330 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1331 } else if (opcode == Op_LShiftVL) { 1332 psllq(dst, shift); 1333 } else { 1334 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1335 psrlq(dst, shift); 1336 } 1337 } 1338 1339 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1340 switch (opcode) { 1341 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1342 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1343 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1344 1345 default: assert(false, "%s", NodeClassNames[opcode]); 1346 } 1347 } 1348 1349 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1350 if (opcode == Op_RShiftVL) { 1351 evpsraq(dst, nds, shift, vector_len); 1352 } else if (opcode == Op_LShiftVL) { 1353 vpsllq(dst, nds, shift, vector_len); 1354 } else { 1355 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1356 vpsrlq(dst, nds, shift, vector_len); 1357 } 1358 } 1359 1360 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1361 switch (opcode) { 1362 case Op_RShiftVB: // fall-through 1363 case Op_RShiftVS: // fall-through 1364 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1365 1366 case Op_LShiftVB: // fall-through 1367 case Op_LShiftVS: // fall-through 1368 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1369 1370 case Op_URShiftVB: // fall-through 1371 case Op_URShiftVS: // fall-through 1372 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1373 1374 default: assert(false, "%s", NodeClassNames[opcode]); 1375 } 1376 } 1377 1378 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1379 switch (opcode) { 1380 case Op_RShiftVB: // fall-through 1381 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1382 1383 case Op_LShiftVB: // fall-through 1384 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1385 1386 case Op_URShiftVB: // fall-through 1387 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1388 1389 default: assert(false, "%s", NodeClassNames[opcode]); 1390 } 1391 } 1392 1393 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1394 assert(UseAVX >= 2, "required"); 1395 switch (opcode) { 1396 case Op_RShiftVL: { 1397 if (UseAVX > 2) { 1398 assert(tmp == xnoreg, "not used"); 1399 if (!VM_Version::supports_avx512vl()) { 1400 vlen_enc = Assembler::AVX_512bit; 1401 } 1402 evpsravq(dst, src, shift, vlen_enc); 1403 } else { 1404 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1405 vpsrlvq(dst, src, shift, vlen_enc); 1406 vpsrlvq(tmp, tmp, shift, vlen_enc); 1407 vpxor(dst, dst, tmp, vlen_enc); 1408 vpsubq(dst, dst, tmp, vlen_enc); 1409 } 1410 break; 1411 } 1412 case Op_LShiftVL: { 1413 assert(tmp == xnoreg, "not used"); 1414 vpsllvq(dst, src, shift, vlen_enc); 1415 break; 1416 } 1417 case Op_URShiftVL: { 1418 assert(tmp == xnoreg, "not used"); 1419 vpsrlvq(dst, src, shift, vlen_enc); 1420 break; 1421 } 1422 default: assert(false, "%s", NodeClassNames[opcode]); 1423 } 1424 } 1425 1426 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1427 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1428 assert(opcode == Op_LShiftVB || 1429 opcode == Op_RShiftVB || 1430 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1431 bool sign = (opcode != Op_URShiftVB); 1432 assert(vector_len == 0, "required"); 1433 vextendbd(sign, dst, src, 1); 1434 vpmovzxbd(vtmp, shift, 1); 1435 varshiftd(opcode, dst, dst, vtmp, 1); 1436 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1437 vextracti128_high(vtmp, dst); 1438 vpackusdw(dst, dst, vtmp, 0); 1439 } 1440 1441 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1442 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1443 assert(opcode == Op_LShiftVB || 1444 opcode == Op_RShiftVB || 1445 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1446 bool sign = (opcode != Op_URShiftVB); 1447 int ext_vector_len = vector_len + 1; 1448 vextendbw(sign, dst, src, ext_vector_len); 1449 vpmovzxbw(vtmp, shift, ext_vector_len); 1450 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1451 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1452 if (vector_len == 0) { 1453 vextracti128_high(vtmp, dst); 1454 vpackuswb(dst, dst, vtmp, vector_len); 1455 } else { 1456 vextracti64x4_high(vtmp, dst); 1457 vpackuswb(dst, dst, vtmp, vector_len); 1458 vpermq(dst, dst, 0xD8, vector_len); 1459 } 1460 } 1461 1462 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1463 switch(typ) { 1464 case T_BYTE: 1465 pinsrb(dst, val, idx); 1466 break; 1467 case T_SHORT: 1468 pinsrw(dst, val, idx); 1469 break; 1470 case T_INT: 1471 pinsrd(dst, val, idx); 1472 break; 1473 case T_LONG: 1474 pinsrq(dst, val, idx); 1475 break; 1476 default: 1477 assert(false,"Should not reach here."); 1478 break; 1479 } 1480 } 1481 1482 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1483 switch(typ) { 1484 case T_BYTE: 1485 vpinsrb(dst, src, val, idx); 1486 break; 1487 case T_SHORT: 1488 vpinsrw(dst, src, val, idx); 1489 break; 1490 case T_INT: 1491 vpinsrd(dst, src, val, idx); 1492 break; 1493 case T_LONG: 1494 vpinsrq(dst, src, val, idx); 1495 break; 1496 default: 1497 assert(false,"Should not reach here."); 1498 break; 1499 } 1500 } 1501 1502 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1503 switch(typ) { 1504 case T_INT: 1505 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1506 break; 1507 case T_FLOAT: 1508 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1509 break; 1510 case T_LONG: 1511 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1512 break; 1513 case T_DOUBLE: 1514 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1515 break; 1516 default: 1517 assert(false,"Should not reach here."); 1518 break; 1519 } 1520 } 1521 1522 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1523 switch(typ) { 1524 case T_INT: 1525 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1526 break; 1527 case T_FLOAT: 1528 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1529 break; 1530 case T_LONG: 1531 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1532 break; 1533 case T_DOUBLE: 1534 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1535 break; 1536 default: 1537 assert(false,"Should not reach here."); 1538 break; 1539 } 1540 } 1541 1542 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1543 switch(typ) { 1544 case T_INT: 1545 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1546 break; 1547 case T_FLOAT: 1548 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1549 break; 1550 case T_LONG: 1551 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1552 break; 1553 case T_DOUBLE: 1554 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1555 break; 1556 default: 1557 assert(false,"Should not reach here."); 1558 break; 1559 } 1560 } 1561 1562 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1563 if (vlen_in_bytes <= 16) { 1564 pxor (dst, dst); 1565 psubb(dst, src); 1566 switch (elem_bt) { 1567 case T_BYTE: /* nothing to do */ break; 1568 case T_SHORT: pmovsxbw(dst, dst); break; 1569 case T_INT: pmovsxbd(dst, dst); break; 1570 case T_FLOAT: pmovsxbd(dst, dst); break; 1571 case T_LONG: pmovsxbq(dst, dst); break; 1572 case T_DOUBLE: pmovsxbq(dst, dst); break; 1573 1574 default: assert(false, "%s", type2name(elem_bt)); 1575 } 1576 } else { 1577 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1578 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1579 1580 vpxor (dst, dst, dst, vlen_enc); 1581 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1582 1583 switch (elem_bt) { 1584 case T_BYTE: /* nothing to do */ break; 1585 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1586 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1587 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1588 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1589 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1590 1591 default: assert(false, "%s", type2name(elem_bt)); 1592 } 1593 } 1594 } 1595 1596 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1597 if (novlbwdq) { 1598 vpmovsxbd(xtmp, src, vlen_enc); 1599 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1600 Assembler::eq, true, vlen_enc, noreg); 1601 } else { 1602 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1603 vpsubb(xtmp, xtmp, src, vlen_enc); 1604 evpmovb2m(dst, xtmp, vlen_enc); 1605 } 1606 } 1607 1608 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) { 1609 switch (vlen_in_bytes) { 1610 case 4: movdl(dst, src); break; 1611 case 8: movq(dst, src); break; 1612 case 16: movdqu(dst, src); break; 1613 case 32: vmovdqu(dst, src); break; 1614 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1615 default: ShouldNotReachHere(); 1616 } 1617 } 1618 1619 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1620 assert(rscratch != noreg || always_reachable(src), "missing"); 1621 1622 if (reachable(src)) { 1623 load_vector(dst, as_Address(src), vlen_in_bytes); 1624 } else { 1625 lea(rscratch, src); 1626 load_vector(dst, Address(rscratch, 0), vlen_in_bytes); 1627 } 1628 } 1629 1630 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1631 int vlen_enc = vector_length_encoding(vlen); 1632 if (VM_Version::supports_avx()) { 1633 if (bt == T_LONG) { 1634 if (VM_Version::supports_avx2()) { 1635 vpbroadcastq(dst, src, vlen_enc); 1636 } else { 1637 vmovddup(dst, src, vlen_enc); 1638 } 1639 } else if (bt == T_DOUBLE) { 1640 if (vlen_enc != Assembler::AVX_128bit) { 1641 vbroadcastsd(dst, src, vlen_enc, noreg); 1642 } else { 1643 vmovddup(dst, src, vlen_enc); 1644 } 1645 } else { 1646 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1647 vpbroadcastd(dst, src, vlen_enc); 1648 } else { 1649 vbroadcastss(dst, src, vlen_enc); 1650 } 1651 } 1652 } else if (VM_Version::supports_sse3()) { 1653 movddup(dst, src); 1654 } else { 1655 movq(dst, src); 1656 if (vlen == 16) { 1657 punpcklqdq(dst, dst); 1658 } 1659 } 1660 } 1661 1662 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1663 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1664 int offset = exact_log2(type2aelembytes(bt)) << 6; 1665 if (is_floating_point_type(bt)) { 1666 offset += 128; 1667 } 1668 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1669 load_vector(dst, addr, vlen_in_bytes); 1670 } 1671 1672 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1673 1674 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1675 int vector_len = Assembler::AVX_128bit; 1676 1677 switch (opcode) { 1678 case Op_AndReductionV: pand(dst, src); break; 1679 case Op_OrReductionV: por (dst, src); break; 1680 case Op_XorReductionV: pxor(dst, src); break; 1681 case Op_MinReductionV: 1682 switch (typ) { 1683 case T_BYTE: pminsb(dst, src); break; 1684 case T_SHORT: pminsw(dst, src); break; 1685 case T_INT: pminsd(dst, src); break; 1686 case T_LONG: assert(UseAVX > 2, "required"); 1687 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1688 default: assert(false, "wrong type"); 1689 } 1690 break; 1691 case Op_MaxReductionV: 1692 switch (typ) { 1693 case T_BYTE: pmaxsb(dst, src); break; 1694 case T_SHORT: pmaxsw(dst, src); break; 1695 case T_INT: pmaxsd(dst, src); break; 1696 case T_LONG: assert(UseAVX > 2, "required"); 1697 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1698 default: assert(false, "wrong type"); 1699 } 1700 break; 1701 case Op_AddReductionVF: addss(dst, src); break; 1702 case Op_AddReductionVD: addsd(dst, src); break; 1703 case Op_AddReductionVI: 1704 switch (typ) { 1705 case T_BYTE: paddb(dst, src); break; 1706 case T_SHORT: paddw(dst, src); break; 1707 case T_INT: paddd(dst, src); break; 1708 default: assert(false, "wrong type"); 1709 } 1710 break; 1711 case Op_AddReductionVL: paddq(dst, src); break; 1712 case Op_MulReductionVF: mulss(dst, src); break; 1713 case Op_MulReductionVD: mulsd(dst, src); break; 1714 case Op_MulReductionVI: 1715 switch (typ) { 1716 case T_SHORT: pmullw(dst, src); break; 1717 case T_INT: pmulld(dst, src); break; 1718 default: assert(false, "wrong type"); 1719 } 1720 break; 1721 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1722 evpmullq(dst, dst, src, vector_len); break; 1723 default: assert(false, "wrong opcode"); 1724 } 1725 } 1726 1727 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1728 int vector_len = Assembler::AVX_256bit; 1729 1730 switch (opcode) { 1731 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1732 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1733 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1734 case Op_MinReductionV: 1735 switch (typ) { 1736 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1737 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1738 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1739 case T_LONG: assert(UseAVX > 2, "required"); 1740 vpminsq(dst, src1, src2, vector_len); break; 1741 default: assert(false, "wrong type"); 1742 } 1743 break; 1744 case Op_MaxReductionV: 1745 switch (typ) { 1746 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1747 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1748 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1749 case T_LONG: assert(UseAVX > 2, "required"); 1750 vpmaxsq(dst, src1, src2, vector_len); break; 1751 default: assert(false, "wrong type"); 1752 } 1753 break; 1754 case Op_AddReductionVI: 1755 switch (typ) { 1756 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1757 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1758 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1759 default: assert(false, "wrong type"); 1760 } 1761 break; 1762 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1763 case Op_MulReductionVI: 1764 switch (typ) { 1765 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1766 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1767 default: assert(false, "wrong type"); 1768 } 1769 break; 1770 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1771 default: assert(false, "wrong opcode"); 1772 } 1773 } 1774 1775 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1776 XMMRegister dst, XMMRegister src, 1777 XMMRegister vtmp1, XMMRegister vtmp2) { 1778 switch (opcode) { 1779 case Op_AddReductionVF: 1780 case Op_MulReductionVF: 1781 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1782 break; 1783 1784 case Op_AddReductionVD: 1785 case Op_MulReductionVD: 1786 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1787 break; 1788 1789 default: assert(false, "wrong opcode"); 1790 } 1791 } 1792 1793 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1794 Register dst, Register src1, XMMRegister src2, 1795 XMMRegister vtmp1, XMMRegister vtmp2) { 1796 switch (vlen) { 1797 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1798 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1799 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1800 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1801 1802 default: assert(false, "wrong vector length"); 1803 } 1804 } 1805 1806 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1807 Register dst, Register src1, XMMRegister src2, 1808 XMMRegister vtmp1, XMMRegister vtmp2) { 1809 switch (vlen) { 1810 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1811 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1812 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1813 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1814 1815 default: assert(false, "wrong vector length"); 1816 } 1817 } 1818 1819 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1820 Register dst, Register src1, XMMRegister src2, 1821 XMMRegister vtmp1, XMMRegister vtmp2) { 1822 switch (vlen) { 1823 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1824 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1825 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1826 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1827 1828 default: assert(false, "wrong vector length"); 1829 } 1830 } 1831 1832 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1833 Register dst, Register src1, XMMRegister src2, 1834 XMMRegister vtmp1, XMMRegister vtmp2) { 1835 switch (vlen) { 1836 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1837 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1838 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1839 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1840 1841 default: assert(false, "wrong vector length"); 1842 } 1843 } 1844 1845 #ifdef _LP64 1846 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1847 Register dst, Register src1, XMMRegister src2, 1848 XMMRegister vtmp1, XMMRegister vtmp2) { 1849 switch (vlen) { 1850 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1851 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1852 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1853 1854 default: assert(false, "wrong vector length"); 1855 } 1856 } 1857 #endif // _LP64 1858 1859 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1860 switch (vlen) { 1861 case 2: 1862 assert(vtmp2 == xnoreg, ""); 1863 reduce2F(opcode, dst, src, vtmp1); 1864 break; 1865 case 4: 1866 assert(vtmp2 == xnoreg, ""); 1867 reduce4F(opcode, dst, src, vtmp1); 1868 break; 1869 case 8: 1870 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1871 break; 1872 case 16: 1873 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1874 break; 1875 default: assert(false, "wrong vector length"); 1876 } 1877 } 1878 1879 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1880 switch (vlen) { 1881 case 2: 1882 assert(vtmp2 == xnoreg, ""); 1883 reduce2D(opcode, dst, src, vtmp1); 1884 break; 1885 case 4: 1886 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1887 break; 1888 case 8: 1889 reduce8D(opcode, dst, src, vtmp1, vtmp2); 1890 break; 1891 default: assert(false, "wrong vector length"); 1892 } 1893 } 1894 1895 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1896 if (opcode == Op_AddReductionVI) { 1897 if (vtmp1 != src2) { 1898 movdqu(vtmp1, src2); 1899 } 1900 phaddd(vtmp1, vtmp1); 1901 } else { 1902 pshufd(vtmp1, src2, 0x1); 1903 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1904 } 1905 movdl(vtmp2, src1); 1906 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1907 movdl(dst, vtmp1); 1908 } 1909 1910 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1911 if (opcode == Op_AddReductionVI) { 1912 if (vtmp1 != src2) { 1913 movdqu(vtmp1, src2); 1914 } 1915 phaddd(vtmp1, src2); 1916 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1917 } else { 1918 pshufd(vtmp2, src2, 0xE); 1919 reduce_operation_128(T_INT, opcode, vtmp2, src2); 1920 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1921 } 1922 } 1923 1924 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1925 if (opcode == Op_AddReductionVI) { 1926 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 1927 vextracti128_high(vtmp2, vtmp1); 1928 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 1929 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1930 } else { 1931 vextracti128_high(vtmp1, src2); 1932 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1933 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1934 } 1935 } 1936 1937 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1938 vextracti64x4_high(vtmp2, src2); 1939 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 1940 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1941 } 1942 1943 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1944 pshufd(vtmp2, src2, 0x1); 1945 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1946 movdqu(vtmp1, vtmp2); 1947 psrldq(vtmp1, 2); 1948 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 1949 movdqu(vtmp2, vtmp1); 1950 psrldq(vtmp2, 1); 1951 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 1952 movdl(vtmp2, src1); 1953 pmovsxbd(vtmp1, vtmp1); 1954 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1955 pextrb(dst, vtmp1, 0x0); 1956 movsbl(dst, dst); 1957 } 1958 1959 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1960 pshufd(vtmp1, src2, 0xE); 1961 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 1962 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1963 } 1964 1965 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1966 vextracti128_high(vtmp2, src2); 1967 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1968 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1969 } 1970 1971 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1972 vextracti64x4_high(vtmp1, src2); 1973 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 1974 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1975 } 1976 1977 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1978 pmovsxbw(vtmp2, src2); 1979 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1980 } 1981 1982 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1983 if (UseAVX > 1) { 1984 int vector_len = Assembler::AVX_256bit; 1985 vpmovsxbw(vtmp1, src2, vector_len); 1986 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1987 } else { 1988 pmovsxbw(vtmp2, src2); 1989 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1990 pshufd(vtmp2, src2, 0x1); 1991 pmovsxbw(vtmp2, src2); 1992 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 1993 } 1994 } 1995 1996 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1997 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 1998 int vector_len = Assembler::AVX_512bit; 1999 vpmovsxbw(vtmp1, src2, vector_len); 2000 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2001 } else { 2002 assert(UseAVX >= 2,"Should not reach here."); 2003 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2004 vextracti128_high(vtmp2, src2); 2005 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2006 } 2007 } 2008 2009 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2010 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2011 vextracti64x4_high(vtmp2, src2); 2012 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2013 } 2014 2015 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2016 if (opcode == Op_AddReductionVI) { 2017 if (vtmp1 != src2) { 2018 movdqu(vtmp1, src2); 2019 } 2020 phaddw(vtmp1, vtmp1); 2021 phaddw(vtmp1, vtmp1); 2022 } else { 2023 pshufd(vtmp2, src2, 0x1); 2024 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2025 movdqu(vtmp1, vtmp2); 2026 psrldq(vtmp1, 2); 2027 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2028 } 2029 movdl(vtmp2, src1); 2030 pmovsxwd(vtmp1, vtmp1); 2031 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2032 pextrw(dst, vtmp1, 0x0); 2033 movswl(dst, dst); 2034 } 2035 2036 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2037 if (opcode == Op_AddReductionVI) { 2038 if (vtmp1 != src2) { 2039 movdqu(vtmp1, src2); 2040 } 2041 phaddw(vtmp1, src2); 2042 } else { 2043 pshufd(vtmp1, src2, 0xE); 2044 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2045 } 2046 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2047 } 2048 2049 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2050 if (opcode == Op_AddReductionVI) { 2051 int vector_len = Assembler::AVX_256bit; 2052 vphaddw(vtmp2, src2, src2, vector_len); 2053 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2054 } else { 2055 vextracti128_high(vtmp2, src2); 2056 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2057 } 2058 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2059 } 2060 2061 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2062 int vector_len = Assembler::AVX_256bit; 2063 vextracti64x4_high(vtmp1, src2); 2064 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2065 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2066 } 2067 2068 #ifdef _LP64 2069 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2070 pshufd(vtmp2, src2, 0xE); 2071 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2072 movdq(vtmp1, src1); 2073 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2074 movdq(dst, vtmp1); 2075 } 2076 2077 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2078 vextracti128_high(vtmp1, src2); 2079 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2080 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2081 } 2082 2083 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2084 vextracti64x4_high(vtmp2, src2); 2085 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2086 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2087 } 2088 2089 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2090 mov64(temp, -1L); 2091 bzhiq(temp, temp, len); 2092 kmovql(dst, temp); 2093 } 2094 #endif // _LP64 2095 2096 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2097 reduce_operation_128(T_FLOAT, opcode, dst, src); 2098 pshufd(vtmp, src, 0x1); 2099 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2100 } 2101 2102 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2103 reduce2F(opcode, dst, src, vtmp); 2104 pshufd(vtmp, src, 0x2); 2105 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2106 pshufd(vtmp, src, 0x3); 2107 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2108 } 2109 2110 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2111 reduce4F(opcode, dst, src, vtmp2); 2112 vextractf128_high(vtmp2, src); 2113 reduce4F(opcode, dst, vtmp2, vtmp1); 2114 } 2115 2116 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2117 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2118 vextracti64x4_high(vtmp1, src); 2119 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2120 } 2121 2122 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2123 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2124 pshufd(vtmp, src, 0xE); 2125 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2126 } 2127 2128 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2129 reduce2D(opcode, dst, src, vtmp2); 2130 vextractf128_high(vtmp2, src); 2131 reduce2D(opcode, dst, vtmp2, vtmp1); 2132 } 2133 2134 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2135 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2136 vextracti64x4_high(vtmp1, src); 2137 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2138 } 2139 2140 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2141 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2142 } 2143 2144 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2145 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2146 } 2147 2148 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2149 int vec_enc) { 2150 switch(elem_bt) { 2151 case T_INT: 2152 case T_FLOAT: 2153 vmaskmovps(dst, src, mask, vec_enc); 2154 break; 2155 case T_LONG: 2156 case T_DOUBLE: 2157 vmaskmovpd(dst, src, mask, vec_enc); 2158 break; 2159 default: 2160 fatal("Unsupported type %s", type2name(elem_bt)); 2161 break; 2162 } 2163 } 2164 2165 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2166 int vec_enc) { 2167 switch(elem_bt) { 2168 case T_INT: 2169 case T_FLOAT: 2170 vmaskmovps(dst, src, mask, vec_enc); 2171 break; 2172 case T_LONG: 2173 case T_DOUBLE: 2174 vmaskmovpd(dst, src, mask, vec_enc); 2175 break; 2176 default: 2177 fatal("Unsupported type %s", type2name(elem_bt)); 2178 break; 2179 } 2180 } 2181 2182 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2183 XMMRegister dst, XMMRegister src, 2184 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2185 XMMRegister xmm_0, XMMRegister xmm_1) { 2186 int permconst[] = {1, 14}; 2187 XMMRegister wsrc = src; 2188 XMMRegister wdst = xmm_0; 2189 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2190 2191 int vlen_enc = Assembler::AVX_128bit; 2192 if (vlen == 16) { 2193 vlen_enc = Assembler::AVX_256bit; 2194 } 2195 2196 for (int i = log2(vlen) - 1; i >=0; i--) { 2197 if (i == 0 && !is_dst_valid) { 2198 wdst = dst; 2199 } 2200 if (i == 3) { 2201 vextracti64x4_high(wtmp, wsrc); 2202 } else if (i == 2) { 2203 vextracti128_high(wtmp, wsrc); 2204 } else { // i = [0,1] 2205 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2206 } 2207 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2208 wsrc = wdst; 2209 vlen_enc = Assembler::AVX_128bit; 2210 } 2211 if (is_dst_valid) { 2212 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2213 } 2214 } 2215 2216 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2217 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2218 XMMRegister xmm_0, XMMRegister xmm_1) { 2219 XMMRegister wsrc = src; 2220 XMMRegister wdst = xmm_0; 2221 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2222 int vlen_enc = Assembler::AVX_128bit; 2223 if (vlen == 8) { 2224 vlen_enc = Assembler::AVX_256bit; 2225 } 2226 for (int i = log2(vlen) - 1; i >=0; i--) { 2227 if (i == 0 && !is_dst_valid) { 2228 wdst = dst; 2229 } 2230 if (i == 1) { 2231 vextracti128_high(wtmp, wsrc); 2232 } else if (i == 2) { 2233 vextracti64x4_high(wtmp, wsrc); 2234 } else { 2235 assert(i == 0, "%d", i); 2236 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2237 } 2238 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2239 wsrc = wdst; 2240 vlen_enc = Assembler::AVX_128bit; 2241 } 2242 if (is_dst_valid) { 2243 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2244 } 2245 } 2246 2247 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2248 switch (bt) { 2249 case T_BYTE: pextrb(dst, src, idx); break; 2250 case T_SHORT: pextrw(dst, src, idx); break; 2251 case T_INT: pextrd(dst, src, idx); break; 2252 case T_LONG: pextrq(dst, src, idx); break; 2253 2254 default: 2255 assert(false,"Should not reach here."); 2256 break; 2257 } 2258 } 2259 2260 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2261 int esize = type2aelembytes(typ); 2262 int elem_per_lane = 16/esize; 2263 int lane = elemindex / elem_per_lane; 2264 int eindex = elemindex % elem_per_lane; 2265 2266 if (lane >= 2) { 2267 assert(UseAVX > 2, "required"); 2268 vextractf32x4(dst, src, lane & 3); 2269 return dst; 2270 } else if (lane > 0) { 2271 assert(UseAVX > 0, "required"); 2272 vextractf128(dst, src, lane); 2273 return dst; 2274 } else { 2275 return src; 2276 } 2277 } 2278 2279 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2280 int esize = type2aelembytes(typ); 2281 int elem_per_lane = 16/esize; 2282 int eindex = elemindex % elem_per_lane; 2283 assert(is_integral_type(typ),"required"); 2284 2285 if (eindex == 0) { 2286 if (typ == T_LONG) { 2287 movq(dst, src); 2288 } else { 2289 movdl(dst, src); 2290 if (typ == T_BYTE) 2291 movsbl(dst, dst); 2292 else if (typ == T_SHORT) 2293 movswl(dst, dst); 2294 } 2295 } else { 2296 extract(typ, dst, src, eindex); 2297 } 2298 } 2299 2300 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2301 int esize = type2aelembytes(typ); 2302 int elem_per_lane = 16/esize; 2303 int eindex = elemindex % elem_per_lane; 2304 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2305 2306 if (eindex == 0) { 2307 movq(dst, src); 2308 } else { 2309 if (typ == T_FLOAT) { 2310 if (UseAVX == 0) { 2311 movdqu(dst, src); 2312 shufps(dst, dst, eindex); 2313 } else { 2314 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2315 } 2316 } else { 2317 if (UseAVX == 0) { 2318 movdqu(dst, src); 2319 psrldq(dst, eindex*esize); 2320 } else { 2321 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2322 } 2323 movq(dst, dst); 2324 } 2325 } 2326 // Zero upper bits 2327 if (typ == T_FLOAT) { 2328 if (UseAVX == 0) { 2329 assert(vtmp != xnoreg, "required."); 2330 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2331 pand(dst, vtmp); 2332 } else { 2333 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2334 } 2335 } 2336 } 2337 2338 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2339 switch(typ) { 2340 case T_BYTE: 2341 case T_BOOLEAN: 2342 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2343 break; 2344 case T_SHORT: 2345 case T_CHAR: 2346 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2347 break; 2348 case T_INT: 2349 case T_FLOAT: 2350 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2351 break; 2352 case T_LONG: 2353 case T_DOUBLE: 2354 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2355 break; 2356 default: 2357 assert(false,"Should not reach here."); 2358 break; 2359 } 2360 } 2361 2362 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2363 assert(rscratch != noreg || always_reachable(src2), "missing"); 2364 2365 switch(typ) { 2366 case T_BOOLEAN: 2367 case T_BYTE: 2368 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2369 break; 2370 case T_CHAR: 2371 case T_SHORT: 2372 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2373 break; 2374 case T_INT: 2375 case T_FLOAT: 2376 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2377 break; 2378 case T_LONG: 2379 case T_DOUBLE: 2380 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2381 break; 2382 default: 2383 assert(false,"Should not reach here."); 2384 break; 2385 } 2386 } 2387 2388 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2389 switch(typ) { 2390 case T_BYTE: 2391 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2392 break; 2393 case T_SHORT: 2394 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2395 break; 2396 case T_INT: 2397 case T_FLOAT: 2398 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2399 break; 2400 case T_LONG: 2401 case T_DOUBLE: 2402 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2403 break; 2404 default: 2405 assert(false,"Should not reach here."); 2406 break; 2407 } 2408 } 2409 2410 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2411 assert(vlen_in_bytes <= 32, ""); 2412 int esize = type2aelembytes(bt); 2413 if (vlen_in_bytes == 32) { 2414 assert(vtmp == xnoreg, "required."); 2415 if (esize >= 4) { 2416 vtestps(src1, src2, AVX_256bit); 2417 } else { 2418 vptest(src1, src2, AVX_256bit); 2419 } 2420 return; 2421 } 2422 if (vlen_in_bytes < 16) { 2423 // Duplicate the lower part to fill the whole register, 2424 // Don't need to do so for src2 2425 assert(vtmp != xnoreg, "required"); 2426 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2427 pshufd(vtmp, src1, shuffle_imm); 2428 } else { 2429 assert(vtmp == xnoreg, "required"); 2430 vtmp = src1; 2431 } 2432 if (esize >= 4 && VM_Version::supports_avx()) { 2433 vtestps(vtmp, src2, AVX_128bit); 2434 } else { 2435 ptest(vtmp, src2); 2436 } 2437 } 2438 2439 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2440 assert(UseAVX >= 2, "required"); 2441 #ifdef ASSERT 2442 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2443 bool is_bw_supported = VM_Version::supports_avx512bw(); 2444 if (is_bw && !is_bw_supported) { 2445 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2446 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2447 "XMM register should be 0-15"); 2448 } 2449 #endif // ASSERT 2450 switch (elem_bt) { 2451 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2452 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2453 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2454 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2455 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2456 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2457 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2458 } 2459 } 2460 2461 #ifdef _LP64 2462 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2463 assert(UseAVX >= 2, "required"); 2464 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2465 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2466 if ((UseAVX > 2) && 2467 (!is_bw || VM_Version::supports_avx512bw()) && 2468 (!is_vl || VM_Version::supports_avx512vl())) { 2469 switch (elem_bt) { 2470 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2471 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2472 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2473 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2474 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2475 } 2476 } else { 2477 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2478 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2479 switch (elem_bt) { 2480 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2481 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2482 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2483 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2484 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2485 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2486 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2487 } 2488 } 2489 } 2490 #endif 2491 2492 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2493 switch (to_elem_bt) { 2494 case T_SHORT: 2495 vpmovsxbw(dst, src, vlen_enc); 2496 break; 2497 case T_INT: 2498 vpmovsxbd(dst, src, vlen_enc); 2499 break; 2500 case T_FLOAT: 2501 vpmovsxbd(dst, src, vlen_enc); 2502 vcvtdq2ps(dst, dst, vlen_enc); 2503 break; 2504 case T_LONG: 2505 vpmovsxbq(dst, src, vlen_enc); 2506 break; 2507 case T_DOUBLE: { 2508 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2509 vpmovsxbd(dst, src, mid_vlen_enc); 2510 vcvtdq2pd(dst, dst, vlen_enc); 2511 break; 2512 } 2513 default: 2514 fatal("Unsupported type %s", type2name(to_elem_bt)); 2515 break; 2516 } 2517 } 2518 2519 //------------------------------------------------------------------------------------------- 2520 2521 // IndexOf for constant substrings with size >= 8 chars 2522 // which don't need to be loaded through stack. 2523 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2524 Register cnt1, Register cnt2, 2525 int int_cnt2, Register result, 2526 XMMRegister vec, Register tmp, 2527 int ae) { 2528 ShortBranchVerifier sbv(this); 2529 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2530 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2531 2532 // This method uses the pcmpestri instruction with bound registers 2533 // inputs: 2534 // xmm - substring 2535 // rax - substring length (elements count) 2536 // mem - scanned string 2537 // rdx - string length (elements count) 2538 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2539 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2540 // outputs: 2541 // rcx - matched index in string 2542 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2543 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2544 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2545 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2546 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2547 2548 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2549 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2550 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2551 2552 // Note, inline_string_indexOf() generates checks: 2553 // if (substr.count > string.count) return -1; 2554 // if (substr.count == 0) return 0; 2555 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2556 2557 // Load substring. 2558 if (ae == StrIntrinsicNode::UL) { 2559 pmovzxbw(vec, Address(str2, 0)); 2560 } else { 2561 movdqu(vec, Address(str2, 0)); 2562 } 2563 movl(cnt2, int_cnt2); 2564 movptr(result, str1); // string addr 2565 2566 if (int_cnt2 > stride) { 2567 jmpb(SCAN_TO_SUBSTR); 2568 2569 // Reload substr for rescan, this code 2570 // is executed only for large substrings (> 8 chars) 2571 bind(RELOAD_SUBSTR); 2572 if (ae == StrIntrinsicNode::UL) { 2573 pmovzxbw(vec, Address(str2, 0)); 2574 } else { 2575 movdqu(vec, Address(str2, 0)); 2576 } 2577 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2578 2579 bind(RELOAD_STR); 2580 // We came here after the beginning of the substring was 2581 // matched but the rest of it was not so we need to search 2582 // again. Start from the next element after the previous match. 2583 2584 // cnt2 is number of substring reminding elements and 2585 // cnt1 is number of string reminding elements when cmp failed. 2586 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2587 subl(cnt1, cnt2); 2588 addl(cnt1, int_cnt2); 2589 movl(cnt2, int_cnt2); // Now restore cnt2 2590 2591 decrementl(cnt1); // Shift to next element 2592 cmpl(cnt1, cnt2); 2593 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2594 2595 addptr(result, (1<<scale1)); 2596 2597 } // (int_cnt2 > 8) 2598 2599 // Scan string for start of substr in 16-byte vectors 2600 bind(SCAN_TO_SUBSTR); 2601 pcmpestri(vec, Address(result, 0), mode); 2602 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2603 subl(cnt1, stride); 2604 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2605 cmpl(cnt1, cnt2); 2606 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2607 addptr(result, 16); 2608 jmpb(SCAN_TO_SUBSTR); 2609 2610 // Found a potential substr 2611 bind(FOUND_CANDIDATE); 2612 // Matched whole vector if first element matched (tmp(rcx) == 0). 2613 if (int_cnt2 == stride) { 2614 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2615 } else { // int_cnt2 > 8 2616 jccb(Assembler::overflow, FOUND_SUBSTR); 2617 } 2618 // After pcmpestri tmp(rcx) contains matched element index 2619 // Compute start addr of substr 2620 lea(result, Address(result, tmp, scale1)); 2621 2622 // Make sure string is still long enough 2623 subl(cnt1, tmp); 2624 cmpl(cnt1, cnt2); 2625 if (int_cnt2 == stride) { 2626 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2627 } else { // int_cnt2 > 8 2628 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2629 } 2630 // Left less then substring. 2631 2632 bind(RET_NOT_FOUND); 2633 movl(result, -1); 2634 jmp(EXIT); 2635 2636 if (int_cnt2 > stride) { 2637 // This code is optimized for the case when whole substring 2638 // is matched if its head is matched. 2639 bind(MATCH_SUBSTR_HEAD); 2640 pcmpestri(vec, Address(result, 0), mode); 2641 // Reload only string if does not match 2642 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2643 2644 Label CONT_SCAN_SUBSTR; 2645 // Compare the rest of substring (> 8 chars). 2646 bind(FOUND_SUBSTR); 2647 // First 8 chars are already matched. 2648 negptr(cnt2); 2649 addptr(cnt2, stride); 2650 2651 bind(SCAN_SUBSTR); 2652 subl(cnt1, stride); 2653 cmpl(cnt2, -stride); // Do not read beyond substring 2654 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2655 // Back-up strings to avoid reading beyond substring: 2656 // cnt1 = cnt1 - cnt2 + 8 2657 addl(cnt1, cnt2); // cnt2 is negative 2658 addl(cnt1, stride); 2659 movl(cnt2, stride); negptr(cnt2); 2660 bind(CONT_SCAN_SUBSTR); 2661 if (int_cnt2 < (int)G) { 2662 int tail_off1 = int_cnt2<<scale1; 2663 int tail_off2 = int_cnt2<<scale2; 2664 if (ae == StrIntrinsicNode::UL) { 2665 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2666 } else { 2667 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2668 } 2669 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2670 } else { 2671 // calculate index in register to avoid integer overflow (int_cnt2*2) 2672 movl(tmp, int_cnt2); 2673 addptr(tmp, cnt2); 2674 if (ae == StrIntrinsicNode::UL) { 2675 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2676 } else { 2677 movdqu(vec, Address(str2, tmp, scale2, 0)); 2678 } 2679 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2680 } 2681 // Need to reload strings pointers if not matched whole vector 2682 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2683 addptr(cnt2, stride); 2684 jcc(Assembler::negative, SCAN_SUBSTR); 2685 // Fall through if found full substring 2686 2687 } // (int_cnt2 > 8) 2688 2689 bind(RET_FOUND); 2690 // Found result if we matched full small substring. 2691 // Compute substr offset 2692 subptr(result, str1); 2693 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2694 shrl(result, 1); // index 2695 } 2696 bind(EXIT); 2697 2698 } // string_indexofC8 2699 2700 // Small strings are loaded through stack if they cross page boundary. 2701 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2702 Register cnt1, Register cnt2, 2703 int int_cnt2, Register result, 2704 XMMRegister vec, Register tmp, 2705 int ae) { 2706 ShortBranchVerifier sbv(this); 2707 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2708 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2709 2710 // 2711 // int_cnt2 is length of small (< 8 chars) constant substring 2712 // or (-1) for non constant substring in which case its length 2713 // is in cnt2 register. 2714 // 2715 // Note, inline_string_indexOf() generates checks: 2716 // if (substr.count > string.count) return -1; 2717 // if (substr.count == 0) return 0; 2718 // 2719 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2720 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2721 // This method uses the pcmpestri instruction with bound registers 2722 // inputs: 2723 // xmm - substring 2724 // rax - substring length (elements count) 2725 // mem - scanned string 2726 // rdx - string length (elements count) 2727 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2728 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2729 // outputs: 2730 // rcx - matched index in string 2731 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2732 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2733 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2734 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2735 2736 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2737 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2738 FOUND_CANDIDATE; 2739 2740 { //======================================================== 2741 // We don't know where these strings are located 2742 // and we can't read beyond them. Load them through stack. 2743 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2744 2745 movptr(tmp, rsp); // save old SP 2746 2747 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2748 if (int_cnt2 == (1>>scale2)) { // One byte 2749 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2750 load_unsigned_byte(result, Address(str2, 0)); 2751 movdl(vec, result); // move 32 bits 2752 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2753 // Not enough header space in 32-bit VM: 12+3 = 15. 2754 movl(result, Address(str2, -1)); 2755 shrl(result, 8); 2756 movdl(vec, result); // move 32 bits 2757 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2758 load_unsigned_short(result, Address(str2, 0)); 2759 movdl(vec, result); // move 32 bits 2760 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2761 movdl(vec, Address(str2, 0)); // move 32 bits 2762 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2763 movq(vec, Address(str2, 0)); // move 64 bits 2764 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2765 // Array header size is 12 bytes in 32-bit VM 2766 // + 6 bytes for 3 chars == 18 bytes, 2767 // enough space to load vec and shift. 2768 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2769 if (ae == StrIntrinsicNode::UL) { 2770 int tail_off = int_cnt2-8; 2771 pmovzxbw(vec, Address(str2, tail_off)); 2772 psrldq(vec, -2*tail_off); 2773 } 2774 else { 2775 int tail_off = int_cnt2*(1<<scale2); 2776 movdqu(vec, Address(str2, tail_off-16)); 2777 psrldq(vec, 16-tail_off); 2778 } 2779 } 2780 } else { // not constant substring 2781 cmpl(cnt2, stride); 2782 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 2783 2784 // We can read beyond string if srt+16 does not cross page boundary 2785 // since heaps are aligned and mapped by pages. 2786 assert(os::vm_page_size() < (int)G, "default page should be small"); 2787 movl(result, str2); // We need only low 32 bits 2788 andl(result, ((int)os::vm_page_size()-1)); 2789 cmpl(result, ((int)os::vm_page_size()-16)); 2790 jccb(Assembler::belowEqual, CHECK_STR); 2791 2792 // Move small strings to stack to allow load 16 bytes into vec. 2793 subptr(rsp, 16); 2794 int stk_offset = wordSize-(1<<scale2); 2795 push(cnt2); 2796 2797 bind(COPY_SUBSTR); 2798 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 2799 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 2800 movb(Address(rsp, cnt2, scale2, stk_offset), result); 2801 } else if (ae == StrIntrinsicNode::UU) { 2802 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 2803 movw(Address(rsp, cnt2, scale2, stk_offset), result); 2804 } 2805 decrement(cnt2); 2806 jccb(Assembler::notZero, COPY_SUBSTR); 2807 2808 pop(cnt2); 2809 movptr(str2, rsp); // New substring address 2810 } // non constant 2811 2812 bind(CHECK_STR); 2813 cmpl(cnt1, stride); 2814 jccb(Assembler::aboveEqual, BIG_STRINGS); 2815 2816 // Check cross page boundary. 2817 movl(result, str1); // We need only low 32 bits 2818 andl(result, ((int)os::vm_page_size()-1)); 2819 cmpl(result, ((int)os::vm_page_size()-16)); 2820 jccb(Assembler::belowEqual, BIG_STRINGS); 2821 2822 subptr(rsp, 16); 2823 int stk_offset = -(1<<scale1); 2824 if (int_cnt2 < 0) { // not constant 2825 push(cnt2); 2826 stk_offset += wordSize; 2827 } 2828 movl(cnt2, cnt1); 2829 2830 bind(COPY_STR); 2831 if (ae == StrIntrinsicNode::LL) { 2832 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 2833 movb(Address(rsp, cnt2, scale1, stk_offset), result); 2834 } else { 2835 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 2836 movw(Address(rsp, cnt2, scale1, stk_offset), result); 2837 } 2838 decrement(cnt2); 2839 jccb(Assembler::notZero, COPY_STR); 2840 2841 if (int_cnt2 < 0) { // not constant 2842 pop(cnt2); 2843 } 2844 movptr(str1, rsp); // New string address 2845 2846 bind(BIG_STRINGS); 2847 // Load substring. 2848 if (int_cnt2 < 0) { // -1 2849 if (ae == StrIntrinsicNode::UL) { 2850 pmovzxbw(vec, Address(str2, 0)); 2851 } else { 2852 movdqu(vec, Address(str2, 0)); 2853 } 2854 push(cnt2); // substr count 2855 push(str2); // substr addr 2856 push(str1); // string addr 2857 } else { 2858 // Small (< 8 chars) constant substrings are loaded already. 2859 movl(cnt2, int_cnt2); 2860 } 2861 push(tmp); // original SP 2862 2863 } // Finished loading 2864 2865 //======================================================== 2866 // Start search 2867 // 2868 2869 movptr(result, str1); // string addr 2870 2871 if (int_cnt2 < 0) { // Only for non constant substring 2872 jmpb(SCAN_TO_SUBSTR); 2873 2874 // SP saved at sp+0 2875 // String saved at sp+1*wordSize 2876 // Substr saved at sp+2*wordSize 2877 // Substr count saved at sp+3*wordSize 2878 2879 // Reload substr for rescan, this code 2880 // is executed only for large substrings (> 8 chars) 2881 bind(RELOAD_SUBSTR); 2882 movptr(str2, Address(rsp, 2*wordSize)); 2883 movl(cnt2, Address(rsp, 3*wordSize)); 2884 if (ae == StrIntrinsicNode::UL) { 2885 pmovzxbw(vec, Address(str2, 0)); 2886 } else { 2887 movdqu(vec, Address(str2, 0)); 2888 } 2889 // We came here after the beginning of the substring was 2890 // matched but the rest of it was not so we need to search 2891 // again. Start from the next element after the previous match. 2892 subptr(str1, result); // Restore counter 2893 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2894 shrl(str1, 1); 2895 } 2896 addl(cnt1, str1); 2897 decrementl(cnt1); // Shift to next element 2898 cmpl(cnt1, cnt2); 2899 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2900 2901 addptr(result, (1<<scale1)); 2902 } // non constant 2903 2904 // Scan string for start of substr in 16-byte vectors 2905 bind(SCAN_TO_SUBSTR); 2906 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2907 pcmpestri(vec, Address(result, 0), mode); 2908 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2909 subl(cnt1, stride); 2910 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2911 cmpl(cnt1, cnt2); 2912 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2913 addptr(result, 16); 2914 2915 bind(ADJUST_STR); 2916 cmpl(cnt1, stride); // Do not read beyond string 2917 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2918 // Back-up string to avoid reading beyond string. 2919 lea(result, Address(result, cnt1, scale1, -16)); 2920 movl(cnt1, stride); 2921 jmpb(SCAN_TO_SUBSTR); 2922 2923 // Found a potential substr 2924 bind(FOUND_CANDIDATE); 2925 // After pcmpestri tmp(rcx) contains matched element index 2926 2927 // Make sure string is still long enough 2928 subl(cnt1, tmp); 2929 cmpl(cnt1, cnt2); 2930 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 2931 // Left less then substring. 2932 2933 bind(RET_NOT_FOUND); 2934 movl(result, -1); 2935 jmp(CLEANUP); 2936 2937 bind(FOUND_SUBSTR); 2938 // Compute start addr of substr 2939 lea(result, Address(result, tmp, scale1)); 2940 if (int_cnt2 > 0) { // Constant substring 2941 // Repeat search for small substring (< 8 chars) 2942 // from new point without reloading substring. 2943 // Have to check that we don't read beyond string. 2944 cmpl(tmp, stride-int_cnt2); 2945 jccb(Assembler::greater, ADJUST_STR); 2946 // Fall through if matched whole substring. 2947 } else { // non constant 2948 assert(int_cnt2 == -1, "should be != 0"); 2949 2950 addl(tmp, cnt2); 2951 // Found result if we matched whole substring. 2952 cmpl(tmp, stride); 2953 jcc(Assembler::lessEqual, RET_FOUND); 2954 2955 // Repeat search for small substring (<= 8 chars) 2956 // from new point 'str1' without reloading substring. 2957 cmpl(cnt2, stride); 2958 // Have to check that we don't read beyond string. 2959 jccb(Assembler::lessEqual, ADJUST_STR); 2960 2961 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 2962 // Compare the rest of substring (> 8 chars). 2963 movptr(str1, result); 2964 2965 cmpl(tmp, cnt2); 2966 // First 8 chars are already matched. 2967 jccb(Assembler::equal, CHECK_NEXT); 2968 2969 bind(SCAN_SUBSTR); 2970 pcmpestri(vec, Address(str1, 0), mode); 2971 // Need to reload strings pointers if not matched whole vector 2972 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2973 2974 bind(CHECK_NEXT); 2975 subl(cnt2, stride); 2976 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 2977 addptr(str1, 16); 2978 if (ae == StrIntrinsicNode::UL) { 2979 addptr(str2, 8); 2980 } else { 2981 addptr(str2, 16); 2982 } 2983 subl(cnt1, stride); 2984 cmpl(cnt2, stride); // Do not read beyond substring 2985 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 2986 // Back-up strings to avoid reading beyond substring. 2987 2988 if (ae == StrIntrinsicNode::UL) { 2989 lea(str2, Address(str2, cnt2, scale2, -8)); 2990 lea(str1, Address(str1, cnt2, scale1, -16)); 2991 } else { 2992 lea(str2, Address(str2, cnt2, scale2, -16)); 2993 lea(str1, Address(str1, cnt2, scale1, -16)); 2994 } 2995 subl(cnt1, cnt2); 2996 movl(cnt2, stride); 2997 addl(cnt1, stride); 2998 bind(CONT_SCAN_SUBSTR); 2999 if (ae == StrIntrinsicNode::UL) { 3000 pmovzxbw(vec, Address(str2, 0)); 3001 } else { 3002 movdqu(vec, Address(str2, 0)); 3003 } 3004 jmp(SCAN_SUBSTR); 3005 3006 bind(RET_FOUND_LONG); 3007 movptr(str1, Address(rsp, wordSize)); 3008 } // non constant 3009 3010 bind(RET_FOUND); 3011 // Compute substr offset 3012 subptr(result, str1); 3013 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3014 shrl(result, 1); // index 3015 } 3016 bind(CLEANUP); 3017 pop(rsp); // restore SP 3018 3019 } // string_indexof 3020 3021 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3022 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3023 ShortBranchVerifier sbv(this); 3024 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3025 3026 int stride = 8; 3027 3028 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3029 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3030 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3031 FOUND_SEQ_CHAR, DONE_LABEL; 3032 3033 movptr(result, str1); 3034 if (UseAVX >= 2) { 3035 cmpl(cnt1, stride); 3036 jcc(Assembler::less, SCAN_TO_CHAR); 3037 cmpl(cnt1, 2*stride); 3038 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3039 movdl(vec1, ch); 3040 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3041 vpxor(vec2, vec2); 3042 movl(tmp, cnt1); 3043 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3044 andl(cnt1,0x0000000F); //tail count (in chars) 3045 3046 bind(SCAN_TO_16_CHAR_LOOP); 3047 vmovdqu(vec3, Address(result, 0)); 3048 vpcmpeqw(vec3, vec3, vec1, 1); 3049 vptest(vec2, vec3); 3050 jcc(Assembler::carryClear, FOUND_CHAR); 3051 addptr(result, 32); 3052 subl(tmp, 2*stride); 3053 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3054 jmp(SCAN_TO_8_CHAR); 3055 bind(SCAN_TO_8_CHAR_INIT); 3056 movdl(vec1, ch); 3057 pshuflw(vec1, vec1, 0x00); 3058 pshufd(vec1, vec1, 0); 3059 pxor(vec2, vec2); 3060 } 3061 bind(SCAN_TO_8_CHAR); 3062 cmpl(cnt1, stride); 3063 jcc(Assembler::less, SCAN_TO_CHAR); 3064 if (UseAVX < 2) { 3065 movdl(vec1, ch); 3066 pshuflw(vec1, vec1, 0x00); 3067 pshufd(vec1, vec1, 0); 3068 pxor(vec2, vec2); 3069 } 3070 movl(tmp, cnt1); 3071 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3072 andl(cnt1,0x00000007); //tail count (in chars) 3073 3074 bind(SCAN_TO_8_CHAR_LOOP); 3075 movdqu(vec3, Address(result, 0)); 3076 pcmpeqw(vec3, vec1); 3077 ptest(vec2, vec3); 3078 jcc(Assembler::carryClear, FOUND_CHAR); 3079 addptr(result, 16); 3080 subl(tmp, stride); 3081 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3082 bind(SCAN_TO_CHAR); 3083 testl(cnt1, cnt1); 3084 jcc(Assembler::zero, RET_NOT_FOUND); 3085 bind(SCAN_TO_CHAR_LOOP); 3086 load_unsigned_short(tmp, Address(result, 0)); 3087 cmpl(ch, tmp); 3088 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3089 addptr(result, 2); 3090 subl(cnt1, 1); 3091 jccb(Assembler::zero, RET_NOT_FOUND); 3092 jmp(SCAN_TO_CHAR_LOOP); 3093 3094 bind(RET_NOT_FOUND); 3095 movl(result, -1); 3096 jmpb(DONE_LABEL); 3097 3098 bind(FOUND_CHAR); 3099 if (UseAVX >= 2) { 3100 vpmovmskb(tmp, vec3); 3101 } else { 3102 pmovmskb(tmp, vec3); 3103 } 3104 bsfl(ch, tmp); 3105 addptr(result, ch); 3106 3107 bind(FOUND_SEQ_CHAR); 3108 subptr(result, str1); 3109 shrl(result, 1); 3110 3111 bind(DONE_LABEL); 3112 } // string_indexof_char 3113 3114 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3115 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3116 ShortBranchVerifier sbv(this); 3117 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3118 3119 int stride = 16; 3120 3121 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3122 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3123 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3124 FOUND_SEQ_CHAR, DONE_LABEL; 3125 3126 movptr(result, str1); 3127 if (UseAVX >= 2) { 3128 cmpl(cnt1, stride); 3129 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3130 cmpl(cnt1, stride*2); 3131 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3132 movdl(vec1, ch); 3133 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3134 vpxor(vec2, vec2); 3135 movl(tmp, cnt1); 3136 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3137 andl(cnt1,0x0000001F); //tail count (in chars) 3138 3139 bind(SCAN_TO_32_CHAR_LOOP); 3140 vmovdqu(vec3, Address(result, 0)); 3141 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3142 vptest(vec2, vec3); 3143 jcc(Assembler::carryClear, FOUND_CHAR); 3144 addptr(result, 32); 3145 subl(tmp, stride*2); 3146 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3147 jmp(SCAN_TO_16_CHAR); 3148 3149 bind(SCAN_TO_16_CHAR_INIT); 3150 movdl(vec1, ch); 3151 pxor(vec2, vec2); 3152 pshufb(vec1, vec2); 3153 } 3154 3155 bind(SCAN_TO_16_CHAR); 3156 cmpl(cnt1, stride); 3157 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3158 if (UseAVX < 2) { 3159 movdl(vec1, ch); 3160 pxor(vec2, vec2); 3161 pshufb(vec1, vec2); 3162 } 3163 movl(tmp, cnt1); 3164 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3165 andl(cnt1,0x0000000F); //tail count (in bytes) 3166 3167 bind(SCAN_TO_16_CHAR_LOOP); 3168 movdqu(vec3, Address(result, 0)); 3169 pcmpeqb(vec3, vec1); 3170 ptest(vec2, vec3); 3171 jcc(Assembler::carryClear, FOUND_CHAR); 3172 addptr(result, 16); 3173 subl(tmp, stride); 3174 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3175 3176 bind(SCAN_TO_CHAR_INIT); 3177 testl(cnt1, cnt1); 3178 jcc(Assembler::zero, RET_NOT_FOUND); 3179 bind(SCAN_TO_CHAR_LOOP); 3180 load_unsigned_byte(tmp, Address(result, 0)); 3181 cmpl(ch, tmp); 3182 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3183 addptr(result, 1); 3184 subl(cnt1, 1); 3185 jccb(Assembler::zero, RET_NOT_FOUND); 3186 jmp(SCAN_TO_CHAR_LOOP); 3187 3188 bind(RET_NOT_FOUND); 3189 movl(result, -1); 3190 jmpb(DONE_LABEL); 3191 3192 bind(FOUND_CHAR); 3193 if (UseAVX >= 2) { 3194 vpmovmskb(tmp, vec3); 3195 } else { 3196 pmovmskb(tmp, vec3); 3197 } 3198 bsfl(ch, tmp); 3199 addptr(result, ch); 3200 3201 bind(FOUND_SEQ_CHAR); 3202 subptr(result, str1); 3203 3204 bind(DONE_LABEL); 3205 } // stringL_indexof_char 3206 3207 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3208 switch (eltype) { 3209 case T_BOOLEAN: return sizeof(jboolean); 3210 case T_BYTE: return sizeof(jbyte); 3211 case T_SHORT: return sizeof(jshort); 3212 case T_CHAR: return sizeof(jchar); 3213 case T_INT: return sizeof(jint); 3214 default: 3215 ShouldNotReachHere(); 3216 return -1; 3217 } 3218 } 3219 3220 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3221 switch (eltype) { 3222 // T_BOOLEAN used as surrogate for unsigned byte 3223 case T_BOOLEAN: movzbl(dst, src); break; 3224 case T_BYTE: movsbl(dst, src); break; 3225 case T_SHORT: movswl(dst, src); break; 3226 case T_CHAR: movzwl(dst, src); break; 3227 case T_INT: movl(dst, src); break; 3228 default: 3229 ShouldNotReachHere(); 3230 } 3231 } 3232 3233 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3234 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3235 } 3236 3237 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3238 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3239 } 3240 3241 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3242 const int vlen = Assembler::AVX_256bit; 3243 switch (eltype) { 3244 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3245 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3246 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3247 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3248 case T_INT: 3249 // do nothing 3250 break; 3251 default: 3252 ShouldNotReachHere(); 3253 } 3254 } 3255 3256 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3257 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3258 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3259 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3260 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3261 BasicType eltype) { 3262 ShortBranchVerifier sbv(this); 3263 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3264 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3265 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3266 3267 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3268 SHORT_UNROLLED_LOOP_EXIT, 3269 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3270 UNROLLED_VECTOR_LOOP_BEGIN, 3271 END; 3272 switch (eltype) { 3273 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3274 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3275 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3276 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3277 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3278 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3279 } 3280 3281 // For "renaming" for readibility of the code 3282 XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3283 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3284 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3285 3286 const int elsize = arrays_hashcode_elsize(eltype); 3287 3288 /* 3289 if (cnt1 >= 2) { 3290 if (cnt1 >= 32) { 3291 UNROLLED VECTOR LOOP 3292 } 3293 UNROLLED SCALAR LOOP 3294 } 3295 SINGLE SCALAR 3296 */ 3297 3298 cmpl(cnt1, 32); 3299 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3300 3301 // cnt1 >= 32 && generate_vectorized_loop 3302 xorl(index, index); 3303 3304 // vresult = IntVector.zero(I256); 3305 for (int idx = 0; idx < 4; idx++) { 3306 vpxor(vresult[idx], vresult[idx]); 3307 } 3308 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3309 Register bound = tmp2; 3310 Register next = tmp3; 3311 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3312 movl(next, Address(tmp2, 0)); 3313 movdl(vnext, next); 3314 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3315 3316 // index = 0; 3317 // bound = cnt1 & ~(32 - 1); 3318 movl(bound, cnt1); 3319 andl(bound, ~(32 - 1)); 3320 // for (; index < bound; index += 32) { 3321 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3322 // result *= next; 3323 imull(result, next); 3324 // loop fission to upfront the cost of fetching from memory, OOO execution 3325 // can then hopefully do a better job of prefetching 3326 for (int idx = 0; idx < 4; idx++) { 3327 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3328 } 3329 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3330 for (int idx = 0; idx < 4; idx++) { 3331 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3332 arrays_hashcode_elvcast(vtmp[idx], eltype); 3333 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3334 } 3335 // index += 32; 3336 addl(index, 32); 3337 // index < bound; 3338 cmpl(index, bound); 3339 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3340 // } 3341 3342 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3343 subl(cnt1, bound); 3344 // release bound 3345 3346 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3347 for (int idx = 0; idx < 4; idx++) { 3348 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3349 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3350 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3351 } 3352 // result += vresult.reduceLanes(ADD); 3353 for (int idx = 0; idx < 4; idx++) { 3354 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3355 } 3356 3357 // } else if (cnt1 < 32) { 3358 3359 bind(SHORT_UNROLLED_BEGIN); 3360 // int i = 1; 3361 movl(index, 1); 3362 cmpl(index, cnt1); 3363 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3364 3365 // for (; i < cnt1 ; i += 2) { 3366 bind(SHORT_UNROLLED_LOOP_BEGIN); 3367 movl(tmp3, 961); 3368 imull(result, tmp3); 3369 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3370 movl(tmp3, tmp2); 3371 shll(tmp3, 5); 3372 subl(tmp3, tmp2); 3373 addl(result, tmp3); 3374 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3375 addl(result, tmp3); 3376 addl(index, 2); 3377 cmpl(index, cnt1); 3378 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3379 3380 // } 3381 // if (i >= cnt1) { 3382 bind(SHORT_UNROLLED_LOOP_EXIT); 3383 jccb(Assembler::greater, END); 3384 movl(tmp2, result); 3385 shll(result, 5); 3386 subl(result, tmp2); 3387 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3388 addl(result, tmp3); 3389 // } 3390 bind(END); 3391 3392 BLOCK_COMMENT("} // arrays_hashcode"); 3393 3394 } // arrays_hashcode 3395 3396 // helper function for string_compare 3397 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3398 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3399 Address::ScaleFactor scale2, Register index, int ae) { 3400 if (ae == StrIntrinsicNode::LL) { 3401 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3402 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3403 } else if (ae == StrIntrinsicNode::UU) { 3404 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3405 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3406 } else { 3407 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3408 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3409 } 3410 } 3411 3412 // Compare strings, used for char[] and byte[]. 3413 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3414 Register cnt1, Register cnt2, Register result, 3415 XMMRegister vec1, int ae, KRegister mask) { 3416 ShortBranchVerifier sbv(this); 3417 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3418 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3419 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3420 int stride2x2 = 0x40; 3421 Address::ScaleFactor scale = Address::no_scale; 3422 Address::ScaleFactor scale1 = Address::no_scale; 3423 Address::ScaleFactor scale2 = Address::no_scale; 3424 3425 if (ae != StrIntrinsicNode::LL) { 3426 stride2x2 = 0x20; 3427 } 3428 3429 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3430 shrl(cnt2, 1); 3431 } 3432 // Compute the minimum of the string lengths and the 3433 // difference of the string lengths (stack). 3434 // Do the conditional move stuff 3435 movl(result, cnt1); 3436 subl(cnt1, cnt2); 3437 push(cnt1); 3438 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3439 3440 // Is the minimum length zero? 3441 testl(cnt2, cnt2); 3442 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3443 if (ae == StrIntrinsicNode::LL) { 3444 // Load first bytes 3445 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3446 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3447 } else if (ae == StrIntrinsicNode::UU) { 3448 // Load first characters 3449 load_unsigned_short(result, Address(str1, 0)); 3450 load_unsigned_short(cnt1, Address(str2, 0)); 3451 } else { 3452 load_unsigned_byte(result, Address(str1, 0)); 3453 load_unsigned_short(cnt1, Address(str2, 0)); 3454 } 3455 subl(result, cnt1); 3456 jcc(Assembler::notZero, POP_LABEL); 3457 3458 if (ae == StrIntrinsicNode::UU) { 3459 // Divide length by 2 to get number of chars 3460 shrl(cnt2, 1); 3461 } 3462 cmpl(cnt2, 1); 3463 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3464 3465 // Check if the strings start at the same location and setup scale and stride 3466 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3467 cmpptr(str1, str2); 3468 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3469 if (ae == StrIntrinsicNode::LL) { 3470 scale = Address::times_1; 3471 stride = 16; 3472 } else { 3473 scale = Address::times_2; 3474 stride = 8; 3475 } 3476 } else { 3477 scale1 = Address::times_1; 3478 scale2 = Address::times_2; 3479 // scale not used 3480 stride = 8; 3481 } 3482 3483 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3484 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3485 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3486 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3487 Label COMPARE_TAIL_LONG; 3488 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3489 3490 int pcmpmask = 0x19; 3491 if (ae == StrIntrinsicNode::LL) { 3492 pcmpmask &= ~0x01; 3493 } 3494 3495 // Setup to compare 16-chars (32-bytes) vectors, 3496 // start from first character again because it has aligned address. 3497 if (ae == StrIntrinsicNode::LL) { 3498 stride2 = 32; 3499 } else { 3500 stride2 = 16; 3501 } 3502 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3503 adr_stride = stride << scale; 3504 } else { 3505 adr_stride1 = 8; //stride << scale1; 3506 adr_stride2 = 16; //stride << scale2; 3507 } 3508 3509 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3510 // rax and rdx are used by pcmpestri as elements counters 3511 movl(result, cnt2); 3512 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3513 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3514 3515 // fast path : compare first 2 8-char vectors. 3516 bind(COMPARE_16_CHARS); 3517 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3518 movdqu(vec1, Address(str1, 0)); 3519 } else { 3520 pmovzxbw(vec1, Address(str1, 0)); 3521 } 3522 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3523 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3524 3525 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3526 movdqu(vec1, Address(str1, adr_stride)); 3527 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3528 } else { 3529 pmovzxbw(vec1, Address(str1, adr_stride1)); 3530 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3531 } 3532 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3533 addl(cnt1, stride); 3534 3535 // Compare the characters at index in cnt1 3536 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3537 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3538 subl(result, cnt2); 3539 jmp(POP_LABEL); 3540 3541 // Setup the registers to start vector comparison loop 3542 bind(COMPARE_WIDE_VECTORS); 3543 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3544 lea(str1, Address(str1, result, scale)); 3545 lea(str2, Address(str2, result, scale)); 3546 } else { 3547 lea(str1, Address(str1, result, scale1)); 3548 lea(str2, Address(str2, result, scale2)); 3549 } 3550 subl(result, stride2); 3551 subl(cnt2, stride2); 3552 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3553 negptr(result); 3554 3555 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3556 bind(COMPARE_WIDE_VECTORS_LOOP); 3557 3558 #ifdef _LP64 3559 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3560 cmpl(cnt2, stride2x2); 3561 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3562 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3563 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3564 3565 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3566 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3567 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3568 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3569 } else { 3570 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3571 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3572 } 3573 kortestql(mask, mask); 3574 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3575 addptr(result, stride2x2); // update since we already compared at this addr 3576 subl(cnt2, stride2x2); // and sub the size too 3577 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3578 3579 vpxor(vec1, vec1); 3580 jmpb(COMPARE_WIDE_TAIL); 3581 }//if (VM_Version::supports_avx512vlbw()) 3582 #endif // _LP64 3583 3584 3585 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3586 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3587 vmovdqu(vec1, Address(str1, result, scale)); 3588 vpxor(vec1, Address(str2, result, scale)); 3589 } else { 3590 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3591 vpxor(vec1, Address(str2, result, scale2)); 3592 } 3593 vptest(vec1, vec1); 3594 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3595 addptr(result, stride2); 3596 subl(cnt2, stride2); 3597 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3598 // clean upper bits of YMM registers 3599 vpxor(vec1, vec1); 3600 3601 // compare wide vectors tail 3602 bind(COMPARE_WIDE_TAIL); 3603 testptr(result, result); 3604 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3605 3606 movl(result, stride2); 3607 movl(cnt2, result); 3608 negptr(result); 3609 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3610 3611 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3612 bind(VECTOR_NOT_EQUAL); 3613 // clean upper bits of YMM registers 3614 vpxor(vec1, vec1); 3615 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3616 lea(str1, Address(str1, result, scale)); 3617 lea(str2, Address(str2, result, scale)); 3618 } else { 3619 lea(str1, Address(str1, result, scale1)); 3620 lea(str2, Address(str2, result, scale2)); 3621 } 3622 jmp(COMPARE_16_CHARS); 3623 3624 // Compare tail chars, length between 1 to 15 chars 3625 bind(COMPARE_TAIL_LONG); 3626 movl(cnt2, result); 3627 cmpl(cnt2, stride); 3628 jcc(Assembler::less, COMPARE_SMALL_STR); 3629 3630 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3631 movdqu(vec1, Address(str1, 0)); 3632 } else { 3633 pmovzxbw(vec1, Address(str1, 0)); 3634 } 3635 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3636 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3637 subptr(cnt2, stride); 3638 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3639 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3640 lea(str1, Address(str1, result, scale)); 3641 lea(str2, Address(str2, result, scale)); 3642 } else { 3643 lea(str1, Address(str1, result, scale1)); 3644 lea(str2, Address(str2, result, scale2)); 3645 } 3646 negptr(cnt2); 3647 jmpb(WHILE_HEAD_LABEL); 3648 3649 bind(COMPARE_SMALL_STR); 3650 } else if (UseSSE42Intrinsics) { 3651 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3652 int pcmpmask = 0x19; 3653 // Setup to compare 8-char (16-byte) vectors, 3654 // start from first character again because it has aligned address. 3655 movl(result, cnt2); 3656 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3657 if (ae == StrIntrinsicNode::LL) { 3658 pcmpmask &= ~0x01; 3659 } 3660 jcc(Assembler::zero, COMPARE_TAIL); 3661 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3662 lea(str1, Address(str1, result, scale)); 3663 lea(str2, Address(str2, result, scale)); 3664 } else { 3665 lea(str1, Address(str1, result, scale1)); 3666 lea(str2, Address(str2, result, scale2)); 3667 } 3668 negptr(result); 3669 3670 // pcmpestri 3671 // inputs: 3672 // vec1- substring 3673 // rax - negative string length (elements count) 3674 // mem - scanned string 3675 // rdx - string length (elements count) 3676 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3677 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3678 // outputs: 3679 // rcx - first mismatched element index 3680 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3681 3682 bind(COMPARE_WIDE_VECTORS); 3683 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3684 movdqu(vec1, Address(str1, result, scale)); 3685 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3686 } else { 3687 pmovzxbw(vec1, Address(str1, result, scale1)); 3688 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3689 } 3690 // After pcmpestri cnt1(rcx) contains mismatched element index 3691 3692 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3693 addptr(result, stride); 3694 subptr(cnt2, stride); 3695 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3696 3697 // compare wide vectors tail 3698 testptr(result, result); 3699 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3700 3701 movl(cnt2, stride); 3702 movl(result, stride); 3703 negptr(result); 3704 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3705 movdqu(vec1, Address(str1, result, scale)); 3706 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3707 } else { 3708 pmovzxbw(vec1, Address(str1, result, scale1)); 3709 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3710 } 3711 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3712 3713 // Mismatched characters in the vectors 3714 bind(VECTOR_NOT_EQUAL); 3715 addptr(cnt1, result); 3716 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3717 subl(result, cnt2); 3718 jmpb(POP_LABEL); 3719 3720 bind(COMPARE_TAIL); // limit is zero 3721 movl(cnt2, result); 3722 // Fallthru to tail compare 3723 } 3724 // Shift str2 and str1 to the end of the arrays, negate min 3725 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3726 lea(str1, Address(str1, cnt2, scale)); 3727 lea(str2, Address(str2, cnt2, scale)); 3728 } else { 3729 lea(str1, Address(str1, cnt2, scale1)); 3730 lea(str2, Address(str2, cnt2, scale2)); 3731 } 3732 decrementl(cnt2); // first character was compared already 3733 negptr(cnt2); 3734 3735 // Compare the rest of the elements 3736 bind(WHILE_HEAD_LABEL); 3737 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3738 subl(result, cnt1); 3739 jccb(Assembler::notZero, POP_LABEL); 3740 increment(cnt2); 3741 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3742 3743 // Strings are equal up to min length. Return the length difference. 3744 bind(LENGTH_DIFF_LABEL); 3745 pop(result); 3746 if (ae == StrIntrinsicNode::UU) { 3747 // Divide diff by 2 to get number of chars 3748 sarl(result, 1); 3749 } 3750 jmpb(DONE_LABEL); 3751 3752 #ifdef _LP64 3753 if (VM_Version::supports_avx512vlbw()) { 3754 3755 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3756 3757 kmovql(cnt1, mask); 3758 notq(cnt1); 3759 bsfq(cnt2, cnt1); 3760 if (ae != StrIntrinsicNode::LL) { 3761 // Divide diff by 2 to get number of chars 3762 sarl(cnt2, 1); 3763 } 3764 addq(result, cnt2); 3765 if (ae == StrIntrinsicNode::LL) { 3766 load_unsigned_byte(cnt1, Address(str2, result)); 3767 load_unsigned_byte(result, Address(str1, result)); 3768 } else if (ae == StrIntrinsicNode::UU) { 3769 load_unsigned_short(cnt1, Address(str2, result, scale)); 3770 load_unsigned_short(result, Address(str1, result, scale)); 3771 } else { 3772 load_unsigned_short(cnt1, Address(str2, result, scale2)); 3773 load_unsigned_byte(result, Address(str1, result, scale1)); 3774 } 3775 subl(result, cnt1); 3776 jmpb(POP_LABEL); 3777 }//if (VM_Version::supports_avx512vlbw()) 3778 #endif // _LP64 3779 3780 // Discard the stored length difference 3781 bind(POP_LABEL); 3782 pop(cnt1); 3783 3784 // That's it 3785 bind(DONE_LABEL); 3786 if(ae == StrIntrinsicNode::UL) { 3787 negl(result); 3788 } 3789 3790 } 3791 3792 // Search for Non-ASCII character (Negative byte value) in a byte array, 3793 // return the index of the first such character, otherwise the length 3794 // of the array segment searched. 3795 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 3796 // @IntrinsicCandidate 3797 // public static int countPositives(byte[] ba, int off, int len) { 3798 // for (int i = off; i < off + len; i++) { 3799 // if (ba[i] < 0) { 3800 // return i - off; 3801 // } 3802 // } 3803 // return len; 3804 // } 3805 void C2_MacroAssembler::count_positives(Register ary1, Register len, 3806 Register result, Register tmp1, 3807 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 3808 // rsi: byte array 3809 // rcx: len 3810 // rax: result 3811 ShortBranchVerifier sbv(this); 3812 assert_different_registers(ary1, len, result, tmp1); 3813 assert_different_registers(vec1, vec2); 3814 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 3815 3816 movl(result, len); // copy 3817 // len == 0 3818 testl(len, len); 3819 jcc(Assembler::zero, DONE); 3820 3821 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 3822 VM_Version::supports_avx512vlbw() && 3823 VM_Version::supports_bmi2()) { 3824 3825 Label test_64_loop, test_tail, BREAK_LOOP; 3826 Register tmp3_aliased = len; 3827 3828 movl(tmp1, len); 3829 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 3830 3831 andl(tmp1, 64 - 1); // tail count (in chars) 0x3F 3832 andl(len, ~(64 - 1)); // vector count (in chars) 3833 jccb(Assembler::zero, test_tail); 3834 3835 lea(ary1, Address(ary1, len, Address::times_1)); 3836 negptr(len); 3837 3838 bind(test_64_loop); 3839 // Check whether our 64 elements of size byte contain negatives 3840 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 3841 kortestql(mask1, mask1); 3842 jcc(Assembler::notZero, BREAK_LOOP); 3843 3844 addptr(len, 64); 3845 jccb(Assembler::notZero, test_64_loop); 3846 3847 bind(test_tail); 3848 // bail out when there is nothing to be done 3849 testl(tmp1, -1); 3850 jcc(Assembler::zero, DONE); 3851 3852 // ~(~0 << len) applied up to two times (for 32-bit scenario) 3853 #ifdef _LP64 3854 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 3855 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 3856 notq(tmp3_aliased); 3857 kmovql(mask2, tmp3_aliased); 3858 #else 3859 Label k_init; 3860 jmp(k_init); 3861 3862 // We could not read 64-bits from a general purpose register thus we move 3863 // data required to compose 64 1's to the instruction stream 3864 // We emit 64 byte wide series of elements from 0..63 which later on would 3865 // be used as a compare targets with tail count contained in tmp1 register. 3866 // Result would be a k register having tmp1 consecutive number or 1 3867 // counting from least significant bit. 3868 address tmp = pc(); 3869 emit_int64(0x0706050403020100); 3870 emit_int64(0x0F0E0D0C0B0A0908); 3871 emit_int64(0x1716151413121110); 3872 emit_int64(0x1F1E1D1C1B1A1918); 3873 emit_int64(0x2726252423222120); 3874 emit_int64(0x2F2E2D2C2B2A2928); 3875 emit_int64(0x3736353433323130); 3876 emit_int64(0x3F3E3D3C3B3A3938); 3877 3878 bind(k_init); 3879 lea(len, InternalAddress(tmp)); 3880 // create mask to test for negative byte inside a vector 3881 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 3882 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 3883 3884 #endif 3885 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 3886 ktestq(mask1, mask2); 3887 jcc(Assembler::zero, DONE); 3888 3889 bind(BREAK_LOOP); 3890 // At least one byte in the last 64 bytes is negative. 3891 // Set up to look at the last 64 bytes as if they were a tail 3892 lea(ary1, Address(ary1, len, Address::times_1)); 3893 addptr(result, len); 3894 // Ignore the very last byte: if all others are positive, 3895 // it must be negative, so we can skip right to the 2+1 byte 3896 // end comparison at this point 3897 orl(result, 63); 3898 movl(len, 63); 3899 // Fallthru to tail compare 3900 } else { 3901 3902 if (UseAVX >= 2 && UseSSE >= 2) { 3903 // With AVX2, use 32-byte vector compare 3904 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 3905 3906 // Compare 32-byte vectors 3907 testl(len, 0xffffffe0); // vector count (in bytes) 3908 jccb(Assembler::zero, TAIL_START); 3909 3910 andl(len, 0xffffffe0); 3911 lea(ary1, Address(ary1, len, Address::times_1)); 3912 negptr(len); 3913 3914 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 3915 movdl(vec2, tmp1); 3916 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 3917 3918 bind(COMPARE_WIDE_VECTORS); 3919 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 3920 vptest(vec1, vec2); 3921 jccb(Assembler::notZero, BREAK_LOOP); 3922 addptr(len, 32); 3923 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3924 3925 testl(result, 0x0000001f); // any bytes remaining? 3926 jcc(Assembler::zero, DONE); 3927 3928 // Quick test using the already prepared vector mask 3929 movl(len, result); 3930 andl(len, 0x0000001f); 3931 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 3932 vptest(vec1, vec2); 3933 jcc(Assembler::zero, DONE); 3934 // There are zeros, jump to the tail to determine exactly where 3935 jmpb(TAIL_START); 3936 3937 bind(BREAK_LOOP); 3938 // At least one byte in the last 32-byte vector is negative. 3939 // Set up to look at the last 32 bytes as if they were a tail 3940 lea(ary1, Address(ary1, len, Address::times_1)); 3941 addptr(result, len); 3942 // Ignore the very last byte: if all others are positive, 3943 // it must be negative, so we can skip right to the 2+1 byte 3944 // end comparison at this point 3945 orl(result, 31); 3946 movl(len, 31); 3947 // Fallthru to tail compare 3948 } else if (UseSSE42Intrinsics) { 3949 // With SSE4.2, use double quad vector compare 3950 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 3951 3952 // Compare 16-byte vectors 3953 testl(len, 0xfffffff0); // vector count (in bytes) 3954 jcc(Assembler::zero, TAIL_START); 3955 3956 andl(len, 0xfffffff0); 3957 lea(ary1, Address(ary1, len, Address::times_1)); 3958 negptr(len); 3959 3960 movl(tmp1, 0x80808080); 3961 movdl(vec2, tmp1); 3962 pshufd(vec2, vec2, 0); 3963 3964 bind(COMPARE_WIDE_VECTORS); 3965 movdqu(vec1, Address(ary1, len, Address::times_1)); 3966 ptest(vec1, vec2); 3967 jccb(Assembler::notZero, BREAK_LOOP); 3968 addptr(len, 16); 3969 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3970 3971 testl(result, 0x0000000f); // len is zero, any bytes remaining? 3972 jcc(Assembler::zero, DONE); 3973 3974 // Quick test using the already prepared vector mask 3975 movl(len, result); 3976 andl(len, 0x0000000f); // tail count (in bytes) 3977 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 3978 ptest(vec1, vec2); 3979 jcc(Assembler::zero, DONE); 3980 jmpb(TAIL_START); 3981 3982 bind(BREAK_LOOP); 3983 // At least one byte in the last 16-byte vector is negative. 3984 // Set up and look at the last 16 bytes as if they were a tail 3985 lea(ary1, Address(ary1, len, Address::times_1)); 3986 addptr(result, len); 3987 // Ignore the very last byte: if all others are positive, 3988 // it must be negative, so we can skip right to the 2+1 byte 3989 // end comparison at this point 3990 orl(result, 15); 3991 movl(len, 15); 3992 // Fallthru to tail compare 3993 } 3994 } 3995 3996 bind(TAIL_START); 3997 // Compare 4-byte vectors 3998 andl(len, 0xfffffffc); // vector count (in bytes) 3999 jccb(Assembler::zero, COMPARE_CHAR); 4000 4001 lea(ary1, Address(ary1, len, Address::times_1)); 4002 negptr(len); 4003 4004 bind(COMPARE_VECTORS); 4005 movl(tmp1, Address(ary1, len, Address::times_1)); 4006 andl(tmp1, 0x80808080); 4007 jccb(Assembler::notZero, TAIL_ADJUST); 4008 addptr(len, 4); 4009 jccb(Assembler::notZero, COMPARE_VECTORS); 4010 4011 // Compare trailing char (final 2-3 bytes), if any 4012 bind(COMPARE_CHAR); 4013 4014 testl(result, 0x2); // tail char 4015 jccb(Assembler::zero, COMPARE_BYTE); 4016 load_unsigned_short(tmp1, Address(ary1, 0)); 4017 andl(tmp1, 0x00008080); 4018 jccb(Assembler::notZero, CHAR_ADJUST); 4019 lea(ary1, Address(ary1, 2)); 4020 4021 bind(COMPARE_BYTE); 4022 testl(result, 0x1); // tail byte 4023 jccb(Assembler::zero, DONE); 4024 load_unsigned_byte(tmp1, Address(ary1, 0)); 4025 testl(tmp1, 0x00000080); 4026 jccb(Assembler::zero, DONE); 4027 subptr(result, 1); 4028 jmpb(DONE); 4029 4030 bind(TAIL_ADJUST); 4031 // there are negative bits in the last 4 byte block. 4032 // Adjust result and check the next three bytes 4033 addptr(result, len); 4034 orl(result, 3); 4035 lea(ary1, Address(ary1, len, Address::times_1)); 4036 jmpb(COMPARE_CHAR); 4037 4038 bind(CHAR_ADJUST); 4039 // We are looking at a char + optional byte tail, and found that one 4040 // of the bytes in the char is negative. Adjust the result, check the 4041 // first byte and readjust if needed. 4042 andl(result, 0xfffffffc); 4043 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4044 jccb(Assembler::notZero, DONE); 4045 addptr(result, 1); 4046 4047 // That's it 4048 bind(DONE); 4049 if (UseAVX >= 2 && UseSSE >= 2) { 4050 // clean upper bits of YMM registers 4051 vpxor(vec1, vec1); 4052 vpxor(vec2, vec2); 4053 } 4054 } 4055 4056 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4057 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4058 Register limit, Register result, Register chr, 4059 XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) { 4060 ShortBranchVerifier sbv(this); 4061 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4062 4063 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4064 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4065 4066 if (is_array_equ) { 4067 // Check the input args 4068 cmpoop(ary1, ary2); 4069 jcc(Assembler::equal, TRUE_LABEL); 4070 4071 // Need additional checks for arrays_equals. 4072 testptr(ary1, ary1); 4073 jcc(Assembler::zero, FALSE_LABEL); 4074 testptr(ary2, ary2); 4075 jcc(Assembler::zero, FALSE_LABEL); 4076 4077 // Check the lengths 4078 movl(limit, Address(ary1, length_offset)); 4079 cmpl(limit, Address(ary2, length_offset)); 4080 jcc(Assembler::notEqual, FALSE_LABEL); 4081 } 4082 4083 // count == 0 4084 testl(limit, limit); 4085 jcc(Assembler::zero, TRUE_LABEL); 4086 4087 if (is_array_equ) { 4088 // Load array address 4089 lea(ary1, Address(ary1, base_offset)); 4090 lea(ary2, Address(ary2, base_offset)); 4091 } 4092 4093 if (is_array_equ && is_char) { 4094 // arrays_equals when used for char[]. 4095 shll(limit, 1); // byte count != 0 4096 } 4097 movl(result, limit); // copy 4098 4099 if (UseAVX >= 2) { 4100 // With AVX2, use 32-byte vector compare 4101 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4102 4103 // Compare 32-byte vectors 4104 andl(result, 0x0000001f); // tail count (in bytes) 4105 andl(limit, 0xffffffe0); // vector count (in bytes) 4106 jcc(Assembler::zero, COMPARE_TAIL); 4107 4108 lea(ary1, Address(ary1, limit, Address::times_1)); 4109 lea(ary2, Address(ary2, limit, Address::times_1)); 4110 negptr(limit); 4111 4112 #ifdef _LP64 4113 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4114 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4115 4116 cmpl(limit, -64); 4117 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4118 4119 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4120 4121 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4122 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4123 kortestql(mask, mask); 4124 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4125 addptr(limit, 64); // update since we already compared at this addr 4126 cmpl(limit, -64); 4127 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4128 4129 // At this point we may still need to compare -limit+result bytes. 4130 // We could execute the next two instruction and just continue via non-wide path: 4131 // cmpl(limit, 0); 4132 // jcc(Assembler::equal, COMPARE_TAIL); // true 4133 // But since we stopped at the points ary{1,2}+limit which are 4134 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4135 // (|limit| <= 32 and result < 32), 4136 // we may just compare the last 64 bytes. 4137 // 4138 addptr(result, -64); // it is safe, bc we just came from this area 4139 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4140 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4141 kortestql(mask, mask); 4142 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4143 4144 jmp(TRUE_LABEL); 4145 4146 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4147 4148 }//if (VM_Version::supports_avx512vlbw()) 4149 #endif //_LP64 4150 bind(COMPARE_WIDE_VECTORS); 4151 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); 4152 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4153 vpxor(vec1, vec2); 4154 4155 vptest(vec1, vec1); 4156 jcc(Assembler::notZero, FALSE_LABEL); 4157 addptr(limit, 32); 4158 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4159 4160 testl(result, result); 4161 jcc(Assembler::zero, TRUE_LABEL); 4162 4163 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 4164 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4165 vpxor(vec1, vec2); 4166 4167 vptest(vec1, vec1); 4168 jccb(Assembler::notZero, FALSE_LABEL); 4169 jmpb(TRUE_LABEL); 4170 4171 bind(COMPARE_TAIL); // limit is zero 4172 movl(limit, result); 4173 // Fallthru to tail compare 4174 } else if (UseSSE42Intrinsics) { 4175 // With SSE4.2, use double quad vector compare 4176 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4177 4178 // Compare 16-byte vectors 4179 andl(result, 0x0000000f); // tail count (in bytes) 4180 andl(limit, 0xfffffff0); // vector count (in bytes) 4181 jcc(Assembler::zero, COMPARE_TAIL); 4182 4183 lea(ary1, Address(ary1, limit, Address::times_1)); 4184 lea(ary2, Address(ary2, limit, Address::times_1)); 4185 negptr(limit); 4186 4187 bind(COMPARE_WIDE_VECTORS); 4188 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4189 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4190 pxor(vec1, vec2); 4191 4192 ptest(vec1, vec1); 4193 jcc(Assembler::notZero, FALSE_LABEL); 4194 addptr(limit, 16); 4195 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4196 4197 testl(result, result); 4198 jcc(Assembler::zero, TRUE_LABEL); 4199 4200 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4201 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4202 pxor(vec1, vec2); 4203 4204 ptest(vec1, vec1); 4205 jccb(Assembler::notZero, FALSE_LABEL); 4206 jmpb(TRUE_LABEL); 4207 4208 bind(COMPARE_TAIL); // limit is zero 4209 movl(limit, result); 4210 // Fallthru to tail compare 4211 } 4212 4213 // Compare 4-byte vectors 4214 andl(limit, 0xfffffffc); // vector count (in bytes) 4215 jccb(Assembler::zero, COMPARE_CHAR); 4216 4217 lea(ary1, Address(ary1, limit, Address::times_1)); 4218 lea(ary2, Address(ary2, limit, Address::times_1)); 4219 negptr(limit); 4220 4221 bind(COMPARE_VECTORS); 4222 movl(chr, Address(ary1, limit, Address::times_1)); 4223 cmpl(chr, Address(ary2, limit, Address::times_1)); 4224 jccb(Assembler::notEqual, FALSE_LABEL); 4225 addptr(limit, 4); 4226 jcc(Assembler::notZero, COMPARE_VECTORS); 4227 4228 // Compare trailing char (final 2 bytes), if any 4229 bind(COMPARE_CHAR); 4230 testl(result, 0x2); // tail char 4231 jccb(Assembler::zero, COMPARE_BYTE); 4232 load_unsigned_short(chr, Address(ary1, 0)); 4233 load_unsigned_short(limit, Address(ary2, 0)); 4234 cmpl(chr, limit); 4235 jccb(Assembler::notEqual, FALSE_LABEL); 4236 4237 if (is_array_equ && is_char) { 4238 bind(COMPARE_BYTE); 4239 } else { 4240 lea(ary1, Address(ary1, 2)); 4241 lea(ary2, Address(ary2, 2)); 4242 4243 bind(COMPARE_BYTE); 4244 testl(result, 0x1); // tail byte 4245 jccb(Assembler::zero, TRUE_LABEL); 4246 load_unsigned_byte(chr, Address(ary1, 0)); 4247 load_unsigned_byte(limit, Address(ary2, 0)); 4248 cmpl(chr, limit); 4249 jccb(Assembler::notEqual, FALSE_LABEL); 4250 } 4251 bind(TRUE_LABEL); 4252 movl(result, 1); // return true 4253 jmpb(DONE); 4254 4255 bind(FALSE_LABEL); 4256 xorl(result, result); // return false 4257 4258 // That's it 4259 bind(DONE); 4260 if (UseAVX >= 2) { 4261 // clean upper bits of YMM registers 4262 vpxor(vec1, vec1); 4263 vpxor(vec2, vec2); 4264 } 4265 } 4266 4267 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4268 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4269 switch(ideal_opc) { 4270 case Op_LShiftVS: 4271 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4272 case Op_LShiftVI: 4273 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4274 case Op_LShiftVL: 4275 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4276 case Op_RShiftVS: 4277 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4278 case Op_RShiftVI: 4279 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4280 case Op_RShiftVL: 4281 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4282 case Op_URShiftVS: 4283 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4284 case Op_URShiftVI: 4285 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4286 case Op_URShiftVL: 4287 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4288 case Op_RotateRightV: 4289 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4290 case Op_RotateLeftV: 4291 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4292 default: 4293 fatal("Unsupported masked operation"); break; 4294 } 4295 } 4296 4297 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4298 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4299 bool is_varshift) { 4300 switch (ideal_opc) { 4301 case Op_AddVB: 4302 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4303 case Op_AddVS: 4304 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4305 case Op_AddVI: 4306 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4307 case Op_AddVL: 4308 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4309 case Op_AddVF: 4310 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4311 case Op_AddVD: 4312 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4313 case Op_SubVB: 4314 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4315 case Op_SubVS: 4316 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4317 case Op_SubVI: 4318 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4319 case Op_SubVL: 4320 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4321 case Op_SubVF: 4322 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4323 case Op_SubVD: 4324 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4325 case Op_MulVS: 4326 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4327 case Op_MulVI: 4328 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4329 case Op_MulVL: 4330 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4331 case Op_MulVF: 4332 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4333 case Op_MulVD: 4334 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4335 case Op_DivVF: 4336 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4337 case Op_DivVD: 4338 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4339 case Op_SqrtVF: 4340 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4341 case Op_SqrtVD: 4342 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4343 case Op_AbsVB: 4344 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4345 case Op_AbsVS: 4346 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4347 case Op_AbsVI: 4348 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4349 case Op_AbsVL: 4350 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4351 case Op_FmaVF: 4352 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4353 case Op_FmaVD: 4354 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4355 case Op_VectorRearrange: 4356 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4357 case Op_LShiftVS: 4358 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4359 case Op_LShiftVI: 4360 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4361 case Op_LShiftVL: 4362 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4363 case Op_RShiftVS: 4364 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4365 case Op_RShiftVI: 4366 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4367 case Op_RShiftVL: 4368 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4369 case Op_URShiftVS: 4370 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4371 case Op_URShiftVI: 4372 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4373 case Op_URShiftVL: 4374 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4375 case Op_RotateLeftV: 4376 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4377 case Op_RotateRightV: 4378 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4379 case Op_MaxV: 4380 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4381 case Op_MinV: 4382 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4383 case Op_XorV: 4384 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4385 case Op_OrV: 4386 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4387 case Op_AndV: 4388 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4389 default: 4390 fatal("Unsupported masked operation"); break; 4391 } 4392 } 4393 4394 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4395 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4396 switch (ideal_opc) { 4397 case Op_AddVB: 4398 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4399 case Op_AddVS: 4400 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4401 case Op_AddVI: 4402 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4403 case Op_AddVL: 4404 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4405 case Op_AddVF: 4406 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4407 case Op_AddVD: 4408 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4409 case Op_SubVB: 4410 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4411 case Op_SubVS: 4412 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4413 case Op_SubVI: 4414 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4415 case Op_SubVL: 4416 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4417 case Op_SubVF: 4418 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4419 case Op_SubVD: 4420 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4421 case Op_MulVS: 4422 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4423 case Op_MulVI: 4424 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4425 case Op_MulVL: 4426 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4427 case Op_MulVF: 4428 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4429 case Op_MulVD: 4430 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4431 case Op_DivVF: 4432 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4433 case Op_DivVD: 4434 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4435 case Op_FmaVF: 4436 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4437 case Op_FmaVD: 4438 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4439 case Op_MaxV: 4440 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4441 case Op_MinV: 4442 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4443 case Op_XorV: 4444 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4445 case Op_OrV: 4446 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4447 case Op_AndV: 4448 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4449 default: 4450 fatal("Unsupported masked operation"); break; 4451 } 4452 } 4453 4454 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4455 KRegister src1, KRegister src2) { 4456 BasicType etype = T_ILLEGAL; 4457 switch(mask_len) { 4458 case 2: 4459 case 4: 4460 case 8: etype = T_BYTE; break; 4461 case 16: etype = T_SHORT; break; 4462 case 32: etype = T_INT; break; 4463 case 64: etype = T_LONG; break; 4464 default: fatal("Unsupported type"); break; 4465 } 4466 assert(etype != T_ILLEGAL, ""); 4467 switch(ideal_opc) { 4468 case Op_AndVMask: 4469 kand(etype, dst, src1, src2); break; 4470 case Op_OrVMask: 4471 kor(etype, dst, src1, src2); break; 4472 case Op_XorVMask: 4473 kxor(etype, dst, src1, src2); break; 4474 default: 4475 fatal("Unsupported masked operation"); break; 4476 } 4477 } 4478 4479 /* 4480 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4481 * If src is NaN, the result is 0. 4482 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4483 * the result is equal to the value of Integer.MIN_VALUE. 4484 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4485 * the result is equal to the value of Integer.MAX_VALUE. 4486 */ 4487 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4488 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4489 Register rscratch, AddressLiteral float_sign_flip, 4490 int vec_enc) { 4491 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4492 Label done; 4493 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4494 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4495 vptest(xtmp2, xtmp2, vec_enc); 4496 jccb(Assembler::equal, done); 4497 4498 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4499 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4500 4501 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4502 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4503 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4504 4505 // Recompute the mask for remaining special value. 4506 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4507 // Extract SRC values corresponding to TRUE mask lanes. 4508 vpand(xtmp4, xtmp2, src, vec_enc); 4509 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4510 // values are set. 4511 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4512 4513 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4514 bind(done); 4515 } 4516 4517 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4518 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4519 Register rscratch, AddressLiteral float_sign_flip, 4520 int vec_enc) { 4521 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4522 Label done; 4523 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4524 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4525 kortestwl(ktmp1, ktmp1); 4526 jccb(Assembler::equal, done); 4527 4528 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4529 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4530 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4531 4532 kxorwl(ktmp1, ktmp1, ktmp2); 4533 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4534 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4535 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4536 bind(done); 4537 } 4538 4539 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4540 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4541 Register rscratch, AddressLiteral double_sign_flip, 4542 int vec_enc) { 4543 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4544 4545 Label done; 4546 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4547 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4548 kortestwl(ktmp1, ktmp1); 4549 jccb(Assembler::equal, done); 4550 4551 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4552 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4553 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4554 4555 kxorwl(ktmp1, ktmp1, ktmp2); 4556 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4557 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4558 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4559 bind(done); 4560 } 4561 4562 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4563 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4564 Register rscratch, AddressLiteral float_sign_flip, 4565 int vec_enc) { 4566 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4567 Label done; 4568 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4569 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4570 kortestwl(ktmp1, ktmp1); 4571 jccb(Assembler::equal, done); 4572 4573 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4574 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4575 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4576 4577 kxorwl(ktmp1, ktmp1, ktmp2); 4578 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4579 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4580 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4581 bind(done); 4582 } 4583 4584 /* 4585 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4586 * If src is NaN, the result is 0. 4587 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 4588 * the result is equal to the value of Long.MIN_VALUE. 4589 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 4590 * the result is equal to the value of Long.MAX_VALUE. 4591 */ 4592 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4593 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4594 Register rscratch, AddressLiteral double_sign_flip, 4595 int vec_enc) { 4596 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4597 4598 Label done; 4599 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4600 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 4601 kortestwl(ktmp1, ktmp1); 4602 jccb(Assembler::equal, done); 4603 4604 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4605 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4606 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4607 4608 kxorwl(ktmp1, ktmp1, ktmp2); 4609 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4610 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4611 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4612 bind(done); 4613 } 4614 4615 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 4616 XMMRegister xtmp, int index, int vec_enc) { 4617 assert(vec_enc < Assembler::AVX_512bit, ""); 4618 if (vec_enc == Assembler::AVX_256bit) { 4619 vextractf128_high(xtmp, src); 4620 vshufps(dst, src, xtmp, index, vec_enc); 4621 } else { 4622 vshufps(dst, src, zero, index, vec_enc); 4623 } 4624 } 4625 4626 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4627 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 4628 AddressLiteral float_sign_flip, int src_vec_enc) { 4629 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4630 4631 Label done; 4632 // Compare the destination lanes with float_sign_flip 4633 // value to get mask for all special values. 4634 movdqu(xtmp1, float_sign_flip, rscratch); 4635 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 4636 ptest(xtmp2, xtmp2); 4637 jccb(Assembler::equal, done); 4638 4639 // Flip float_sign_flip to get max integer value. 4640 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 4641 pxor(xtmp1, xtmp4); 4642 4643 // Set detination lanes corresponding to unordered source lanes as zero. 4644 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 4645 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 4646 4647 // Shuffle mask vector and pack lower doubles word from each quadword lane. 4648 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 4649 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 4650 4651 // Recompute the mask for remaining special value. 4652 pxor(xtmp2, xtmp3); 4653 // Extract mask corresponding to non-negative source lanes. 4654 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 4655 4656 // Shuffle mask vector and pack lower doubles word from each quadword lane. 4657 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 4658 pand(xtmp3, xtmp2); 4659 4660 // Replace destination lanes holding special value(0x80000000) with max int 4661 // if corresponding source lane holds a +ve value. 4662 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 4663 bind(done); 4664 } 4665 4666 4667 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 4668 XMMRegister xtmp, Register rscratch, int vec_enc) { 4669 switch(to_elem_bt) { 4670 case T_SHORT: 4671 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 4672 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 4673 vpackusdw(dst, dst, zero, vec_enc); 4674 if (vec_enc == Assembler::AVX_256bit) { 4675 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 4676 } 4677 break; 4678 case T_BYTE: 4679 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 4680 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 4681 vpackusdw(dst, dst, zero, vec_enc); 4682 if (vec_enc == Assembler::AVX_256bit) { 4683 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 4684 } 4685 vpackuswb(dst, dst, zero, vec_enc); 4686 break; 4687 default: assert(false, "%s", type2name(to_elem_bt)); 4688 } 4689 } 4690 4691 /* 4692 * Algorithm for vector D2L and F2I conversions:- 4693 * a) Perform vector D2L/F2I cast. 4694 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 4695 * It signifies that source value could be any of the special floating point 4696 * values(NaN,-Inf,Inf,Max,-Min). 4697 * c) Set destination to zero if source is NaN value. 4698 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 4699 */ 4700 4701 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4702 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4703 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 4704 int to_elem_sz = type2aelembytes(to_elem_bt); 4705 assert(to_elem_sz <= 4, ""); 4706 vcvttps2dq(dst, src, vec_enc); 4707 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 4708 if (to_elem_sz < 4) { 4709 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4710 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 4711 } 4712 } 4713 4714 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4715 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 4716 Register rscratch, int vec_enc) { 4717 int to_elem_sz = type2aelembytes(to_elem_bt); 4718 assert(to_elem_sz <= 4, ""); 4719 vcvttps2dq(dst, src, vec_enc); 4720 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 4721 switch(to_elem_bt) { 4722 case T_INT: 4723 break; 4724 case T_SHORT: 4725 evpmovdw(dst, dst, vec_enc); 4726 break; 4727 case T_BYTE: 4728 evpmovdb(dst, dst, vec_enc); 4729 break; 4730 default: assert(false, "%s", type2name(to_elem_bt)); 4731 } 4732 } 4733 4734 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4735 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 4736 Register rscratch, int vec_enc) { 4737 evcvttps2qq(dst, src, vec_enc); 4738 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 4739 } 4740 4741 // Handling for downcasting from double to integer or sub-word types on AVX2. 4742 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4743 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 4744 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 4745 int to_elem_sz = type2aelembytes(to_elem_bt); 4746 assert(to_elem_sz < 8, ""); 4747 vcvttpd2dq(dst, src, vec_enc); 4748 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 4749 float_sign_flip, vec_enc); 4750 if (to_elem_sz < 4) { 4751 // xtmp4 holds all zero lanes. 4752 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 4753 } 4754 } 4755 4756 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 4757 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 4758 KRegister ktmp2, AddressLiteral sign_flip, 4759 Register rscratch, int vec_enc) { 4760 if (VM_Version::supports_avx512dq()) { 4761 evcvttpd2qq(dst, src, vec_enc); 4762 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 4763 switch(to_elem_bt) { 4764 case T_LONG: 4765 break; 4766 case T_INT: 4767 evpmovsqd(dst, dst, vec_enc); 4768 break; 4769 case T_SHORT: 4770 evpmovsqd(dst, dst, vec_enc); 4771 evpmovdw(dst, dst, vec_enc); 4772 break; 4773 case T_BYTE: 4774 evpmovsqd(dst, dst, vec_enc); 4775 evpmovdb(dst, dst, vec_enc); 4776 break; 4777 default: assert(false, "%s", type2name(to_elem_bt)); 4778 } 4779 } else { 4780 assert(type2aelembytes(to_elem_bt) <= 4, ""); 4781 vcvttpd2dq(dst, src, vec_enc); 4782 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 4783 switch(to_elem_bt) { 4784 case T_INT: 4785 break; 4786 case T_SHORT: 4787 evpmovdw(dst, dst, vec_enc); 4788 break; 4789 case T_BYTE: 4790 evpmovdb(dst, dst, vec_enc); 4791 break; 4792 default: assert(false, "%s", type2name(to_elem_bt)); 4793 } 4794 } 4795 } 4796 4797 #ifdef _LP64 4798 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 4799 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 4800 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 4801 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4802 // and re-instantiate original MXCSR.RC mode after that. 4803 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 4804 4805 mov64(tmp, julong_cast(0.5L)); 4806 evpbroadcastq(xtmp1, tmp, vec_enc); 4807 vaddpd(xtmp1, src , xtmp1, vec_enc); 4808 evcvtpd2qq(dst, xtmp1, vec_enc); 4809 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 4810 double_sign_flip, vec_enc);; 4811 4812 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 4813 } 4814 4815 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 4816 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 4817 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 4818 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4819 // and re-instantiate original MXCSR.RC mode after that. 4820 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 4821 4822 movl(tmp, jint_cast(0.5)); 4823 movq(xtmp1, tmp); 4824 vbroadcastss(xtmp1, xtmp1, vec_enc); 4825 vaddps(xtmp1, src , xtmp1, vec_enc); 4826 vcvtps2dq(dst, xtmp1, vec_enc); 4827 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 4828 float_sign_flip, vec_enc); 4829 4830 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 4831 } 4832 4833 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 4834 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 4835 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 4836 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4837 // and re-instantiate original MXCSR.RC mode after that. 4838 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 4839 4840 movl(tmp, jint_cast(0.5)); 4841 movq(xtmp1, tmp); 4842 vbroadcastss(xtmp1, xtmp1, vec_enc); 4843 vaddps(xtmp1, src , xtmp1, vec_enc); 4844 vcvtps2dq(dst, xtmp1, vec_enc); 4845 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 4846 4847 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 4848 } 4849 #endif // _LP64 4850 4851 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 4852 BasicType from_elem_bt, BasicType to_elem_bt) { 4853 switch (from_elem_bt) { 4854 case T_BYTE: 4855 switch (to_elem_bt) { 4856 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 4857 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 4858 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 4859 default: ShouldNotReachHere(); 4860 } 4861 break; 4862 case T_SHORT: 4863 switch (to_elem_bt) { 4864 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 4865 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 4866 default: ShouldNotReachHere(); 4867 } 4868 break; 4869 case T_INT: 4870 assert(to_elem_bt == T_LONG, ""); 4871 vpmovzxdq(dst, src, vlen_enc); 4872 break; 4873 default: 4874 ShouldNotReachHere(); 4875 } 4876 } 4877 4878 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 4879 BasicType from_elem_bt, BasicType to_elem_bt) { 4880 switch (from_elem_bt) { 4881 case T_BYTE: 4882 switch (to_elem_bt) { 4883 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 4884 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 4885 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 4886 default: ShouldNotReachHere(); 4887 } 4888 break; 4889 case T_SHORT: 4890 switch (to_elem_bt) { 4891 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 4892 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 4893 default: ShouldNotReachHere(); 4894 } 4895 break; 4896 case T_INT: 4897 assert(to_elem_bt == T_LONG, ""); 4898 vpmovsxdq(dst, src, vlen_enc); 4899 break; 4900 default: 4901 ShouldNotReachHere(); 4902 } 4903 } 4904 4905 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 4906 BasicType dst_bt, BasicType src_bt, int vlen) { 4907 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 4908 assert(vlen_enc != AVX_512bit, ""); 4909 4910 int dst_bt_size = type2aelembytes(dst_bt); 4911 int src_bt_size = type2aelembytes(src_bt); 4912 if (dst_bt_size > src_bt_size) { 4913 switch (dst_bt_size / src_bt_size) { 4914 case 2: vpmovsxbw(dst, src, vlen_enc); break; 4915 case 4: vpmovsxbd(dst, src, vlen_enc); break; 4916 case 8: vpmovsxbq(dst, src, vlen_enc); break; 4917 default: ShouldNotReachHere(); 4918 } 4919 } else { 4920 assert(dst_bt_size < src_bt_size, ""); 4921 switch (src_bt_size / dst_bt_size) { 4922 case 2: { 4923 if (vlen_enc == AVX_128bit) { 4924 vpacksswb(dst, src, src, vlen_enc); 4925 } else { 4926 vpacksswb(dst, src, src, vlen_enc); 4927 vpermq(dst, dst, 0x08, vlen_enc); 4928 } 4929 break; 4930 } 4931 case 4: { 4932 if (vlen_enc == AVX_128bit) { 4933 vpackssdw(dst, src, src, vlen_enc); 4934 vpacksswb(dst, dst, dst, vlen_enc); 4935 } else { 4936 vpackssdw(dst, src, src, vlen_enc); 4937 vpermq(dst, dst, 0x08, vlen_enc); 4938 vpacksswb(dst, dst, dst, AVX_128bit); 4939 } 4940 break; 4941 } 4942 case 8: { 4943 if (vlen_enc == AVX_128bit) { 4944 vpshufd(dst, src, 0x08, vlen_enc); 4945 vpackssdw(dst, dst, dst, vlen_enc); 4946 vpacksswb(dst, dst, dst, vlen_enc); 4947 } else { 4948 vpshufd(dst, src, 0x08, vlen_enc); 4949 vpermq(dst, dst, 0x08, vlen_enc); 4950 vpackssdw(dst, dst, dst, AVX_128bit); 4951 vpacksswb(dst, dst, dst, AVX_128bit); 4952 } 4953 break; 4954 } 4955 default: ShouldNotReachHere(); 4956 } 4957 } 4958 } 4959 4960 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 4961 bool merge, BasicType bt, int vlen_enc) { 4962 if (bt == T_INT) { 4963 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 4964 } else { 4965 assert(bt == T_LONG, ""); 4966 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 4967 } 4968 } 4969 4970 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 4971 bool merge, BasicType bt, int vlen_enc) { 4972 if (bt == T_INT) { 4973 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 4974 } else { 4975 assert(bt == T_LONG, ""); 4976 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 4977 } 4978 } 4979 4980 #ifdef _LP64 4981 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 4982 Register rtmp2, XMMRegister xtmp, int mask_len, 4983 int vec_enc) { 4984 int index = 0; 4985 int vindex = 0; 4986 mov64(rtmp1, 0x0101010101010101L); 4987 pdepq(rtmp1, src, rtmp1); 4988 if (mask_len > 8) { 4989 movq(rtmp2, src); 4990 vpxor(xtmp, xtmp, xtmp, vec_enc); 4991 movq(xtmp, rtmp1); 4992 } 4993 movq(dst, rtmp1); 4994 4995 mask_len -= 8; 4996 while (mask_len > 0) { 4997 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 4998 index++; 4999 if ((index % 2) == 0) { 5000 pxor(xtmp, xtmp); 5001 } 5002 mov64(rtmp1, 0x0101010101010101L); 5003 shrq(rtmp2, 8); 5004 pdepq(rtmp1, rtmp2, rtmp1); 5005 pinsrq(xtmp, rtmp1, index % 2); 5006 vindex = index / 2; 5007 if (vindex) { 5008 // Write entire 16 byte vector when both 64 bit 5009 // lanes are update to save redundant instructions. 5010 if (index % 2) { 5011 vinsertf128(dst, dst, xtmp, vindex); 5012 } 5013 } else { 5014 vmovdqu(dst, xtmp); 5015 } 5016 mask_len -= 8; 5017 } 5018 } 5019 5020 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5021 switch(opc) { 5022 case Op_VectorMaskTrueCount: 5023 popcntq(dst, tmp); 5024 break; 5025 case Op_VectorMaskLastTrue: 5026 if (VM_Version::supports_lzcnt()) { 5027 lzcntq(tmp, tmp); 5028 movl(dst, 63); 5029 subl(dst, tmp); 5030 } else { 5031 movl(dst, -1); 5032 bsrq(tmp, tmp); 5033 cmov32(Assembler::notZero, dst, tmp); 5034 } 5035 break; 5036 case Op_VectorMaskFirstTrue: 5037 if (VM_Version::supports_bmi1()) { 5038 if (masklen < 32) { 5039 orl(tmp, 1 << masklen); 5040 tzcntl(dst, tmp); 5041 } else if (masklen == 32) { 5042 tzcntl(dst, tmp); 5043 } else { 5044 assert(masklen == 64, ""); 5045 tzcntq(dst, tmp); 5046 } 5047 } else { 5048 if (masklen < 32) { 5049 orl(tmp, 1 << masklen); 5050 bsfl(dst, tmp); 5051 } else { 5052 assert(masklen == 32 || masklen == 64, ""); 5053 movl(dst, masklen); 5054 if (masklen == 32) { 5055 bsfl(tmp, tmp); 5056 } else { 5057 bsfq(tmp, tmp); 5058 } 5059 cmov32(Assembler::notZero, dst, tmp); 5060 } 5061 } 5062 break; 5063 case Op_VectorMaskToLong: 5064 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5065 break; 5066 default: assert(false, "Unhandled mask operation"); 5067 } 5068 } 5069 5070 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5071 int masklen, int masksize, int vec_enc) { 5072 assert(VM_Version::supports_popcnt(), ""); 5073 5074 if(VM_Version::supports_avx512bw()) { 5075 kmovql(tmp, mask); 5076 } else { 5077 assert(masklen <= 16, ""); 5078 kmovwl(tmp, mask); 5079 } 5080 5081 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5082 // operations needs to be clipped. 5083 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5084 andq(tmp, (1 << masklen) - 1); 5085 } 5086 5087 vector_mask_operation_helper(opc, dst, tmp, masklen); 5088 } 5089 5090 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5091 Register tmp, int masklen, BasicType bt, int vec_enc) { 5092 assert(vec_enc == AVX_128bit && VM_Version::supports_avx() || 5093 vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4), ""); 5094 assert(VM_Version::supports_popcnt(), ""); 5095 5096 bool need_clip = false; 5097 switch(bt) { 5098 case T_BOOLEAN: 5099 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5100 vpxor(xtmp, xtmp, xtmp, vec_enc); 5101 vpsubb(xtmp, xtmp, mask, vec_enc); 5102 vpmovmskb(tmp, xtmp, vec_enc); 5103 need_clip = masklen < 16; 5104 break; 5105 case T_BYTE: 5106 vpmovmskb(tmp, mask, vec_enc); 5107 need_clip = masklen < 16; 5108 break; 5109 case T_SHORT: 5110 vpacksswb(xtmp, mask, mask, vec_enc); 5111 if (masklen >= 16) { 5112 vpermpd(xtmp, xtmp, 8, vec_enc); 5113 } 5114 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5115 need_clip = masklen < 16; 5116 break; 5117 case T_INT: 5118 case T_FLOAT: 5119 vmovmskps(tmp, mask, vec_enc); 5120 need_clip = masklen < 4; 5121 break; 5122 case T_LONG: 5123 case T_DOUBLE: 5124 vmovmskpd(tmp, mask, vec_enc); 5125 need_clip = masklen < 2; 5126 break; 5127 default: assert(false, "Unhandled type, %s", type2name(bt)); 5128 } 5129 5130 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5131 // operations needs to be clipped. 5132 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5133 // need_clip implies masklen < 32 5134 andq(tmp, (1 << masklen) - 1); 5135 } 5136 5137 vector_mask_operation_helper(opc, dst, tmp, masklen); 5138 } 5139 5140 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5141 Register rtmp2, int mask_len) { 5142 kmov(rtmp1, src); 5143 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5144 mov64(rtmp2, -1L); 5145 pextq(rtmp2, rtmp2, rtmp1); 5146 kmov(dst, rtmp2); 5147 } 5148 5149 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5150 bool merge, BasicType bt, int vec_enc) { 5151 if (opcode == Op_CompressV) { 5152 switch(bt) { 5153 case T_BYTE: 5154 evpcompressb(dst, mask, src, merge, vec_enc); 5155 break; 5156 case T_CHAR: 5157 case T_SHORT: 5158 evpcompressw(dst, mask, src, merge, vec_enc); 5159 break; 5160 case T_INT: 5161 evpcompressd(dst, mask, src, merge, vec_enc); 5162 break; 5163 case T_FLOAT: 5164 evcompressps(dst, mask, src, merge, vec_enc); 5165 break; 5166 case T_LONG: 5167 evpcompressq(dst, mask, src, merge, vec_enc); 5168 break; 5169 case T_DOUBLE: 5170 evcompresspd(dst, mask, src, merge, vec_enc); 5171 break; 5172 default: 5173 fatal("Unsupported type %s", type2name(bt)); 5174 break; 5175 } 5176 } else { 5177 assert(opcode == Op_ExpandV, ""); 5178 switch(bt) { 5179 case T_BYTE: 5180 evpexpandb(dst, mask, src, merge, vec_enc); 5181 break; 5182 case T_CHAR: 5183 case T_SHORT: 5184 evpexpandw(dst, mask, src, merge, vec_enc); 5185 break; 5186 case T_INT: 5187 evpexpandd(dst, mask, src, merge, vec_enc); 5188 break; 5189 case T_FLOAT: 5190 evexpandps(dst, mask, src, merge, vec_enc); 5191 break; 5192 case T_LONG: 5193 evpexpandq(dst, mask, src, merge, vec_enc); 5194 break; 5195 case T_DOUBLE: 5196 evexpandpd(dst, mask, src, merge, vec_enc); 5197 break; 5198 default: 5199 fatal("Unsupported type %s", type2name(bt)); 5200 break; 5201 } 5202 } 5203 } 5204 #endif 5205 5206 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5207 KRegister ktmp1, int vec_enc) { 5208 if (opcode == Op_SignumVD) { 5209 vsubpd(dst, zero, one, vec_enc); 5210 // if src < 0 ? -1 : 1 5211 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5212 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5213 // if src == NaN, -0.0 or 0.0 return src. 5214 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5215 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5216 } else { 5217 assert(opcode == Op_SignumVF, ""); 5218 vsubps(dst, zero, one, vec_enc); 5219 // if src < 0 ? -1 : 1 5220 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5221 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5222 // if src == NaN, -0.0 or 0.0 return src. 5223 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5224 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5225 } 5226 } 5227 5228 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5229 XMMRegister xtmp1, int vec_enc) { 5230 if (opcode == Op_SignumVD) { 5231 vsubpd(dst, zero, one, vec_enc); 5232 // if src < 0 ? -1 : 1 5233 vblendvpd(dst, one, dst, src, vec_enc); 5234 // if src == NaN, -0.0 or 0.0 return src. 5235 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5236 vblendvpd(dst, dst, src, xtmp1, vec_enc); 5237 } else { 5238 assert(opcode == Op_SignumVF, ""); 5239 vsubps(dst, zero, one, vec_enc); 5240 // if src < 0 ? -1 : 1 5241 vblendvps(dst, one, dst, src, vec_enc); 5242 // if src == NaN, -0.0 or 0.0 return src. 5243 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5244 vblendvps(dst, dst, src, xtmp1, vec_enc); 5245 } 5246 } 5247 5248 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5249 if (VM_Version::supports_avx512bw()) { 5250 if (mask_len > 32) { 5251 kmovql(dst, src); 5252 } else { 5253 kmovdl(dst, src); 5254 if (mask_len != 32) { 5255 kshiftrdl(dst, dst, 32 - mask_len); 5256 } 5257 } 5258 } else { 5259 assert(mask_len <= 16, ""); 5260 kmovwl(dst, src); 5261 if (mask_len != 16) { 5262 kshiftrwl(dst, dst, 16 - mask_len); 5263 } 5264 } 5265 } 5266 5267 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5268 int lane_size = type2aelembytes(bt); 5269 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5270 if ((is_LP64 || lane_size < 8) && 5271 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5272 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5273 movptr(rtmp, imm32); 5274 switch(lane_size) { 5275 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5276 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5277 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5278 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5279 fatal("Unsupported lane size %d", lane_size); 5280 break; 5281 } 5282 } else { 5283 movptr(rtmp, imm32); 5284 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5285 switch(lane_size) { 5286 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5287 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5288 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5289 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5290 fatal("Unsupported lane size %d", lane_size); 5291 break; 5292 } 5293 } 5294 } 5295 5296 // 5297 // Following is lookup table based popcount computation algorithm:- 5298 // Index Bit set count 5299 // [ 0000 -> 0, 5300 // 0001 -> 1, 5301 // 0010 -> 1, 5302 // 0011 -> 2, 5303 // 0100 -> 1, 5304 // 0101 -> 2, 5305 // 0110 -> 2, 5306 // 0111 -> 3, 5307 // 1000 -> 1, 5308 // 1001 -> 2, 5309 // 1010 -> 3, 5310 // 1011 -> 3, 5311 // 1100 -> 2, 5312 // 1101 -> 3, 5313 // 1111 -> 4 ] 5314 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5315 // shuffle indices for lookup table access. 5316 // b. Right shift each byte of vector lane by 4 positions. 5317 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5318 // shuffle indices for lookup table access. 5319 // d. Add the bitset count of upper and lower 4 bits of each byte. 5320 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5321 // count of all the bytes of a quadword. 5322 // f. Perform step e. for upper 128bit vector lane. 5323 // g. Pack the bitset count of quadwords back to double word. 5324 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5325 5326 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5327 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5328 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5329 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5330 vpsrlw(dst, src, 4, vec_enc); 5331 vpand(dst, dst, xtmp1, vec_enc); 5332 vpand(xtmp1, src, xtmp1, vec_enc); 5333 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5334 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5335 vpshufb(dst, xtmp2, dst, vec_enc); 5336 vpaddb(dst, dst, xtmp1, vec_enc); 5337 } 5338 5339 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5340 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5341 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5342 // Following code is as per steps e,f,g and h of above algorithm. 5343 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5344 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5345 vpsadbw(dst, dst, xtmp2, vec_enc); 5346 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5347 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5348 vpackuswb(dst, xtmp1, dst, vec_enc); 5349 } 5350 5351 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5352 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5353 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5354 // Add the popcount of upper and lower bytes of word. 5355 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5356 vpsrlw(dst, xtmp1, 8, vec_enc); 5357 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5358 vpaddw(dst, dst, xtmp1, vec_enc); 5359 } 5360 5361 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5362 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5363 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5364 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5365 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5366 } 5367 5368 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5369 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5370 switch(bt) { 5371 case T_LONG: 5372 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5373 break; 5374 case T_INT: 5375 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5376 break; 5377 case T_CHAR: 5378 case T_SHORT: 5379 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5380 break; 5381 case T_BYTE: 5382 case T_BOOLEAN: 5383 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5384 break; 5385 default: 5386 fatal("Unsupported type %s", type2name(bt)); 5387 break; 5388 } 5389 } 5390 5391 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5392 KRegister mask, bool merge, int vec_enc) { 5393 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5394 switch(bt) { 5395 case T_LONG: 5396 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5397 evpopcntq(dst, mask, src, merge, vec_enc); 5398 break; 5399 case T_INT: 5400 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5401 evpopcntd(dst, mask, src, merge, vec_enc); 5402 break; 5403 case T_CHAR: 5404 case T_SHORT: 5405 assert(VM_Version::supports_avx512_bitalg(), ""); 5406 evpopcntw(dst, mask, src, merge, vec_enc); 5407 break; 5408 case T_BYTE: 5409 case T_BOOLEAN: 5410 assert(VM_Version::supports_avx512_bitalg(), ""); 5411 evpopcntb(dst, mask, src, merge, vec_enc); 5412 break; 5413 default: 5414 fatal("Unsupported type %s", type2name(bt)); 5415 break; 5416 } 5417 } 5418 5419 #ifndef _LP64 5420 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 5421 assert(VM_Version::supports_avx512bw(), ""); 5422 kmovdl(tmp, src); 5423 kunpckdql(dst, tmp, tmp); 5424 } 5425 #endif 5426 5427 // Bit reversal algorithm first reverses the bits of each byte followed by 5428 // a byte level reversal for multi-byte primitive types (short/int/long). 5429 // Algorithm performs a lookup table access to get reverse bit sequence 5430 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5431 // is obtained by swapping the reverse bit sequences of upper and lower 5432 // nibble of a byte. 5433 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5434 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5435 if (VM_Version::supports_avx512vlbw()) { 5436 5437 // Get the reverse bit sequence of lower nibble of each byte. 5438 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5439 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5440 evpandq(dst, xtmp2, src, vec_enc); 5441 vpshufb(dst, xtmp1, dst, vec_enc); 5442 vpsllq(dst, dst, 4, vec_enc); 5443 5444 // Get the reverse bit sequence of upper nibble of each byte. 5445 vpandn(xtmp2, xtmp2, src, vec_enc); 5446 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5447 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5448 5449 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5450 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5451 evporq(xtmp2, dst, xtmp2, vec_enc); 5452 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5453 5454 } else if(vec_enc == Assembler::AVX_512bit) { 5455 // Shift based bit reversal. 5456 assert(bt == T_LONG || bt == T_INT, ""); 5457 5458 // Swap lower and upper nibble of each byte. 5459 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5460 5461 // Swap two least and most significant bits of each nibble. 5462 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5463 5464 // Swap adjacent pair of bits. 5465 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5466 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5467 5468 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5469 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5470 } else { 5471 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5472 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5473 5474 // Get the reverse bit sequence of lower nibble of each byte. 5475 vpand(dst, xtmp2, src, vec_enc); 5476 vpshufb(dst, xtmp1, dst, vec_enc); 5477 vpsllq(dst, dst, 4, vec_enc); 5478 5479 // Get the reverse bit sequence of upper nibble of each byte. 5480 vpandn(xtmp2, xtmp2, src, vec_enc); 5481 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5482 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5483 5484 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5485 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5486 vpor(xtmp2, dst, xtmp2, vec_enc); 5487 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5488 } 5489 } 5490 5491 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 5492 XMMRegister xtmp, Register rscratch) { 5493 assert(VM_Version::supports_gfni(), ""); 5494 assert(rscratch != noreg || always_reachable(mask), "missing"); 5495 5496 // Galois field instruction based bit reversal based on following algorithm. 5497 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5498 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 5499 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 5500 vector_reverse_byte(bt, dst, xtmp, vec_enc); 5501 } 5502 5503 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 5504 XMMRegister xtmp1, Register rtmp, int vec_enc) { 5505 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 5506 evpandq(dst, xtmp1, src, vec_enc); 5507 vpsllq(dst, dst, nbits, vec_enc); 5508 vpandn(xtmp1, xtmp1, src, vec_enc); 5509 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 5510 evporq(dst, dst, xtmp1, vec_enc); 5511 } 5512 5513 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5514 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5515 // Shift based bit reversal. 5516 assert(VM_Version::supports_evex(), ""); 5517 switch(bt) { 5518 case T_LONG: 5519 // Swap upper and lower double word of each quad word. 5520 evprorq(xtmp1, k0, src, 32, true, vec_enc); 5521 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 5522 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5523 break; 5524 case T_INT: 5525 // Swap upper and lower word of each double word. 5526 evprord(xtmp1, k0, src, 16, true, vec_enc); 5527 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5528 break; 5529 case T_CHAR: 5530 case T_SHORT: 5531 // Swap upper and lower byte of each word. 5532 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 5533 break; 5534 case T_BYTE: 5535 evmovdquq(dst, k0, src, true, vec_enc); 5536 break; 5537 default: 5538 fatal("Unsupported type %s", type2name(bt)); 5539 break; 5540 } 5541 } 5542 5543 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5544 if (bt == T_BYTE) { 5545 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 5546 evmovdquq(dst, k0, src, true, vec_enc); 5547 } else { 5548 vmovdqu(dst, src); 5549 } 5550 return; 5551 } 5552 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 5553 // pre-computed shuffle indices. 5554 switch(bt) { 5555 case T_LONG: 5556 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 5557 break; 5558 case T_INT: 5559 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 5560 break; 5561 case T_CHAR: 5562 case T_SHORT: 5563 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 5564 break; 5565 default: 5566 fatal("Unsupported type %s", type2name(bt)); 5567 break; 5568 } 5569 vpshufb(dst, src, dst, vec_enc); 5570 } 5571 5572 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5573 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5574 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 5575 assert(is_integral_type(bt), ""); 5576 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5577 assert(VM_Version::supports_avx512cd(), ""); 5578 switch(bt) { 5579 case T_LONG: 5580 evplzcntq(dst, ktmp, src, merge, vec_enc); 5581 break; 5582 case T_INT: 5583 evplzcntd(dst, ktmp, src, merge, vec_enc); 5584 break; 5585 case T_SHORT: 5586 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 5587 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 5588 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 5589 vpunpckhwd(dst, xtmp1, src, vec_enc); 5590 evplzcntd(dst, ktmp, dst, merge, vec_enc); 5591 vpackusdw(dst, xtmp2, dst, vec_enc); 5592 break; 5593 case T_BYTE: 5594 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5595 // accessing the lookup table. 5596 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 5597 // accessing the lookup table. 5598 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 5599 assert(VM_Version::supports_avx512bw(), ""); 5600 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 5601 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 5602 vpand(xtmp2, dst, src, vec_enc); 5603 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5604 vpsrlw(xtmp3, src, 4, vec_enc); 5605 vpand(xtmp3, dst, xtmp3, vec_enc); 5606 vpshufb(dst, xtmp1, xtmp3, vec_enc); 5607 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 5608 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 5609 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 5610 break; 5611 default: 5612 fatal("Unsupported type %s", type2name(bt)); 5613 break; 5614 } 5615 } 5616 5617 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5618 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5619 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 5620 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5621 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5622 // accessing the lookup table. 5623 vpand(dst, xtmp2, src, vec_enc); 5624 vpshufb(dst, xtmp1, dst, vec_enc); 5625 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 5626 // accessing the lookup table. 5627 vpsrlw(xtmp3, src, 4, vec_enc); 5628 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 5629 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 5630 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 5631 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 5632 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 5633 vpaddb(dst, dst, xtmp2, vec_enc); 5634 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 5635 } 5636 5637 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5638 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5639 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5640 // Add zero counts of lower byte and upper byte of a word if 5641 // upper byte holds a zero value. 5642 vpsrlw(xtmp3, src, 8, vec_enc); 5643 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 5644 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 5645 vpsllw(xtmp2, dst, 8, vec_enc); 5646 vpaddw(xtmp2, xtmp2, dst, vec_enc); 5647 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5648 vpsrlw(dst, dst, 8, vec_enc); 5649 } 5650 5651 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5652 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 5653 // Since IEEE 754 floating point format represents mantissa in 1.0 format 5654 // hence biased exponent can be used to compute leading zero count as per 5655 // following formula:- 5656 // LZCNT = 32 - (biased_exp - 127) 5657 // Special handling has been introduced for Zero, Max_Int and -ve source values. 5658 5659 // Broadcast 0xFF 5660 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 5661 vpsrld(xtmp1, xtmp1, 24, vec_enc); 5662 5663 // Extract biased exponent. 5664 vcvtdq2ps(dst, src, vec_enc); 5665 vpsrld(dst, dst, 23, vec_enc); 5666 vpand(dst, dst, xtmp1, vec_enc); 5667 5668 // Broadcast 127. 5669 vpsrld(xtmp1, xtmp1, 1, vec_enc); 5670 // Exponent = biased_exp - 127 5671 vpsubd(dst, dst, xtmp1, vec_enc); 5672 5673 // Exponent = Exponent + 1 5674 vpsrld(xtmp3, xtmp1, 6, vec_enc); 5675 vpaddd(dst, dst, xtmp3, vec_enc); 5676 5677 // Replace -ve exponent with zero, exponent is -ve when src 5678 // lane contains a zero value. 5679 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5680 vblendvps(dst, dst, xtmp2, dst, vec_enc); 5681 5682 // Rematerialize broadcast 32. 5683 vpslld(xtmp1, xtmp3, 5, vec_enc); 5684 // Exponent is 32 if corresponding source lane contains max_int value. 5685 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 5686 // LZCNT = 32 - exponent 5687 vpsubd(dst, xtmp1, dst, vec_enc); 5688 5689 // Replace LZCNT with a value 1 if corresponding source lane 5690 // contains max_int value. 5691 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 5692 5693 // Replace biased_exp with 0 if source lane value is less than zero. 5694 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5695 vblendvps(dst, dst, xtmp2, src, vec_enc); 5696 } 5697 5698 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5699 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5700 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5701 // Add zero counts of lower word and upper word of a double word if 5702 // upper word holds a zero value. 5703 vpsrld(xtmp3, src, 16, vec_enc); 5704 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 5705 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 5706 vpslld(xtmp2, dst, 16, vec_enc); 5707 vpaddd(xtmp2, xtmp2, dst, vec_enc); 5708 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5709 vpsrld(dst, dst, 16, vec_enc); 5710 // Add zero counts of lower doubleword and upper doubleword of a 5711 // quadword if upper doubleword holds a zero value. 5712 vpsrlq(xtmp3, src, 32, vec_enc); 5713 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 5714 vpsllq(xtmp2, dst, 32, vec_enc); 5715 vpaddq(xtmp2, xtmp2, dst, vec_enc); 5716 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5717 vpsrlq(dst, dst, 32, vec_enc); 5718 } 5719 5720 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 5721 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5722 Register rtmp, int vec_enc) { 5723 assert(is_integral_type(bt), "unexpected type"); 5724 assert(vec_enc < Assembler::AVX_512bit, ""); 5725 switch(bt) { 5726 case T_LONG: 5727 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5728 break; 5729 case T_INT: 5730 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 5731 break; 5732 case T_SHORT: 5733 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5734 break; 5735 case T_BYTE: 5736 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5737 break; 5738 default: 5739 fatal("Unsupported type %s", type2name(bt)); 5740 break; 5741 } 5742 } 5743 5744 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 5745 switch(bt) { 5746 case T_BYTE: 5747 vpsubb(dst, src1, src2, vec_enc); 5748 break; 5749 case T_SHORT: 5750 vpsubw(dst, src1, src2, vec_enc); 5751 break; 5752 case T_INT: 5753 vpsubd(dst, src1, src2, vec_enc); 5754 break; 5755 case T_LONG: 5756 vpsubq(dst, src1, src2, vec_enc); 5757 break; 5758 default: 5759 fatal("Unsupported type %s", type2name(bt)); 5760 break; 5761 } 5762 } 5763 5764 // Trailing zero count computation is based on leading zero count operation as per 5765 // following equation. All AVX3 targets support AVX512CD feature which offers 5766 // direct vector instruction to compute leading zero count. 5767 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 5768 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5769 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5770 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 5771 assert(is_integral_type(bt), ""); 5772 // xtmp = -1 5773 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 5774 // xtmp = xtmp + src 5775 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 5776 // xtmp = xtmp & ~src 5777 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 5778 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 5779 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 5780 vpsub(bt, dst, xtmp4, dst, vec_enc); 5781 } 5782 5783 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 5784 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 5785 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5786 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5787 assert(is_integral_type(bt), ""); 5788 // xtmp = 0 5789 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 5790 // xtmp = 0 - src 5791 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 5792 // xtmp = xtmp | src 5793 vpor(xtmp3, xtmp3, src, vec_enc); 5794 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 5795 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 5796 vpsub(bt, dst, xtmp1, dst, vec_enc); 5797 } 5798 5799 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 5800 Label done; 5801 Label neg_divisor_fastpath; 5802 cmpl(divisor, 0); 5803 jccb(Assembler::less, neg_divisor_fastpath); 5804 xorl(rdx, rdx); 5805 divl(divisor); 5806 jmpb(done); 5807 bind(neg_divisor_fastpath); 5808 // Fastpath for divisor < 0: 5809 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 5810 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 5811 movl(rdx, rax); 5812 subl(rdx, divisor); 5813 if (VM_Version::supports_bmi1()) { 5814 andnl(rax, rdx, rax); 5815 } else { 5816 notl(rdx); 5817 andl(rax, rdx); 5818 } 5819 shrl(rax, 31); 5820 bind(done); 5821 } 5822 5823 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 5824 Label done; 5825 Label neg_divisor_fastpath; 5826 cmpl(divisor, 0); 5827 jccb(Assembler::less, neg_divisor_fastpath); 5828 xorl(rdx, rdx); 5829 divl(divisor); 5830 jmpb(done); 5831 bind(neg_divisor_fastpath); 5832 // Fastpath when divisor < 0: 5833 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 5834 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 5835 movl(rdx, rax); 5836 subl(rax, divisor); 5837 if (VM_Version::supports_bmi1()) { 5838 andnl(rax, rax, rdx); 5839 } else { 5840 notl(rax); 5841 andl(rax, rdx); 5842 } 5843 sarl(rax, 31); 5844 andl(rax, divisor); 5845 subl(rdx, rax); 5846 bind(done); 5847 } 5848 5849 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 5850 Label done; 5851 Label neg_divisor_fastpath; 5852 5853 cmpl(divisor, 0); 5854 jccb(Assembler::less, neg_divisor_fastpath); 5855 xorl(rdx, rdx); 5856 divl(divisor); 5857 jmpb(done); 5858 bind(neg_divisor_fastpath); 5859 // Fastpath for divisor < 0: 5860 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 5861 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 5862 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 5863 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 5864 movl(rdx, rax); 5865 subl(rax, divisor); 5866 if (VM_Version::supports_bmi1()) { 5867 andnl(rax, rax, rdx); 5868 } else { 5869 notl(rax); 5870 andl(rax, rdx); 5871 } 5872 movl(tmp, rax); 5873 shrl(rax, 31); // quotient 5874 sarl(tmp, 31); 5875 andl(tmp, divisor); 5876 subl(rdx, tmp); // remainder 5877 bind(done); 5878 } 5879 5880 #ifdef _LP64 5881 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 5882 XMMRegister xtmp2, Register rtmp) { 5883 if(VM_Version::supports_gfni()) { 5884 // Galois field instruction based bit reversal based on following algorithm. 5885 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5886 mov64(rtmp, 0x8040201008040201L); 5887 movq(xtmp1, src); 5888 movq(xtmp2, rtmp); 5889 gf2p8affineqb(xtmp1, xtmp2, 0); 5890 movq(dst, xtmp1); 5891 } else { 5892 // Swap even and odd numbered bits. 5893 movl(rtmp, src); 5894 andl(rtmp, 0x55555555); 5895 shll(rtmp, 1); 5896 movl(dst, src); 5897 andl(dst, 0xAAAAAAAA); 5898 shrl(dst, 1); 5899 orl(dst, rtmp); 5900 5901 // Swap LSB and MSB 2 bits of each nibble. 5902 movl(rtmp, dst); 5903 andl(rtmp, 0x33333333); 5904 shll(rtmp, 2); 5905 andl(dst, 0xCCCCCCCC); 5906 shrl(dst, 2); 5907 orl(dst, rtmp); 5908 5909 // Swap LSB and MSB 4 bits of each byte. 5910 movl(rtmp, dst); 5911 andl(rtmp, 0x0F0F0F0F); 5912 shll(rtmp, 4); 5913 andl(dst, 0xF0F0F0F0); 5914 shrl(dst, 4); 5915 orl(dst, rtmp); 5916 } 5917 bswapl(dst); 5918 } 5919 5920 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 5921 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 5922 if(VM_Version::supports_gfni()) { 5923 // Galois field instruction based bit reversal based on following algorithm. 5924 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5925 mov64(rtmp1, 0x8040201008040201L); 5926 movq(xtmp1, src); 5927 movq(xtmp2, rtmp1); 5928 gf2p8affineqb(xtmp1, xtmp2, 0); 5929 movq(dst, xtmp1); 5930 } else { 5931 // Swap even and odd numbered bits. 5932 movq(rtmp1, src); 5933 mov64(rtmp2, 0x5555555555555555L); 5934 andq(rtmp1, rtmp2); 5935 shlq(rtmp1, 1); 5936 movq(dst, src); 5937 notq(rtmp2); 5938 andq(dst, rtmp2); 5939 shrq(dst, 1); 5940 orq(dst, rtmp1); 5941 5942 // Swap LSB and MSB 2 bits of each nibble. 5943 movq(rtmp1, dst); 5944 mov64(rtmp2, 0x3333333333333333L); 5945 andq(rtmp1, rtmp2); 5946 shlq(rtmp1, 2); 5947 notq(rtmp2); 5948 andq(dst, rtmp2); 5949 shrq(dst, 2); 5950 orq(dst, rtmp1); 5951 5952 // Swap LSB and MSB 4 bits of each byte. 5953 movq(rtmp1, dst); 5954 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 5955 andq(rtmp1, rtmp2); 5956 shlq(rtmp1, 4); 5957 notq(rtmp2); 5958 andq(dst, rtmp2); 5959 shrq(dst, 4); 5960 orq(dst, rtmp1); 5961 } 5962 bswapq(dst); 5963 } 5964 5965 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 5966 Label done; 5967 Label neg_divisor_fastpath; 5968 cmpq(divisor, 0); 5969 jccb(Assembler::less, neg_divisor_fastpath); 5970 xorl(rdx, rdx); 5971 divq(divisor); 5972 jmpb(done); 5973 bind(neg_divisor_fastpath); 5974 // Fastpath for divisor < 0: 5975 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 5976 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 5977 movq(rdx, rax); 5978 subq(rdx, divisor); 5979 if (VM_Version::supports_bmi1()) { 5980 andnq(rax, rdx, rax); 5981 } else { 5982 notq(rdx); 5983 andq(rax, rdx); 5984 } 5985 shrq(rax, 63); 5986 bind(done); 5987 } 5988 5989 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 5990 Label done; 5991 Label neg_divisor_fastpath; 5992 cmpq(divisor, 0); 5993 jccb(Assembler::less, neg_divisor_fastpath); 5994 xorq(rdx, rdx); 5995 divq(divisor); 5996 jmp(done); 5997 bind(neg_divisor_fastpath); 5998 // Fastpath when divisor < 0: 5999 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6000 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6001 movq(rdx, rax); 6002 subq(rax, divisor); 6003 if (VM_Version::supports_bmi1()) { 6004 andnq(rax, rax, rdx); 6005 } else { 6006 notq(rax); 6007 andq(rax, rdx); 6008 } 6009 sarq(rax, 63); 6010 andq(rax, divisor); 6011 subq(rdx, rax); 6012 bind(done); 6013 } 6014 6015 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6016 Label done; 6017 Label neg_divisor_fastpath; 6018 cmpq(divisor, 0); 6019 jccb(Assembler::less, neg_divisor_fastpath); 6020 xorq(rdx, rdx); 6021 divq(divisor); 6022 jmp(done); 6023 bind(neg_divisor_fastpath); 6024 // Fastpath for divisor < 0: 6025 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6026 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6027 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6028 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6029 movq(rdx, rax); 6030 subq(rax, divisor); 6031 if (VM_Version::supports_bmi1()) { 6032 andnq(rax, rax, rdx); 6033 } else { 6034 notq(rax); 6035 andq(rax, rdx); 6036 } 6037 movq(tmp, rax); 6038 shrq(rax, 63); // quotient 6039 sarq(tmp, 63); 6040 andq(tmp, divisor); 6041 subq(rdx, tmp); // remainder 6042 bind(done); 6043 } 6044 #endif 6045 6046 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6047 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6048 int vlen_enc) { 6049 assert(VM_Version::supports_avx512bw(), ""); 6050 // Byte shuffles are inlane operations and indices are determined using 6051 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6052 // normalized to index range 0-15. This makes sure that all the multiples 6053 // of an index value are placed at same relative position in 128 bit 6054 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6055 // will be 16th element in their respective 128 bit lanes. 6056 movl(rtmp, 16); 6057 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6058 6059 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6060 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6061 // original shuffle indices and move the shuffled lanes corresponding to true 6062 // mask to destination vector. 6063 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6064 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6065 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6066 6067 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6068 // and broadcasting second 128 bit lane. 6069 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6070 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6071 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6072 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6073 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6074 6075 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6076 // and broadcasting third 128 bit lane. 6077 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6078 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6079 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6080 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6081 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6082 6083 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6084 // and broadcasting third 128 bit lane. 6085 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6086 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6087 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6088 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6089 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6090 } 6091