1 /* 2 * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "oops/methodData.hpp" 31 #include "opto/c2_MacroAssembler.hpp" 32 #include "opto/intrinsicnode.hpp" 33 #include "opto/output.hpp" 34 #include "opto/opcodes.hpp" 35 #include "opto/subnode.hpp" 36 #include "runtime/objectMonitor.hpp" 37 #include "runtime/stubRoutines.hpp" 38 39 #ifdef PRODUCT 40 #define BLOCK_COMMENT(str) /* nothing */ 41 #define STOP(error) stop(error) 42 #else 43 #define BLOCK_COMMENT(str) block_comment(str) 44 #define STOP(error) block_comment(error); stop(error) 45 #endif 46 47 // C2 compiled method's prolog code. 48 void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub, int max_monitors) { 49 50 // WARNING: Initial instruction MUST be 5 bytes or longer so that 51 // NativeJump::patch_verified_entry will be able to patch out the entry 52 // code safely. The push to verify stack depth is ok at 5 bytes, 53 // the frame allocation can be either 3 or 6 bytes. So if we don't do 54 // stack bang then we must use the 6 byte frame allocation even if 55 // we have no frame. :-( 56 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 57 58 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 59 // Remove word for return addr 60 framesize -= wordSize; 61 stack_bang_size -= wordSize; 62 63 // Calls to C2R adapters often do not accept exceptional returns. 64 // We require that their callers must bang for them. But be careful, because 65 // some VM calls (such as call site linkage) can use several kilobytes of 66 // stack. But the stack safety zone should account for that. 67 // See bugs 4446381, 4468289, 4497237. 68 if (stack_bang_size > 0) { 69 generate_stack_overflow_check(stack_bang_size); 70 71 // We always push rbp, so that on return to interpreter rbp, will be 72 // restored correctly and we can correct the stack. 73 push(rbp); 74 // Save caller's stack pointer into RBP if the frame pointer is preserved. 75 if (PreserveFramePointer) { 76 mov(rbp, rsp); 77 } 78 // Remove word for ebp 79 framesize -= wordSize; 80 81 // Create frame 82 if (framesize) { 83 subptr(rsp, framesize); 84 } 85 } else { 86 // Create frame (force generation of a 4 byte immediate value) 87 subptr_imm32(rsp, framesize); 88 89 // Save RBP register now. 90 framesize -= wordSize; 91 movptr(Address(rsp, framesize), rbp); 92 // Save caller's stack pointer into RBP if the frame pointer is preserved. 93 if (PreserveFramePointer) { 94 movptr(rbp, rsp); 95 if (framesize > 0) { 96 addptr(rbp, framesize); 97 } 98 } 99 } 100 101 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 102 framesize -= wordSize; 103 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 104 } 105 106 #ifndef _LP64 107 // If method sets FPU control word do it now 108 if (fp_mode_24b) { 109 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 110 } 111 if (UseSSE >= 2 && VerifyFPU) { 112 verify_FPU(0, "FPU stack must be clean on entry"); 113 } 114 #endif 115 116 #ifdef ASSERT 117 if (VerifyStackAtCalls) { 118 Label L; 119 push(rax); 120 mov(rax, rsp); 121 andptr(rax, StackAlignmentInBytes-1); 122 cmpptr(rax, StackAlignmentInBytes-wordSize); 123 pop(rax); 124 jcc(Assembler::equal, L); 125 STOP("Stack is not properly aligned!"); 126 bind(L); 127 } 128 #endif 129 130 #ifdef _LP64 131 if (UseFastLocking && max_monitors > 0) { 132 C2CheckLockStackStub* stub = new (Compile::current()->comp_arena()) C2CheckLockStackStub(); 133 Compile::current()->output()->add_stub(stub); 134 assert(!is_stub, "only methods have monitors"); 135 Register thread = r15_thread; 136 movptr(rax, Address(thread, JavaThread::lock_stack_current_offset())); 137 addptr(rax, max_monitors * oopSize); 138 cmpptr(rax, Address(thread, JavaThread::lock_stack_limit_offset())); 139 jcc(Assembler::greaterEqual, stub->entry()); 140 bind(stub->continuation()); 141 } 142 #endif 143 144 if (!is_stub) { 145 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 146 #ifdef _LP64 147 if (BarrierSet::barrier_set()->barrier_set_nmethod() != NULL) { 148 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 149 Label dummy_slow_path; 150 Label dummy_continuation; 151 Label* slow_path = &dummy_slow_path; 152 Label* continuation = &dummy_continuation; 153 if (!Compile::current()->output()->in_scratch_emit_size()) { 154 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 155 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 156 Compile::current()->output()->add_stub(stub); 157 slow_path = &stub->entry(); 158 continuation = &stub->continuation(); 159 } 160 bs->nmethod_entry_barrier(this, slow_path, continuation); 161 } 162 #else 163 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 164 bs->nmethod_entry_barrier(this, NULL /* slow_path */, NULL /* continuation */); 165 #endif 166 } 167 } 168 169 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 170 switch (vlen_in_bytes) { 171 case 4: // fall-through 172 case 8: // fall-through 173 case 16: return Assembler::AVX_128bit; 174 case 32: return Assembler::AVX_256bit; 175 case 64: return Assembler::AVX_512bit; 176 177 default: { 178 ShouldNotReachHere(); 179 return Assembler::AVX_NoVec; 180 } 181 } 182 } 183 184 #if INCLUDE_RTM_OPT 185 186 // Update rtm_counters based on abort status 187 // input: abort_status 188 // rtm_counters (RTMLockingCounters*) 189 // flags are killed 190 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) { 191 192 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset())); 193 if (PrintPreciseRTMLockingStatistics) { 194 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 195 Label check_abort; 196 testl(abort_status, (1<<i)); 197 jccb(Assembler::equal, check_abort); 198 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx)))); 199 bind(check_abort); 200 } 201 } 202 } 203 204 // Branch if (random & (count-1) != 0), count is 2^n 205 // tmp, scr and flags are killed 206 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) { 207 assert(tmp == rax, ""); 208 assert(scr == rdx, ""); 209 rdtsc(); // modifies EDX:EAX 210 andptr(tmp, count-1); 211 jccb(Assembler::notZero, brLabel); 212 } 213 214 // Perform abort ratio calculation, set no_rtm bit if high ratio 215 // input: rtm_counters_Reg (RTMLockingCounters* address) 216 // tmpReg, rtm_counters_Reg and flags are killed 217 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg, 218 Register rtm_counters_Reg, 219 RTMLockingCounters* rtm_counters, 220 Metadata* method_data) { 221 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 222 223 if (RTMLockingCalculationDelay > 0) { 224 // Delay calculation 225 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr())); 226 testptr(tmpReg, tmpReg); 227 jccb(Assembler::equal, L_done); 228 } 229 // Abort ratio calculation only if abort_count > RTMAbortThreshold 230 // Aborted transactions = abort_count * 100 231 // All transactions = total_count * RTMTotalCountIncrRate 232 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 233 234 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset())); 235 cmpptr(tmpReg, RTMAbortThreshold); 236 jccb(Assembler::below, L_check_always_rtm2); 237 imulptr(tmpReg, tmpReg, 100); 238 239 Register scrReg = rtm_counters_Reg; 240 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 241 imulptr(scrReg, scrReg, RTMTotalCountIncrRate); 242 imulptr(scrReg, scrReg, RTMAbortRatio); 243 cmpptr(tmpReg, scrReg); 244 jccb(Assembler::below, L_check_always_rtm1); 245 if (method_data != NULL) { 246 // set rtm_state to "no rtm" in MDO 247 mov_metadata(tmpReg, method_data); 248 lock(); 249 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM); 250 } 251 jmpb(L_done); 252 bind(L_check_always_rtm1); 253 // Reload RTMLockingCounters* address 254 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 255 bind(L_check_always_rtm2); 256 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 257 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 258 jccb(Assembler::below, L_done); 259 if (method_data != NULL) { 260 // set rtm_state to "always rtm" in MDO 261 mov_metadata(tmpReg, method_data); 262 lock(); 263 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM); 264 } 265 bind(L_done); 266 } 267 268 // Update counters and perform abort ratio calculation 269 // input: abort_status_Reg 270 // rtm_counters_Reg, flags are killed 271 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg, 272 Register rtm_counters_Reg, 273 RTMLockingCounters* rtm_counters, 274 Metadata* method_data, 275 bool profile_rtm) { 276 277 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 278 // update rtm counters based on rax value at abort 279 // reads abort_status_Reg, updates flags 280 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 281 rtm_counters_update(abort_status_Reg, rtm_counters_Reg); 282 if (profile_rtm) { 283 // Save abort status because abort_status_Reg is used by following code. 284 if (RTMRetryCount > 0) { 285 push(abort_status_Reg); 286 } 287 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 288 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data); 289 // restore abort status 290 if (RTMRetryCount > 0) { 291 pop(abort_status_Reg); 292 } 293 } 294 } 295 296 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4) 297 // inputs: retry_count_Reg 298 // : abort_status_Reg 299 // output: retry_count_Reg decremented by 1 300 // flags are killed 301 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) { 302 Label doneRetry; 303 assert(abort_status_Reg == rax, ""); 304 // The abort reason bits are in eax (see all states in rtmLocking.hpp) 305 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) 306 // if reason is in 0x6 and retry count != 0 then retry 307 andptr(abort_status_Reg, 0x6); 308 jccb(Assembler::zero, doneRetry); 309 testl(retry_count_Reg, retry_count_Reg); 310 jccb(Assembler::zero, doneRetry); 311 pause(); 312 decrementl(retry_count_Reg); 313 jmp(retryLabel); 314 bind(doneRetry); 315 } 316 317 // Spin and retry if lock is busy, 318 // inputs: box_Reg (monitor address) 319 // : retry_count_Reg 320 // output: retry_count_Reg decremented by 1 321 // : clear z flag if retry count exceeded 322 // tmp_Reg, scr_Reg, flags are killed 323 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg, 324 Register tmp_Reg, Register scr_Reg, Label& retryLabel) { 325 Label SpinLoop, SpinExit, doneRetry; 326 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 327 328 testl(retry_count_Reg, retry_count_Reg); 329 jccb(Assembler::zero, doneRetry); 330 decrementl(retry_count_Reg); 331 movptr(scr_Reg, RTMSpinLoopCount); 332 333 bind(SpinLoop); 334 pause(); 335 decrementl(scr_Reg); 336 jccb(Assembler::lessEqual, SpinExit); 337 movptr(tmp_Reg, Address(box_Reg, owner_offset)); 338 testptr(tmp_Reg, tmp_Reg); 339 jccb(Assembler::notZero, SpinLoop); 340 341 bind(SpinExit); 342 jmp(retryLabel); 343 bind(doneRetry); 344 incrementl(retry_count_Reg); // clear z flag 345 } 346 347 // Use RTM for normal stack locks 348 // Input: objReg (object to lock) 349 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg, 350 Register retry_on_abort_count_Reg, 351 RTMLockingCounters* stack_rtm_counters, 352 Metadata* method_data, bool profile_rtm, 353 Label& DONE_LABEL, Label& IsInflated) { 354 assert(UseRTMForStackLocks, "why call this otherwise?"); 355 assert(tmpReg == rax, ""); 356 assert(scrReg == rdx, ""); 357 Label L_rtm_retry, L_decrement_retry, L_on_abort; 358 359 if (RTMRetryCount > 0) { 360 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 361 bind(L_rtm_retry); 362 } 363 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 364 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 365 jcc(Assembler::notZero, IsInflated); 366 367 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 368 Label L_noincrement; 369 if (RTMTotalCountIncrRate > 1) { 370 // tmpReg, scrReg and flags are killed 371 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 372 } 373 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 374 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg); 375 bind(L_noincrement); 376 } 377 xbegin(L_on_abort); 378 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 379 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 380 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 381 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked 382 383 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 384 if (UseRTMXendForLockBusy) { 385 xend(); 386 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry) 387 jmp(L_decrement_retry); 388 } 389 else { 390 xabort(0); 391 } 392 bind(L_on_abort); 393 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 394 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm); 395 } 396 bind(L_decrement_retry); 397 if (RTMRetryCount > 0) { 398 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 399 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 400 } 401 } 402 403 // Use RTM for inflating locks 404 // inputs: objReg (object to lock) 405 // boxReg (on-stack box address (displaced header location) - KILLED) 406 // tmpReg (ObjectMonitor address + markWord::monitor_value) 407 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg, 408 Register scrReg, Register retry_on_busy_count_Reg, 409 Register retry_on_abort_count_Reg, 410 RTMLockingCounters* rtm_counters, 411 Metadata* method_data, bool profile_rtm, 412 Label& DONE_LABEL) { 413 assert(UseRTMLocking, "why call this otherwise?"); 414 assert(tmpReg == rax, ""); 415 assert(scrReg == rdx, ""); 416 Label L_rtm_retry, L_decrement_retry, L_on_abort; 417 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 418 419 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 420 movptr(boxReg, tmpReg); // Save ObjectMonitor address 421 422 if (RTMRetryCount > 0) { 423 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy 424 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 425 bind(L_rtm_retry); 426 } 427 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 428 Label L_noincrement; 429 if (RTMTotalCountIncrRate > 1) { 430 // tmpReg, scrReg and flags are killed 431 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 432 } 433 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 434 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg); 435 bind(L_noincrement); 436 } 437 xbegin(L_on_abort); 438 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 439 movptr(tmpReg, Address(tmpReg, owner_offset)); 440 testptr(tmpReg, tmpReg); 441 jcc(Assembler::zero, DONE_LABEL); 442 if (UseRTMXendForLockBusy) { 443 xend(); 444 jmp(L_decrement_retry); 445 } 446 else { 447 xabort(0); 448 } 449 bind(L_on_abort); 450 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 451 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 452 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm); 453 } 454 if (RTMRetryCount > 0) { 455 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 456 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 457 } 458 459 movptr(tmpReg, Address(boxReg, owner_offset)) ; 460 testptr(tmpReg, tmpReg) ; 461 jccb(Assembler::notZero, L_decrement_retry) ; 462 463 // Appears unlocked - try to swing _owner from null to non-null. 464 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 465 #ifdef _LP64 466 Register threadReg = r15_thread; 467 #else 468 get_thread(scrReg); 469 Register threadReg = scrReg; 470 #endif 471 lock(); 472 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg 473 474 if (RTMRetryCount > 0) { 475 // success done else retry 476 jccb(Assembler::equal, DONE_LABEL) ; 477 bind(L_decrement_retry); 478 // Spin and retry if lock is busy. 479 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry); 480 } 481 else { 482 bind(L_decrement_retry); 483 } 484 } 485 486 #endif // INCLUDE_RTM_OPT 487 488 // fast_lock and fast_unlock used by C2 489 490 // Because the transitions from emitted code to the runtime 491 // monitorenter/exit helper stubs are so slow it's critical that 492 // we inline both the stack-locking fast path and the inflated fast path. 493 // 494 // See also: cmpFastLock and cmpFastUnlock. 495 // 496 // What follows is a specialized inline transliteration of the code 497 // in enter() and exit(). If we're concerned about I$ bloat another 498 // option would be to emit TrySlowEnter and TrySlowExit methods 499 // at startup-time. These methods would accept arguments as 500 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 501 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 502 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 503 // In practice, however, the # of lock sites is bounded and is usually small. 504 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 505 // if the processor uses simple bimodal branch predictors keyed by EIP 506 // Since the helper routines would be called from multiple synchronization 507 // sites. 508 // 509 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 510 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 511 // to those specialized methods. That'd give us a mostly platform-independent 512 // implementation that the JITs could optimize and inline at their pleasure. 513 // Done correctly, the only time we'd need to cross to native could would be 514 // to park() or unpark() threads. We'd also need a few more unsafe operators 515 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 516 // (b) explicit barriers or fence operations. 517 // 518 // TODO: 519 // 520 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 521 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 522 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 523 // the lock operators would typically be faster than reifying Self. 524 // 525 // * Ideally I'd define the primitives as: 526 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 527 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 528 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 529 // Instead, we're stuck with a rather awkward and brittle register assignments below. 530 // Furthermore the register assignments are overconstrained, possibly resulting in 531 // sub-optimal code near the synchronization site. 532 // 533 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 534 // Alternately, use a better sp-proximity test. 535 // 536 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 537 // Either one is sufficient to uniquely identify a thread. 538 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 539 // 540 // * Intrinsify notify() and notifyAll() for the common cases where the 541 // object is locked by the calling thread but the waitlist is empty. 542 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 543 // 544 // * use jccb and jmpb instead of jcc and jmp to improve code density. 545 // But beware of excessive branch density on AMD Opterons. 546 // 547 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 548 // or failure of the fast path. If the fast path fails then we pass 549 // control to the slow path, typically in C. In fast_lock and 550 // fast_unlock we often branch to DONE_LABEL, just to find that C2 551 // will emit a conditional branch immediately after the node. 552 // So we have branches to branches and lots of ICC.ZF games. 553 // Instead, it might be better to have C2 pass a "FailureLabel" 554 // into fast_lock and fast_unlock. In the case of success, control 555 // will drop through the node. ICC.ZF is undefined at exit. 556 // In the case of failure, the node will branch directly to the 557 // FailureLabel 558 559 560 // obj: object to lock 561 // box: on-stack box address (displaced header location) - KILLED 562 // rax,: tmp -- KILLED 563 // scr: tmp -- KILLED 564 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 565 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 566 RTMLockingCounters* rtm_counters, 567 RTMLockingCounters* stack_rtm_counters, 568 Metadata* method_data, 569 bool use_rtm, bool profile_rtm) { 570 // Ensure the register assignments are disjoint 571 assert(tmpReg == rax, ""); 572 573 if (use_rtm) { 574 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg); 575 } else { 576 assert(cx1Reg == noreg, ""); 577 assert(cx2Reg == noreg, ""); 578 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 579 } 580 581 // Possible cases that we'll encounter in fast_lock 582 // ------------------------------------------------ 583 // * Inflated 584 // -- unlocked 585 // -- Locked 586 // = by self 587 // = by other 588 // * neutral 589 // * stack-locked 590 // -- by self 591 // = sp-proximity test hits 592 // = sp-proximity test generates false-negative 593 // -- by other 594 // 595 596 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 597 598 if (DiagnoseSyncOnValueBasedClasses != 0) { 599 load_klass(tmpReg, objReg, scrReg); 600 movl(tmpReg, Address(tmpReg, Klass::access_flags_offset())); 601 testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS); 602 jcc(Assembler::notZero, DONE_LABEL); 603 } 604 605 #if INCLUDE_RTM_OPT 606 if (UseRTMForStackLocks && use_rtm) { 607 assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive"); 608 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg, 609 stack_rtm_counters, method_data, profile_rtm, 610 DONE_LABEL, IsInflated); 611 } 612 #endif // INCLUDE_RTM_OPT 613 614 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 615 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 616 jccb(Assembler::notZero, IsInflated); 617 618 if (!UseHeavyMonitors) { 619 if (UseFastLocking) { 620 #ifdef _LP64 621 fast_lock_impl(objReg, tmpReg, thread, scrReg, NO_COUNT, false); 622 jmp(COUNT); 623 #else 624 // We can not emit the lock-stack-check in verified_entry() because we don't have enough 625 // registers (for thread ptr). Therefore we have to emit the lock-stack-check in 626 // fast_lock_impl(). However, that check can take a slow-path with ZF=1, therefore 627 // we need to handle it specially and force ZF=0 before taking the actual slow-path. 628 Label slow; 629 fast_lock_impl(objReg, tmpReg, thread, scrReg, slow); 630 jmp(COUNT); 631 bind(slow); 632 testptr(objReg, objReg); // ZF=0 to indicate failure 633 jmp(NO_COUNT); 634 #endif 635 } else { 636 // Attempt stack-locking ... 637 orptr (tmpReg, markWord::unlocked_value); 638 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 639 lock(); 640 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 641 jcc(Assembler::equal, COUNT); // Success 642 643 // Recursive locking. 644 // The object is stack-locked: markword contains stack pointer to BasicLock. 645 // Locked by current thread if difference with current SP is less than one page. 646 subptr(tmpReg, rsp); 647 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 648 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) ); 649 movptr(Address(boxReg, 0), tmpReg); 650 } 651 } else { 652 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 653 testptr(objReg, objReg); 654 } 655 jmp(DONE_LABEL); 656 657 bind(IsInflated); 658 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 659 660 #if INCLUDE_RTM_OPT 661 // Use the same RTM locking code in 32- and 64-bit VM. 662 if (use_rtm) { 663 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg, 664 rtm_counters, method_data, profile_rtm, DONE_LABEL); 665 } else { 666 #endif // INCLUDE_RTM_OPT 667 668 #ifndef _LP64 669 // The object is inflated. 670 671 // boxReg refers to the on-stack BasicLock in the current frame. 672 // We'd like to write: 673 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 674 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 675 // additional latency as we have another ST in the store buffer that must drain. 676 677 // avoid ST-before-CAS 678 // register juggle because we need tmpReg for cmpxchgptr below 679 movptr(scrReg, boxReg); 680 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 681 682 // Optimistic form: consider XORL tmpReg,tmpReg 683 movptr(tmpReg, NULL_WORD); 684 685 // Appears unlocked - try to swing _owner from null to non-null. 686 // Ideally, I'd manifest "Self" with get_thread and then attempt 687 // to CAS the register containing Self into m->Owner. 688 // But we don't have enough registers, so instead we can either try to CAS 689 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 690 // we later store "Self" into m->Owner. Transiently storing a stack address 691 // (rsp or the address of the box) into m->owner is harmless. 692 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 693 lock(); 694 cmpxchgptr(thread, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 695 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 696 697 // If the CAS fails we can either retry or pass control to the slow path. 698 // We use the latter tactic. 699 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 700 // If the CAS was successful ... 701 // Self has acquired the lock 702 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 703 // Intentional fall-through into DONE_LABEL ... 704 #else // _LP64 705 // It's inflated and we use scrReg for ObjectMonitor* in this section. 706 movq(scrReg, tmpReg); 707 xorq(tmpReg, tmpReg); 708 lock(); 709 cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 710 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 711 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 712 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 713 // Propagate ICC.ZF from CAS above into DONE_LABEL. 714 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 715 716 cmpptr(r15_thread, rax); // Check if we are already the owner (recursive lock) 717 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 718 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 719 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 720 #endif // _LP64 721 #if INCLUDE_RTM_OPT 722 } // use_rtm() 723 #endif 724 bind(DONE_LABEL); 725 726 // ZFlag == 1 count in fast path 727 // ZFlag == 0 count in slow path 728 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 729 730 bind(COUNT); 731 // Count monitors in fast path 732 #ifndef _LP64 733 get_thread(tmpReg); 734 incrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); 735 #else // _LP64 736 incrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 737 #endif 738 739 xorl(tmpReg, tmpReg); // Set ZF == 1 740 741 bind(NO_COUNT); 742 743 // At NO_COUNT the icc ZFlag is set as follows ... 744 // fast_unlock uses the same protocol. 745 // ZFlag == 1 -> Success 746 // ZFlag == 0 -> Failure - force control through the slow path 747 } 748 749 // obj: object to unlock 750 // box: box address (displaced header location), killed. Must be EAX. 751 // tmp: killed, cannot be obj nor box. 752 // 753 // Some commentary on balanced locking: 754 // 755 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 756 // Methods that don't have provably balanced locking are forced to run in the 757 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 758 // The interpreter provides two properties: 759 // I1: At return-time the interpreter automatically and quietly unlocks any 760 // objects acquired the current activation (frame). Recall that the 761 // interpreter maintains an on-stack list of locks currently held by 762 // a frame. 763 // I2: If a method attempts to unlock an object that is not held by the 764 // the frame the interpreter throws IMSX. 765 // 766 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 767 // B() doesn't have provably balanced locking so it runs in the interpreter. 768 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 769 // is still locked by A(). 770 // 771 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 772 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 773 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 774 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 775 // Arguably given that the spec legislates the JNI case as undefined our implementation 776 // could reasonably *avoid* checking owner in fast_unlock(). 777 // In the interest of performance we elide m->Owner==Self check in unlock. 778 // A perfectly viable alternative is to elide the owner check except when 779 // Xcheck:jni is enabled. 780 781 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { 782 assert(boxReg == rax, ""); 783 assert_different_registers(objReg, boxReg, tmpReg); 784 785 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 786 787 #if INCLUDE_RTM_OPT 788 if (UseRTMForStackLocks && use_rtm) { 789 assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive"); 790 Label L_regular_unlock; 791 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 792 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 793 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 794 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock 795 xend(); // otherwise end... 796 jmp(DONE_LABEL); // ... and we're done 797 bind(L_regular_unlock); 798 } 799 #endif 800 801 if (!UseHeavyMonitors && !UseFastLocking) { 802 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 803 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 804 } 805 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 806 if (!UseHeavyMonitors) { 807 testptr(tmpReg, markWord::monitor_value); // Inflated? 808 #if INCLUDE_RTM_OPT 809 if (UseFastLocking && use_rtm) { 810 jcc(Assembler::zero, Stacked); 811 } else 812 #endif 813 jccb(Assembler::zero, Stacked); 814 if (UseFastLocking) { 815 // If the owner is ANONYMOUS, we need to fix it. 816 testb(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t) (intptr_t) ANONYMOUS_OWNER); 817 #ifdef _LP64 818 C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmpReg); 819 Compile::current()->output()->add_stub(stub); 820 jcc(Assembler::notEqual, stub->entry()); 821 bind(stub->continuation()); 822 #else 823 // We can't easily implement this optimization on 32 bit because we don't have a thread register. 824 // Call the slow-path instead. 825 jcc(Assembler::notEqual, NO_COUNT); 826 #endif 827 } 828 } 829 830 // It's inflated. 831 #if INCLUDE_RTM_OPT 832 if (use_rtm) { 833 Label L_regular_inflated_unlock; 834 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 835 movptr(boxReg, Address(tmpReg, owner_offset)); 836 testptr(boxReg, boxReg); 837 jccb(Assembler::notZero, L_regular_inflated_unlock); 838 xend(); 839 jmp(DONE_LABEL); 840 bind(L_regular_inflated_unlock); 841 } 842 #endif 843 844 // Despite our balanced locking property we still check that m->_owner == Self 845 // as java routines or native JNI code called by this thread might 846 // have released the lock. 847 // Refer to the comments in synchronizer.cpp for how we might encode extra 848 // state in _succ so we can avoid fetching EntryList|cxq. 849 // 850 // If there's no contention try a 1-0 exit. That is, exit without 851 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 852 // we detect and recover from the race that the 1-0 exit admits. 853 // 854 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 855 // before it STs null into _owner, releasing the lock. Updates 856 // to data protected by the critical section must be visible before 857 // we drop the lock (and thus before any other thread could acquire 858 // the lock and observe the fields protected by the lock). 859 // IA32's memory-model is SPO, so STs are ordered with respect to 860 // each other and there's no need for an explicit barrier (fence). 861 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 862 #ifndef _LP64 863 // Note that we could employ various encoding schemes to reduce 864 // the number of loads below (currently 4) to just 2 or 3. 865 // Refer to the comments in synchronizer.cpp. 866 // In practice the chain of fetches doesn't seem to impact performance, however. 867 xorptr(boxReg, boxReg); 868 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 869 jccb (Assembler::notZero, DONE_LABEL); 870 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 871 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 872 jccb (Assembler::notZero, DONE_LABEL); 873 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 874 jmpb (DONE_LABEL); 875 #else // _LP64 876 // It's inflated 877 Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath; 878 879 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 880 jccb(Assembler::equal, LNotRecursive); 881 882 // Recursive inflated unlock 883 decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 884 jmpb(LSuccess); 885 886 bind(LNotRecursive); 887 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 888 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 889 jccb (Assembler::notZero, CheckSucc); 890 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 891 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 892 jmpb (DONE_LABEL); 893 894 // Try to avoid passing control into the slow_path ... 895 bind (CheckSucc); 896 897 // The following optional optimization can be elided if necessary 898 // Effectively: if (succ == null) goto slow path 899 // The code reduces the window for a race, however, 900 // and thus benefits performance. 901 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 902 jccb (Assembler::zero, LGoSlowPath); 903 904 xorptr(boxReg, boxReg); 905 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 906 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 907 908 // Memory barrier/fence 909 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 910 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 911 // This is faster on Nehalem and AMD Shanghai/Barcelona. 912 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 913 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 914 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 915 lock(); addl(Address(rsp, 0), 0); 916 917 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 918 jccb (Assembler::notZero, LSuccess); 919 920 // Rare inopportune interleaving - race. 921 // The successor vanished in the small window above. 922 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 923 // We need to ensure progress and succession. 924 // Try to reacquire the lock. 925 // If that fails then the new owner is responsible for succession and this 926 // thread needs to take no further action and can exit via the fast path (success). 927 // If the re-acquire succeeds then pass control into the slow path. 928 // As implemented, this latter mode is horrible because we generated more 929 // coherence traffic on the lock *and* artificially extended the critical section 930 // length while by virtue of passing control into the slow path. 931 932 // box is really RAX -- the following CMPXCHG depends on that binding 933 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 934 lock(); 935 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 936 // There's no successor so we tried to regrab the lock. 937 // If that didn't work, then another thread grabbed the 938 // lock so we're done (and exit was a success). 939 jccb (Assembler::notEqual, LSuccess); 940 // Intentional fall-through into slow path 941 942 bind (LGoSlowPath); 943 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 944 jmpb (DONE_LABEL); 945 946 bind (LSuccess); 947 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 948 jmpb (DONE_LABEL); 949 950 #endif 951 if (!UseHeavyMonitors) { 952 bind (Stacked); 953 if (UseFastLocking) { 954 mov(boxReg, tmpReg); 955 fast_unlock_impl(objReg, boxReg, tmpReg, NO_COUNT); 956 jmp(COUNT); 957 } else { 958 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 959 lock(); 960 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 961 } 962 // Intentional fall-thru into DONE_LABEL 963 } 964 bind(DONE_LABEL); 965 966 // ZFlag == 1 count in fast path 967 // ZFlag == 0 count in slow path 968 jccb(Assembler::notZero, NO_COUNT); 969 970 bind(COUNT); 971 // Count monitors in fast path 972 #ifndef _LP64 973 get_thread(tmpReg); 974 decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); 975 #else // _LP64 976 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 977 #endif 978 979 xorl(tmpReg, tmpReg); // Set ZF == 1 980 981 bind(NO_COUNT); 982 } 983 984 //------------------------------------------------------------------------------------------- 985 // Generic instructions support for use in .ad files C2 code generation 986 987 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 988 if (dst != src) { 989 movdqu(dst, src); 990 } 991 if (opcode == Op_AbsVD) { 992 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 993 } else { 994 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 995 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 996 } 997 } 998 999 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 1000 if (opcode == Op_AbsVD) { 1001 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 1002 } else { 1003 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 1004 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 1005 } 1006 } 1007 1008 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 1009 if (dst != src) { 1010 movdqu(dst, src); 1011 } 1012 if (opcode == Op_AbsVF) { 1013 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 1014 } else { 1015 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 1016 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1017 } 1018 } 1019 1020 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 1021 if (opcode == Op_AbsVF) { 1022 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 1023 } else { 1024 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 1025 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 1026 } 1027 } 1028 1029 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 1030 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1031 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 1032 1033 if (opcode == Op_MinV) { 1034 if (elem_bt == T_BYTE) { 1035 pminsb(dst, src); 1036 } else if (elem_bt == T_SHORT) { 1037 pminsw(dst, src); 1038 } else if (elem_bt == T_INT) { 1039 pminsd(dst, src); 1040 } else { 1041 assert(elem_bt == T_LONG, "required"); 1042 assert(tmp == xmm0, "required"); 1043 assert_different_registers(dst, src, tmp); 1044 movdqu(xmm0, dst); 1045 pcmpgtq(xmm0, src); 1046 blendvpd(dst, src); // xmm0 as mask 1047 } 1048 } else { // opcode == Op_MaxV 1049 if (elem_bt == T_BYTE) { 1050 pmaxsb(dst, src); 1051 } else if (elem_bt == T_SHORT) { 1052 pmaxsw(dst, src); 1053 } else if (elem_bt == T_INT) { 1054 pmaxsd(dst, src); 1055 } else { 1056 assert(elem_bt == T_LONG, "required"); 1057 assert(tmp == xmm0, "required"); 1058 assert_different_registers(dst, src, tmp); 1059 movdqu(xmm0, src); 1060 pcmpgtq(xmm0, dst); 1061 blendvpd(dst, src); // xmm0 as mask 1062 } 1063 } 1064 } 1065 1066 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 1067 XMMRegister dst, XMMRegister src1, XMMRegister src2, 1068 int vlen_enc) { 1069 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1070 1071 if (opcode == Op_MinV) { 1072 if (elem_bt == T_BYTE) { 1073 vpminsb(dst, src1, src2, vlen_enc); 1074 } else if (elem_bt == T_SHORT) { 1075 vpminsw(dst, src1, src2, vlen_enc); 1076 } else if (elem_bt == T_INT) { 1077 vpminsd(dst, src1, src2, vlen_enc); 1078 } else { 1079 assert(elem_bt == T_LONG, "required"); 1080 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1081 vpminsq(dst, src1, src2, vlen_enc); 1082 } else { 1083 assert_different_registers(dst, src1, src2); 1084 vpcmpgtq(dst, src1, src2, vlen_enc); 1085 vblendvpd(dst, src1, src2, dst, vlen_enc); 1086 } 1087 } 1088 } else { // opcode == Op_MaxV 1089 if (elem_bt == T_BYTE) { 1090 vpmaxsb(dst, src1, src2, vlen_enc); 1091 } else if (elem_bt == T_SHORT) { 1092 vpmaxsw(dst, src1, src2, vlen_enc); 1093 } else if (elem_bt == T_INT) { 1094 vpmaxsd(dst, src1, src2, vlen_enc); 1095 } else { 1096 assert(elem_bt == T_LONG, "required"); 1097 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1098 vpmaxsq(dst, src1, src2, vlen_enc); 1099 } else { 1100 assert_different_registers(dst, src1, src2); 1101 vpcmpgtq(dst, src1, src2, vlen_enc); 1102 vblendvpd(dst, src2, src1, dst, vlen_enc); 1103 } 1104 } 1105 } 1106 } 1107 1108 // Float/Double min max 1109 1110 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1111 XMMRegister dst, XMMRegister a, XMMRegister b, 1112 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1113 int vlen_enc) { 1114 assert(UseAVX > 0, "required"); 1115 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1116 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1117 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1118 assert_different_registers(a, b, tmp, atmp, btmp); 1119 1120 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1121 bool is_double_word = is_double_word_type(elem_bt); 1122 1123 if (!is_double_word && is_min) { 1124 vblendvps(atmp, a, b, a, vlen_enc); 1125 vblendvps(btmp, b, a, a, vlen_enc); 1126 vminps(tmp, atmp, btmp, vlen_enc); 1127 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1128 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 1129 } else if (!is_double_word && !is_min) { 1130 vblendvps(btmp, b, a, b, vlen_enc); 1131 vblendvps(atmp, a, b, b, vlen_enc); 1132 vmaxps(tmp, atmp, btmp, vlen_enc); 1133 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1134 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 1135 } else if (is_double_word && is_min) { 1136 vblendvpd(atmp, a, b, a, vlen_enc); 1137 vblendvpd(btmp, b, a, a, vlen_enc); 1138 vminpd(tmp, atmp, btmp, vlen_enc); 1139 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1140 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 1141 } else { 1142 assert(is_double_word && !is_min, "sanity"); 1143 vblendvpd(btmp, b, a, b, vlen_enc); 1144 vblendvpd(atmp, a, b, b, vlen_enc); 1145 vmaxpd(tmp, atmp, btmp, vlen_enc); 1146 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1147 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 1148 } 1149 } 1150 1151 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1152 XMMRegister dst, XMMRegister a, XMMRegister b, 1153 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1154 int vlen_enc) { 1155 assert(UseAVX > 2, "required"); 1156 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1157 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1158 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1159 assert_different_registers(dst, a, b, atmp, btmp); 1160 1161 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1162 bool is_double_word = is_double_word_type(elem_bt); 1163 bool merge = true; 1164 1165 if (!is_double_word && is_min) { 1166 evpmovd2m(ktmp, a, vlen_enc); 1167 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1168 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1169 vminps(dst, atmp, btmp, vlen_enc); 1170 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1171 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1172 } else if (!is_double_word && !is_min) { 1173 evpmovd2m(ktmp, b, vlen_enc); 1174 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1175 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1176 vmaxps(dst, atmp, btmp, vlen_enc); 1177 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1178 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1179 } else if (is_double_word && is_min) { 1180 evpmovq2m(ktmp, a, vlen_enc); 1181 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1182 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1183 vminpd(dst, atmp, btmp, vlen_enc); 1184 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1185 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1186 } else { 1187 assert(is_double_word && !is_min, "sanity"); 1188 evpmovq2m(ktmp, b, vlen_enc); 1189 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1190 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1191 vmaxpd(dst, atmp, btmp, vlen_enc); 1192 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1193 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1194 } 1195 } 1196 1197 // Float/Double signum 1198 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1199 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1200 1201 Label DONE_LABEL; 1202 1203 if (opcode == Op_SignumF) { 1204 assert(UseSSE > 0, "required"); 1205 ucomiss(dst, zero); 1206 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1207 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1208 movflt(dst, one); 1209 jcc(Assembler::above, DONE_LABEL); 1210 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1211 } else if (opcode == Op_SignumD) { 1212 assert(UseSSE > 1, "required"); 1213 ucomisd(dst, zero); 1214 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1215 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1216 movdbl(dst, one); 1217 jcc(Assembler::above, DONE_LABEL); 1218 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1219 } 1220 1221 bind(DONE_LABEL); 1222 } 1223 1224 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1225 if (sign) { 1226 pmovsxbw(dst, src); 1227 } else { 1228 pmovzxbw(dst, src); 1229 } 1230 } 1231 1232 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1233 if (sign) { 1234 vpmovsxbw(dst, src, vector_len); 1235 } else { 1236 vpmovzxbw(dst, src, vector_len); 1237 } 1238 } 1239 1240 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1241 if (sign) { 1242 vpmovsxbd(dst, src, vector_len); 1243 } else { 1244 vpmovzxbd(dst, src, vector_len); 1245 } 1246 } 1247 1248 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1249 if (sign) { 1250 vpmovsxwd(dst, src, vector_len); 1251 } else { 1252 vpmovzxwd(dst, src, vector_len); 1253 } 1254 } 1255 1256 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1257 int shift, int vector_len) { 1258 if (opcode == Op_RotateLeftV) { 1259 if (etype == T_INT) { 1260 evprold(dst, src, shift, vector_len); 1261 } else { 1262 assert(etype == T_LONG, "expected type T_LONG"); 1263 evprolq(dst, src, shift, vector_len); 1264 } 1265 } else { 1266 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1267 if (etype == T_INT) { 1268 evprord(dst, src, shift, vector_len); 1269 } else { 1270 assert(etype == T_LONG, "expected type T_LONG"); 1271 evprorq(dst, src, shift, vector_len); 1272 } 1273 } 1274 } 1275 1276 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1277 XMMRegister shift, int vector_len) { 1278 if (opcode == Op_RotateLeftV) { 1279 if (etype == T_INT) { 1280 evprolvd(dst, src, shift, vector_len); 1281 } else { 1282 assert(etype == T_LONG, "expected type T_LONG"); 1283 evprolvq(dst, src, shift, vector_len); 1284 } 1285 } else { 1286 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1287 if (etype == T_INT) { 1288 evprorvd(dst, src, shift, vector_len); 1289 } else { 1290 assert(etype == T_LONG, "expected type T_LONG"); 1291 evprorvq(dst, src, shift, vector_len); 1292 } 1293 } 1294 } 1295 1296 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1297 if (opcode == Op_RShiftVI) { 1298 psrad(dst, shift); 1299 } else if (opcode == Op_LShiftVI) { 1300 pslld(dst, shift); 1301 } else { 1302 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1303 psrld(dst, shift); 1304 } 1305 } 1306 1307 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1308 switch (opcode) { 1309 case Op_RShiftVI: psrad(dst, shift); break; 1310 case Op_LShiftVI: pslld(dst, shift); break; 1311 case Op_URShiftVI: psrld(dst, shift); break; 1312 1313 default: assert(false, "%s", NodeClassNames[opcode]); 1314 } 1315 } 1316 1317 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1318 if (opcode == Op_RShiftVI) { 1319 vpsrad(dst, nds, shift, vector_len); 1320 } else if (opcode == Op_LShiftVI) { 1321 vpslld(dst, nds, shift, vector_len); 1322 } else { 1323 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1324 vpsrld(dst, nds, shift, vector_len); 1325 } 1326 } 1327 1328 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1329 switch (opcode) { 1330 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1331 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1332 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1333 1334 default: assert(false, "%s", NodeClassNames[opcode]); 1335 } 1336 } 1337 1338 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1339 switch (opcode) { 1340 case Op_RShiftVB: // fall-through 1341 case Op_RShiftVS: psraw(dst, shift); break; 1342 1343 case Op_LShiftVB: // fall-through 1344 case Op_LShiftVS: psllw(dst, shift); break; 1345 1346 case Op_URShiftVS: // fall-through 1347 case Op_URShiftVB: psrlw(dst, shift); break; 1348 1349 default: assert(false, "%s", NodeClassNames[opcode]); 1350 } 1351 } 1352 1353 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1354 switch (opcode) { 1355 case Op_RShiftVB: // fall-through 1356 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1357 1358 case Op_LShiftVB: // fall-through 1359 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1360 1361 case Op_URShiftVS: // fall-through 1362 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1363 1364 default: assert(false, "%s", NodeClassNames[opcode]); 1365 } 1366 } 1367 1368 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1369 switch (opcode) { 1370 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1371 case Op_LShiftVL: psllq(dst, shift); break; 1372 case Op_URShiftVL: psrlq(dst, shift); break; 1373 1374 default: assert(false, "%s", NodeClassNames[opcode]); 1375 } 1376 } 1377 1378 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1379 if (opcode == Op_RShiftVL) { 1380 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1381 } else if (opcode == Op_LShiftVL) { 1382 psllq(dst, shift); 1383 } else { 1384 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1385 psrlq(dst, shift); 1386 } 1387 } 1388 1389 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1390 switch (opcode) { 1391 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1392 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1393 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1394 1395 default: assert(false, "%s", NodeClassNames[opcode]); 1396 } 1397 } 1398 1399 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1400 if (opcode == Op_RShiftVL) { 1401 evpsraq(dst, nds, shift, vector_len); 1402 } else if (opcode == Op_LShiftVL) { 1403 vpsllq(dst, nds, shift, vector_len); 1404 } else { 1405 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1406 vpsrlq(dst, nds, shift, vector_len); 1407 } 1408 } 1409 1410 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1411 switch (opcode) { 1412 case Op_RShiftVB: // fall-through 1413 case Op_RShiftVS: // fall-through 1414 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1415 1416 case Op_LShiftVB: // fall-through 1417 case Op_LShiftVS: // fall-through 1418 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1419 1420 case Op_URShiftVB: // fall-through 1421 case Op_URShiftVS: // fall-through 1422 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1423 1424 default: assert(false, "%s", NodeClassNames[opcode]); 1425 } 1426 } 1427 1428 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1429 switch (opcode) { 1430 case Op_RShiftVB: // fall-through 1431 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1432 1433 case Op_LShiftVB: // fall-through 1434 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1435 1436 case Op_URShiftVB: // fall-through 1437 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1438 1439 default: assert(false, "%s", NodeClassNames[opcode]); 1440 } 1441 } 1442 1443 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1444 assert(UseAVX >= 2, "required"); 1445 switch (opcode) { 1446 case Op_RShiftVL: { 1447 if (UseAVX > 2) { 1448 assert(tmp == xnoreg, "not used"); 1449 if (!VM_Version::supports_avx512vl()) { 1450 vlen_enc = Assembler::AVX_512bit; 1451 } 1452 evpsravq(dst, src, shift, vlen_enc); 1453 } else { 1454 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1455 vpsrlvq(dst, src, shift, vlen_enc); 1456 vpsrlvq(tmp, tmp, shift, vlen_enc); 1457 vpxor(dst, dst, tmp, vlen_enc); 1458 vpsubq(dst, dst, tmp, vlen_enc); 1459 } 1460 break; 1461 } 1462 case Op_LShiftVL: { 1463 assert(tmp == xnoreg, "not used"); 1464 vpsllvq(dst, src, shift, vlen_enc); 1465 break; 1466 } 1467 case Op_URShiftVL: { 1468 assert(tmp == xnoreg, "not used"); 1469 vpsrlvq(dst, src, shift, vlen_enc); 1470 break; 1471 } 1472 default: assert(false, "%s", NodeClassNames[opcode]); 1473 } 1474 } 1475 1476 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1477 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1478 assert(opcode == Op_LShiftVB || 1479 opcode == Op_RShiftVB || 1480 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1481 bool sign = (opcode != Op_URShiftVB); 1482 assert(vector_len == 0, "required"); 1483 vextendbd(sign, dst, src, 1); 1484 vpmovzxbd(vtmp, shift, 1); 1485 varshiftd(opcode, dst, dst, vtmp, 1); 1486 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1487 vextracti128_high(vtmp, dst); 1488 vpackusdw(dst, dst, vtmp, 0); 1489 } 1490 1491 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1492 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1493 assert(opcode == Op_LShiftVB || 1494 opcode == Op_RShiftVB || 1495 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1496 bool sign = (opcode != Op_URShiftVB); 1497 int ext_vector_len = vector_len + 1; 1498 vextendbw(sign, dst, src, ext_vector_len); 1499 vpmovzxbw(vtmp, shift, ext_vector_len); 1500 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1501 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1502 if (vector_len == 0) { 1503 vextracti128_high(vtmp, dst); 1504 vpackuswb(dst, dst, vtmp, vector_len); 1505 } else { 1506 vextracti64x4_high(vtmp, dst); 1507 vpackuswb(dst, dst, vtmp, vector_len); 1508 vpermq(dst, dst, 0xD8, vector_len); 1509 } 1510 } 1511 1512 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1513 switch(typ) { 1514 case T_BYTE: 1515 pinsrb(dst, val, idx); 1516 break; 1517 case T_SHORT: 1518 pinsrw(dst, val, idx); 1519 break; 1520 case T_INT: 1521 pinsrd(dst, val, idx); 1522 break; 1523 case T_LONG: 1524 pinsrq(dst, val, idx); 1525 break; 1526 default: 1527 assert(false,"Should not reach here."); 1528 break; 1529 } 1530 } 1531 1532 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1533 switch(typ) { 1534 case T_BYTE: 1535 vpinsrb(dst, src, val, idx); 1536 break; 1537 case T_SHORT: 1538 vpinsrw(dst, src, val, idx); 1539 break; 1540 case T_INT: 1541 vpinsrd(dst, src, val, idx); 1542 break; 1543 case T_LONG: 1544 vpinsrq(dst, src, val, idx); 1545 break; 1546 default: 1547 assert(false,"Should not reach here."); 1548 break; 1549 } 1550 } 1551 1552 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1553 switch(typ) { 1554 case T_INT: 1555 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1556 break; 1557 case T_FLOAT: 1558 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1559 break; 1560 case T_LONG: 1561 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1562 break; 1563 case T_DOUBLE: 1564 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1565 break; 1566 default: 1567 assert(false,"Should not reach here."); 1568 break; 1569 } 1570 } 1571 1572 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1573 switch(typ) { 1574 case T_INT: 1575 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1576 break; 1577 case T_FLOAT: 1578 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1579 break; 1580 case T_LONG: 1581 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1582 break; 1583 case T_DOUBLE: 1584 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1585 break; 1586 default: 1587 assert(false,"Should not reach here."); 1588 break; 1589 } 1590 } 1591 1592 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1593 switch(typ) { 1594 case T_INT: 1595 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1596 break; 1597 case T_FLOAT: 1598 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1599 break; 1600 case T_LONG: 1601 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1602 break; 1603 case T_DOUBLE: 1604 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1605 break; 1606 default: 1607 assert(false,"Should not reach here."); 1608 break; 1609 } 1610 } 1611 1612 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1613 if (vlen_in_bytes <= 16) { 1614 pxor (dst, dst); 1615 psubb(dst, src); 1616 switch (elem_bt) { 1617 case T_BYTE: /* nothing to do */ break; 1618 case T_SHORT: pmovsxbw(dst, dst); break; 1619 case T_INT: pmovsxbd(dst, dst); break; 1620 case T_FLOAT: pmovsxbd(dst, dst); break; 1621 case T_LONG: pmovsxbq(dst, dst); break; 1622 case T_DOUBLE: pmovsxbq(dst, dst); break; 1623 1624 default: assert(false, "%s", type2name(elem_bt)); 1625 } 1626 } else { 1627 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1628 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1629 1630 vpxor (dst, dst, dst, vlen_enc); 1631 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1632 1633 switch (elem_bt) { 1634 case T_BYTE: /* nothing to do */ break; 1635 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1636 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1637 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1638 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1639 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1640 1641 default: assert(false, "%s", type2name(elem_bt)); 1642 } 1643 } 1644 } 1645 1646 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1647 if (novlbwdq) { 1648 vpmovsxbd(xtmp, src, vlen_enc); 1649 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1650 Assembler::eq, true, vlen_enc, noreg); 1651 } else { 1652 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1653 vpsubb(xtmp, xtmp, src, vlen_enc); 1654 evpmovb2m(dst, xtmp, vlen_enc); 1655 } 1656 } 1657 1658 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) { 1659 switch (vlen_in_bytes) { 1660 case 4: movdl(dst, src); break; 1661 case 8: movq(dst, src); break; 1662 case 16: movdqu(dst, src); break; 1663 case 32: vmovdqu(dst, src); break; 1664 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1665 default: ShouldNotReachHere(); 1666 } 1667 } 1668 1669 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1670 assert(rscratch != noreg || always_reachable(src), "missing"); 1671 1672 if (reachable(src)) { 1673 load_vector(dst, as_Address(src), vlen_in_bytes); 1674 } else { 1675 lea(rscratch, src); 1676 load_vector(dst, Address(rscratch, 0), vlen_in_bytes); 1677 } 1678 } 1679 1680 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1681 int vlen_enc = vector_length_encoding(vlen); 1682 if (VM_Version::supports_avx()) { 1683 if (bt == T_LONG) { 1684 if (VM_Version::supports_avx2()) { 1685 vpbroadcastq(dst, src, vlen_enc); 1686 } else { 1687 vmovddup(dst, src, vlen_enc); 1688 } 1689 } else if (bt == T_DOUBLE) { 1690 if (vlen_enc != Assembler::AVX_128bit) { 1691 vbroadcastsd(dst, src, vlen_enc, noreg); 1692 } else { 1693 vmovddup(dst, src, vlen_enc); 1694 } 1695 } else { 1696 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1697 vpbroadcastd(dst, src, vlen_enc); 1698 } else { 1699 vbroadcastss(dst, src, vlen_enc); 1700 } 1701 } 1702 } else if (VM_Version::supports_sse3()) { 1703 movddup(dst, src); 1704 } else { 1705 movq(dst, src); 1706 if (vlen == 16) { 1707 punpcklqdq(dst, dst); 1708 } 1709 } 1710 } 1711 1712 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1713 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1714 int offset = exact_log2(type2aelembytes(bt)) << 6; 1715 if (is_floating_point_type(bt)) { 1716 offset += 128; 1717 } 1718 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1719 load_vector(dst, addr, vlen_in_bytes); 1720 } 1721 1722 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1723 1724 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1725 int vector_len = Assembler::AVX_128bit; 1726 1727 switch (opcode) { 1728 case Op_AndReductionV: pand(dst, src); break; 1729 case Op_OrReductionV: por (dst, src); break; 1730 case Op_XorReductionV: pxor(dst, src); break; 1731 case Op_MinReductionV: 1732 switch (typ) { 1733 case T_BYTE: pminsb(dst, src); break; 1734 case T_SHORT: pminsw(dst, src); break; 1735 case T_INT: pminsd(dst, src); break; 1736 case T_LONG: assert(UseAVX > 2, "required"); 1737 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1738 default: assert(false, "wrong type"); 1739 } 1740 break; 1741 case Op_MaxReductionV: 1742 switch (typ) { 1743 case T_BYTE: pmaxsb(dst, src); break; 1744 case T_SHORT: pmaxsw(dst, src); break; 1745 case T_INT: pmaxsd(dst, src); break; 1746 case T_LONG: assert(UseAVX > 2, "required"); 1747 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1748 default: assert(false, "wrong type"); 1749 } 1750 break; 1751 case Op_AddReductionVF: addss(dst, src); break; 1752 case Op_AddReductionVD: addsd(dst, src); break; 1753 case Op_AddReductionVI: 1754 switch (typ) { 1755 case T_BYTE: paddb(dst, src); break; 1756 case T_SHORT: paddw(dst, src); break; 1757 case T_INT: paddd(dst, src); break; 1758 default: assert(false, "wrong type"); 1759 } 1760 break; 1761 case Op_AddReductionVL: paddq(dst, src); break; 1762 case Op_MulReductionVF: mulss(dst, src); break; 1763 case Op_MulReductionVD: mulsd(dst, src); break; 1764 case Op_MulReductionVI: 1765 switch (typ) { 1766 case T_SHORT: pmullw(dst, src); break; 1767 case T_INT: pmulld(dst, src); break; 1768 default: assert(false, "wrong type"); 1769 } 1770 break; 1771 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1772 evpmullq(dst, dst, src, vector_len); break; 1773 default: assert(false, "wrong opcode"); 1774 } 1775 } 1776 1777 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1778 int vector_len = Assembler::AVX_256bit; 1779 1780 switch (opcode) { 1781 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1782 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1783 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1784 case Op_MinReductionV: 1785 switch (typ) { 1786 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1787 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1788 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1789 case T_LONG: assert(UseAVX > 2, "required"); 1790 vpminsq(dst, src1, src2, vector_len); break; 1791 default: assert(false, "wrong type"); 1792 } 1793 break; 1794 case Op_MaxReductionV: 1795 switch (typ) { 1796 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1797 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1798 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1799 case T_LONG: assert(UseAVX > 2, "required"); 1800 vpmaxsq(dst, src1, src2, vector_len); break; 1801 default: assert(false, "wrong type"); 1802 } 1803 break; 1804 case Op_AddReductionVI: 1805 switch (typ) { 1806 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1807 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1808 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1809 default: assert(false, "wrong type"); 1810 } 1811 break; 1812 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1813 case Op_MulReductionVI: 1814 switch (typ) { 1815 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1816 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1817 default: assert(false, "wrong type"); 1818 } 1819 break; 1820 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1821 default: assert(false, "wrong opcode"); 1822 } 1823 } 1824 1825 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1826 XMMRegister dst, XMMRegister src, 1827 XMMRegister vtmp1, XMMRegister vtmp2) { 1828 switch (opcode) { 1829 case Op_AddReductionVF: 1830 case Op_MulReductionVF: 1831 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1832 break; 1833 1834 case Op_AddReductionVD: 1835 case Op_MulReductionVD: 1836 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1837 break; 1838 1839 default: assert(false, "wrong opcode"); 1840 } 1841 } 1842 1843 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1844 Register dst, Register src1, XMMRegister src2, 1845 XMMRegister vtmp1, XMMRegister vtmp2) { 1846 switch (vlen) { 1847 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1848 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1849 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1850 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1851 1852 default: assert(false, "wrong vector length"); 1853 } 1854 } 1855 1856 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1857 Register dst, Register src1, XMMRegister src2, 1858 XMMRegister vtmp1, XMMRegister vtmp2) { 1859 switch (vlen) { 1860 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1861 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1862 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1863 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1864 1865 default: assert(false, "wrong vector length"); 1866 } 1867 } 1868 1869 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1870 Register dst, Register src1, XMMRegister src2, 1871 XMMRegister vtmp1, XMMRegister vtmp2) { 1872 switch (vlen) { 1873 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1874 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1875 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1876 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1877 1878 default: assert(false, "wrong vector length"); 1879 } 1880 } 1881 1882 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1883 Register dst, Register src1, XMMRegister src2, 1884 XMMRegister vtmp1, XMMRegister vtmp2) { 1885 switch (vlen) { 1886 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1887 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1888 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1889 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1890 1891 default: assert(false, "wrong vector length"); 1892 } 1893 } 1894 1895 #ifdef _LP64 1896 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1897 Register dst, Register src1, XMMRegister src2, 1898 XMMRegister vtmp1, XMMRegister vtmp2) { 1899 switch (vlen) { 1900 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1901 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1902 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1903 1904 default: assert(false, "wrong vector length"); 1905 } 1906 } 1907 #endif // _LP64 1908 1909 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1910 switch (vlen) { 1911 case 2: 1912 assert(vtmp2 == xnoreg, ""); 1913 reduce2F(opcode, dst, src, vtmp1); 1914 break; 1915 case 4: 1916 assert(vtmp2 == xnoreg, ""); 1917 reduce4F(opcode, dst, src, vtmp1); 1918 break; 1919 case 8: 1920 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1921 break; 1922 case 16: 1923 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1924 break; 1925 default: assert(false, "wrong vector length"); 1926 } 1927 } 1928 1929 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1930 switch (vlen) { 1931 case 2: 1932 assert(vtmp2 == xnoreg, ""); 1933 reduce2D(opcode, dst, src, vtmp1); 1934 break; 1935 case 4: 1936 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1937 break; 1938 case 8: 1939 reduce8D(opcode, dst, src, vtmp1, vtmp2); 1940 break; 1941 default: assert(false, "wrong vector length"); 1942 } 1943 } 1944 1945 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1946 if (opcode == Op_AddReductionVI) { 1947 if (vtmp1 != src2) { 1948 movdqu(vtmp1, src2); 1949 } 1950 phaddd(vtmp1, vtmp1); 1951 } else { 1952 pshufd(vtmp1, src2, 0x1); 1953 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1954 } 1955 movdl(vtmp2, src1); 1956 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1957 movdl(dst, vtmp1); 1958 } 1959 1960 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1961 if (opcode == Op_AddReductionVI) { 1962 if (vtmp1 != src2) { 1963 movdqu(vtmp1, src2); 1964 } 1965 phaddd(vtmp1, src2); 1966 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1967 } else { 1968 pshufd(vtmp2, src2, 0xE); 1969 reduce_operation_128(T_INT, opcode, vtmp2, src2); 1970 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1971 } 1972 } 1973 1974 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1975 if (opcode == Op_AddReductionVI) { 1976 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 1977 vextracti128_high(vtmp2, vtmp1); 1978 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 1979 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1980 } else { 1981 vextracti128_high(vtmp1, src2); 1982 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1983 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1984 } 1985 } 1986 1987 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1988 vextracti64x4_high(vtmp2, src2); 1989 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 1990 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1991 } 1992 1993 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1994 pshufd(vtmp2, src2, 0x1); 1995 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1996 movdqu(vtmp1, vtmp2); 1997 psrldq(vtmp1, 2); 1998 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 1999 movdqu(vtmp2, vtmp1); 2000 psrldq(vtmp2, 1); 2001 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2002 movdl(vtmp2, src1); 2003 pmovsxbd(vtmp1, vtmp1); 2004 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2005 pextrb(dst, vtmp1, 0x0); 2006 movsbl(dst, dst); 2007 } 2008 2009 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2010 pshufd(vtmp1, src2, 0xE); 2011 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2012 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2013 } 2014 2015 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2016 vextracti128_high(vtmp2, src2); 2017 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2018 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2019 } 2020 2021 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2022 vextracti64x4_high(vtmp1, src2); 2023 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2024 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2025 } 2026 2027 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2028 pmovsxbw(vtmp2, src2); 2029 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2030 } 2031 2032 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2033 if (UseAVX > 1) { 2034 int vector_len = Assembler::AVX_256bit; 2035 vpmovsxbw(vtmp1, src2, vector_len); 2036 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2037 } else { 2038 pmovsxbw(vtmp2, src2); 2039 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2040 pshufd(vtmp2, src2, 0x1); 2041 pmovsxbw(vtmp2, src2); 2042 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2043 } 2044 } 2045 2046 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2047 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2048 int vector_len = Assembler::AVX_512bit; 2049 vpmovsxbw(vtmp1, src2, vector_len); 2050 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2051 } else { 2052 assert(UseAVX >= 2,"Should not reach here."); 2053 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2054 vextracti128_high(vtmp2, src2); 2055 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2056 } 2057 } 2058 2059 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2060 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2061 vextracti64x4_high(vtmp2, src2); 2062 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2063 } 2064 2065 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2066 if (opcode == Op_AddReductionVI) { 2067 if (vtmp1 != src2) { 2068 movdqu(vtmp1, src2); 2069 } 2070 phaddw(vtmp1, vtmp1); 2071 phaddw(vtmp1, vtmp1); 2072 } else { 2073 pshufd(vtmp2, src2, 0x1); 2074 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2075 movdqu(vtmp1, vtmp2); 2076 psrldq(vtmp1, 2); 2077 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2078 } 2079 movdl(vtmp2, src1); 2080 pmovsxwd(vtmp1, vtmp1); 2081 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2082 pextrw(dst, vtmp1, 0x0); 2083 movswl(dst, dst); 2084 } 2085 2086 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2087 if (opcode == Op_AddReductionVI) { 2088 if (vtmp1 != src2) { 2089 movdqu(vtmp1, src2); 2090 } 2091 phaddw(vtmp1, src2); 2092 } else { 2093 pshufd(vtmp1, src2, 0xE); 2094 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2095 } 2096 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2097 } 2098 2099 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2100 if (opcode == Op_AddReductionVI) { 2101 int vector_len = Assembler::AVX_256bit; 2102 vphaddw(vtmp2, src2, src2, vector_len); 2103 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2104 } else { 2105 vextracti128_high(vtmp2, src2); 2106 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2107 } 2108 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2109 } 2110 2111 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2112 int vector_len = Assembler::AVX_256bit; 2113 vextracti64x4_high(vtmp1, src2); 2114 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2115 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2116 } 2117 2118 #ifdef _LP64 2119 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2120 pshufd(vtmp2, src2, 0xE); 2121 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2122 movdq(vtmp1, src1); 2123 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2124 movdq(dst, vtmp1); 2125 } 2126 2127 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2128 vextracti128_high(vtmp1, src2); 2129 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2130 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2131 } 2132 2133 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2134 vextracti64x4_high(vtmp2, src2); 2135 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2136 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2137 } 2138 2139 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2140 mov64(temp, -1L); 2141 bzhiq(temp, temp, len); 2142 kmovql(dst, temp); 2143 } 2144 #endif // _LP64 2145 2146 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2147 reduce_operation_128(T_FLOAT, opcode, dst, src); 2148 pshufd(vtmp, src, 0x1); 2149 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2150 } 2151 2152 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2153 reduce2F(opcode, dst, src, vtmp); 2154 pshufd(vtmp, src, 0x2); 2155 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2156 pshufd(vtmp, src, 0x3); 2157 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2158 } 2159 2160 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2161 reduce4F(opcode, dst, src, vtmp2); 2162 vextractf128_high(vtmp2, src); 2163 reduce4F(opcode, dst, vtmp2, vtmp1); 2164 } 2165 2166 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2167 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2168 vextracti64x4_high(vtmp1, src); 2169 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2170 } 2171 2172 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2173 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2174 pshufd(vtmp, src, 0xE); 2175 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2176 } 2177 2178 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2179 reduce2D(opcode, dst, src, vtmp2); 2180 vextractf128_high(vtmp2, src); 2181 reduce2D(opcode, dst, vtmp2, vtmp1); 2182 } 2183 2184 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2185 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2186 vextracti64x4_high(vtmp1, src); 2187 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2188 } 2189 2190 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2191 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2192 } 2193 2194 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2195 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2196 } 2197 2198 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2199 int vec_enc) { 2200 switch(elem_bt) { 2201 case T_INT: 2202 case T_FLOAT: 2203 vmaskmovps(dst, src, mask, vec_enc); 2204 break; 2205 case T_LONG: 2206 case T_DOUBLE: 2207 vmaskmovpd(dst, src, mask, vec_enc); 2208 break; 2209 default: 2210 fatal("Unsupported type %s", type2name(elem_bt)); 2211 break; 2212 } 2213 } 2214 2215 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2216 int vec_enc) { 2217 switch(elem_bt) { 2218 case T_INT: 2219 case T_FLOAT: 2220 vmaskmovps(dst, src, mask, vec_enc); 2221 break; 2222 case T_LONG: 2223 case T_DOUBLE: 2224 vmaskmovpd(dst, src, mask, vec_enc); 2225 break; 2226 default: 2227 fatal("Unsupported type %s", type2name(elem_bt)); 2228 break; 2229 } 2230 } 2231 2232 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2233 XMMRegister dst, XMMRegister src, 2234 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2235 XMMRegister xmm_0, XMMRegister xmm_1) { 2236 int permconst[] = {1, 14}; 2237 XMMRegister wsrc = src; 2238 XMMRegister wdst = xmm_0; 2239 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2240 2241 int vlen_enc = Assembler::AVX_128bit; 2242 if (vlen == 16) { 2243 vlen_enc = Assembler::AVX_256bit; 2244 } 2245 2246 for (int i = log2(vlen) - 1; i >=0; i--) { 2247 if (i == 0 && !is_dst_valid) { 2248 wdst = dst; 2249 } 2250 if (i == 3) { 2251 vextracti64x4_high(wtmp, wsrc); 2252 } else if (i == 2) { 2253 vextracti128_high(wtmp, wsrc); 2254 } else { // i = [0,1] 2255 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2256 } 2257 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2258 wsrc = wdst; 2259 vlen_enc = Assembler::AVX_128bit; 2260 } 2261 if (is_dst_valid) { 2262 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2263 } 2264 } 2265 2266 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2267 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2268 XMMRegister xmm_0, XMMRegister xmm_1) { 2269 XMMRegister wsrc = src; 2270 XMMRegister wdst = xmm_0; 2271 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2272 int vlen_enc = Assembler::AVX_128bit; 2273 if (vlen == 8) { 2274 vlen_enc = Assembler::AVX_256bit; 2275 } 2276 for (int i = log2(vlen) - 1; i >=0; i--) { 2277 if (i == 0 && !is_dst_valid) { 2278 wdst = dst; 2279 } 2280 if (i == 1) { 2281 vextracti128_high(wtmp, wsrc); 2282 } else if (i == 2) { 2283 vextracti64x4_high(wtmp, wsrc); 2284 } else { 2285 assert(i == 0, "%d", i); 2286 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2287 } 2288 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2289 wsrc = wdst; 2290 vlen_enc = Assembler::AVX_128bit; 2291 } 2292 if (is_dst_valid) { 2293 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2294 } 2295 } 2296 2297 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2298 switch (bt) { 2299 case T_BYTE: pextrb(dst, src, idx); break; 2300 case T_SHORT: pextrw(dst, src, idx); break; 2301 case T_INT: pextrd(dst, src, idx); break; 2302 case T_LONG: pextrq(dst, src, idx); break; 2303 2304 default: 2305 assert(false,"Should not reach here."); 2306 break; 2307 } 2308 } 2309 2310 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2311 int esize = type2aelembytes(typ); 2312 int elem_per_lane = 16/esize; 2313 int lane = elemindex / elem_per_lane; 2314 int eindex = elemindex % elem_per_lane; 2315 2316 if (lane >= 2) { 2317 assert(UseAVX > 2, "required"); 2318 vextractf32x4(dst, src, lane & 3); 2319 return dst; 2320 } else if (lane > 0) { 2321 assert(UseAVX > 0, "required"); 2322 vextractf128(dst, src, lane); 2323 return dst; 2324 } else { 2325 return src; 2326 } 2327 } 2328 2329 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2330 int esize = type2aelembytes(typ); 2331 int elem_per_lane = 16/esize; 2332 int eindex = elemindex % elem_per_lane; 2333 assert(is_integral_type(typ),"required"); 2334 2335 if (eindex == 0) { 2336 if (typ == T_LONG) { 2337 movq(dst, src); 2338 } else { 2339 movdl(dst, src); 2340 if (typ == T_BYTE) 2341 movsbl(dst, dst); 2342 else if (typ == T_SHORT) 2343 movswl(dst, dst); 2344 } 2345 } else { 2346 extract(typ, dst, src, eindex); 2347 } 2348 } 2349 2350 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2351 int esize = type2aelembytes(typ); 2352 int elem_per_lane = 16/esize; 2353 int eindex = elemindex % elem_per_lane; 2354 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2355 2356 if (eindex == 0) { 2357 movq(dst, src); 2358 } else { 2359 if (typ == T_FLOAT) { 2360 if (UseAVX == 0) { 2361 movdqu(dst, src); 2362 shufps(dst, dst, eindex); 2363 } else { 2364 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2365 } 2366 } else { 2367 if (UseAVX == 0) { 2368 movdqu(dst, src); 2369 psrldq(dst, eindex*esize); 2370 } else { 2371 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2372 } 2373 movq(dst, dst); 2374 } 2375 } 2376 // Zero upper bits 2377 if (typ == T_FLOAT) { 2378 if (UseAVX == 0) { 2379 assert(vtmp != xnoreg, "required."); 2380 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2381 pand(dst, vtmp); 2382 } else { 2383 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2384 } 2385 } 2386 } 2387 2388 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2389 switch(typ) { 2390 case T_BYTE: 2391 case T_BOOLEAN: 2392 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2393 break; 2394 case T_SHORT: 2395 case T_CHAR: 2396 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2397 break; 2398 case T_INT: 2399 case T_FLOAT: 2400 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2401 break; 2402 case T_LONG: 2403 case T_DOUBLE: 2404 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2405 break; 2406 default: 2407 assert(false,"Should not reach here."); 2408 break; 2409 } 2410 } 2411 2412 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2413 assert(rscratch != noreg || always_reachable(src2), "missing"); 2414 2415 switch(typ) { 2416 case T_BOOLEAN: 2417 case T_BYTE: 2418 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2419 break; 2420 case T_CHAR: 2421 case T_SHORT: 2422 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2423 break; 2424 case T_INT: 2425 case T_FLOAT: 2426 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2427 break; 2428 case T_LONG: 2429 case T_DOUBLE: 2430 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2431 break; 2432 default: 2433 assert(false,"Should not reach here."); 2434 break; 2435 } 2436 } 2437 2438 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2439 switch(typ) { 2440 case T_BYTE: 2441 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2442 break; 2443 case T_SHORT: 2444 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2445 break; 2446 case T_INT: 2447 case T_FLOAT: 2448 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2449 break; 2450 case T_LONG: 2451 case T_DOUBLE: 2452 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2453 break; 2454 default: 2455 assert(false,"Should not reach here."); 2456 break; 2457 } 2458 } 2459 2460 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2461 assert(vlen_in_bytes <= 32, ""); 2462 int esize = type2aelembytes(bt); 2463 if (vlen_in_bytes == 32) { 2464 assert(vtmp == xnoreg, "required."); 2465 if (esize >= 4) { 2466 vtestps(src1, src2, AVX_256bit); 2467 } else { 2468 vptest(src1, src2, AVX_256bit); 2469 } 2470 return; 2471 } 2472 if (vlen_in_bytes < 16) { 2473 // Duplicate the lower part to fill the whole register, 2474 // Don't need to do so for src2 2475 assert(vtmp != xnoreg, "required"); 2476 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2477 pshufd(vtmp, src1, shuffle_imm); 2478 } else { 2479 assert(vtmp == xnoreg, "required"); 2480 vtmp = src1; 2481 } 2482 if (esize >= 4 && VM_Version::supports_avx()) { 2483 vtestps(vtmp, src2, AVX_128bit); 2484 } else { 2485 ptest(vtmp, src2); 2486 } 2487 } 2488 2489 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2490 assert(UseAVX >= 2, "required"); 2491 #ifdef ASSERT 2492 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2493 bool is_bw_supported = VM_Version::supports_avx512bw(); 2494 if (is_bw && !is_bw_supported) { 2495 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2496 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2497 "XMM register should be 0-15"); 2498 } 2499 #endif // ASSERT 2500 switch (elem_bt) { 2501 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2502 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2503 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2504 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2505 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2506 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2507 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2508 } 2509 } 2510 2511 #ifdef _LP64 2512 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2513 assert(UseAVX >= 2, "required"); 2514 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2515 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2516 if ((UseAVX > 2) && 2517 (!is_bw || VM_Version::supports_avx512bw()) && 2518 (!is_vl || VM_Version::supports_avx512vl())) { 2519 switch (elem_bt) { 2520 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2521 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2522 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2523 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2524 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2525 } 2526 } else { 2527 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2528 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2529 switch (elem_bt) { 2530 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2531 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2532 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2533 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2534 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2535 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2536 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2537 } 2538 } 2539 } 2540 #endif 2541 2542 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2543 switch (to_elem_bt) { 2544 case T_SHORT: 2545 vpmovsxbw(dst, src, vlen_enc); 2546 break; 2547 case T_INT: 2548 vpmovsxbd(dst, src, vlen_enc); 2549 break; 2550 case T_FLOAT: 2551 vpmovsxbd(dst, src, vlen_enc); 2552 vcvtdq2ps(dst, dst, vlen_enc); 2553 break; 2554 case T_LONG: 2555 vpmovsxbq(dst, src, vlen_enc); 2556 break; 2557 case T_DOUBLE: { 2558 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2559 vpmovsxbd(dst, src, mid_vlen_enc); 2560 vcvtdq2pd(dst, dst, vlen_enc); 2561 break; 2562 } 2563 default: 2564 fatal("Unsupported type %s", type2name(to_elem_bt)); 2565 break; 2566 } 2567 } 2568 2569 //------------------------------------------------------------------------------------------- 2570 2571 // IndexOf for constant substrings with size >= 8 chars 2572 // which don't need to be loaded through stack. 2573 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2574 Register cnt1, Register cnt2, 2575 int int_cnt2, Register result, 2576 XMMRegister vec, Register tmp, 2577 int ae) { 2578 ShortBranchVerifier sbv(this); 2579 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2580 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2581 2582 // This method uses the pcmpestri instruction with bound registers 2583 // inputs: 2584 // xmm - substring 2585 // rax - substring length (elements count) 2586 // mem - scanned string 2587 // rdx - string length (elements count) 2588 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2589 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2590 // outputs: 2591 // rcx - matched index in string 2592 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2593 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2594 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2595 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2596 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2597 2598 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2599 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2600 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2601 2602 // Note, inline_string_indexOf() generates checks: 2603 // if (substr.count > string.count) return -1; 2604 // if (substr.count == 0) return 0; 2605 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2606 2607 // Load substring. 2608 if (ae == StrIntrinsicNode::UL) { 2609 pmovzxbw(vec, Address(str2, 0)); 2610 } else { 2611 movdqu(vec, Address(str2, 0)); 2612 } 2613 movl(cnt2, int_cnt2); 2614 movptr(result, str1); // string addr 2615 2616 if (int_cnt2 > stride) { 2617 jmpb(SCAN_TO_SUBSTR); 2618 2619 // Reload substr for rescan, this code 2620 // is executed only for large substrings (> 8 chars) 2621 bind(RELOAD_SUBSTR); 2622 if (ae == StrIntrinsicNode::UL) { 2623 pmovzxbw(vec, Address(str2, 0)); 2624 } else { 2625 movdqu(vec, Address(str2, 0)); 2626 } 2627 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2628 2629 bind(RELOAD_STR); 2630 // We came here after the beginning of the substring was 2631 // matched but the rest of it was not so we need to search 2632 // again. Start from the next element after the previous match. 2633 2634 // cnt2 is number of substring reminding elements and 2635 // cnt1 is number of string reminding elements when cmp failed. 2636 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2637 subl(cnt1, cnt2); 2638 addl(cnt1, int_cnt2); 2639 movl(cnt2, int_cnt2); // Now restore cnt2 2640 2641 decrementl(cnt1); // Shift to next element 2642 cmpl(cnt1, cnt2); 2643 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2644 2645 addptr(result, (1<<scale1)); 2646 2647 } // (int_cnt2 > 8) 2648 2649 // Scan string for start of substr in 16-byte vectors 2650 bind(SCAN_TO_SUBSTR); 2651 pcmpestri(vec, Address(result, 0), mode); 2652 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2653 subl(cnt1, stride); 2654 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2655 cmpl(cnt1, cnt2); 2656 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2657 addptr(result, 16); 2658 jmpb(SCAN_TO_SUBSTR); 2659 2660 // Found a potential substr 2661 bind(FOUND_CANDIDATE); 2662 // Matched whole vector if first element matched (tmp(rcx) == 0). 2663 if (int_cnt2 == stride) { 2664 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2665 } else { // int_cnt2 > 8 2666 jccb(Assembler::overflow, FOUND_SUBSTR); 2667 } 2668 // After pcmpestri tmp(rcx) contains matched element index 2669 // Compute start addr of substr 2670 lea(result, Address(result, tmp, scale1)); 2671 2672 // Make sure string is still long enough 2673 subl(cnt1, tmp); 2674 cmpl(cnt1, cnt2); 2675 if (int_cnt2 == stride) { 2676 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2677 } else { // int_cnt2 > 8 2678 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2679 } 2680 // Left less then substring. 2681 2682 bind(RET_NOT_FOUND); 2683 movl(result, -1); 2684 jmp(EXIT); 2685 2686 if (int_cnt2 > stride) { 2687 // This code is optimized for the case when whole substring 2688 // is matched if its head is matched. 2689 bind(MATCH_SUBSTR_HEAD); 2690 pcmpestri(vec, Address(result, 0), mode); 2691 // Reload only string if does not match 2692 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2693 2694 Label CONT_SCAN_SUBSTR; 2695 // Compare the rest of substring (> 8 chars). 2696 bind(FOUND_SUBSTR); 2697 // First 8 chars are already matched. 2698 negptr(cnt2); 2699 addptr(cnt2, stride); 2700 2701 bind(SCAN_SUBSTR); 2702 subl(cnt1, stride); 2703 cmpl(cnt2, -stride); // Do not read beyond substring 2704 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2705 // Back-up strings to avoid reading beyond substring: 2706 // cnt1 = cnt1 - cnt2 + 8 2707 addl(cnt1, cnt2); // cnt2 is negative 2708 addl(cnt1, stride); 2709 movl(cnt2, stride); negptr(cnt2); 2710 bind(CONT_SCAN_SUBSTR); 2711 if (int_cnt2 < (int)G) { 2712 int tail_off1 = int_cnt2<<scale1; 2713 int tail_off2 = int_cnt2<<scale2; 2714 if (ae == StrIntrinsicNode::UL) { 2715 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2716 } else { 2717 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2718 } 2719 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2720 } else { 2721 // calculate index in register to avoid integer overflow (int_cnt2*2) 2722 movl(tmp, int_cnt2); 2723 addptr(tmp, cnt2); 2724 if (ae == StrIntrinsicNode::UL) { 2725 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2726 } else { 2727 movdqu(vec, Address(str2, tmp, scale2, 0)); 2728 } 2729 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2730 } 2731 // Need to reload strings pointers if not matched whole vector 2732 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2733 addptr(cnt2, stride); 2734 jcc(Assembler::negative, SCAN_SUBSTR); 2735 // Fall through if found full substring 2736 2737 } // (int_cnt2 > 8) 2738 2739 bind(RET_FOUND); 2740 // Found result if we matched full small substring. 2741 // Compute substr offset 2742 subptr(result, str1); 2743 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2744 shrl(result, 1); // index 2745 } 2746 bind(EXIT); 2747 2748 } // string_indexofC8 2749 2750 // Small strings are loaded through stack if they cross page boundary. 2751 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2752 Register cnt1, Register cnt2, 2753 int int_cnt2, Register result, 2754 XMMRegister vec, Register tmp, 2755 int ae) { 2756 ShortBranchVerifier sbv(this); 2757 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2758 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2759 2760 // 2761 // int_cnt2 is length of small (< 8 chars) constant substring 2762 // or (-1) for non constant substring in which case its length 2763 // is in cnt2 register. 2764 // 2765 // Note, inline_string_indexOf() generates checks: 2766 // if (substr.count > string.count) return -1; 2767 // if (substr.count == 0) return 0; 2768 // 2769 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2770 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2771 // This method uses the pcmpestri instruction with bound registers 2772 // inputs: 2773 // xmm - substring 2774 // rax - substring length (elements count) 2775 // mem - scanned string 2776 // rdx - string length (elements count) 2777 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2778 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2779 // outputs: 2780 // rcx - matched index in string 2781 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2782 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2783 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2784 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2785 2786 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2787 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2788 FOUND_CANDIDATE; 2789 2790 { //======================================================== 2791 // We don't know where these strings are located 2792 // and we can't read beyond them. Load them through stack. 2793 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2794 2795 movptr(tmp, rsp); // save old SP 2796 2797 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2798 if (int_cnt2 == (1>>scale2)) { // One byte 2799 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2800 load_unsigned_byte(result, Address(str2, 0)); 2801 movdl(vec, result); // move 32 bits 2802 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2803 // Not enough header space in 32-bit VM: 12+3 = 15. 2804 movl(result, Address(str2, -1)); 2805 shrl(result, 8); 2806 movdl(vec, result); // move 32 bits 2807 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2808 load_unsigned_short(result, Address(str2, 0)); 2809 movdl(vec, result); // move 32 bits 2810 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2811 movdl(vec, Address(str2, 0)); // move 32 bits 2812 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2813 movq(vec, Address(str2, 0)); // move 64 bits 2814 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2815 // Array header size is 12 bytes in 32-bit VM 2816 // + 6 bytes for 3 chars == 18 bytes, 2817 // enough space to load vec and shift. 2818 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2819 if (ae == StrIntrinsicNode::UL) { 2820 int tail_off = int_cnt2-8; 2821 pmovzxbw(vec, Address(str2, tail_off)); 2822 psrldq(vec, -2*tail_off); 2823 } 2824 else { 2825 int tail_off = int_cnt2*(1<<scale2); 2826 movdqu(vec, Address(str2, tail_off-16)); 2827 psrldq(vec, 16-tail_off); 2828 } 2829 } 2830 } else { // not constant substring 2831 cmpl(cnt2, stride); 2832 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 2833 2834 // We can read beyond string if srt+16 does not cross page boundary 2835 // since heaps are aligned and mapped by pages. 2836 assert(os::vm_page_size() < (int)G, "default page should be small"); 2837 movl(result, str2); // We need only low 32 bits 2838 andl(result, ((int)os::vm_page_size()-1)); 2839 cmpl(result, ((int)os::vm_page_size()-16)); 2840 jccb(Assembler::belowEqual, CHECK_STR); 2841 2842 // Move small strings to stack to allow load 16 bytes into vec. 2843 subptr(rsp, 16); 2844 int stk_offset = wordSize-(1<<scale2); 2845 push(cnt2); 2846 2847 bind(COPY_SUBSTR); 2848 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 2849 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 2850 movb(Address(rsp, cnt2, scale2, stk_offset), result); 2851 } else if (ae == StrIntrinsicNode::UU) { 2852 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 2853 movw(Address(rsp, cnt2, scale2, stk_offset), result); 2854 } 2855 decrement(cnt2); 2856 jccb(Assembler::notZero, COPY_SUBSTR); 2857 2858 pop(cnt2); 2859 movptr(str2, rsp); // New substring address 2860 } // non constant 2861 2862 bind(CHECK_STR); 2863 cmpl(cnt1, stride); 2864 jccb(Assembler::aboveEqual, BIG_STRINGS); 2865 2866 // Check cross page boundary. 2867 movl(result, str1); // We need only low 32 bits 2868 andl(result, ((int)os::vm_page_size()-1)); 2869 cmpl(result, ((int)os::vm_page_size()-16)); 2870 jccb(Assembler::belowEqual, BIG_STRINGS); 2871 2872 subptr(rsp, 16); 2873 int stk_offset = -(1<<scale1); 2874 if (int_cnt2 < 0) { // not constant 2875 push(cnt2); 2876 stk_offset += wordSize; 2877 } 2878 movl(cnt2, cnt1); 2879 2880 bind(COPY_STR); 2881 if (ae == StrIntrinsicNode::LL) { 2882 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 2883 movb(Address(rsp, cnt2, scale1, stk_offset), result); 2884 } else { 2885 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 2886 movw(Address(rsp, cnt2, scale1, stk_offset), result); 2887 } 2888 decrement(cnt2); 2889 jccb(Assembler::notZero, COPY_STR); 2890 2891 if (int_cnt2 < 0) { // not constant 2892 pop(cnt2); 2893 } 2894 movptr(str1, rsp); // New string address 2895 2896 bind(BIG_STRINGS); 2897 // Load substring. 2898 if (int_cnt2 < 0) { // -1 2899 if (ae == StrIntrinsicNode::UL) { 2900 pmovzxbw(vec, Address(str2, 0)); 2901 } else { 2902 movdqu(vec, Address(str2, 0)); 2903 } 2904 push(cnt2); // substr count 2905 push(str2); // substr addr 2906 push(str1); // string addr 2907 } else { 2908 // Small (< 8 chars) constant substrings are loaded already. 2909 movl(cnt2, int_cnt2); 2910 } 2911 push(tmp); // original SP 2912 2913 } // Finished loading 2914 2915 //======================================================== 2916 // Start search 2917 // 2918 2919 movptr(result, str1); // string addr 2920 2921 if (int_cnt2 < 0) { // Only for non constant substring 2922 jmpb(SCAN_TO_SUBSTR); 2923 2924 // SP saved at sp+0 2925 // String saved at sp+1*wordSize 2926 // Substr saved at sp+2*wordSize 2927 // Substr count saved at sp+3*wordSize 2928 2929 // Reload substr for rescan, this code 2930 // is executed only for large substrings (> 8 chars) 2931 bind(RELOAD_SUBSTR); 2932 movptr(str2, Address(rsp, 2*wordSize)); 2933 movl(cnt2, Address(rsp, 3*wordSize)); 2934 if (ae == StrIntrinsicNode::UL) { 2935 pmovzxbw(vec, Address(str2, 0)); 2936 } else { 2937 movdqu(vec, Address(str2, 0)); 2938 } 2939 // We came here after the beginning of the substring was 2940 // matched but the rest of it was not so we need to search 2941 // again. Start from the next element after the previous match. 2942 subptr(str1, result); // Restore counter 2943 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2944 shrl(str1, 1); 2945 } 2946 addl(cnt1, str1); 2947 decrementl(cnt1); // Shift to next element 2948 cmpl(cnt1, cnt2); 2949 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2950 2951 addptr(result, (1<<scale1)); 2952 } // non constant 2953 2954 // Scan string for start of substr in 16-byte vectors 2955 bind(SCAN_TO_SUBSTR); 2956 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2957 pcmpestri(vec, Address(result, 0), mode); 2958 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2959 subl(cnt1, stride); 2960 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2961 cmpl(cnt1, cnt2); 2962 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2963 addptr(result, 16); 2964 2965 bind(ADJUST_STR); 2966 cmpl(cnt1, stride); // Do not read beyond string 2967 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2968 // Back-up string to avoid reading beyond string. 2969 lea(result, Address(result, cnt1, scale1, -16)); 2970 movl(cnt1, stride); 2971 jmpb(SCAN_TO_SUBSTR); 2972 2973 // Found a potential substr 2974 bind(FOUND_CANDIDATE); 2975 // After pcmpestri tmp(rcx) contains matched element index 2976 2977 // Make sure string is still long enough 2978 subl(cnt1, tmp); 2979 cmpl(cnt1, cnt2); 2980 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 2981 // Left less then substring. 2982 2983 bind(RET_NOT_FOUND); 2984 movl(result, -1); 2985 jmp(CLEANUP); 2986 2987 bind(FOUND_SUBSTR); 2988 // Compute start addr of substr 2989 lea(result, Address(result, tmp, scale1)); 2990 if (int_cnt2 > 0) { // Constant substring 2991 // Repeat search for small substring (< 8 chars) 2992 // from new point without reloading substring. 2993 // Have to check that we don't read beyond string. 2994 cmpl(tmp, stride-int_cnt2); 2995 jccb(Assembler::greater, ADJUST_STR); 2996 // Fall through if matched whole substring. 2997 } else { // non constant 2998 assert(int_cnt2 == -1, "should be != 0"); 2999 3000 addl(tmp, cnt2); 3001 // Found result if we matched whole substring. 3002 cmpl(tmp, stride); 3003 jcc(Assembler::lessEqual, RET_FOUND); 3004 3005 // Repeat search for small substring (<= 8 chars) 3006 // from new point 'str1' without reloading substring. 3007 cmpl(cnt2, stride); 3008 // Have to check that we don't read beyond string. 3009 jccb(Assembler::lessEqual, ADJUST_STR); 3010 3011 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3012 // Compare the rest of substring (> 8 chars). 3013 movptr(str1, result); 3014 3015 cmpl(tmp, cnt2); 3016 // First 8 chars are already matched. 3017 jccb(Assembler::equal, CHECK_NEXT); 3018 3019 bind(SCAN_SUBSTR); 3020 pcmpestri(vec, Address(str1, 0), mode); 3021 // Need to reload strings pointers if not matched whole vector 3022 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3023 3024 bind(CHECK_NEXT); 3025 subl(cnt2, stride); 3026 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3027 addptr(str1, 16); 3028 if (ae == StrIntrinsicNode::UL) { 3029 addptr(str2, 8); 3030 } else { 3031 addptr(str2, 16); 3032 } 3033 subl(cnt1, stride); 3034 cmpl(cnt2, stride); // Do not read beyond substring 3035 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3036 // Back-up strings to avoid reading beyond substring. 3037 3038 if (ae == StrIntrinsicNode::UL) { 3039 lea(str2, Address(str2, cnt2, scale2, -8)); 3040 lea(str1, Address(str1, cnt2, scale1, -16)); 3041 } else { 3042 lea(str2, Address(str2, cnt2, scale2, -16)); 3043 lea(str1, Address(str1, cnt2, scale1, -16)); 3044 } 3045 subl(cnt1, cnt2); 3046 movl(cnt2, stride); 3047 addl(cnt1, stride); 3048 bind(CONT_SCAN_SUBSTR); 3049 if (ae == StrIntrinsicNode::UL) { 3050 pmovzxbw(vec, Address(str2, 0)); 3051 } else { 3052 movdqu(vec, Address(str2, 0)); 3053 } 3054 jmp(SCAN_SUBSTR); 3055 3056 bind(RET_FOUND_LONG); 3057 movptr(str1, Address(rsp, wordSize)); 3058 } // non constant 3059 3060 bind(RET_FOUND); 3061 // Compute substr offset 3062 subptr(result, str1); 3063 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3064 shrl(result, 1); // index 3065 } 3066 bind(CLEANUP); 3067 pop(rsp); // restore SP 3068 3069 } // string_indexof 3070 3071 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3072 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3073 ShortBranchVerifier sbv(this); 3074 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3075 3076 int stride = 8; 3077 3078 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3079 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3080 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3081 FOUND_SEQ_CHAR, DONE_LABEL; 3082 3083 movptr(result, str1); 3084 if (UseAVX >= 2) { 3085 cmpl(cnt1, stride); 3086 jcc(Assembler::less, SCAN_TO_CHAR); 3087 cmpl(cnt1, 2*stride); 3088 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3089 movdl(vec1, ch); 3090 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3091 vpxor(vec2, vec2); 3092 movl(tmp, cnt1); 3093 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3094 andl(cnt1,0x0000000F); //tail count (in chars) 3095 3096 bind(SCAN_TO_16_CHAR_LOOP); 3097 vmovdqu(vec3, Address(result, 0)); 3098 vpcmpeqw(vec3, vec3, vec1, 1); 3099 vptest(vec2, vec3); 3100 jcc(Assembler::carryClear, FOUND_CHAR); 3101 addptr(result, 32); 3102 subl(tmp, 2*stride); 3103 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3104 jmp(SCAN_TO_8_CHAR); 3105 bind(SCAN_TO_8_CHAR_INIT); 3106 movdl(vec1, ch); 3107 pshuflw(vec1, vec1, 0x00); 3108 pshufd(vec1, vec1, 0); 3109 pxor(vec2, vec2); 3110 } 3111 bind(SCAN_TO_8_CHAR); 3112 cmpl(cnt1, stride); 3113 jcc(Assembler::less, SCAN_TO_CHAR); 3114 if (UseAVX < 2) { 3115 movdl(vec1, ch); 3116 pshuflw(vec1, vec1, 0x00); 3117 pshufd(vec1, vec1, 0); 3118 pxor(vec2, vec2); 3119 } 3120 movl(tmp, cnt1); 3121 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3122 andl(cnt1,0x00000007); //tail count (in chars) 3123 3124 bind(SCAN_TO_8_CHAR_LOOP); 3125 movdqu(vec3, Address(result, 0)); 3126 pcmpeqw(vec3, vec1); 3127 ptest(vec2, vec3); 3128 jcc(Assembler::carryClear, FOUND_CHAR); 3129 addptr(result, 16); 3130 subl(tmp, stride); 3131 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3132 bind(SCAN_TO_CHAR); 3133 testl(cnt1, cnt1); 3134 jcc(Assembler::zero, RET_NOT_FOUND); 3135 bind(SCAN_TO_CHAR_LOOP); 3136 load_unsigned_short(tmp, Address(result, 0)); 3137 cmpl(ch, tmp); 3138 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3139 addptr(result, 2); 3140 subl(cnt1, 1); 3141 jccb(Assembler::zero, RET_NOT_FOUND); 3142 jmp(SCAN_TO_CHAR_LOOP); 3143 3144 bind(RET_NOT_FOUND); 3145 movl(result, -1); 3146 jmpb(DONE_LABEL); 3147 3148 bind(FOUND_CHAR); 3149 if (UseAVX >= 2) { 3150 vpmovmskb(tmp, vec3); 3151 } else { 3152 pmovmskb(tmp, vec3); 3153 } 3154 bsfl(ch, tmp); 3155 addptr(result, ch); 3156 3157 bind(FOUND_SEQ_CHAR); 3158 subptr(result, str1); 3159 shrl(result, 1); 3160 3161 bind(DONE_LABEL); 3162 } // string_indexof_char 3163 3164 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3165 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3166 ShortBranchVerifier sbv(this); 3167 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3168 3169 int stride = 16; 3170 3171 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3172 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3173 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3174 FOUND_SEQ_CHAR, DONE_LABEL; 3175 3176 movptr(result, str1); 3177 if (UseAVX >= 2) { 3178 cmpl(cnt1, stride); 3179 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3180 cmpl(cnt1, stride*2); 3181 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3182 movdl(vec1, ch); 3183 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3184 vpxor(vec2, vec2); 3185 movl(tmp, cnt1); 3186 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3187 andl(cnt1,0x0000001F); //tail count (in chars) 3188 3189 bind(SCAN_TO_32_CHAR_LOOP); 3190 vmovdqu(vec3, Address(result, 0)); 3191 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3192 vptest(vec2, vec3); 3193 jcc(Assembler::carryClear, FOUND_CHAR); 3194 addptr(result, 32); 3195 subl(tmp, stride*2); 3196 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3197 jmp(SCAN_TO_16_CHAR); 3198 3199 bind(SCAN_TO_16_CHAR_INIT); 3200 movdl(vec1, ch); 3201 pxor(vec2, vec2); 3202 pshufb(vec1, vec2); 3203 } 3204 3205 bind(SCAN_TO_16_CHAR); 3206 cmpl(cnt1, stride); 3207 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3208 if (UseAVX < 2) { 3209 movdl(vec1, ch); 3210 pxor(vec2, vec2); 3211 pshufb(vec1, vec2); 3212 } 3213 movl(tmp, cnt1); 3214 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3215 andl(cnt1,0x0000000F); //tail count (in bytes) 3216 3217 bind(SCAN_TO_16_CHAR_LOOP); 3218 movdqu(vec3, Address(result, 0)); 3219 pcmpeqb(vec3, vec1); 3220 ptest(vec2, vec3); 3221 jcc(Assembler::carryClear, FOUND_CHAR); 3222 addptr(result, 16); 3223 subl(tmp, stride); 3224 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3225 3226 bind(SCAN_TO_CHAR_INIT); 3227 testl(cnt1, cnt1); 3228 jcc(Assembler::zero, RET_NOT_FOUND); 3229 bind(SCAN_TO_CHAR_LOOP); 3230 load_unsigned_byte(tmp, Address(result, 0)); 3231 cmpl(ch, tmp); 3232 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3233 addptr(result, 1); 3234 subl(cnt1, 1); 3235 jccb(Assembler::zero, RET_NOT_FOUND); 3236 jmp(SCAN_TO_CHAR_LOOP); 3237 3238 bind(RET_NOT_FOUND); 3239 movl(result, -1); 3240 jmpb(DONE_LABEL); 3241 3242 bind(FOUND_CHAR); 3243 if (UseAVX >= 2) { 3244 vpmovmskb(tmp, vec3); 3245 } else { 3246 pmovmskb(tmp, vec3); 3247 } 3248 bsfl(ch, tmp); 3249 addptr(result, ch); 3250 3251 bind(FOUND_SEQ_CHAR); 3252 subptr(result, str1); 3253 3254 bind(DONE_LABEL); 3255 } // stringL_indexof_char 3256 3257 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3258 switch (eltype) { 3259 case T_BOOLEAN: return sizeof(jboolean); 3260 case T_BYTE: return sizeof(jbyte); 3261 case T_SHORT: return sizeof(jshort); 3262 case T_CHAR: return sizeof(jchar); 3263 case T_INT: return sizeof(jint); 3264 default: 3265 ShouldNotReachHere(); 3266 return -1; 3267 } 3268 } 3269 3270 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3271 switch (eltype) { 3272 // T_BOOLEAN used as surrogate for unsigned byte 3273 case T_BOOLEAN: movzbl(dst, src); break; 3274 case T_BYTE: movsbl(dst, src); break; 3275 case T_SHORT: movswl(dst, src); break; 3276 case T_CHAR: movzwl(dst, src); break; 3277 case T_INT: movl(dst, src); break; 3278 default: 3279 ShouldNotReachHere(); 3280 } 3281 } 3282 3283 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3284 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3285 } 3286 3287 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3288 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3289 } 3290 3291 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3292 const int vlen = Assembler::AVX_256bit; 3293 switch (eltype) { 3294 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3295 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3296 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3297 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3298 case T_INT: 3299 // do nothing 3300 break; 3301 default: 3302 ShouldNotReachHere(); 3303 } 3304 } 3305 3306 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3307 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3308 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3309 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3310 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3311 BasicType eltype) { 3312 ShortBranchVerifier sbv(this); 3313 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3314 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3315 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3316 3317 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3318 SHORT_UNROLLED_LOOP_EXIT, 3319 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3320 UNROLLED_VECTOR_LOOP_BEGIN, 3321 END; 3322 switch (eltype) { 3323 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3324 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3325 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3326 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3327 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3328 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3329 } 3330 3331 // For "renaming" for readibility of the code 3332 XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3333 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3334 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3335 3336 const int elsize = arrays_hashcode_elsize(eltype); 3337 3338 /* 3339 if (cnt1 >= 2) { 3340 if (cnt1 >= 32) { 3341 UNROLLED VECTOR LOOP 3342 } 3343 UNROLLED SCALAR LOOP 3344 } 3345 SINGLE SCALAR 3346 */ 3347 3348 cmpl(cnt1, 32); 3349 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3350 3351 // cnt1 >= 32 && generate_vectorized_loop 3352 xorl(index, index); 3353 3354 // vresult = IntVector.zero(I256); 3355 for (int idx = 0; idx < 4; idx++) { 3356 vpxor(vresult[idx], vresult[idx]); 3357 } 3358 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3359 Register bound = tmp2; 3360 Register next = tmp3; 3361 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3362 movl(next, Address(tmp2, 0)); 3363 movdl(vnext, next); 3364 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3365 3366 // index = 0; 3367 // bound = cnt1 & ~(32 - 1); 3368 movl(bound, cnt1); 3369 andl(bound, ~(32 - 1)); 3370 // for (; index < bound; index += 32) { 3371 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3372 // result *= next; 3373 imull(result, next); 3374 // loop fission to upfront the cost of fetching from memory, OOO execution 3375 // can then hopefully do a better job of prefetching 3376 for (int idx = 0; idx < 4; idx++) { 3377 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3378 } 3379 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3380 for (int idx = 0; idx < 4; idx++) { 3381 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3382 arrays_hashcode_elvcast(vtmp[idx], eltype); 3383 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3384 } 3385 // index += 32; 3386 addl(index, 32); 3387 // index < bound; 3388 cmpl(index, bound); 3389 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3390 // } 3391 3392 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3393 subl(cnt1, bound); 3394 // release bound 3395 3396 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3397 for (int idx = 0; idx < 4; idx++) { 3398 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3399 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3400 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3401 } 3402 // result += vresult.reduceLanes(ADD); 3403 for (int idx = 0; idx < 4; idx++) { 3404 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3405 } 3406 3407 // } else if (cnt1 < 32) { 3408 3409 bind(SHORT_UNROLLED_BEGIN); 3410 // int i = 1; 3411 movl(index, 1); 3412 cmpl(index, cnt1); 3413 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3414 3415 // for (; i < cnt1 ; i += 2) { 3416 bind(SHORT_UNROLLED_LOOP_BEGIN); 3417 movl(tmp3, 961); 3418 imull(result, tmp3); 3419 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3420 movl(tmp3, tmp2); 3421 shll(tmp3, 5); 3422 subl(tmp3, tmp2); 3423 addl(result, tmp3); 3424 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3425 addl(result, tmp3); 3426 addl(index, 2); 3427 cmpl(index, cnt1); 3428 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3429 3430 // } 3431 // if (i >= cnt1) { 3432 bind(SHORT_UNROLLED_LOOP_EXIT); 3433 jccb(Assembler::greater, END); 3434 movl(tmp2, result); 3435 shll(result, 5); 3436 subl(result, tmp2); 3437 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3438 addl(result, tmp3); 3439 // } 3440 bind(END); 3441 3442 BLOCK_COMMENT("} // arrays_hashcode"); 3443 3444 } // arrays_hashcode 3445 3446 // helper function for string_compare 3447 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3448 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3449 Address::ScaleFactor scale2, Register index, int ae) { 3450 if (ae == StrIntrinsicNode::LL) { 3451 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3452 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3453 } else if (ae == StrIntrinsicNode::UU) { 3454 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3455 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3456 } else { 3457 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3458 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3459 } 3460 } 3461 3462 // Compare strings, used for char[] and byte[]. 3463 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3464 Register cnt1, Register cnt2, Register result, 3465 XMMRegister vec1, int ae, KRegister mask) { 3466 ShortBranchVerifier sbv(this); 3467 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3468 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3469 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3470 int stride2x2 = 0x40; 3471 Address::ScaleFactor scale = Address::no_scale; 3472 Address::ScaleFactor scale1 = Address::no_scale; 3473 Address::ScaleFactor scale2 = Address::no_scale; 3474 3475 if (ae != StrIntrinsicNode::LL) { 3476 stride2x2 = 0x20; 3477 } 3478 3479 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3480 shrl(cnt2, 1); 3481 } 3482 // Compute the minimum of the string lengths and the 3483 // difference of the string lengths (stack). 3484 // Do the conditional move stuff 3485 movl(result, cnt1); 3486 subl(cnt1, cnt2); 3487 push(cnt1); 3488 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3489 3490 // Is the minimum length zero? 3491 testl(cnt2, cnt2); 3492 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3493 if (ae == StrIntrinsicNode::LL) { 3494 // Load first bytes 3495 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3496 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3497 } else if (ae == StrIntrinsicNode::UU) { 3498 // Load first characters 3499 load_unsigned_short(result, Address(str1, 0)); 3500 load_unsigned_short(cnt1, Address(str2, 0)); 3501 } else { 3502 load_unsigned_byte(result, Address(str1, 0)); 3503 load_unsigned_short(cnt1, Address(str2, 0)); 3504 } 3505 subl(result, cnt1); 3506 jcc(Assembler::notZero, POP_LABEL); 3507 3508 if (ae == StrIntrinsicNode::UU) { 3509 // Divide length by 2 to get number of chars 3510 shrl(cnt2, 1); 3511 } 3512 cmpl(cnt2, 1); 3513 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3514 3515 // Check if the strings start at the same location and setup scale and stride 3516 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3517 cmpptr(str1, str2); 3518 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3519 if (ae == StrIntrinsicNode::LL) { 3520 scale = Address::times_1; 3521 stride = 16; 3522 } else { 3523 scale = Address::times_2; 3524 stride = 8; 3525 } 3526 } else { 3527 scale1 = Address::times_1; 3528 scale2 = Address::times_2; 3529 // scale not used 3530 stride = 8; 3531 } 3532 3533 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3534 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3535 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3536 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3537 Label COMPARE_TAIL_LONG; 3538 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3539 3540 int pcmpmask = 0x19; 3541 if (ae == StrIntrinsicNode::LL) { 3542 pcmpmask &= ~0x01; 3543 } 3544 3545 // Setup to compare 16-chars (32-bytes) vectors, 3546 // start from first character again because it has aligned address. 3547 if (ae == StrIntrinsicNode::LL) { 3548 stride2 = 32; 3549 } else { 3550 stride2 = 16; 3551 } 3552 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3553 adr_stride = stride << scale; 3554 } else { 3555 adr_stride1 = 8; //stride << scale1; 3556 adr_stride2 = 16; //stride << scale2; 3557 } 3558 3559 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3560 // rax and rdx are used by pcmpestri as elements counters 3561 movl(result, cnt2); 3562 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3563 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3564 3565 // fast path : compare first 2 8-char vectors. 3566 bind(COMPARE_16_CHARS); 3567 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3568 movdqu(vec1, Address(str1, 0)); 3569 } else { 3570 pmovzxbw(vec1, Address(str1, 0)); 3571 } 3572 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3573 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3574 3575 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3576 movdqu(vec1, Address(str1, adr_stride)); 3577 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3578 } else { 3579 pmovzxbw(vec1, Address(str1, adr_stride1)); 3580 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3581 } 3582 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3583 addl(cnt1, stride); 3584 3585 // Compare the characters at index in cnt1 3586 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3587 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3588 subl(result, cnt2); 3589 jmp(POP_LABEL); 3590 3591 // Setup the registers to start vector comparison loop 3592 bind(COMPARE_WIDE_VECTORS); 3593 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3594 lea(str1, Address(str1, result, scale)); 3595 lea(str2, Address(str2, result, scale)); 3596 } else { 3597 lea(str1, Address(str1, result, scale1)); 3598 lea(str2, Address(str2, result, scale2)); 3599 } 3600 subl(result, stride2); 3601 subl(cnt2, stride2); 3602 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3603 negptr(result); 3604 3605 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3606 bind(COMPARE_WIDE_VECTORS_LOOP); 3607 3608 #ifdef _LP64 3609 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3610 cmpl(cnt2, stride2x2); 3611 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3612 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3613 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3614 3615 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3616 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3617 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3618 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3619 } else { 3620 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3621 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3622 } 3623 kortestql(mask, mask); 3624 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3625 addptr(result, stride2x2); // update since we already compared at this addr 3626 subl(cnt2, stride2x2); // and sub the size too 3627 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3628 3629 vpxor(vec1, vec1); 3630 jmpb(COMPARE_WIDE_TAIL); 3631 }//if (VM_Version::supports_avx512vlbw()) 3632 #endif // _LP64 3633 3634 3635 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3636 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3637 vmovdqu(vec1, Address(str1, result, scale)); 3638 vpxor(vec1, Address(str2, result, scale)); 3639 } else { 3640 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3641 vpxor(vec1, Address(str2, result, scale2)); 3642 } 3643 vptest(vec1, vec1); 3644 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3645 addptr(result, stride2); 3646 subl(cnt2, stride2); 3647 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3648 // clean upper bits of YMM registers 3649 vpxor(vec1, vec1); 3650 3651 // compare wide vectors tail 3652 bind(COMPARE_WIDE_TAIL); 3653 testptr(result, result); 3654 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3655 3656 movl(result, stride2); 3657 movl(cnt2, result); 3658 negptr(result); 3659 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3660 3661 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3662 bind(VECTOR_NOT_EQUAL); 3663 // clean upper bits of YMM registers 3664 vpxor(vec1, vec1); 3665 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3666 lea(str1, Address(str1, result, scale)); 3667 lea(str2, Address(str2, result, scale)); 3668 } else { 3669 lea(str1, Address(str1, result, scale1)); 3670 lea(str2, Address(str2, result, scale2)); 3671 } 3672 jmp(COMPARE_16_CHARS); 3673 3674 // Compare tail chars, length between 1 to 15 chars 3675 bind(COMPARE_TAIL_LONG); 3676 movl(cnt2, result); 3677 cmpl(cnt2, stride); 3678 jcc(Assembler::less, COMPARE_SMALL_STR); 3679 3680 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3681 movdqu(vec1, Address(str1, 0)); 3682 } else { 3683 pmovzxbw(vec1, Address(str1, 0)); 3684 } 3685 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3686 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3687 subptr(cnt2, stride); 3688 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3689 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3690 lea(str1, Address(str1, result, scale)); 3691 lea(str2, Address(str2, result, scale)); 3692 } else { 3693 lea(str1, Address(str1, result, scale1)); 3694 lea(str2, Address(str2, result, scale2)); 3695 } 3696 negptr(cnt2); 3697 jmpb(WHILE_HEAD_LABEL); 3698 3699 bind(COMPARE_SMALL_STR); 3700 } else if (UseSSE42Intrinsics) { 3701 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3702 int pcmpmask = 0x19; 3703 // Setup to compare 8-char (16-byte) vectors, 3704 // start from first character again because it has aligned address. 3705 movl(result, cnt2); 3706 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3707 if (ae == StrIntrinsicNode::LL) { 3708 pcmpmask &= ~0x01; 3709 } 3710 jcc(Assembler::zero, COMPARE_TAIL); 3711 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3712 lea(str1, Address(str1, result, scale)); 3713 lea(str2, Address(str2, result, scale)); 3714 } else { 3715 lea(str1, Address(str1, result, scale1)); 3716 lea(str2, Address(str2, result, scale2)); 3717 } 3718 negptr(result); 3719 3720 // pcmpestri 3721 // inputs: 3722 // vec1- substring 3723 // rax - negative string length (elements count) 3724 // mem - scanned string 3725 // rdx - string length (elements count) 3726 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3727 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3728 // outputs: 3729 // rcx - first mismatched element index 3730 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3731 3732 bind(COMPARE_WIDE_VECTORS); 3733 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3734 movdqu(vec1, Address(str1, result, scale)); 3735 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3736 } else { 3737 pmovzxbw(vec1, Address(str1, result, scale1)); 3738 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3739 } 3740 // After pcmpestri cnt1(rcx) contains mismatched element index 3741 3742 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3743 addptr(result, stride); 3744 subptr(cnt2, stride); 3745 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3746 3747 // compare wide vectors tail 3748 testptr(result, result); 3749 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3750 3751 movl(cnt2, stride); 3752 movl(result, stride); 3753 negptr(result); 3754 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3755 movdqu(vec1, Address(str1, result, scale)); 3756 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3757 } else { 3758 pmovzxbw(vec1, Address(str1, result, scale1)); 3759 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3760 } 3761 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3762 3763 // Mismatched characters in the vectors 3764 bind(VECTOR_NOT_EQUAL); 3765 addptr(cnt1, result); 3766 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3767 subl(result, cnt2); 3768 jmpb(POP_LABEL); 3769 3770 bind(COMPARE_TAIL); // limit is zero 3771 movl(cnt2, result); 3772 // Fallthru to tail compare 3773 } 3774 // Shift str2 and str1 to the end of the arrays, negate min 3775 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3776 lea(str1, Address(str1, cnt2, scale)); 3777 lea(str2, Address(str2, cnt2, scale)); 3778 } else { 3779 lea(str1, Address(str1, cnt2, scale1)); 3780 lea(str2, Address(str2, cnt2, scale2)); 3781 } 3782 decrementl(cnt2); // first character was compared already 3783 negptr(cnt2); 3784 3785 // Compare the rest of the elements 3786 bind(WHILE_HEAD_LABEL); 3787 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3788 subl(result, cnt1); 3789 jccb(Assembler::notZero, POP_LABEL); 3790 increment(cnt2); 3791 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3792 3793 // Strings are equal up to min length. Return the length difference. 3794 bind(LENGTH_DIFF_LABEL); 3795 pop(result); 3796 if (ae == StrIntrinsicNode::UU) { 3797 // Divide diff by 2 to get number of chars 3798 sarl(result, 1); 3799 } 3800 jmpb(DONE_LABEL); 3801 3802 #ifdef _LP64 3803 if (VM_Version::supports_avx512vlbw()) { 3804 3805 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3806 3807 kmovql(cnt1, mask); 3808 notq(cnt1); 3809 bsfq(cnt2, cnt1); 3810 if (ae != StrIntrinsicNode::LL) { 3811 // Divide diff by 2 to get number of chars 3812 sarl(cnt2, 1); 3813 } 3814 addq(result, cnt2); 3815 if (ae == StrIntrinsicNode::LL) { 3816 load_unsigned_byte(cnt1, Address(str2, result)); 3817 load_unsigned_byte(result, Address(str1, result)); 3818 } else if (ae == StrIntrinsicNode::UU) { 3819 load_unsigned_short(cnt1, Address(str2, result, scale)); 3820 load_unsigned_short(result, Address(str1, result, scale)); 3821 } else { 3822 load_unsigned_short(cnt1, Address(str2, result, scale2)); 3823 load_unsigned_byte(result, Address(str1, result, scale1)); 3824 } 3825 subl(result, cnt1); 3826 jmpb(POP_LABEL); 3827 }//if (VM_Version::supports_avx512vlbw()) 3828 #endif // _LP64 3829 3830 // Discard the stored length difference 3831 bind(POP_LABEL); 3832 pop(cnt1); 3833 3834 // That's it 3835 bind(DONE_LABEL); 3836 if(ae == StrIntrinsicNode::UL) { 3837 negl(result); 3838 } 3839 3840 } 3841 3842 // Search for Non-ASCII character (Negative byte value) in a byte array, 3843 // return the index of the first such character, otherwise the length 3844 // of the array segment searched. 3845 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 3846 // @IntrinsicCandidate 3847 // public static int countPositives(byte[] ba, int off, int len) { 3848 // for (int i = off; i < off + len; i++) { 3849 // if (ba[i] < 0) { 3850 // return i - off; 3851 // } 3852 // } 3853 // return len; 3854 // } 3855 void C2_MacroAssembler::count_positives(Register ary1, Register len, 3856 Register result, Register tmp1, 3857 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 3858 // rsi: byte array 3859 // rcx: len 3860 // rax: result 3861 ShortBranchVerifier sbv(this); 3862 assert_different_registers(ary1, len, result, tmp1); 3863 assert_different_registers(vec1, vec2); 3864 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 3865 3866 movl(result, len); // copy 3867 // len == 0 3868 testl(len, len); 3869 jcc(Assembler::zero, DONE); 3870 3871 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 3872 VM_Version::supports_avx512vlbw() && 3873 VM_Version::supports_bmi2()) { 3874 3875 Label test_64_loop, test_tail, BREAK_LOOP; 3876 Register tmp3_aliased = len; 3877 3878 movl(tmp1, len); 3879 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 3880 3881 andl(tmp1, 64 - 1); // tail count (in chars) 0x3F 3882 andl(len, ~(64 - 1)); // vector count (in chars) 3883 jccb(Assembler::zero, test_tail); 3884 3885 lea(ary1, Address(ary1, len, Address::times_1)); 3886 negptr(len); 3887 3888 bind(test_64_loop); 3889 // Check whether our 64 elements of size byte contain negatives 3890 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 3891 kortestql(mask1, mask1); 3892 jcc(Assembler::notZero, BREAK_LOOP); 3893 3894 addptr(len, 64); 3895 jccb(Assembler::notZero, test_64_loop); 3896 3897 bind(test_tail); 3898 // bail out when there is nothing to be done 3899 testl(tmp1, -1); 3900 jcc(Assembler::zero, DONE); 3901 3902 // ~(~0 << len) applied up to two times (for 32-bit scenario) 3903 #ifdef _LP64 3904 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 3905 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 3906 notq(tmp3_aliased); 3907 kmovql(mask2, tmp3_aliased); 3908 #else 3909 Label k_init; 3910 jmp(k_init); 3911 3912 // We could not read 64-bits from a general purpose register thus we move 3913 // data required to compose 64 1's to the instruction stream 3914 // We emit 64 byte wide series of elements from 0..63 which later on would 3915 // be used as a compare targets with tail count contained in tmp1 register. 3916 // Result would be a k register having tmp1 consecutive number or 1 3917 // counting from least significant bit. 3918 address tmp = pc(); 3919 emit_int64(0x0706050403020100); 3920 emit_int64(0x0F0E0D0C0B0A0908); 3921 emit_int64(0x1716151413121110); 3922 emit_int64(0x1F1E1D1C1B1A1918); 3923 emit_int64(0x2726252423222120); 3924 emit_int64(0x2F2E2D2C2B2A2928); 3925 emit_int64(0x3736353433323130); 3926 emit_int64(0x3F3E3D3C3B3A3938); 3927 3928 bind(k_init); 3929 lea(len, InternalAddress(tmp)); 3930 // create mask to test for negative byte inside a vector 3931 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 3932 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 3933 3934 #endif 3935 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 3936 ktestq(mask1, mask2); 3937 jcc(Assembler::zero, DONE); 3938 3939 bind(BREAK_LOOP); 3940 // At least one byte in the last 64 bytes is negative. 3941 // Set up to look at the last 64 bytes as if they were a tail 3942 lea(ary1, Address(ary1, len, Address::times_1)); 3943 addptr(result, len); 3944 // Ignore the very last byte: if all others are positive, 3945 // it must be negative, so we can skip right to the 2+1 byte 3946 // end comparison at this point 3947 orl(result, 63); 3948 movl(len, 63); 3949 // Fallthru to tail compare 3950 } else { 3951 3952 if (UseAVX >= 2 && UseSSE >= 2) { 3953 // With AVX2, use 32-byte vector compare 3954 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 3955 3956 // Compare 32-byte vectors 3957 testl(len, 0xffffffe0); // vector count (in bytes) 3958 jccb(Assembler::zero, TAIL_START); 3959 3960 andl(len, 0xffffffe0); 3961 lea(ary1, Address(ary1, len, Address::times_1)); 3962 negptr(len); 3963 3964 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 3965 movdl(vec2, tmp1); 3966 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 3967 3968 bind(COMPARE_WIDE_VECTORS); 3969 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 3970 vptest(vec1, vec2); 3971 jccb(Assembler::notZero, BREAK_LOOP); 3972 addptr(len, 32); 3973 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3974 3975 testl(result, 0x0000001f); // any bytes remaining? 3976 jcc(Assembler::zero, DONE); 3977 3978 // Quick test using the already prepared vector mask 3979 movl(len, result); 3980 andl(len, 0x0000001f); 3981 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 3982 vptest(vec1, vec2); 3983 jcc(Assembler::zero, DONE); 3984 // There are zeros, jump to the tail to determine exactly where 3985 jmpb(TAIL_START); 3986 3987 bind(BREAK_LOOP); 3988 // At least one byte in the last 32-byte vector is negative. 3989 // Set up to look at the last 32 bytes as if they were a tail 3990 lea(ary1, Address(ary1, len, Address::times_1)); 3991 addptr(result, len); 3992 // Ignore the very last byte: if all others are positive, 3993 // it must be negative, so we can skip right to the 2+1 byte 3994 // end comparison at this point 3995 orl(result, 31); 3996 movl(len, 31); 3997 // Fallthru to tail compare 3998 } else if (UseSSE42Intrinsics) { 3999 // With SSE4.2, use double quad vector compare 4000 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4001 4002 // Compare 16-byte vectors 4003 testl(len, 0xfffffff0); // vector count (in bytes) 4004 jcc(Assembler::zero, TAIL_START); 4005 4006 andl(len, 0xfffffff0); 4007 lea(ary1, Address(ary1, len, Address::times_1)); 4008 negptr(len); 4009 4010 movl(tmp1, 0x80808080); 4011 movdl(vec2, tmp1); 4012 pshufd(vec2, vec2, 0); 4013 4014 bind(COMPARE_WIDE_VECTORS); 4015 movdqu(vec1, Address(ary1, len, Address::times_1)); 4016 ptest(vec1, vec2); 4017 jccb(Assembler::notZero, BREAK_LOOP); 4018 addptr(len, 16); 4019 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4020 4021 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4022 jcc(Assembler::zero, DONE); 4023 4024 // Quick test using the already prepared vector mask 4025 movl(len, result); 4026 andl(len, 0x0000000f); // tail count (in bytes) 4027 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4028 ptest(vec1, vec2); 4029 jcc(Assembler::zero, DONE); 4030 jmpb(TAIL_START); 4031 4032 bind(BREAK_LOOP); 4033 // At least one byte in the last 16-byte vector is negative. 4034 // Set up and look at the last 16 bytes as if they were a tail 4035 lea(ary1, Address(ary1, len, Address::times_1)); 4036 addptr(result, len); 4037 // Ignore the very last byte: if all others are positive, 4038 // it must be negative, so we can skip right to the 2+1 byte 4039 // end comparison at this point 4040 orl(result, 15); 4041 movl(len, 15); 4042 // Fallthru to tail compare 4043 } 4044 } 4045 4046 bind(TAIL_START); 4047 // Compare 4-byte vectors 4048 andl(len, 0xfffffffc); // vector count (in bytes) 4049 jccb(Assembler::zero, COMPARE_CHAR); 4050 4051 lea(ary1, Address(ary1, len, Address::times_1)); 4052 negptr(len); 4053 4054 bind(COMPARE_VECTORS); 4055 movl(tmp1, Address(ary1, len, Address::times_1)); 4056 andl(tmp1, 0x80808080); 4057 jccb(Assembler::notZero, TAIL_ADJUST); 4058 addptr(len, 4); 4059 jccb(Assembler::notZero, COMPARE_VECTORS); 4060 4061 // Compare trailing char (final 2-3 bytes), if any 4062 bind(COMPARE_CHAR); 4063 4064 testl(result, 0x2); // tail char 4065 jccb(Assembler::zero, COMPARE_BYTE); 4066 load_unsigned_short(tmp1, Address(ary1, 0)); 4067 andl(tmp1, 0x00008080); 4068 jccb(Assembler::notZero, CHAR_ADJUST); 4069 lea(ary1, Address(ary1, 2)); 4070 4071 bind(COMPARE_BYTE); 4072 testl(result, 0x1); // tail byte 4073 jccb(Assembler::zero, DONE); 4074 load_unsigned_byte(tmp1, Address(ary1, 0)); 4075 testl(tmp1, 0x00000080); 4076 jccb(Assembler::zero, DONE); 4077 subptr(result, 1); 4078 jmpb(DONE); 4079 4080 bind(TAIL_ADJUST); 4081 // there are negative bits in the last 4 byte block. 4082 // Adjust result and check the next three bytes 4083 addptr(result, len); 4084 orl(result, 3); 4085 lea(ary1, Address(ary1, len, Address::times_1)); 4086 jmpb(COMPARE_CHAR); 4087 4088 bind(CHAR_ADJUST); 4089 // We are looking at a char + optional byte tail, and found that one 4090 // of the bytes in the char is negative. Adjust the result, check the 4091 // first byte and readjust if needed. 4092 andl(result, 0xfffffffc); 4093 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4094 jccb(Assembler::notZero, DONE); 4095 addptr(result, 1); 4096 4097 // That's it 4098 bind(DONE); 4099 if (UseAVX >= 2 && UseSSE >= 2) { 4100 // clean upper bits of YMM registers 4101 vpxor(vec1, vec1); 4102 vpxor(vec2, vec2); 4103 } 4104 } 4105 4106 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4107 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4108 Register limit, Register result, Register chr, 4109 XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) { 4110 ShortBranchVerifier sbv(this); 4111 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4112 4113 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4114 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4115 4116 if (is_array_equ) { 4117 // Check the input args 4118 cmpoop(ary1, ary2); 4119 jcc(Assembler::equal, TRUE_LABEL); 4120 4121 // Need additional checks for arrays_equals. 4122 testptr(ary1, ary1); 4123 jcc(Assembler::zero, FALSE_LABEL); 4124 testptr(ary2, ary2); 4125 jcc(Assembler::zero, FALSE_LABEL); 4126 4127 // Check the lengths 4128 movl(limit, Address(ary1, length_offset)); 4129 cmpl(limit, Address(ary2, length_offset)); 4130 jcc(Assembler::notEqual, FALSE_LABEL); 4131 } 4132 4133 // count == 0 4134 testl(limit, limit); 4135 jcc(Assembler::zero, TRUE_LABEL); 4136 4137 if (is_array_equ) { 4138 // Load array address 4139 lea(ary1, Address(ary1, base_offset)); 4140 lea(ary2, Address(ary2, base_offset)); 4141 } 4142 4143 if (is_array_equ && is_char) { 4144 // arrays_equals when used for char[]. 4145 shll(limit, 1); // byte count != 0 4146 } 4147 movl(result, limit); // copy 4148 4149 if (UseAVX >= 2) { 4150 // With AVX2, use 32-byte vector compare 4151 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4152 4153 // Compare 32-byte vectors 4154 andl(result, 0x0000001f); // tail count (in bytes) 4155 andl(limit, 0xffffffe0); // vector count (in bytes) 4156 jcc(Assembler::zero, COMPARE_TAIL); 4157 4158 lea(ary1, Address(ary1, limit, Address::times_1)); 4159 lea(ary2, Address(ary2, limit, Address::times_1)); 4160 negptr(limit); 4161 4162 #ifdef _LP64 4163 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4164 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4165 4166 cmpl(limit, -64); 4167 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4168 4169 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4170 4171 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4172 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4173 kortestql(mask, mask); 4174 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4175 addptr(limit, 64); // update since we already compared at this addr 4176 cmpl(limit, -64); 4177 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4178 4179 // At this point we may still need to compare -limit+result bytes. 4180 // We could execute the next two instruction and just continue via non-wide path: 4181 // cmpl(limit, 0); 4182 // jcc(Assembler::equal, COMPARE_TAIL); // true 4183 // But since we stopped at the points ary{1,2}+limit which are 4184 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4185 // (|limit| <= 32 and result < 32), 4186 // we may just compare the last 64 bytes. 4187 // 4188 addptr(result, -64); // it is safe, bc we just came from this area 4189 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4190 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4191 kortestql(mask, mask); 4192 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4193 4194 jmp(TRUE_LABEL); 4195 4196 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4197 4198 }//if (VM_Version::supports_avx512vlbw()) 4199 #endif //_LP64 4200 bind(COMPARE_WIDE_VECTORS); 4201 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); 4202 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4203 vpxor(vec1, vec2); 4204 4205 vptest(vec1, vec1); 4206 jcc(Assembler::notZero, FALSE_LABEL); 4207 addptr(limit, 32); 4208 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4209 4210 testl(result, result); 4211 jcc(Assembler::zero, TRUE_LABEL); 4212 4213 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 4214 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4215 vpxor(vec1, vec2); 4216 4217 vptest(vec1, vec1); 4218 jccb(Assembler::notZero, FALSE_LABEL); 4219 jmpb(TRUE_LABEL); 4220 4221 bind(COMPARE_TAIL); // limit is zero 4222 movl(limit, result); 4223 // Fallthru to tail compare 4224 } else if (UseSSE42Intrinsics) { 4225 // With SSE4.2, use double quad vector compare 4226 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4227 4228 // Compare 16-byte vectors 4229 andl(result, 0x0000000f); // tail count (in bytes) 4230 andl(limit, 0xfffffff0); // vector count (in bytes) 4231 jcc(Assembler::zero, COMPARE_TAIL); 4232 4233 lea(ary1, Address(ary1, limit, Address::times_1)); 4234 lea(ary2, Address(ary2, limit, Address::times_1)); 4235 negptr(limit); 4236 4237 bind(COMPARE_WIDE_VECTORS); 4238 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4239 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4240 pxor(vec1, vec2); 4241 4242 ptest(vec1, vec1); 4243 jcc(Assembler::notZero, FALSE_LABEL); 4244 addptr(limit, 16); 4245 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4246 4247 testl(result, result); 4248 jcc(Assembler::zero, TRUE_LABEL); 4249 4250 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4251 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4252 pxor(vec1, vec2); 4253 4254 ptest(vec1, vec1); 4255 jccb(Assembler::notZero, FALSE_LABEL); 4256 jmpb(TRUE_LABEL); 4257 4258 bind(COMPARE_TAIL); // limit is zero 4259 movl(limit, result); 4260 // Fallthru to tail compare 4261 } 4262 4263 // Compare 4-byte vectors 4264 andl(limit, 0xfffffffc); // vector count (in bytes) 4265 jccb(Assembler::zero, COMPARE_CHAR); 4266 4267 lea(ary1, Address(ary1, limit, Address::times_1)); 4268 lea(ary2, Address(ary2, limit, Address::times_1)); 4269 negptr(limit); 4270 4271 bind(COMPARE_VECTORS); 4272 movl(chr, Address(ary1, limit, Address::times_1)); 4273 cmpl(chr, Address(ary2, limit, Address::times_1)); 4274 jccb(Assembler::notEqual, FALSE_LABEL); 4275 addptr(limit, 4); 4276 jcc(Assembler::notZero, COMPARE_VECTORS); 4277 4278 // Compare trailing char (final 2 bytes), if any 4279 bind(COMPARE_CHAR); 4280 testl(result, 0x2); // tail char 4281 jccb(Assembler::zero, COMPARE_BYTE); 4282 load_unsigned_short(chr, Address(ary1, 0)); 4283 load_unsigned_short(limit, Address(ary2, 0)); 4284 cmpl(chr, limit); 4285 jccb(Assembler::notEqual, FALSE_LABEL); 4286 4287 if (is_array_equ && is_char) { 4288 bind(COMPARE_BYTE); 4289 } else { 4290 lea(ary1, Address(ary1, 2)); 4291 lea(ary2, Address(ary2, 2)); 4292 4293 bind(COMPARE_BYTE); 4294 testl(result, 0x1); // tail byte 4295 jccb(Assembler::zero, TRUE_LABEL); 4296 load_unsigned_byte(chr, Address(ary1, 0)); 4297 load_unsigned_byte(limit, Address(ary2, 0)); 4298 cmpl(chr, limit); 4299 jccb(Assembler::notEqual, FALSE_LABEL); 4300 } 4301 bind(TRUE_LABEL); 4302 movl(result, 1); // return true 4303 jmpb(DONE); 4304 4305 bind(FALSE_LABEL); 4306 xorl(result, result); // return false 4307 4308 // That's it 4309 bind(DONE); 4310 if (UseAVX >= 2) { 4311 // clean upper bits of YMM registers 4312 vpxor(vec1, vec1); 4313 vpxor(vec2, vec2); 4314 } 4315 } 4316 4317 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4318 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4319 switch(ideal_opc) { 4320 case Op_LShiftVS: 4321 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4322 case Op_LShiftVI: 4323 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4324 case Op_LShiftVL: 4325 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4326 case Op_RShiftVS: 4327 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4328 case Op_RShiftVI: 4329 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4330 case Op_RShiftVL: 4331 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4332 case Op_URShiftVS: 4333 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4334 case Op_URShiftVI: 4335 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4336 case Op_URShiftVL: 4337 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4338 case Op_RotateRightV: 4339 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4340 case Op_RotateLeftV: 4341 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4342 default: 4343 fatal("Unsupported masked operation"); break; 4344 } 4345 } 4346 4347 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4348 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4349 bool is_varshift) { 4350 switch (ideal_opc) { 4351 case Op_AddVB: 4352 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4353 case Op_AddVS: 4354 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4355 case Op_AddVI: 4356 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4357 case Op_AddVL: 4358 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4359 case Op_AddVF: 4360 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4361 case Op_AddVD: 4362 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4363 case Op_SubVB: 4364 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4365 case Op_SubVS: 4366 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4367 case Op_SubVI: 4368 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4369 case Op_SubVL: 4370 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4371 case Op_SubVF: 4372 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4373 case Op_SubVD: 4374 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4375 case Op_MulVS: 4376 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4377 case Op_MulVI: 4378 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4379 case Op_MulVL: 4380 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4381 case Op_MulVF: 4382 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4383 case Op_MulVD: 4384 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4385 case Op_DivVF: 4386 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4387 case Op_DivVD: 4388 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4389 case Op_SqrtVF: 4390 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4391 case Op_SqrtVD: 4392 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4393 case Op_AbsVB: 4394 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4395 case Op_AbsVS: 4396 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4397 case Op_AbsVI: 4398 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4399 case Op_AbsVL: 4400 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4401 case Op_FmaVF: 4402 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4403 case Op_FmaVD: 4404 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4405 case Op_VectorRearrange: 4406 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4407 case Op_LShiftVS: 4408 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4409 case Op_LShiftVI: 4410 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4411 case Op_LShiftVL: 4412 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4413 case Op_RShiftVS: 4414 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4415 case Op_RShiftVI: 4416 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4417 case Op_RShiftVL: 4418 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4419 case Op_URShiftVS: 4420 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4421 case Op_URShiftVI: 4422 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4423 case Op_URShiftVL: 4424 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4425 case Op_RotateLeftV: 4426 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4427 case Op_RotateRightV: 4428 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4429 case Op_MaxV: 4430 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4431 case Op_MinV: 4432 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4433 case Op_XorV: 4434 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4435 case Op_OrV: 4436 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4437 case Op_AndV: 4438 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4439 default: 4440 fatal("Unsupported masked operation"); break; 4441 } 4442 } 4443 4444 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4445 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4446 switch (ideal_opc) { 4447 case Op_AddVB: 4448 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4449 case Op_AddVS: 4450 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4451 case Op_AddVI: 4452 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4453 case Op_AddVL: 4454 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4455 case Op_AddVF: 4456 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4457 case Op_AddVD: 4458 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4459 case Op_SubVB: 4460 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4461 case Op_SubVS: 4462 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4463 case Op_SubVI: 4464 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4465 case Op_SubVL: 4466 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4467 case Op_SubVF: 4468 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4469 case Op_SubVD: 4470 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4471 case Op_MulVS: 4472 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4473 case Op_MulVI: 4474 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4475 case Op_MulVL: 4476 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4477 case Op_MulVF: 4478 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4479 case Op_MulVD: 4480 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4481 case Op_DivVF: 4482 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4483 case Op_DivVD: 4484 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4485 case Op_FmaVF: 4486 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4487 case Op_FmaVD: 4488 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4489 case Op_MaxV: 4490 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4491 case Op_MinV: 4492 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4493 case Op_XorV: 4494 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4495 case Op_OrV: 4496 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4497 case Op_AndV: 4498 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4499 default: 4500 fatal("Unsupported masked operation"); break; 4501 } 4502 } 4503 4504 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4505 KRegister src1, KRegister src2) { 4506 BasicType etype = T_ILLEGAL; 4507 switch(mask_len) { 4508 case 2: 4509 case 4: 4510 case 8: etype = T_BYTE; break; 4511 case 16: etype = T_SHORT; break; 4512 case 32: etype = T_INT; break; 4513 case 64: etype = T_LONG; break; 4514 default: fatal("Unsupported type"); break; 4515 } 4516 assert(etype != T_ILLEGAL, ""); 4517 switch(ideal_opc) { 4518 case Op_AndVMask: 4519 kand(etype, dst, src1, src2); break; 4520 case Op_OrVMask: 4521 kor(etype, dst, src1, src2); break; 4522 case Op_XorVMask: 4523 kxor(etype, dst, src1, src2); break; 4524 default: 4525 fatal("Unsupported masked operation"); break; 4526 } 4527 } 4528 4529 /* 4530 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4531 * If src is NaN, the result is 0. 4532 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4533 * the result is equal to the value of Integer.MIN_VALUE. 4534 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4535 * the result is equal to the value of Integer.MAX_VALUE. 4536 */ 4537 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4538 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4539 Register rscratch, AddressLiteral float_sign_flip, 4540 int vec_enc) { 4541 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4542 Label done; 4543 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4544 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4545 vptest(xtmp2, xtmp2, vec_enc); 4546 jccb(Assembler::equal, done); 4547 4548 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4549 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4550 4551 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4552 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4553 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4554 4555 // Recompute the mask for remaining special value. 4556 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4557 // Extract SRC values corresponding to TRUE mask lanes. 4558 vpand(xtmp4, xtmp2, src, vec_enc); 4559 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4560 // values are set. 4561 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4562 4563 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4564 bind(done); 4565 } 4566 4567 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4568 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4569 Register rscratch, AddressLiteral float_sign_flip, 4570 int vec_enc) { 4571 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4572 Label done; 4573 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4574 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4575 kortestwl(ktmp1, ktmp1); 4576 jccb(Assembler::equal, done); 4577 4578 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4579 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4580 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4581 4582 kxorwl(ktmp1, ktmp1, ktmp2); 4583 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4584 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4585 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4586 bind(done); 4587 } 4588 4589 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4590 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4591 Register rscratch, AddressLiteral double_sign_flip, 4592 int vec_enc) { 4593 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4594 4595 Label done; 4596 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4597 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4598 kortestwl(ktmp1, ktmp1); 4599 jccb(Assembler::equal, done); 4600 4601 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4602 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4603 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4604 4605 kxorwl(ktmp1, ktmp1, ktmp2); 4606 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4607 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4608 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4609 bind(done); 4610 } 4611 4612 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4613 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4614 Register rscratch, AddressLiteral float_sign_flip, 4615 int vec_enc) { 4616 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4617 Label done; 4618 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4619 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4620 kortestwl(ktmp1, ktmp1); 4621 jccb(Assembler::equal, done); 4622 4623 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4624 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4625 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4626 4627 kxorwl(ktmp1, ktmp1, ktmp2); 4628 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4629 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4630 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4631 bind(done); 4632 } 4633 4634 /* 4635 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4636 * If src is NaN, the result is 0. 4637 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 4638 * the result is equal to the value of Long.MIN_VALUE. 4639 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 4640 * the result is equal to the value of Long.MAX_VALUE. 4641 */ 4642 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4643 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4644 Register rscratch, AddressLiteral double_sign_flip, 4645 int vec_enc) { 4646 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4647 4648 Label done; 4649 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4650 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 4651 kortestwl(ktmp1, ktmp1); 4652 jccb(Assembler::equal, done); 4653 4654 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4655 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4656 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4657 4658 kxorwl(ktmp1, ktmp1, ktmp2); 4659 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4660 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4661 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4662 bind(done); 4663 } 4664 4665 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 4666 XMMRegister xtmp, int index, int vec_enc) { 4667 assert(vec_enc < Assembler::AVX_512bit, ""); 4668 if (vec_enc == Assembler::AVX_256bit) { 4669 vextractf128_high(xtmp, src); 4670 vshufps(dst, src, xtmp, index, vec_enc); 4671 } else { 4672 vshufps(dst, src, zero, index, vec_enc); 4673 } 4674 } 4675 4676 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4677 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 4678 AddressLiteral float_sign_flip, int src_vec_enc) { 4679 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4680 4681 Label done; 4682 // Compare the destination lanes with float_sign_flip 4683 // value to get mask for all special values. 4684 movdqu(xtmp1, float_sign_flip, rscratch); 4685 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 4686 ptest(xtmp2, xtmp2); 4687 jccb(Assembler::equal, done); 4688 4689 // Flip float_sign_flip to get max integer value. 4690 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 4691 pxor(xtmp1, xtmp4); 4692 4693 // Set detination lanes corresponding to unordered source lanes as zero. 4694 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 4695 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 4696 4697 // Shuffle mask vector and pack lower doubles word from each quadword lane. 4698 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 4699 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 4700 4701 // Recompute the mask for remaining special value. 4702 pxor(xtmp2, xtmp3); 4703 // Extract mask corresponding to non-negative source lanes. 4704 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 4705 4706 // Shuffle mask vector and pack lower doubles word from each quadword lane. 4707 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 4708 pand(xtmp3, xtmp2); 4709 4710 // Replace destination lanes holding special value(0x80000000) with max int 4711 // if corresponding source lane holds a +ve value. 4712 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 4713 bind(done); 4714 } 4715 4716 4717 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 4718 XMMRegister xtmp, Register rscratch, int vec_enc) { 4719 switch(to_elem_bt) { 4720 case T_SHORT: 4721 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 4722 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 4723 vpackusdw(dst, dst, zero, vec_enc); 4724 if (vec_enc == Assembler::AVX_256bit) { 4725 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 4726 } 4727 break; 4728 case T_BYTE: 4729 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 4730 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 4731 vpackusdw(dst, dst, zero, vec_enc); 4732 if (vec_enc == Assembler::AVX_256bit) { 4733 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 4734 } 4735 vpackuswb(dst, dst, zero, vec_enc); 4736 break; 4737 default: assert(false, "%s", type2name(to_elem_bt)); 4738 } 4739 } 4740 4741 /* 4742 * Algorithm for vector D2L and F2I conversions:- 4743 * a) Perform vector D2L/F2I cast. 4744 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 4745 * It signifies that source value could be any of the special floating point 4746 * values(NaN,-Inf,Inf,Max,-Min). 4747 * c) Set destination to zero if source is NaN value. 4748 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 4749 */ 4750 4751 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4752 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4753 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 4754 int to_elem_sz = type2aelembytes(to_elem_bt); 4755 assert(to_elem_sz <= 4, ""); 4756 vcvttps2dq(dst, src, vec_enc); 4757 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 4758 if (to_elem_sz < 4) { 4759 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4760 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 4761 } 4762 } 4763 4764 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4765 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 4766 Register rscratch, int vec_enc) { 4767 int to_elem_sz = type2aelembytes(to_elem_bt); 4768 assert(to_elem_sz <= 4, ""); 4769 vcvttps2dq(dst, src, vec_enc); 4770 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 4771 switch(to_elem_bt) { 4772 case T_INT: 4773 break; 4774 case T_SHORT: 4775 evpmovdw(dst, dst, vec_enc); 4776 break; 4777 case T_BYTE: 4778 evpmovdb(dst, dst, vec_enc); 4779 break; 4780 default: assert(false, "%s", type2name(to_elem_bt)); 4781 } 4782 } 4783 4784 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4785 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 4786 Register rscratch, int vec_enc) { 4787 evcvttps2qq(dst, src, vec_enc); 4788 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 4789 } 4790 4791 // Handling for downcasting from double to integer or sub-word types on AVX2. 4792 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4793 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 4794 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 4795 int to_elem_sz = type2aelembytes(to_elem_bt); 4796 assert(to_elem_sz < 8, ""); 4797 vcvttpd2dq(dst, src, vec_enc); 4798 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 4799 float_sign_flip, vec_enc); 4800 if (to_elem_sz < 4) { 4801 // xtmp4 holds all zero lanes. 4802 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 4803 } 4804 } 4805 4806 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 4807 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 4808 KRegister ktmp2, AddressLiteral sign_flip, 4809 Register rscratch, int vec_enc) { 4810 if (VM_Version::supports_avx512dq()) { 4811 evcvttpd2qq(dst, src, vec_enc); 4812 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 4813 switch(to_elem_bt) { 4814 case T_LONG: 4815 break; 4816 case T_INT: 4817 evpmovsqd(dst, dst, vec_enc); 4818 break; 4819 case T_SHORT: 4820 evpmovsqd(dst, dst, vec_enc); 4821 evpmovdw(dst, dst, vec_enc); 4822 break; 4823 case T_BYTE: 4824 evpmovsqd(dst, dst, vec_enc); 4825 evpmovdb(dst, dst, vec_enc); 4826 break; 4827 default: assert(false, "%s", type2name(to_elem_bt)); 4828 } 4829 } else { 4830 assert(type2aelembytes(to_elem_bt) <= 4, ""); 4831 vcvttpd2dq(dst, src, vec_enc); 4832 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 4833 switch(to_elem_bt) { 4834 case T_INT: 4835 break; 4836 case T_SHORT: 4837 evpmovdw(dst, dst, vec_enc); 4838 break; 4839 case T_BYTE: 4840 evpmovdb(dst, dst, vec_enc); 4841 break; 4842 default: assert(false, "%s", type2name(to_elem_bt)); 4843 } 4844 } 4845 } 4846 4847 #ifdef _LP64 4848 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 4849 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 4850 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 4851 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4852 // and re-instantiate original MXCSR.RC mode after that. 4853 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 4854 4855 mov64(tmp, julong_cast(0.5L)); 4856 evpbroadcastq(xtmp1, tmp, vec_enc); 4857 vaddpd(xtmp1, src , xtmp1, vec_enc); 4858 evcvtpd2qq(dst, xtmp1, vec_enc); 4859 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 4860 double_sign_flip, vec_enc);; 4861 4862 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 4863 } 4864 4865 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 4866 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 4867 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 4868 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4869 // and re-instantiate original MXCSR.RC mode after that. 4870 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 4871 4872 movl(tmp, jint_cast(0.5)); 4873 movq(xtmp1, tmp); 4874 vbroadcastss(xtmp1, xtmp1, vec_enc); 4875 vaddps(xtmp1, src , xtmp1, vec_enc); 4876 vcvtps2dq(dst, xtmp1, vec_enc); 4877 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 4878 float_sign_flip, vec_enc); 4879 4880 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 4881 } 4882 4883 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 4884 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 4885 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 4886 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4887 // and re-instantiate original MXCSR.RC mode after that. 4888 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 4889 4890 movl(tmp, jint_cast(0.5)); 4891 movq(xtmp1, tmp); 4892 vbroadcastss(xtmp1, xtmp1, vec_enc); 4893 vaddps(xtmp1, src , xtmp1, vec_enc); 4894 vcvtps2dq(dst, xtmp1, vec_enc); 4895 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 4896 4897 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 4898 } 4899 #endif // _LP64 4900 4901 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 4902 BasicType from_elem_bt, BasicType to_elem_bt) { 4903 switch (from_elem_bt) { 4904 case T_BYTE: 4905 switch (to_elem_bt) { 4906 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 4907 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 4908 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 4909 default: ShouldNotReachHere(); 4910 } 4911 break; 4912 case T_SHORT: 4913 switch (to_elem_bt) { 4914 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 4915 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 4916 default: ShouldNotReachHere(); 4917 } 4918 break; 4919 case T_INT: 4920 assert(to_elem_bt == T_LONG, ""); 4921 vpmovzxdq(dst, src, vlen_enc); 4922 break; 4923 default: 4924 ShouldNotReachHere(); 4925 } 4926 } 4927 4928 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 4929 BasicType from_elem_bt, BasicType to_elem_bt) { 4930 switch (from_elem_bt) { 4931 case T_BYTE: 4932 switch (to_elem_bt) { 4933 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 4934 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 4935 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 4936 default: ShouldNotReachHere(); 4937 } 4938 break; 4939 case T_SHORT: 4940 switch (to_elem_bt) { 4941 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 4942 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 4943 default: ShouldNotReachHere(); 4944 } 4945 break; 4946 case T_INT: 4947 assert(to_elem_bt == T_LONG, ""); 4948 vpmovsxdq(dst, src, vlen_enc); 4949 break; 4950 default: 4951 ShouldNotReachHere(); 4952 } 4953 } 4954 4955 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 4956 BasicType dst_bt, BasicType src_bt, int vlen) { 4957 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 4958 assert(vlen_enc != AVX_512bit, ""); 4959 4960 int dst_bt_size = type2aelembytes(dst_bt); 4961 int src_bt_size = type2aelembytes(src_bt); 4962 if (dst_bt_size > src_bt_size) { 4963 switch (dst_bt_size / src_bt_size) { 4964 case 2: vpmovsxbw(dst, src, vlen_enc); break; 4965 case 4: vpmovsxbd(dst, src, vlen_enc); break; 4966 case 8: vpmovsxbq(dst, src, vlen_enc); break; 4967 default: ShouldNotReachHere(); 4968 } 4969 } else { 4970 assert(dst_bt_size < src_bt_size, ""); 4971 switch (src_bt_size / dst_bt_size) { 4972 case 2: { 4973 if (vlen_enc == AVX_128bit) { 4974 vpacksswb(dst, src, src, vlen_enc); 4975 } else { 4976 vpacksswb(dst, src, src, vlen_enc); 4977 vpermq(dst, dst, 0x08, vlen_enc); 4978 } 4979 break; 4980 } 4981 case 4: { 4982 if (vlen_enc == AVX_128bit) { 4983 vpackssdw(dst, src, src, vlen_enc); 4984 vpacksswb(dst, dst, dst, vlen_enc); 4985 } else { 4986 vpackssdw(dst, src, src, vlen_enc); 4987 vpermq(dst, dst, 0x08, vlen_enc); 4988 vpacksswb(dst, dst, dst, AVX_128bit); 4989 } 4990 break; 4991 } 4992 case 8: { 4993 if (vlen_enc == AVX_128bit) { 4994 vpshufd(dst, src, 0x08, vlen_enc); 4995 vpackssdw(dst, dst, dst, vlen_enc); 4996 vpacksswb(dst, dst, dst, vlen_enc); 4997 } else { 4998 vpshufd(dst, src, 0x08, vlen_enc); 4999 vpermq(dst, dst, 0x08, vlen_enc); 5000 vpackssdw(dst, dst, dst, AVX_128bit); 5001 vpacksswb(dst, dst, dst, AVX_128bit); 5002 } 5003 break; 5004 } 5005 default: ShouldNotReachHere(); 5006 } 5007 } 5008 } 5009 5010 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5011 bool merge, BasicType bt, int vlen_enc) { 5012 if (bt == T_INT) { 5013 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5014 } else { 5015 assert(bt == T_LONG, ""); 5016 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5017 } 5018 } 5019 5020 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5021 bool merge, BasicType bt, int vlen_enc) { 5022 if (bt == T_INT) { 5023 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5024 } else { 5025 assert(bt == T_LONG, ""); 5026 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5027 } 5028 } 5029 5030 #ifdef _LP64 5031 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5032 Register rtmp2, XMMRegister xtmp, int mask_len, 5033 int vec_enc) { 5034 int index = 0; 5035 int vindex = 0; 5036 mov64(rtmp1, 0x0101010101010101L); 5037 pdepq(rtmp1, src, rtmp1); 5038 if (mask_len > 8) { 5039 movq(rtmp2, src); 5040 vpxor(xtmp, xtmp, xtmp, vec_enc); 5041 movq(xtmp, rtmp1); 5042 } 5043 movq(dst, rtmp1); 5044 5045 mask_len -= 8; 5046 while (mask_len > 0) { 5047 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5048 index++; 5049 if ((index % 2) == 0) { 5050 pxor(xtmp, xtmp); 5051 } 5052 mov64(rtmp1, 0x0101010101010101L); 5053 shrq(rtmp2, 8); 5054 pdepq(rtmp1, rtmp2, rtmp1); 5055 pinsrq(xtmp, rtmp1, index % 2); 5056 vindex = index / 2; 5057 if (vindex) { 5058 // Write entire 16 byte vector when both 64 bit 5059 // lanes are update to save redundant instructions. 5060 if (index % 2) { 5061 vinsertf128(dst, dst, xtmp, vindex); 5062 } 5063 } else { 5064 vmovdqu(dst, xtmp); 5065 } 5066 mask_len -= 8; 5067 } 5068 } 5069 5070 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5071 switch(opc) { 5072 case Op_VectorMaskTrueCount: 5073 popcntq(dst, tmp); 5074 break; 5075 case Op_VectorMaskLastTrue: 5076 if (VM_Version::supports_lzcnt()) { 5077 lzcntq(tmp, tmp); 5078 movl(dst, 63); 5079 subl(dst, tmp); 5080 } else { 5081 movl(dst, -1); 5082 bsrq(tmp, tmp); 5083 cmov32(Assembler::notZero, dst, tmp); 5084 } 5085 break; 5086 case Op_VectorMaskFirstTrue: 5087 if (VM_Version::supports_bmi1()) { 5088 if (masklen < 32) { 5089 orl(tmp, 1 << masklen); 5090 tzcntl(dst, tmp); 5091 } else if (masklen == 32) { 5092 tzcntl(dst, tmp); 5093 } else { 5094 assert(masklen == 64, ""); 5095 tzcntq(dst, tmp); 5096 } 5097 } else { 5098 if (masklen < 32) { 5099 orl(tmp, 1 << masklen); 5100 bsfl(dst, tmp); 5101 } else { 5102 assert(masklen == 32 || masklen == 64, ""); 5103 movl(dst, masklen); 5104 if (masklen == 32) { 5105 bsfl(tmp, tmp); 5106 } else { 5107 bsfq(tmp, tmp); 5108 } 5109 cmov32(Assembler::notZero, dst, tmp); 5110 } 5111 } 5112 break; 5113 case Op_VectorMaskToLong: 5114 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5115 break; 5116 default: assert(false, "Unhandled mask operation"); 5117 } 5118 } 5119 5120 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5121 int masklen, int masksize, int vec_enc) { 5122 assert(VM_Version::supports_popcnt(), ""); 5123 5124 if(VM_Version::supports_avx512bw()) { 5125 kmovql(tmp, mask); 5126 } else { 5127 assert(masklen <= 16, ""); 5128 kmovwl(tmp, mask); 5129 } 5130 5131 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5132 // operations needs to be clipped. 5133 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5134 andq(tmp, (1 << masklen) - 1); 5135 } 5136 5137 vector_mask_operation_helper(opc, dst, tmp, masklen); 5138 } 5139 5140 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5141 Register tmp, int masklen, BasicType bt, int vec_enc) { 5142 assert(vec_enc == AVX_128bit && VM_Version::supports_avx() || 5143 vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4), ""); 5144 assert(VM_Version::supports_popcnt(), ""); 5145 5146 bool need_clip = false; 5147 switch(bt) { 5148 case T_BOOLEAN: 5149 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5150 vpxor(xtmp, xtmp, xtmp, vec_enc); 5151 vpsubb(xtmp, xtmp, mask, vec_enc); 5152 vpmovmskb(tmp, xtmp, vec_enc); 5153 need_clip = masklen < 16; 5154 break; 5155 case T_BYTE: 5156 vpmovmskb(tmp, mask, vec_enc); 5157 need_clip = masklen < 16; 5158 break; 5159 case T_SHORT: 5160 vpacksswb(xtmp, mask, mask, vec_enc); 5161 if (masklen >= 16) { 5162 vpermpd(xtmp, xtmp, 8, vec_enc); 5163 } 5164 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5165 need_clip = masklen < 16; 5166 break; 5167 case T_INT: 5168 case T_FLOAT: 5169 vmovmskps(tmp, mask, vec_enc); 5170 need_clip = masklen < 4; 5171 break; 5172 case T_LONG: 5173 case T_DOUBLE: 5174 vmovmskpd(tmp, mask, vec_enc); 5175 need_clip = masklen < 2; 5176 break; 5177 default: assert(false, "Unhandled type, %s", type2name(bt)); 5178 } 5179 5180 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5181 // operations needs to be clipped. 5182 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5183 // need_clip implies masklen < 32 5184 andq(tmp, (1 << masklen) - 1); 5185 } 5186 5187 vector_mask_operation_helper(opc, dst, tmp, masklen); 5188 } 5189 5190 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5191 Register rtmp2, int mask_len) { 5192 kmov(rtmp1, src); 5193 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5194 mov64(rtmp2, -1L); 5195 pextq(rtmp2, rtmp2, rtmp1); 5196 kmov(dst, rtmp2); 5197 } 5198 5199 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5200 bool merge, BasicType bt, int vec_enc) { 5201 if (opcode == Op_CompressV) { 5202 switch(bt) { 5203 case T_BYTE: 5204 evpcompressb(dst, mask, src, merge, vec_enc); 5205 break; 5206 case T_CHAR: 5207 case T_SHORT: 5208 evpcompressw(dst, mask, src, merge, vec_enc); 5209 break; 5210 case T_INT: 5211 evpcompressd(dst, mask, src, merge, vec_enc); 5212 break; 5213 case T_FLOAT: 5214 evcompressps(dst, mask, src, merge, vec_enc); 5215 break; 5216 case T_LONG: 5217 evpcompressq(dst, mask, src, merge, vec_enc); 5218 break; 5219 case T_DOUBLE: 5220 evcompresspd(dst, mask, src, merge, vec_enc); 5221 break; 5222 default: 5223 fatal("Unsupported type %s", type2name(bt)); 5224 break; 5225 } 5226 } else { 5227 assert(opcode == Op_ExpandV, ""); 5228 switch(bt) { 5229 case T_BYTE: 5230 evpexpandb(dst, mask, src, merge, vec_enc); 5231 break; 5232 case T_CHAR: 5233 case T_SHORT: 5234 evpexpandw(dst, mask, src, merge, vec_enc); 5235 break; 5236 case T_INT: 5237 evpexpandd(dst, mask, src, merge, vec_enc); 5238 break; 5239 case T_FLOAT: 5240 evexpandps(dst, mask, src, merge, vec_enc); 5241 break; 5242 case T_LONG: 5243 evpexpandq(dst, mask, src, merge, vec_enc); 5244 break; 5245 case T_DOUBLE: 5246 evexpandpd(dst, mask, src, merge, vec_enc); 5247 break; 5248 default: 5249 fatal("Unsupported type %s", type2name(bt)); 5250 break; 5251 } 5252 } 5253 } 5254 #endif 5255 5256 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5257 KRegister ktmp1, int vec_enc) { 5258 if (opcode == Op_SignumVD) { 5259 vsubpd(dst, zero, one, vec_enc); 5260 // if src < 0 ? -1 : 1 5261 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5262 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5263 // if src == NaN, -0.0 or 0.0 return src. 5264 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5265 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5266 } else { 5267 assert(opcode == Op_SignumVF, ""); 5268 vsubps(dst, zero, one, vec_enc); 5269 // if src < 0 ? -1 : 1 5270 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5271 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5272 // if src == NaN, -0.0 or 0.0 return src. 5273 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5274 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5275 } 5276 } 5277 5278 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5279 XMMRegister xtmp1, int vec_enc) { 5280 if (opcode == Op_SignumVD) { 5281 vsubpd(dst, zero, one, vec_enc); 5282 // if src < 0 ? -1 : 1 5283 vblendvpd(dst, one, dst, src, vec_enc); 5284 // if src == NaN, -0.0 or 0.0 return src. 5285 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5286 vblendvpd(dst, dst, src, xtmp1, vec_enc); 5287 } else { 5288 assert(opcode == Op_SignumVF, ""); 5289 vsubps(dst, zero, one, vec_enc); 5290 // if src < 0 ? -1 : 1 5291 vblendvps(dst, one, dst, src, vec_enc); 5292 // if src == NaN, -0.0 or 0.0 return src. 5293 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5294 vblendvps(dst, dst, src, xtmp1, vec_enc); 5295 } 5296 } 5297 5298 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5299 if (VM_Version::supports_avx512bw()) { 5300 if (mask_len > 32) { 5301 kmovql(dst, src); 5302 } else { 5303 kmovdl(dst, src); 5304 if (mask_len != 32) { 5305 kshiftrdl(dst, dst, 32 - mask_len); 5306 } 5307 } 5308 } else { 5309 assert(mask_len <= 16, ""); 5310 kmovwl(dst, src); 5311 if (mask_len != 16) { 5312 kshiftrwl(dst, dst, 16 - mask_len); 5313 } 5314 } 5315 } 5316 5317 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5318 int lane_size = type2aelembytes(bt); 5319 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5320 if ((is_LP64 || lane_size < 8) && 5321 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5322 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5323 movptr(rtmp, imm32); 5324 switch(lane_size) { 5325 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5326 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5327 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5328 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5329 fatal("Unsupported lane size %d", lane_size); 5330 break; 5331 } 5332 } else { 5333 movptr(rtmp, imm32); 5334 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5335 switch(lane_size) { 5336 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5337 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5338 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5339 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5340 fatal("Unsupported lane size %d", lane_size); 5341 break; 5342 } 5343 } 5344 } 5345 5346 // 5347 // Following is lookup table based popcount computation algorithm:- 5348 // Index Bit set count 5349 // [ 0000 -> 0, 5350 // 0001 -> 1, 5351 // 0010 -> 1, 5352 // 0011 -> 2, 5353 // 0100 -> 1, 5354 // 0101 -> 2, 5355 // 0110 -> 2, 5356 // 0111 -> 3, 5357 // 1000 -> 1, 5358 // 1001 -> 2, 5359 // 1010 -> 3, 5360 // 1011 -> 3, 5361 // 1100 -> 2, 5362 // 1101 -> 3, 5363 // 1111 -> 4 ] 5364 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5365 // shuffle indices for lookup table access. 5366 // b. Right shift each byte of vector lane by 4 positions. 5367 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5368 // shuffle indices for lookup table access. 5369 // d. Add the bitset count of upper and lower 4 bits of each byte. 5370 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5371 // count of all the bytes of a quadword. 5372 // f. Perform step e. for upper 128bit vector lane. 5373 // g. Pack the bitset count of quadwords back to double word. 5374 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5375 5376 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5377 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5378 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5379 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5380 vpsrlw(dst, src, 4, vec_enc); 5381 vpand(dst, dst, xtmp1, vec_enc); 5382 vpand(xtmp1, src, xtmp1, vec_enc); 5383 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5384 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5385 vpshufb(dst, xtmp2, dst, vec_enc); 5386 vpaddb(dst, dst, xtmp1, vec_enc); 5387 } 5388 5389 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5390 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5391 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5392 // Following code is as per steps e,f,g and h of above algorithm. 5393 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5394 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5395 vpsadbw(dst, dst, xtmp2, vec_enc); 5396 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5397 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5398 vpackuswb(dst, xtmp1, dst, vec_enc); 5399 } 5400 5401 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5402 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5403 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5404 // Add the popcount of upper and lower bytes of word. 5405 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5406 vpsrlw(dst, xtmp1, 8, vec_enc); 5407 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5408 vpaddw(dst, dst, xtmp1, vec_enc); 5409 } 5410 5411 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5412 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5413 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5414 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5415 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5416 } 5417 5418 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5419 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5420 switch(bt) { 5421 case T_LONG: 5422 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5423 break; 5424 case T_INT: 5425 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5426 break; 5427 case T_CHAR: 5428 case T_SHORT: 5429 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5430 break; 5431 case T_BYTE: 5432 case T_BOOLEAN: 5433 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5434 break; 5435 default: 5436 fatal("Unsupported type %s", type2name(bt)); 5437 break; 5438 } 5439 } 5440 5441 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5442 KRegister mask, bool merge, int vec_enc) { 5443 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5444 switch(bt) { 5445 case T_LONG: 5446 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5447 evpopcntq(dst, mask, src, merge, vec_enc); 5448 break; 5449 case T_INT: 5450 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5451 evpopcntd(dst, mask, src, merge, vec_enc); 5452 break; 5453 case T_CHAR: 5454 case T_SHORT: 5455 assert(VM_Version::supports_avx512_bitalg(), ""); 5456 evpopcntw(dst, mask, src, merge, vec_enc); 5457 break; 5458 case T_BYTE: 5459 case T_BOOLEAN: 5460 assert(VM_Version::supports_avx512_bitalg(), ""); 5461 evpopcntb(dst, mask, src, merge, vec_enc); 5462 break; 5463 default: 5464 fatal("Unsupported type %s", type2name(bt)); 5465 break; 5466 } 5467 } 5468 5469 #ifndef _LP64 5470 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 5471 assert(VM_Version::supports_avx512bw(), ""); 5472 kmovdl(tmp, src); 5473 kunpckdql(dst, tmp, tmp); 5474 } 5475 #endif 5476 5477 // Bit reversal algorithm first reverses the bits of each byte followed by 5478 // a byte level reversal for multi-byte primitive types (short/int/long). 5479 // Algorithm performs a lookup table access to get reverse bit sequence 5480 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5481 // is obtained by swapping the reverse bit sequences of upper and lower 5482 // nibble of a byte. 5483 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5484 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5485 if (VM_Version::supports_avx512vlbw()) { 5486 5487 // Get the reverse bit sequence of lower nibble of each byte. 5488 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5489 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5490 evpandq(dst, xtmp2, src, vec_enc); 5491 vpshufb(dst, xtmp1, dst, vec_enc); 5492 vpsllq(dst, dst, 4, vec_enc); 5493 5494 // Get the reverse bit sequence of upper nibble of each byte. 5495 vpandn(xtmp2, xtmp2, src, vec_enc); 5496 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5497 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5498 5499 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5500 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5501 evporq(xtmp2, dst, xtmp2, vec_enc); 5502 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5503 5504 } else if(vec_enc == Assembler::AVX_512bit) { 5505 // Shift based bit reversal. 5506 assert(bt == T_LONG || bt == T_INT, ""); 5507 5508 // Swap lower and upper nibble of each byte. 5509 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5510 5511 // Swap two least and most significant bits of each nibble. 5512 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5513 5514 // Swap adjacent pair of bits. 5515 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5516 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5517 5518 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5519 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5520 } else { 5521 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5522 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5523 5524 // Get the reverse bit sequence of lower nibble of each byte. 5525 vpand(dst, xtmp2, src, vec_enc); 5526 vpshufb(dst, xtmp1, dst, vec_enc); 5527 vpsllq(dst, dst, 4, vec_enc); 5528 5529 // Get the reverse bit sequence of upper nibble of each byte. 5530 vpandn(xtmp2, xtmp2, src, vec_enc); 5531 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5532 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5533 5534 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5535 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5536 vpor(xtmp2, dst, xtmp2, vec_enc); 5537 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5538 } 5539 } 5540 5541 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 5542 XMMRegister xtmp, Register rscratch) { 5543 assert(VM_Version::supports_gfni(), ""); 5544 assert(rscratch != noreg || always_reachable(mask), "missing"); 5545 5546 // Galois field instruction based bit reversal based on following algorithm. 5547 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5548 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 5549 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 5550 vector_reverse_byte(bt, dst, xtmp, vec_enc); 5551 } 5552 5553 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 5554 XMMRegister xtmp1, Register rtmp, int vec_enc) { 5555 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 5556 evpandq(dst, xtmp1, src, vec_enc); 5557 vpsllq(dst, dst, nbits, vec_enc); 5558 vpandn(xtmp1, xtmp1, src, vec_enc); 5559 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 5560 evporq(dst, dst, xtmp1, vec_enc); 5561 } 5562 5563 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5564 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5565 // Shift based bit reversal. 5566 assert(VM_Version::supports_evex(), ""); 5567 switch(bt) { 5568 case T_LONG: 5569 // Swap upper and lower double word of each quad word. 5570 evprorq(xtmp1, k0, src, 32, true, vec_enc); 5571 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 5572 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5573 break; 5574 case T_INT: 5575 // Swap upper and lower word of each double word. 5576 evprord(xtmp1, k0, src, 16, true, vec_enc); 5577 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5578 break; 5579 case T_CHAR: 5580 case T_SHORT: 5581 // Swap upper and lower byte of each word. 5582 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 5583 break; 5584 case T_BYTE: 5585 evmovdquq(dst, k0, src, true, vec_enc); 5586 break; 5587 default: 5588 fatal("Unsupported type %s", type2name(bt)); 5589 break; 5590 } 5591 } 5592 5593 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5594 if (bt == T_BYTE) { 5595 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 5596 evmovdquq(dst, k0, src, true, vec_enc); 5597 } else { 5598 vmovdqu(dst, src); 5599 } 5600 return; 5601 } 5602 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 5603 // pre-computed shuffle indices. 5604 switch(bt) { 5605 case T_LONG: 5606 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 5607 break; 5608 case T_INT: 5609 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 5610 break; 5611 case T_CHAR: 5612 case T_SHORT: 5613 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 5614 break; 5615 default: 5616 fatal("Unsupported type %s", type2name(bt)); 5617 break; 5618 } 5619 vpshufb(dst, src, dst, vec_enc); 5620 } 5621 5622 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5623 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5624 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 5625 assert(is_integral_type(bt), ""); 5626 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5627 assert(VM_Version::supports_avx512cd(), ""); 5628 switch(bt) { 5629 case T_LONG: 5630 evplzcntq(dst, ktmp, src, merge, vec_enc); 5631 break; 5632 case T_INT: 5633 evplzcntd(dst, ktmp, src, merge, vec_enc); 5634 break; 5635 case T_SHORT: 5636 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 5637 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 5638 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 5639 vpunpckhwd(dst, xtmp1, src, vec_enc); 5640 evplzcntd(dst, ktmp, dst, merge, vec_enc); 5641 vpackusdw(dst, xtmp2, dst, vec_enc); 5642 break; 5643 case T_BYTE: 5644 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5645 // accessing the lookup table. 5646 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 5647 // accessing the lookup table. 5648 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 5649 assert(VM_Version::supports_avx512bw(), ""); 5650 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 5651 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 5652 vpand(xtmp2, dst, src, vec_enc); 5653 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5654 vpsrlw(xtmp3, src, 4, vec_enc); 5655 vpand(xtmp3, dst, xtmp3, vec_enc); 5656 vpshufb(dst, xtmp1, xtmp3, vec_enc); 5657 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 5658 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 5659 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 5660 break; 5661 default: 5662 fatal("Unsupported type %s", type2name(bt)); 5663 break; 5664 } 5665 } 5666 5667 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5668 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5669 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 5670 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5671 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5672 // accessing the lookup table. 5673 vpand(dst, xtmp2, src, vec_enc); 5674 vpshufb(dst, xtmp1, dst, vec_enc); 5675 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 5676 // accessing the lookup table. 5677 vpsrlw(xtmp3, src, 4, vec_enc); 5678 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 5679 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 5680 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 5681 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 5682 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 5683 vpaddb(dst, dst, xtmp2, vec_enc); 5684 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 5685 } 5686 5687 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5688 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5689 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5690 // Add zero counts of lower byte and upper byte of a word if 5691 // upper byte holds a zero value. 5692 vpsrlw(xtmp3, src, 8, vec_enc); 5693 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 5694 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 5695 vpsllw(xtmp2, dst, 8, vec_enc); 5696 vpaddw(xtmp2, xtmp2, dst, vec_enc); 5697 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5698 vpsrlw(dst, dst, 8, vec_enc); 5699 } 5700 5701 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5702 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 5703 // Since IEEE 754 floating point format represents mantissa in 1.0 format 5704 // hence biased exponent can be used to compute leading zero count as per 5705 // following formula:- 5706 // LZCNT = 32 - (biased_exp - 127) 5707 // Special handling has been introduced for Zero, Max_Int and -ve source values. 5708 5709 // Broadcast 0xFF 5710 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 5711 vpsrld(xtmp1, xtmp1, 24, vec_enc); 5712 5713 // Extract biased exponent. 5714 vcvtdq2ps(dst, src, vec_enc); 5715 vpsrld(dst, dst, 23, vec_enc); 5716 vpand(dst, dst, xtmp1, vec_enc); 5717 5718 // Broadcast 127. 5719 vpsrld(xtmp1, xtmp1, 1, vec_enc); 5720 // Exponent = biased_exp - 127 5721 vpsubd(dst, dst, xtmp1, vec_enc); 5722 5723 // Exponent = Exponent + 1 5724 vpsrld(xtmp3, xtmp1, 6, vec_enc); 5725 vpaddd(dst, dst, xtmp3, vec_enc); 5726 5727 // Replace -ve exponent with zero, exponent is -ve when src 5728 // lane contains a zero value. 5729 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5730 vblendvps(dst, dst, xtmp2, dst, vec_enc); 5731 5732 // Rematerialize broadcast 32. 5733 vpslld(xtmp1, xtmp3, 5, vec_enc); 5734 // Exponent is 32 if corresponding source lane contains max_int value. 5735 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 5736 // LZCNT = 32 - exponent 5737 vpsubd(dst, xtmp1, dst, vec_enc); 5738 5739 // Replace LZCNT with a value 1 if corresponding source lane 5740 // contains max_int value. 5741 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 5742 5743 // Replace biased_exp with 0 if source lane value is less than zero. 5744 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5745 vblendvps(dst, dst, xtmp2, src, vec_enc); 5746 } 5747 5748 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5749 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5750 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5751 // Add zero counts of lower word and upper word of a double word if 5752 // upper word holds a zero value. 5753 vpsrld(xtmp3, src, 16, vec_enc); 5754 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 5755 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 5756 vpslld(xtmp2, dst, 16, vec_enc); 5757 vpaddd(xtmp2, xtmp2, dst, vec_enc); 5758 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5759 vpsrld(dst, dst, 16, vec_enc); 5760 // Add zero counts of lower doubleword and upper doubleword of a 5761 // quadword if upper doubleword holds a zero value. 5762 vpsrlq(xtmp3, src, 32, vec_enc); 5763 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 5764 vpsllq(xtmp2, dst, 32, vec_enc); 5765 vpaddq(xtmp2, xtmp2, dst, vec_enc); 5766 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5767 vpsrlq(dst, dst, 32, vec_enc); 5768 } 5769 5770 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 5771 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5772 Register rtmp, int vec_enc) { 5773 assert(is_integral_type(bt), "unexpected type"); 5774 assert(vec_enc < Assembler::AVX_512bit, ""); 5775 switch(bt) { 5776 case T_LONG: 5777 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5778 break; 5779 case T_INT: 5780 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 5781 break; 5782 case T_SHORT: 5783 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5784 break; 5785 case T_BYTE: 5786 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5787 break; 5788 default: 5789 fatal("Unsupported type %s", type2name(bt)); 5790 break; 5791 } 5792 } 5793 5794 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 5795 switch(bt) { 5796 case T_BYTE: 5797 vpsubb(dst, src1, src2, vec_enc); 5798 break; 5799 case T_SHORT: 5800 vpsubw(dst, src1, src2, vec_enc); 5801 break; 5802 case T_INT: 5803 vpsubd(dst, src1, src2, vec_enc); 5804 break; 5805 case T_LONG: 5806 vpsubq(dst, src1, src2, vec_enc); 5807 break; 5808 default: 5809 fatal("Unsupported type %s", type2name(bt)); 5810 break; 5811 } 5812 } 5813 5814 // Trailing zero count computation is based on leading zero count operation as per 5815 // following equation. All AVX3 targets support AVX512CD feature which offers 5816 // direct vector instruction to compute leading zero count. 5817 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 5818 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5819 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5820 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 5821 assert(is_integral_type(bt), ""); 5822 // xtmp = -1 5823 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 5824 // xtmp = xtmp + src 5825 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 5826 // xtmp = xtmp & ~src 5827 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 5828 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 5829 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 5830 vpsub(bt, dst, xtmp4, dst, vec_enc); 5831 } 5832 5833 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 5834 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 5835 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5836 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5837 assert(is_integral_type(bt), ""); 5838 // xtmp = 0 5839 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 5840 // xtmp = 0 - src 5841 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 5842 // xtmp = xtmp | src 5843 vpor(xtmp3, xtmp3, src, vec_enc); 5844 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 5845 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 5846 vpsub(bt, dst, xtmp1, dst, vec_enc); 5847 } 5848 5849 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 5850 Label done; 5851 Label neg_divisor_fastpath; 5852 cmpl(divisor, 0); 5853 jccb(Assembler::less, neg_divisor_fastpath); 5854 xorl(rdx, rdx); 5855 divl(divisor); 5856 jmpb(done); 5857 bind(neg_divisor_fastpath); 5858 // Fastpath for divisor < 0: 5859 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 5860 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 5861 movl(rdx, rax); 5862 subl(rdx, divisor); 5863 if (VM_Version::supports_bmi1()) { 5864 andnl(rax, rdx, rax); 5865 } else { 5866 notl(rdx); 5867 andl(rax, rdx); 5868 } 5869 shrl(rax, 31); 5870 bind(done); 5871 } 5872 5873 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 5874 Label done; 5875 Label neg_divisor_fastpath; 5876 cmpl(divisor, 0); 5877 jccb(Assembler::less, neg_divisor_fastpath); 5878 xorl(rdx, rdx); 5879 divl(divisor); 5880 jmpb(done); 5881 bind(neg_divisor_fastpath); 5882 // Fastpath when divisor < 0: 5883 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 5884 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 5885 movl(rdx, rax); 5886 subl(rax, divisor); 5887 if (VM_Version::supports_bmi1()) { 5888 andnl(rax, rax, rdx); 5889 } else { 5890 notl(rax); 5891 andl(rax, rdx); 5892 } 5893 sarl(rax, 31); 5894 andl(rax, divisor); 5895 subl(rdx, rax); 5896 bind(done); 5897 } 5898 5899 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 5900 Label done; 5901 Label neg_divisor_fastpath; 5902 5903 cmpl(divisor, 0); 5904 jccb(Assembler::less, neg_divisor_fastpath); 5905 xorl(rdx, rdx); 5906 divl(divisor); 5907 jmpb(done); 5908 bind(neg_divisor_fastpath); 5909 // Fastpath for divisor < 0: 5910 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 5911 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 5912 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 5913 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 5914 movl(rdx, rax); 5915 subl(rax, divisor); 5916 if (VM_Version::supports_bmi1()) { 5917 andnl(rax, rax, rdx); 5918 } else { 5919 notl(rax); 5920 andl(rax, rdx); 5921 } 5922 movl(tmp, rax); 5923 shrl(rax, 31); // quotient 5924 sarl(tmp, 31); 5925 andl(tmp, divisor); 5926 subl(rdx, tmp); // remainder 5927 bind(done); 5928 } 5929 5930 #ifdef _LP64 5931 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 5932 XMMRegister xtmp2, Register rtmp) { 5933 if(VM_Version::supports_gfni()) { 5934 // Galois field instruction based bit reversal based on following algorithm. 5935 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5936 mov64(rtmp, 0x8040201008040201L); 5937 movq(xtmp1, src); 5938 movq(xtmp2, rtmp); 5939 gf2p8affineqb(xtmp1, xtmp2, 0); 5940 movq(dst, xtmp1); 5941 } else { 5942 // Swap even and odd numbered bits. 5943 movl(rtmp, src); 5944 andl(rtmp, 0x55555555); 5945 shll(rtmp, 1); 5946 movl(dst, src); 5947 andl(dst, 0xAAAAAAAA); 5948 shrl(dst, 1); 5949 orl(dst, rtmp); 5950 5951 // Swap LSB and MSB 2 bits of each nibble. 5952 movl(rtmp, dst); 5953 andl(rtmp, 0x33333333); 5954 shll(rtmp, 2); 5955 andl(dst, 0xCCCCCCCC); 5956 shrl(dst, 2); 5957 orl(dst, rtmp); 5958 5959 // Swap LSB and MSB 4 bits of each byte. 5960 movl(rtmp, dst); 5961 andl(rtmp, 0x0F0F0F0F); 5962 shll(rtmp, 4); 5963 andl(dst, 0xF0F0F0F0); 5964 shrl(dst, 4); 5965 orl(dst, rtmp); 5966 } 5967 bswapl(dst); 5968 } 5969 5970 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 5971 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 5972 if(VM_Version::supports_gfni()) { 5973 // Galois field instruction based bit reversal based on following algorithm. 5974 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5975 mov64(rtmp1, 0x8040201008040201L); 5976 movq(xtmp1, src); 5977 movq(xtmp2, rtmp1); 5978 gf2p8affineqb(xtmp1, xtmp2, 0); 5979 movq(dst, xtmp1); 5980 } else { 5981 // Swap even and odd numbered bits. 5982 movq(rtmp1, src); 5983 mov64(rtmp2, 0x5555555555555555L); 5984 andq(rtmp1, rtmp2); 5985 shlq(rtmp1, 1); 5986 movq(dst, src); 5987 notq(rtmp2); 5988 andq(dst, rtmp2); 5989 shrq(dst, 1); 5990 orq(dst, rtmp1); 5991 5992 // Swap LSB and MSB 2 bits of each nibble. 5993 movq(rtmp1, dst); 5994 mov64(rtmp2, 0x3333333333333333L); 5995 andq(rtmp1, rtmp2); 5996 shlq(rtmp1, 2); 5997 notq(rtmp2); 5998 andq(dst, rtmp2); 5999 shrq(dst, 2); 6000 orq(dst, rtmp1); 6001 6002 // Swap LSB and MSB 4 bits of each byte. 6003 movq(rtmp1, dst); 6004 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6005 andq(rtmp1, rtmp2); 6006 shlq(rtmp1, 4); 6007 notq(rtmp2); 6008 andq(dst, rtmp2); 6009 shrq(dst, 4); 6010 orq(dst, rtmp1); 6011 } 6012 bswapq(dst); 6013 } 6014 6015 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6016 Label done; 6017 Label neg_divisor_fastpath; 6018 cmpq(divisor, 0); 6019 jccb(Assembler::less, neg_divisor_fastpath); 6020 xorl(rdx, rdx); 6021 divq(divisor); 6022 jmpb(done); 6023 bind(neg_divisor_fastpath); 6024 // Fastpath for divisor < 0: 6025 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6026 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6027 movq(rdx, rax); 6028 subq(rdx, divisor); 6029 if (VM_Version::supports_bmi1()) { 6030 andnq(rax, rdx, rax); 6031 } else { 6032 notq(rdx); 6033 andq(rax, rdx); 6034 } 6035 shrq(rax, 63); 6036 bind(done); 6037 } 6038 6039 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6040 Label done; 6041 Label neg_divisor_fastpath; 6042 cmpq(divisor, 0); 6043 jccb(Assembler::less, neg_divisor_fastpath); 6044 xorq(rdx, rdx); 6045 divq(divisor); 6046 jmp(done); 6047 bind(neg_divisor_fastpath); 6048 // Fastpath when divisor < 0: 6049 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6050 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6051 movq(rdx, rax); 6052 subq(rax, divisor); 6053 if (VM_Version::supports_bmi1()) { 6054 andnq(rax, rax, rdx); 6055 } else { 6056 notq(rax); 6057 andq(rax, rdx); 6058 } 6059 sarq(rax, 63); 6060 andq(rax, divisor); 6061 subq(rdx, rax); 6062 bind(done); 6063 } 6064 6065 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6066 Label done; 6067 Label neg_divisor_fastpath; 6068 cmpq(divisor, 0); 6069 jccb(Assembler::less, neg_divisor_fastpath); 6070 xorq(rdx, rdx); 6071 divq(divisor); 6072 jmp(done); 6073 bind(neg_divisor_fastpath); 6074 // Fastpath for divisor < 0: 6075 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6076 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6077 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6078 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6079 movq(rdx, rax); 6080 subq(rax, divisor); 6081 if (VM_Version::supports_bmi1()) { 6082 andnq(rax, rax, rdx); 6083 } else { 6084 notq(rax); 6085 andq(rax, rdx); 6086 } 6087 movq(tmp, rax); 6088 shrq(rax, 63); // quotient 6089 sarq(tmp, 63); 6090 andq(tmp, divisor); 6091 subq(rdx, tmp); // remainder 6092 bind(done); 6093 } 6094 #endif 6095 6096 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6097 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6098 int vlen_enc) { 6099 assert(VM_Version::supports_avx512bw(), ""); 6100 // Byte shuffles are inlane operations and indices are determined using 6101 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6102 // normalized to index range 0-15. This makes sure that all the multiples 6103 // of an index value are placed at same relative position in 128 bit 6104 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6105 // will be 16th element in their respective 128 bit lanes. 6106 movl(rtmp, 16); 6107 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6108 6109 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6110 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6111 // original shuffle indices and move the shuffled lanes corresponding to true 6112 // mask to destination vector. 6113 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6114 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6115 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6116 6117 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6118 // and broadcasting second 128 bit lane. 6119 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6120 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6121 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6122 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6123 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6124 6125 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6126 // and broadcasting third 128 bit lane. 6127 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6128 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6129 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6130 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6131 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6132 6133 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6134 // and broadcasting third 128 bit lane. 6135 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6136 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6137 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6138 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6139 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6140 } 6141