1 /* 2 * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "oops/methodData.hpp" 31 #include "opto/c2_MacroAssembler.hpp" 32 #include "opto/intrinsicnode.hpp" 33 #include "opto/output.hpp" 34 #include "opto/opcodes.hpp" 35 #include "opto/subnode.hpp" 36 #include "runtime/objectMonitor.hpp" 37 #include "runtime/stubRoutines.hpp" 38 39 #ifdef PRODUCT 40 #define BLOCK_COMMENT(str) /* nothing */ 41 #define STOP(error) stop(error) 42 #else 43 #define BLOCK_COMMENT(str) block_comment(str) 44 #define STOP(error) block_comment(error); stop(error) 45 #endif 46 47 // C2 compiled method's prolog code. 48 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) { 49 if (C->clinit_barrier_on_entry()) { 50 assert(VM_Version::supports_fast_class_init_checks(), "sanity"); 51 assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started"); 52 53 Label L_skip_barrier; 54 Register klass = rscratch1; 55 56 mov_metadata(klass, C->method()->holder()->constant_encoding()); 57 clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 58 59 jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 60 61 bind(L_skip_barrier); 62 } 63 64 int framesize = C->output()->frame_size_in_bytes(); 65 int bangsize = C->output()->bang_size_in_bytes(); 66 bool fp_mode_24b = false; 67 int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0; 68 69 // WARNING: Initial instruction MUST be 5 bytes or longer so that 70 // NativeJump::patch_verified_entry will be able to patch out the entry 71 // code safely. The push to verify stack depth is ok at 5 bytes, 72 // the frame allocation can be either 3 or 6 bytes. So if we don't do 73 // stack bang then we must use the 6 byte frame allocation even if 74 // we have no frame. :-( 75 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 76 77 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 78 // Remove word for return addr 79 framesize -= wordSize; 80 stack_bang_size -= wordSize; 81 82 // Calls to C2R adapters often do not accept exceptional returns. 83 // We require that their callers must bang for them. But be careful, because 84 // some VM calls (such as call site linkage) can use several kilobytes of 85 // stack. But the stack safety zone should account for that. 86 // See bugs 4446381, 4468289, 4497237. 87 if (stack_bang_size > 0) { 88 generate_stack_overflow_check(stack_bang_size); 89 90 // We always push rbp, so that on return to interpreter rbp, will be 91 // restored correctly and we can correct the stack. 92 push(rbp); 93 // Save caller's stack pointer into RBP if the frame pointer is preserved. 94 if (PreserveFramePointer) { 95 mov(rbp, rsp); 96 } 97 // Remove word for ebp 98 framesize -= wordSize; 99 100 // Create frame 101 if (framesize) { 102 subptr(rsp, framesize); 103 } 104 } else { 105 // Create frame (force generation of a 4 byte immediate value) 106 subptr_imm32(rsp, framesize); 107 108 // Save RBP register now. 109 framesize -= wordSize; 110 movptr(Address(rsp, framesize), rbp); 111 // Save caller's stack pointer into RBP if the frame pointer is preserved. 112 if (PreserveFramePointer) { 113 movptr(rbp, rsp); 114 if (framesize > 0) { 115 addptr(rbp, framesize); 116 } 117 } 118 } 119 120 if (C->needs_stack_repair()) { 121 // Save stack increment just below the saved rbp (also account for fixed framesize and rbp) 122 assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned"); 123 movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize); 124 } 125 126 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 127 framesize -= wordSize; 128 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 129 } 130 131 #ifndef _LP64 132 // If method sets FPU control word do it now 133 if (fp_mode_24b) { 134 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 135 } 136 if (UseSSE >= 2 && VerifyFPU) { 137 verify_FPU(0, "FPU stack must be clean on entry"); 138 } 139 #endif 140 141 #ifdef ASSERT 142 if (VerifyStackAtCalls) { 143 Label L; 144 push(rax); 145 mov(rax, rsp); 146 andptr(rax, StackAlignmentInBytes-1); 147 cmpptr(rax, StackAlignmentInBytes-wordSize); 148 pop(rax); 149 jcc(Assembler::equal, L); 150 STOP("Stack is not properly aligned!"); 151 bind(L); 152 } 153 #endif 154 } 155 156 void C2_MacroAssembler::entry_barrier() { 157 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 158 #ifdef _LP64 159 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 160 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 161 Label dummy_slow_path; 162 Label dummy_continuation; 163 Label* slow_path = &dummy_slow_path; 164 Label* continuation = &dummy_continuation; 165 if (!Compile::current()->output()->in_scratch_emit_size()) { 166 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 167 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 168 Compile::current()->output()->add_stub(stub); 169 slow_path = &stub->entry(); 170 continuation = &stub->continuation(); 171 } 172 bs->nmethod_entry_barrier(this, slow_path, continuation); 173 } 174 #else 175 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 176 bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */); 177 #endif 178 } 179 180 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 181 switch (vlen_in_bytes) { 182 case 4: // fall-through 183 case 8: // fall-through 184 case 16: return Assembler::AVX_128bit; 185 case 32: return Assembler::AVX_256bit; 186 case 64: return Assembler::AVX_512bit; 187 188 default: { 189 ShouldNotReachHere(); 190 return Assembler::AVX_NoVec; 191 } 192 } 193 } 194 195 #if INCLUDE_RTM_OPT 196 197 // Update rtm_counters based on abort status 198 // input: abort_status 199 // rtm_counters (RTMLockingCounters*) 200 // flags are killed 201 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) { 202 203 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset())); 204 if (PrintPreciseRTMLockingStatistics) { 205 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 206 Label check_abort; 207 testl(abort_status, (1<<i)); 208 jccb(Assembler::equal, check_abort); 209 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx)))); 210 bind(check_abort); 211 } 212 } 213 } 214 215 // Branch if (random & (count-1) != 0), count is 2^n 216 // tmp, scr and flags are killed 217 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) { 218 assert(tmp == rax, ""); 219 assert(scr == rdx, ""); 220 rdtsc(); // modifies EDX:EAX 221 andptr(tmp, count-1); 222 jccb(Assembler::notZero, brLabel); 223 } 224 225 // Perform abort ratio calculation, set no_rtm bit if high ratio 226 // input: rtm_counters_Reg (RTMLockingCounters* address) 227 // tmpReg, rtm_counters_Reg and flags are killed 228 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg, 229 Register rtm_counters_Reg, 230 RTMLockingCounters* rtm_counters, 231 Metadata* method_data) { 232 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 233 234 if (RTMLockingCalculationDelay > 0) { 235 // Delay calculation 236 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr())); 237 testptr(tmpReg, tmpReg); 238 jccb(Assembler::equal, L_done); 239 } 240 // Abort ratio calculation only if abort_count > RTMAbortThreshold 241 // Aborted transactions = abort_count * 100 242 // All transactions = total_count * RTMTotalCountIncrRate 243 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 244 245 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset())); 246 cmpptr(tmpReg, RTMAbortThreshold); 247 jccb(Assembler::below, L_check_always_rtm2); 248 imulptr(tmpReg, tmpReg, 100); 249 250 Register scrReg = rtm_counters_Reg; 251 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 252 imulptr(scrReg, scrReg, RTMTotalCountIncrRate); 253 imulptr(scrReg, scrReg, RTMAbortRatio); 254 cmpptr(tmpReg, scrReg); 255 jccb(Assembler::below, L_check_always_rtm1); 256 if (method_data != nullptr) { 257 // set rtm_state to "no rtm" in MDO 258 mov_metadata(tmpReg, method_data); 259 lock(); 260 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM); 261 } 262 jmpb(L_done); 263 bind(L_check_always_rtm1); 264 // Reload RTMLockingCounters* address 265 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 266 bind(L_check_always_rtm2); 267 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 268 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 269 jccb(Assembler::below, L_done); 270 if (method_data != nullptr) { 271 // set rtm_state to "always rtm" in MDO 272 mov_metadata(tmpReg, method_data); 273 lock(); 274 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM); 275 } 276 bind(L_done); 277 } 278 279 // Update counters and perform abort ratio calculation 280 // input: abort_status_Reg 281 // rtm_counters_Reg, flags are killed 282 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg, 283 Register rtm_counters_Reg, 284 RTMLockingCounters* rtm_counters, 285 Metadata* method_data, 286 bool profile_rtm) { 287 288 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 289 // update rtm counters based on rax value at abort 290 // reads abort_status_Reg, updates flags 291 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 292 rtm_counters_update(abort_status_Reg, rtm_counters_Reg); 293 if (profile_rtm) { 294 // Save abort status because abort_status_Reg is used by following code. 295 if (RTMRetryCount > 0) { 296 push(abort_status_Reg); 297 } 298 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 299 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data); 300 // restore abort status 301 if (RTMRetryCount > 0) { 302 pop(abort_status_Reg); 303 } 304 } 305 } 306 307 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4) 308 // inputs: retry_count_Reg 309 // : abort_status_Reg 310 // output: retry_count_Reg decremented by 1 311 // flags are killed 312 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) { 313 Label doneRetry; 314 assert(abort_status_Reg == rax, ""); 315 // The abort reason bits are in eax (see all states in rtmLocking.hpp) 316 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) 317 // if reason is in 0x6 and retry count != 0 then retry 318 andptr(abort_status_Reg, 0x6); 319 jccb(Assembler::zero, doneRetry); 320 testl(retry_count_Reg, retry_count_Reg); 321 jccb(Assembler::zero, doneRetry); 322 pause(); 323 decrementl(retry_count_Reg); 324 jmp(retryLabel); 325 bind(doneRetry); 326 } 327 328 // Spin and retry if lock is busy, 329 // inputs: box_Reg (monitor address) 330 // : retry_count_Reg 331 // output: retry_count_Reg decremented by 1 332 // : clear z flag if retry count exceeded 333 // tmp_Reg, scr_Reg, flags are killed 334 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg, 335 Register tmp_Reg, Register scr_Reg, Label& retryLabel) { 336 Label SpinLoop, SpinExit, doneRetry; 337 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 338 339 testl(retry_count_Reg, retry_count_Reg); 340 jccb(Assembler::zero, doneRetry); 341 decrementl(retry_count_Reg); 342 movptr(scr_Reg, RTMSpinLoopCount); 343 344 bind(SpinLoop); 345 pause(); 346 decrementl(scr_Reg); 347 jccb(Assembler::lessEqual, SpinExit); 348 movptr(tmp_Reg, Address(box_Reg, owner_offset)); 349 testptr(tmp_Reg, tmp_Reg); 350 jccb(Assembler::notZero, SpinLoop); 351 352 bind(SpinExit); 353 jmp(retryLabel); 354 bind(doneRetry); 355 incrementl(retry_count_Reg); // clear z flag 356 } 357 358 // Use RTM for normal stack locks 359 // Input: objReg (object to lock) 360 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg, 361 Register retry_on_abort_count_Reg, 362 RTMLockingCounters* stack_rtm_counters, 363 Metadata* method_data, bool profile_rtm, 364 Label& DONE_LABEL, Label& IsInflated) { 365 assert(UseRTMForStackLocks, "why call this otherwise?"); 366 assert(tmpReg == rax, ""); 367 assert(scrReg == rdx, ""); 368 Label L_rtm_retry, L_decrement_retry, L_on_abort; 369 370 if (RTMRetryCount > 0) { 371 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 372 bind(L_rtm_retry); 373 } 374 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 375 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 376 jcc(Assembler::notZero, IsInflated); 377 378 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 379 Label L_noincrement; 380 if (RTMTotalCountIncrRate > 1) { 381 // tmpReg, scrReg and flags are killed 382 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 383 } 384 assert(stack_rtm_counters != nullptr, "should not be null when profiling RTM"); 385 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg); 386 bind(L_noincrement); 387 } 388 xbegin(L_on_abort); 389 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 390 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 391 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 392 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked 393 394 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 395 if (UseRTMXendForLockBusy) { 396 xend(); 397 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry) 398 jmp(L_decrement_retry); 399 } 400 else { 401 xabort(0); 402 } 403 bind(L_on_abort); 404 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 405 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm); 406 } 407 bind(L_decrement_retry); 408 if (RTMRetryCount > 0) { 409 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 410 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 411 } 412 } 413 414 // Use RTM for inflating locks 415 // inputs: objReg (object to lock) 416 // boxReg (on-stack box address (displaced header location) - KILLED) 417 // tmpReg (ObjectMonitor address + markWord::monitor_value) 418 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg, 419 Register scrReg, Register retry_on_busy_count_Reg, 420 Register retry_on_abort_count_Reg, 421 RTMLockingCounters* rtm_counters, 422 Metadata* method_data, bool profile_rtm, 423 Label& DONE_LABEL) { 424 assert(UseRTMLocking, "why call this otherwise?"); 425 assert(tmpReg == rax, ""); 426 assert(scrReg == rdx, ""); 427 Label L_rtm_retry, L_decrement_retry, L_on_abort; 428 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 429 430 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 431 movptr(boxReg, tmpReg); // Save ObjectMonitor address 432 433 if (RTMRetryCount > 0) { 434 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy 435 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 436 bind(L_rtm_retry); 437 } 438 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 439 Label L_noincrement; 440 if (RTMTotalCountIncrRate > 1) { 441 // tmpReg, scrReg and flags are killed 442 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 443 } 444 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 445 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg); 446 bind(L_noincrement); 447 } 448 xbegin(L_on_abort); 449 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 450 movptr(tmpReg, Address(tmpReg, owner_offset)); 451 testptr(tmpReg, tmpReg); 452 jcc(Assembler::zero, DONE_LABEL); 453 if (UseRTMXendForLockBusy) { 454 xend(); 455 jmp(L_decrement_retry); 456 } 457 else { 458 xabort(0); 459 } 460 bind(L_on_abort); 461 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 462 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 463 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm); 464 } 465 if (RTMRetryCount > 0) { 466 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 467 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 468 } 469 470 movptr(tmpReg, Address(boxReg, owner_offset)) ; 471 testptr(tmpReg, tmpReg) ; 472 jccb(Assembler::notZero, L_decrement_retry) ; 473 474 // Appears unlocked - try to swing _owner from null to non-null. 475 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 476 #ifdef _LP64 477 Register threadReg = r15_thread; 478 #else 479 get_thread(scrReg); 480 Register threadReg = scrReg; 481 #endif 482 lock(); 483 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg 484 485 if (RTMRetryCount > 0) { 486 // success done else retry 487 jccb(Assembler::equal, DONE_LABEL) ; 488 bind(L_decrement_retry); 489 // Spin and retry if lock is busy. 490 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry); 491 } 492 else { 493 bind(L_decrement_retry); 494 } 495 } 496 497 #endif // INCLUDE_RTM_OPT 498 499 // fast_lock and fast_unlock used by C2 500 501 // Because the transitions from emitted code to the runtime 502 // monitorenter/exit helper stubs are so slow it's critical that 503 // we inline both the stack-locking fast path and the inflated fast path. 504 // 505 // See also: cmpFastLock and cmpFastUnlock. 506 // 507 // What follows is a specialized inline transliteration of the code 508 // in enter() and exit(). If we're concerned about I$ bloat another 509 // option would be to emit TrySlowEnter and TrySlowExit methods 510 // at startup-time. These methods would accept arguments as 511 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 512 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 513 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 514 // In practice, however, the # of lock sites is bounded and is usually small. 515 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 516 // if the processor uses simple bimodal branch predictors keyed by EIP 517 // Since the helper routines would be called from multiple synchronization 518 // sites. 519 // 520 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 521 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 522 // to those specialized methods. That'd give us a mostly platform-independent 523 // implementation that the JITs could optimize and inline at their pleasure. 524 // Done correctly, the only time we'd need to cross to native could would be 525 // to park() or unpark() threads. We'd also need a few more unsafe operators 526 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 527 // (b) explicit barriers or fence operations. 528 // 529 // TODO: 530 // 531 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 532 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 533 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 534 // the lock operators would typically be faster than reifying Self. 535 // 536 // * Ideally I'd define the primitives as: 537 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 538 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 539 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 540 // Instead, we're stuck with a rather awkward and brittle register assignments below. 541 // Furthermore the register assignments are overconstrained, possibly resulting in 542 // sub-optimal code near the synchronization site. 543 // 544 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 545 // Alternately, use a better sp-proximity test. 546 // 547 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 548 // Either one is sufficient to uniquely identify a thread. 549 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 550 // 551 // * Intrinsify notify() and notifyAll() for the common cases where the 552 // object is locked by the calling thread but the waitlist is empty. 553 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 554 // 555 // * use jccb and jmpb instead of jcc and jmp to improve code density. 556 // But beware of excessive branch density on AMD Opterons. 557 // 558 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 559 // or failure of the fast path. If the fast path fails then we pass 560 // control to the slow path, typically in C. In fast_lock and 561 // fast_unlock we often branch to DONE_LABEL, just to find that C2 562 // will emit a conditional branch immediately after the node. 563 // So we have branches to branches and lots of ICC.ZF games. 564 // Instead, it might be better to have C2 pass a "FailureLabel" 565 // into fast_lock and fast_unlock. In the case of success, control 566 // will drop through the node. ICC.ZF is undefined at exit. 567 // In the case of failure, the node will branch directly to the 568 // FailureLabel 569 570 571 // obj: object to lock 572 // box: on-stack box address (displaced header location) - KILLED 573 // rax,: tmp -- KILLED 574 // scr: tmp -- KILLED 575 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 576 Register scrReg, Register cx1Reg, Register cx2Reg, 577 RTMLockingCounters* rtm_counters, 578 RTMLockingCounters* stack_rtm_counters, 579 Metadata* method_data, 580 bool use_rtm, bool profile_rtm) { 581 // Ensure the register assignments are disjoint 582 assert(tmpReg == rax, ""); 583 584 if (use_rtm) { 585 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg); 586 } else { 587 assert(cx1Reg == noreg, ""); 588 assert(cx2Reg == noreg, ""); 589 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 590 } 591 592 // Possible cases that we'll encounter in fast_lock 593 // ------------------------------------------------ 594 // * Inflated 595 // -- unlocked 596 // -- Locked 597 // = by self 598 // = by other 599 // * neutral 600 // * stack-locked 601 // -- by self 602 // = sp-proximity test hits 603 // = sp-proximity test generates false-negative 604 // -- by other 605 // 606 607 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 608 609 if (DiagnoseSyncOnValueBasedClasses != 0) { 610 load_klass(tmpReg, objReg, scrReg); 611 movl(tmpReg, Address(tmpReg, Klass::access_flags_offset())); 612 testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS); 613 jcc(Assembler::notZero, DONE_LABEL); 614 } 615 616 #if INCLUDE_RTM_OPT 617 if (UseRTMForStackLocks && use_rtm) { 618 assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive"); 619 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg, 620 stack_rtm_counters, method_data, profile_rtm, 621 DONE_LABEL, IsInflated); 622 } 623 #endif // INCLUDE_RTM_OPT 624 625 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 626 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 627 jccb(Assembler::notZero, IsInflated); 628 629 if (!UseHeavyMonitors) { 630 // Attempt stack-locking ... 631 orptr (tmpReg, markWord::unlocked_value); 632 if (EnableValhalla) { 633 // Mask inline_type bit such that we go to the slow path if object is an inline type 634 andptr(tmpReg, ~((int) markWord::inline_type_bit_in_place)); 635 } 636 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 637 lock(); 638 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 639 jcc(Assembler::equal, COUNT); // Success 640 641 // Recursive locking. 642 // The object is stack-locked: markword contains stack pointer to BasicLock. 643 // Locked by current thread if difference with current SP is less than one page. 644 subptr(tmpReg, rsp); 645 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 646 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) ); 647 movptr(Address(boxReg, 0), tmpReg); 648 } else { 649 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 650 testptr(objReg, objReg); 651 } 652 jmp(DONE_LABEL); 653 654 bind(IsInflated); 655 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 656 657 #if INCLUDE_RTM_OPT 658 // Use the same RTM locking code in 32- and 64-bit VM. 659 if (use_rtm) { 660 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg, 661 rtm_counters, method_data, profile_rtm, DONE_LABEL); 662 } else { 663 #endif // INCLUDE_RTM_OPT 664 665 #ifndef _LP64 666 // The object is inflated. 667 668 // boxReg refers to the on-stack BasicLock in the current frame. 669 // We'd like to write: 670 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 671 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 672 // additional latency as we have another ST in the store buffer that must drain. 673 674 // avoid ST-before-CAS 675 // register juggle because we need tmpReg for cmpxchgptr below 676 movptr(scrReg, boxReg); 677 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 678 679 // Optimistic form: consider XORL tmpReg,tmpReg 680 movptr(tmpReg, NULL_WORD); 681 682 // Appears unlocked - try to swing _owner from null to non-null. 683 // Ideally, I'd manifest "Self" with get_thread and then attempt 684 // to CAS the register containing Self into m->Owner. 685 // But we don't have enough registers, so instead we can either try to CAS 686 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 687 // we later store "Self" into m->Owner. Transiently storing a stack address 688 // (rsp or the address of the box) into m->owner is harmless. 689 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 690 lock(); 691 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 692 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 693 // If we weren't able to swing _owner from null to the BasicLock 694 // then take the slow path. 695 jccb (Assembler::notZero, NO_COUNT); 696 // update _owner from BasicLock to thread 697 get_thread (scrReg); // beware: clobbers ICCs 698 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 699 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 700 701 // If the CAS fails we can either retry or pass control to the slow path. 702 // We use the latter tactic. 703 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 704 // If the CAS was successful ... 705 // Self has acquired the lock 706 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 707 // Intentional fall-through into DONE_LABEL ... 708 #else // _LP64 709 // It's inflated and we use scrReg for ObjectMonitor* in this section. 710 movq(scrReg, tmpReg); 711 xorq(tmpReg, tmpReg); 712 lock(); 713 cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 714 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 715 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 716 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 717 // Propagate ICC.ZF from CAS above into DONE_LABEL. 718 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 719 720 cmpptr(r15_thread, rax); // Check if we are already the owner (recursive lock) 721 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 722 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 723 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 724 #endif // _LP64 725 #if INCLUDE_RTM_OPT 726 } // use_rtm() 727 #endif 728 bind(DONE_LABEL); 729 730 // ZFlag == 1 count in fast path 731 // ZFlag == 0 count in slow path 732 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 733 734 bind(COUNT); 735 // Count monitors in fast path 736 #ifndef _LP64 737 get_thread(tmpReg); 738 incrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); 739 #else // _LP64 740 incrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 741 #endif 742 743 xorl(tmpReg, tmpReg); // Set ZF == 1 744 745 bind(NO_COUNT); 746 747 // At NO_COUNT the icc ZFlag is set as follows ... 748 // fast_unlock uses the same protocol. 749 // ZFlag == 1 -> Success 750 // ZFlag == 0 -> Failure - force control through the slow path 751 } 752 753 // obj: object to unlock 754 // box: box address (displaced header location), killed. Must be EAX. 755 // tmp: killed, cannot be obj nor box. 756 // 757 // Some commentary on balanced locking: 758 // 759 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 760 // Methods that don't have provably balanced locking are forced to run in the 761 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 762 // The interpreter provides two properties: 763 // I1: At return-time the interpreter automatically and quietly unlocks any 764 // objects acquired the current activation (frame). Recall that the 765 // interpreter maintains an on-stack list of locks currently held by 766 // a frame. 767 // I2: If a method attempts to unlock an object that is not held by the 768 // the frame the interpreter throws IMSX. 769 // 770 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 771 // B() doesn't have provably balanced locking so it runs in the interpreter. 772 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 773 // is still locked by A(). 774 // 775 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 776 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 777 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 778 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 779 // Arguably given that the spec legislates the JNI case as undefined our implementation 780 // could reasonably *avoid* checking owner in fast_unlock(). 781 // In the interest of performance we elide m->Owner==Self check in unlock. 782 // A perfectly viable alternative is to elide the owner check except when 783 // Xcheck:jni is enabled. 784 785 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { 786 assert(boxReg == rax, ""); 787 assert_different_registers(objReg, boxReg, tmpReg); 788 789 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 790 791 #if INCLUDE_RTM_OPT 792 if (UseRTMForStackLocks && use_rtm) { 793 assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive"); 794 Label L_regular_unlock; 795 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 796 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 797 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 798 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock 799 xend(); // otherwise end... 800 jmp(DONE_LABEL); // ... and we're done 801 bind(L_regular_unlock); 802 } 803 #endif 804 805 if (!UseHeavyMonitors) { 806 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 807 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 808 } 809 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 810 if (!UseHeavyMonitors) { 811 testptr(tmpReg, markWord::monitor_value); // Inflated? 812 jccb (Assembler::zero, Stacked); 813 } 814 815 // It's inflated. 816 #if INCLUDE_RTM_OPT 817 if (use_rtm) { 818 Label L_regular_inflated_unlock; 819 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 820 movptr(boxReg, Address(tmpReg, owner_offset)); 821 testptr(boxReg, boxReg); 822 jccb(Assembler::notZero, L_regular_inflated_unlock); 823 xend(); 824 jmpb(DONE_LABEL); 825 bind(L_regular_inflated_unlock); 826 } 827 #endif 828 829 // Despite our balanced locking property we still check that m->_owner == Self 830 // as java routines or native JNI code called by this thread might 831 // have released the lock. 832 // Refer to the comments in synchronizer.cpp for how we might encode extra 833 // state in _succ so we can avoid fetching EntryList|cxq. 834 // 835 // If there's no contention try a 1-0 exit. That is, exit without 836 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 837 // we detect and recover from the race that the 1-0 exit admits. 838 // 839 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 840 // before it STs null into _owner, releasing the lock. Updates 841 // to data protected by the critical section must be visible before 842 // we drop the lock (and thus before any other thread could acquire 843 // the lock and observe the fields protected by the lock). 844 // IA32's memory-model is SPO, so STs are ordered with respect to 845 // each other and there's no need for an explicit barrier (fence). 846 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 847 #ifndef _LP64 848 // Note that we could employ various encoding schemes to reduce 849 // the number of loads below (currently 4) to just 2 or 3. 850 // Refer to the comments in synchronizer.cpp. 851 // In practice the chain of fetches doesn't seem to impact performance, however. 852 xorptr(boxReg, boxReg); 853 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 854 jccb (Assembler::notZero, DONE_LABEL); 855 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 856 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 857 jccb (Assembler::notZero, DONE_LABEL); 858 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 859 jmpb (DONE_LABEL); 860 #else // _LP64 861 // It's inflated 862 Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath; 863 864 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 865 jccb(Assembler::equal, LNotRecursive); 866 867 // Recursive inflated unlock 868 decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 869 jmpb(LSuccess); 870 871 bind(LNotRecursive); 872 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 873 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 874 jccb (Assembler::notZero, CheckSucc); 875 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 876 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 877 jmpb (DONE_LABEL); 878 879 // Try to avoid passing control into the slow_path ... 880 bind (CheckSucc); 881 882 // The following optional optimization can be elided if necessary 883 // Effectively: if (succ == null) goto slow path 884 // The code reduces the window for a race, however, 885 // and thus benefits performance. 886 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 887 jccb (Assembler::zero, LGoSlowPath); 888 889 xorptr(boxReg, boxReg); 890 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 891 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 892 893 // Memory barrier/fence 894 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 895 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 896 // This is faster on Nehalem and AMD Shanghai/Barcelona. 897 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 898 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 899 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 900 lock(); addl(Address(rsp, 0), 0); 901 902 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 903 jccb (Assembler::notZero, LSuccess); 904 905 // Rare inopportune interleaving - race. 906 // The successor vanished in the small window above. 907 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 908 // We need to ensure progress and succession. 909 // Try to reacquire the lock. 910 // If that fails then the new owner is responsible for succession and this 911 // thread needs to take no further action and can exit via the fast path (success). 912 // If the re-acquire succeeds then pass control into the slow path. 913 // As implemented, this latter mode is horrible because we generated more 914 // coherence traffic on the lock *and* artificially extended the critical section 915 // length while by virtue of passing control into the slow path. 916 917 // box is really RAX -- the following CMPXCHG depends on that binding 918 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 919 lock(); 920 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 921 // There's no successor so we tried to regrab the lock. 922 // If that didn't work, then another thread grabbed the 923 // lock so we're done (and exit was a success). 924 jccb (Assembler::notEqual, LSuccess); 925 // Intentional fall-through into slow path 926 927 bind (LGoSlowPath); 928 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 929 jmpb (DONE_LABEL); 930 931 bind (LSuccess); 932 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 933 jmpb (DONE_LABEL); 934 935 #endif 936 if (!UseHeavyMonitors) { 937 bind (Stacked); 938 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 939 lock(); 940 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 941 // Intentional fall-thru into DONE_LABEL 942 } 943 bind(DONE_LABEL); 944 945 // ZFlag == 1 count in fast path 946 // ZFlag == 0 count in slow path 947 jccb(Assembler::notZero, NO_COUNT); 948 949 bind(COUNT); 950 // Count monitors in fast path 951 #ifndef _LP64 952 get_thread(tmpReg); 953 decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); 954 #else // _LP64 955 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 956 #endif 957 958 xorl(tmpReg, tmpReg); // Set ZF == 1 959 960 bind(NO_COUNT); 961 } 962 963 //------------------------------------------------------------------------------------------- 964 // Generic instructions support for use in .ad files C2 code generation 965 966 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 967 if (dst != src) { 968 movdqu(dst, src); 969 } 970 if (opcode == Op_AbsVD) { 971 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 972 } else { 973 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 974 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 975 } 976 } 977 978 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 979 if (opcode == Op_AbsVD) { 980 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 981 } else { 982 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 983 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 984 } 985 } 986 987 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 988 if (dst != src) { 989 movdqu(dst, src); 990 } 991 if (opcode == Op_AbsVF) { 992 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 993 } else { 994 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 995 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 996 } 997 } 998 999 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 1000 if (opcode == Op_AbsVF) { 1001 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 1002 } else { 1003 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 1004 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 1005 } 1006 } 1007 1008 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 1009 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1010 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 1011 1012 if (opcode == Op_MinV) { 1013 if (elem_bt == T_BYTE) { 1014 pminsb(dst, src); 1015 } else if (elem_bt == T_SHORT) { 1016 pminsw(dst, src); 1017 } else if (elem_bt == T_INT) { 1018 pminsd(dst, src); 1019 } else { 1020 assert(elem_bt == T_LONG, "required"); 1021 assert(tmp == xmm0, "required"); 1022 assert_different_registers(dst, src, tmp); 1023 movdqu(xmm0, dst); 1024 pcmpgtq(xmm0, src); 1025 blendvpd(dst, src); // xmm0 as mask 1026 } 1027 } else { // opcode == Op_MaxV 1028 if (elem_bt == T_BYTE) { 1029 pmaxsb(dst, src); 1030 } else if (elem_bt == T_SHORT) { 1031 pmaxsw(dst, src); 1032 } else if (elem_bt == T_INT) { 1033 pmaxsd(dst, src); 1034 } else { 1035 assert(elem_bt == T_LONG, "required"); 1036 assert(tmp == xmm0, "required"); 1037 assert_different_registers(dst, src, tmp); 1038 movdqu(xmm0, src); 1039 pcmpgtq(xmm0, dst); 1040 blendvpd(dst, src); // xmm0 as mask 1041 } 1042 } 1043 } 1044 1045 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 1046 XMMRegister dst, XMMRegister src1, XMMRegister src2, 1047 int vlen_enc) { 1048 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1049 1050 if (opcode == Op_MinV) { 1051 if (elem_bt == T_BYTE) { 1052 vpminsb(dst, src1, src2, vlen_enc); 1053 } else if (elem_bt == T_SHORT) { 1054 vpminsw(dst, src1, src2, vlen_enc); 1055 } else if (elem_bt == T_INT) { 1056 vpminsd(dst, src1, src2, vlen_enc); 1057 } else { 1058 assert(elem_bt == T_LONG, "required"); 1059 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1060 vpminsq(dst, src1, src2, vlen_enc); 1061 } else { 1062 assert_different_registers(dst, src1, src2); 1063 vpcmpgtq(dst, src1, src2, vlen_enc); 1064 vblendvpd(dst, src1, src2, dst, vlen_enc); 1065 } 1066 } 1067 } else { // opcode == Op_MaxV 1068 if (elem_bt == T_BYTE) { 1069 vpmaxsb(dst, src1, src2, vlen_enc); 1070 } else if (elem_bt == T_SHORT) { 1071 vpmaxsw(dst, src1, src2, vlen_enc); 1072 } else if (elem_bt == T_INT) { 1073 vpmaxsd(dst, src1, src2, vlen_enc); 1074 } else { 1075 assert(elem_bt == T_LONG, "required"); 1076 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1077 vpmaxsq(dst, src1, src2, vlen_enc); 1078 } else { 1079 assert_different_registers(dst, src1, src2); 1080 vpcmpgtq(dst, src1, src2, vlen_enc); 1081 vblendvpd(dst, src2, src1, dst, vlen_enc); 1082 } 1083 } 1084 } 1085 } 1086 1087 // Float/Double min max 1088 1089 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1090 XMMRegister dst, XMMRegister a, XMMRegister b, 1091 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1092 int vlen_enc) { 1093 assert(UseAVX > 0, "required"); 1094 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1095 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1096 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1097 assert_different_registers(a, b, tmp, atmp, btmp); 1098 1099 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1100 bool is_double_word = is_double_word_type(elem_bt); 1101 1102 if (!is_double_word && is_min) { 1103 vblendvps(atmp, a, b, a, vlen_enc); 1104 vblendvps(btmp, b, a, a, vlen_enc); 1105 vminps(tmp, atmp, btmp, vlen_enc); 1106 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1107 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 1108 } else if (!is_double_word && !is_min) { 1109 vblendvps(btmp, b, a, b, vlen_enc); 1110 vblendvps(atmp, a, b, b, vlen_enc); 1111 vmaxps(tmp, atmp, btmp, vlen_enc); 1112 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1113 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 1114 } else if (is_double_word && is_min) { 1115 vblendvpd(atmp, a, b, a, vlen_enc); 1116 vblendvpd(btmp, b, a, a, vlen_enc); 1117 vminpd(tmp, atmp, btmp, vlen_enc); 1118 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1119 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 1120 } else { 1121 assert(is_double_word && !is_min, "sanity"); 1122 vblendvpd(btmp, b, a, b, vlen_enc); 1123 vblendvpd(atmp, a, b, b, vlen_enc); 1124 vmaxpd(tmp, atmp, btmp, vlen_enc); 1125 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1126 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 1127 } 1128 } 1129 1130 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1131 XMMRegister dst, XMMRegister a, XMMRegister b, 1132 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1133 int vlen_enc) { 1134 assert(UseAVX > 2, "required"); 1135 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1136 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1137 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1138 assert_different_registers(dst, a, b, atmp, btmp); 1139 1140 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1141 bool is_double_word = is_double_word_type(elem_bt); 1142 bool merge = true; 1143 1144 if (!is_double_word && is_min) { 1145 evpmovd2m(ktmp, a, vlen_enc); 1146 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1147 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1148 vminps(dst, atmp, btmp, vlen_enc); 1149 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1150 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1151 } else if (!is_double_word && !is_min) { 1152 evpmovd2m(ktmp, b, vlen_enc); 1153 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1154 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1155 vmaxps(dst, atmp, btmp, vlen_enc); 1156 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1157 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1158 } else if (is_double_word && is_min) { 1159 evpmovq2m(ktmp, a, vlen_enc); 1160 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1161 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1162 vminpd(dst, atmp, btmp, vlen_enc); 1163 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1164 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1165 } else { 1166 assert(is_double_word && !is_min, "sanity"); 1167 evpmovq2m(ktmp, b, vlen_enc); 1168 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1169 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1170 vmaxpd(dst, atmp, btmp, vlen_enc); 1171 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1172 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1173 } 1174 } 1175 1176 // Float/Double signum 1177 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1178 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1179 1180 Label DONE_LABEL; 1181 1182 if (opcode == Op_SignumF) { 1183 assert(UseSSE > 0, "required"); 1184 ucomiss(dst, zero); 1185 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1186 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1187 movflt(dst, one); 1188 jcc(Assembler::above, DONE_LABEL); 1189 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1190 } else if (opcode == Op_SignumD) { 1191 assert(UseSSE > 1, "required"); 1192 ucomisd(dst, zero); 1193 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1194 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1195 movdbl(dst, one); 1196 jcc(Assembler::above, DONE_LABEL); 1197 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1198 } 1199 1200 bind(DONE_LABEL); 1201 } 1202 1203 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1204 if (sign) { 1205 pmovsxbw(dst, src); 1206 } else { 1207 pmovzxbw(dst, src); 1208 } 1209 } 1210 1211 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1212 if (sign) { 1213 vpmovsxbw(dst, src, vector_len); 1214 } else { 1215 vpmovzxbw(dst, src, vector_len); 1216 } 1217 } 1218 1219 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1220 if (sign) { 1221 vpmovsxbd(dst, src, vector_len); 1222 } else { 1223 vpmovzxbd(dst, src, vector_len); 1224 } 1225 } 1226 1227 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1228 if (sign) { 1229 vpmovsxwd(dst, src, vector_len); 1230 } else { 1231 vpmovzxwd(dst, src, vector_len); 1232 } 1233 } 1234 1235 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1236 int shift, int vector_len) { 1237 if (opcode == Op_RotateLeftV) { 1238 if (etype == T_INT) { 1239 evprold(dst, src, shift, vector_len); 1240 } else { 1241 assert(etype == T_LONG, "expected type T_LONG"); 1242 evprolq(dst, src, shift, vector_len); 1243 } 1244 } else { 1245 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1246 if (etype == T_INT) { 1247 evprord(dst, src, shift, vector_len); 1248 } else { 1249 assert(etype == T_LONG, "expected type T_LONG"); 1250 evprorq(dst, src, shift, vector_len); 1251 } 1252 } 1253 } 1254 1255 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1256 XMMRegister shift, int vector_len) { 1257 if (opcode == Op_RotateLeftV) { 1258 if (etype == T_INT) { 1259 evprolvd(dst, src, shift, vector_len); 1260 } else { 1261 assert(etype == T_LONG, "expected type T_LONG"); 1262 evprolvq(dst, src, shift, vector_len); 1263 } 1264 } else { 1265 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1266 if (etype == T_INT) { 1267 evprorvd(dst, src, shift, vector_len); 1268 } else { 1269 assert(etype == T_LONG, "expected type T_LONG"); 1270 evprorvq(dst, src, shift, vector_len); 1271 } 1272 } 1273 } 1274 1275 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1276 if (opcode == Op_RShiftVI) { 1277 psrad(dst, shift); 1278 } else if (opcode == Op_LShiftVI) { 1279 pslld(dst, shift); 1280 } else { 1281 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1282 psrld(dst, shift); 1283 } 1284 } 1285 1286 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1287 switch (opcode) { 1288 case Op_RShiftVI: psrad(dst, shift); break; 1289 case Op_LShiftVI: pslld(dst, shift); break; 1290 case Op_URShiftVI: psrld(dst, shift); break; 1291 1292 default: assert(false, "%s", NodeClassNames[opcode]); 1293 } 1294 } 1295 1296 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1297 if (opcode == Op_RShiftVI) { 1298 vpsrad(dst, nds, shift, vector_len); 1299 } else if (opcode == Op_LShiftVI) { 1300 vpslld(dst, nds, shift, vector_len); 1301 } else { 1302 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1303 vpsrld(dst, nds, shift, vector_len); 1304 } 1305 } 1306 1307 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1308 switch (opcode) { 1309 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1310 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1311 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1312 1313 default: assert(false, "%s", NodeClassNames[opcode]); 1314 } 1315 } 1316 1317 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1318 switch (opcode) { 1319 case Op_RShiftVB: // fall-through 1320 case Op_RShiftVS: psraw(dst, shift); break; 1321 1322 case Op_LShiftVB: // fall-through 1323 case Op_LShiftVS: psllw(dst, shift); break; 1324 1325 case Op_URShiftVS: // fall-through 1326 case Op_URShiftVB: psrlw(dst, shift); break; 1327 1328 default: assert(false, "%s", NodeClassNames[opcode]); 1329 } 1330 } 1331 1332 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1333 switch (opcode) { 1334 case Op_RShiftVB: // fall-through 1335 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1336 1337 case Op_LShiftVB: // fall-through 1338 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1339 1340 case Op_URShiftVS: // fall-through 1341 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1342 1343 default: assert(false, "%s", NodeClassNames[opcode]); 1344 } 1345 } 1346 1347 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1348 switch (opcode) { 1349 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1350 case Op_LShiftVL: psllq(dst, shift); break; 1351 case Op_URShiftVL: psrlq(dst, shift); break; 1352 1353 default: assert(false, "%s", NodeClassNames[opcode]); 1354 } 1355 } 1356 1357 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1358 if (opcode == Op_RShiftVL) { 1359 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1360 } else if (opcode == Op_LShiftVL) { 1361 psllq(dst, shift); 1362 } else { 1363 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1364 psrlq(dst, shift); 1365 } 1366 } 1367 1368 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1369 switch (opcode) { 1370 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1371 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1372 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1373 1374 default: assert(false, "%s", NodeClassNames[opcode]); 1375 } 1376 } 1377 1378 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1379 if (opcode == Op_RShiftVL) { 1380 evpsraq(dst, nds, shift, vector_len); 1381 } else if (opcode == Op_LShiftVL) { 1382 vpsllq(dst, nds, shift, vector_len); 1383 } else { 1384 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1385 vpsrlq(dst, nds, shift, vector_len); 1386 } 1387 } 1388 1389 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1390 switch (opcode) { 1391 case Op_RShiftVB: // fall-through 1392 case Op_RShiftVS: // fall-through 1393 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1394 1395 case Op_LShiftVB: // fall-through 1396 case Op_LShiftVS: // fall-through 1397 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1398 1399 case Op_URShiftVB: // fall-through 1400 case Op_URShiftVS: // fall-through 1401 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1402 1403 default: assert(false, "%s", NodeClassNames[opcode]); 1404 } 1405 } 1406 1407 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1408 switch (opcode) { 1409 case Op_RShiftVB: // fall-through 1410 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1411 1412 case Op_LShiftVB: // fall-through 1413 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1414 1415 case Op_URShiftVB: // fall-through 1416 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1417 1418 default: assert(false, "%s", NodeClassNames[opcode]); 1419 } 1420 } 1421 1422 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1423 assert(UseAVX >= 2, "required"); 1424 switch (opcode) { 1425 case Op_RShiftVL: { 1426 if (UseAVX > 2) { 1427 assert(tmp == xnoreg, "not used"); 1428 if (!VM_Version::supports_avx512vl()) { 1429 vlen_enc = Assembler::AVX_512bit; 1430 } 1431 evpsravq(dst, src, shift, vlen_enc); 1432 } else { 1433 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1434 vpsrlvq(dst, src, shift, vlen_enc); 1435 vpsrlvq(tmp, tmp, shift, vlen_enc); 1436 vpxor(dst, dst, tmp, vlen_enc); 1437 vpsubq(dst, dst, tmp, vlen_enc); 1438 } 1439 break; 1440 } 1441 case Op_LShiftVL: { 1442 assert(tmp == xnoreg, "not used"); 1443 vpsllvq(dst, src, shift, vlen_enc); 1444 break; 1445 } 1446 case Op_URShiftVL: { 1447 assert(tmp == xnoreg, "not used"); 1448 vpsrlvq(dst, src, shift, vlen_enc); 1449 break; 1450 } 1451 default: assert(false, "%s", NodeClassNames[opcode]); 1452 } 1453 } 1454 1455 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1456 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1457 assert(opcode == Op_LShiftVB || 1458 opcode == Op_RShiftVB || 1459 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1460 bool sign = (opcode != Op_URShiftVB); 1461 assert(vector_len == 0, "required"); 1462 vextendbd(sign, dst, src, 1); 1463 vpmovzxbd(vtmp, shift, 1); 1464 varshiftd(opcode, dst, dst, vtmp, 1); 1465 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1466 vextracti128_high(vtmp, dst); 1467 vpackusdw(dst, dst, vtmp, 0); 1468 } 1469 1470 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1471 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1472 assert(opcode == Op_LShiftVB || 1473 opcode == Op_RShiftVB || 1474 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1475 bool sign = (opcode != Op_URShiftVB); 1476 int ext_vector_len = vector_len + 1; 1477 vextendbw(sign, dst, src, ext_vector_len); 1478 vpmovzxbw(vtmp, shift, ext_vector_len); 1479 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1480 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1481 if (vector_len == 0) { 1482 vextracti128_high(vtmp, dst); 1483 vpackuswb(dst, dst, vtmp, vector_len); 1484 } else { 1485 vextracti64x4_high(vtmp, dst); 1486 vpackuswb(dst, dst, vtmp, vector_len); 1487 vpermq(dst, dst, 0xD8, vector_len); 1488 } 1489 } 1490 1491 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1492 switch(typ) { 1493 case T_BYTE: 1494 pinsrb(dst, val, idx); 1495 break; 1496 case T_SHORT: 1497 pinsrw(dst, val, idx); 1498 break; 1499 case T_INT: 1500 pinsrd(dst, val, idx); 1501 break; 1502 case T_LONG: 1503 pinsrq(dst, val, idx); 1504 break; 1505 default: 1506 assert(false,"Should not reach here."); 1507 break; 1508 } 1509 } 1510 1511 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1512 switch(typ) { 1513 case T_BYTE: 1514 vpinsrb(dst, src, val, idx); 1515 break; 1516 case T_SHORT: 1517 vpinsrw(dst, src, val, idx); 1518 break; 1519 case T_INT: 1520 vpinsrd(dst, src, val, idx); 1521 break; 1522 case T_LONG: 1523 vpinsrq(dst, src, val, idx); 1524 break; 1525 default: 1526 assert(false,"Should not reach here."); 1527 break; 1528 } 1529 } 1530 1531 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1532 switch(typ) { 1533 case T_INT: 1534 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1535 break; 1536 case T_FLOAT: 1537 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1538 break; 1539 case T_LONG: 1540 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1541 break; 1542 case T_DOUBLE: 1543 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1544 break; 1545 default: 1546 assert(false,"Should not reach here."); 1547 break; 1548 } 1549 } 1550 1551 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1552 switch(typ) { 1553 case T_INT: 1554 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1555 break; 1556 case T_FLOAT: 1557 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1558 break; 1559 case T_LONG: 1560 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1561 break; 1562 case T_DOUBLE: 1563 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1564 break; 1565 default: 1566 assert(false,"Should not reach here."); 1567 break; 1568 } 1569 } 1570 1571 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1572 switch(typ) { 1573 case T_INT: 1574 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1575 break; 1576 case T_FLOAT: 1577 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1578 break; 1579 case T_LONG: 1580 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1581 break; 1582 case T_DOUBLE: 1583 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1584 break; 1585 default: 1586 assert(false,"Should not reach here."); 1587 break; 1588 } 1589 } 1590 1591 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1592 if (vlen_in_bytes <= 16) { 1593 pxor (dst, dst); 1594 psubb(dst, src); 1595 switch (elem_bt) { 1596 case T_BYTE: /* nothing to do */ break; 1597 case T_SHORT: pmovsxbw(dst, dst); break; 1598 case T_INT: pmovsxbd(dst, dst); break; 1599 case T_FLOAT: pmovsxbd(dst, dst); break; 1600 case T_LONG: pmovsxbq(dst, dst); break; 1601 case T_DOUBLE: pmovsxbq(dst, dst); break; 1602 1603 default: assert(false, "%s", type2name(elem_bt)); 1604 } 1605 } else { 1606 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1607 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1608 1609 vpxor (dst, dst, dst, vlen_enc); 1610 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1611 1612 switch (elem_bt) { 1613 case T_BYTE: /* nothing to do */ break; 1614 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1615 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1616 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1617 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1618 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1619 1620 default: assert(false, "%s", type2name(elem_bt)); 1621 } 1622 } 1623 } 1624 1625 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1626 if (novlbwdq) { 1627 vpmovsxbd(xtmp, src, vlen_enc); 1628 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1629 Assembler::eq, true, vlen_enc, noreg); 1630 } else { 1631 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1632 vpsubb(xtmp, xtmp, src, vlen_enc); 1633 evpmovb2m(dst, xtmp, vlen_enc); 1634 } 1635 } 1636 1637 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) { 1638 switch (vlen_in_bytes) { 1639 case 4: movdl(dst, src); break; 1640 case 8: movq(dst, src); break; 1641 case 16: movdqu(dst, src); break; 1642 case 32: vmovdqu(dst, src); break; 1643 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1644 default: ShouldNotReachHere(); 1645 } 1646 } 1647 1648 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1649 assert(rscratch != noreg || always_reachable(src), "missing"); 1650 1651 if (reachable(src)) { 1652 load_vector(dst, as_Address(src), vlen_in_bytes); 1653 } else { 1654 lea(rscratch, src); 1655 load_vector(dst, Address(rscratch, 0), vlen_in_bytes); 1656 } 1657 } 1658 1659 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1660 int vlen_enc = vector_length_encoding(vlen); 1661 if (VM_Version::supports_avx()) { 1662 if (bt == T_LONG) { 1663 if (VM_Version::supports_avx2()) { 1664 vpbroadcastq(dst, src, vlen_enc); 1665 } else { 1666 vmovddup(dst, src, vlen_enc); 1667 } 1668 } else if (bt == T_DOUBLE) { 1669 if (vlen_enc != Assembler::AVX_128bit) { 1670 vbroadcastsd(dst, src, vlen_enc, noreg); 1671 } else { 1672 vmovddup(dst, src, vlen_enc); 1673 } 1674 } else { 1675 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1676 vpbroadcastd(dst, src, vlen_enc); 1677 } else { 1678 vbroadcastss(dst, src, vlen_enc); 1679 } 1680 } 1681 } else if (VM_Version::supports_sse3()) { 1682 movddup(dst, src); 1683 } else { 1684 movq(dst, src); 1685 if (vlen == 16) { 1686 punpcklqdq(dst, dst); 1687 } 1688 } 1689 } 1690 1691 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1692 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1693 int offset = exact_log2(type2aelembytes(bt)) << 6; 1694 if (is_floating_point_type(bt)) { 1695 offset += 128; 1696 } 1697 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1698 load_vector(dst, addr, vlen_in_bytes); 1699 } 1700 1701 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1702 1703 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1704 int vector_len = Assembler::AVX_128bit; 1705 1706 switch (opcode) { 1707 case Op_AndReductionV: pand(dst, src); break; 1708 case Op_OrReductionV: por (dst, src); break; 1709 case Op_XorReductionV: pxor(dst, src); break; 1710 case Op_MinReductionV: 1711 switch (typ) { 1712 case T_BYTE: pminsb(dst, src); break; 1713 case T_SHORT: pminsw(dst, src); break; 1714 case T_INT: pminsd(dst, src); break; 1715 case T_LONG: assert(UseAVX > 2, "required"); 1716 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1717 default: assert(false, "wrong type"); 1718 } 1719 break; 1720 case Op_MaxReductionV: 1721 switch (typ) { 1722 case T_BYTE: pmaxsb(dst, src); break; 1723 case T_SHORT: pmaxsw(dst, src); break; 1724 case T_INT: pmaxsd(dst, src); break; 1725 case T_LONG: assert(UseAVX > 2, "required"); 1726 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1727 default: assert(false, "wrong type"); 1728 } 1729 break; 1730 case Op_AddReductionVF: addss(dst, src); break; 1731 case Op_AddReductionVD: addsd(dst, src); break; 1732 case Op_AddReductionVI: 1733 switch (typ) { 1734 case T_BYTE: paddb(dst, src); break; 1735 case T_SHORT: paddw(dst, src); break; 1736 case T_INT: paddd(dst, src); break; 1737 default: assert(false, "wrong type"); 1738 } 1739 break; 1740 case Op_AddReductionVL: paddq(dst, src); break; 1741 case Op_MulReductionVF: mulss(dst, src); break; 1742 case Op_MulReductionVD: mulsd(dst, src); break; 1743 case Op_MulReductionVI: 1744 switch (typ) { 1745 case T_SHORT: pmullw(dst, src); break; 1746 case T_INT: pmulld(dst, src); break; 1747 default: assert(false, "wrong type"); 1748 } 1749 break; 1750 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1751 evpmullq(dst, dst, src, vector_len); break; 1752 default: assert(false, "wrong opcode"); 1753 } 1754 } 1755 1756 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1757 int vector_len = Assembler::AVX_256bit; 1758 1759 switch (opcode) { 1760 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1761 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1762 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1763 case Op_MinReductionV: 1764 switch (typ) { 1765 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1766 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1767 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1768 case T_LONG: assert(UseAVX > 2, "required"); 1769 vpminsq(dst, src1, src2, vector_len); break; 1770 default: assert(false, "wrong type"); 1771 } 1772 break; 1773 case Op_MaxReductionV: 1774 switch (typ) { 1775 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1776 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1777 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1778 case T_LONG: assert(UseAVX > 2, "required"); 1779 vpmaxsq(dst, src1, src2, vector_len); break; 1780 default: assert(false, "wrong type"); 1781 } 1782 break; 1783 case Op_AddReductionVI: 1784 switch (typ) { 1785 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1786 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1787 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1788 default: assert(false, "wrong type"); 1789 } 1790 break; 1791 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1792 case Op_MulReductionVI: 1793 switch (typ) { 1794 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1795 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1796 default: assert(false, "wrong type"); 1797 } 1798 break; 1799 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1800 default: assert(false, "wrong opcode"); 1801 } 1802 } 1803 1804 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1805 XMMRegister dst, XMMRegister src, 1806 XMMRegister vtmp1, XMMRegister vtmp2) { 1807 switch (opcode) { 1808 case Op_AddReductionVF: 1809 case Op_MulReductionVF: 1810 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1811 break; 1812 1813 case Op_AddReductionVD: 1814 case Op_MulReductionVD: 1815 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1816 break; 1817 1818 default: assert(false, "wrong opcode"); 1819 } 1820 } 1821 1822 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1823 Register dst, Register src1, XMMRegister src2, 1824 XMMRegister vtmp1, XMMRegister vtmp2) { 1825 switch (vlen) { 1826 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1827 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1828 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1829 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1830 1831 default: assert(false, "wrong vector length"); 1832 } 1833 } 1834 1835 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1836 Register dst, Register src1, XMMRegister src2, 1837 XMMRegister vtmp1, XMMRegister vtmp2) { 1838 switch (vlen) { 1839 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1840 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1841 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1842 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1843 1844 default: assert(false, "wrong vector length"); 1845 } 1846 } 1847 1848 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1849 Register dst, Register src1, XMMRegister src2, 1850 XMMRegister vtmp1, XMMRegister vtmp2) { 1851 switch (vlen) { 1852 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1853 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1854 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1855 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1856 1857 default: assert(false, "wrong vector length"); 1858 } 1859 } 1860 1861 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1862 Register dst, Register src1, XMMRegister src2, 1863 XMMRegister vtmp1, XMMRegister vtmp2) { 1864 switch (vlen) { 1865 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1866 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1867 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1868 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1869 1870 default: assert(false, "wrong vector length"); 1871 } 1872 } 1873 1874 #ifdef _LP64 1875 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1876 Register dst, Register src1, XMMRegister src2, 1877 XMMRegister vtmp1, XMMRegister vtmp2) { 1878 switch (vlen) { 1879 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1880 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1881 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1882 1883 default: assert(false, "wrong vector length"); 1884 } 1885 } 1886 #endif // _LP64 1887 1888 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1889 switch (vlen) { 1890 case 2: 1891 assert(vtmp2 == xnoreg, ""); 1892 reduce2F(opcode, dst, src, vtmp1); 1893 break; 1894 case 4: 1895 assert(vtmp2 == xnoreg, ""); 1896 reduce4F(opcode, dst, src, vtmp1); 1897 break; 1898 case 8: 1899 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1900 break; 1901 case 16: 1902 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1903 break; 1904 default: assert(false, "wrong vector length"); 1905 } 1906 } 1907 1908 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1909 switch (vlen) { 1910 case 2: 1911 assert(vtmp2 == xnoreg, ""); 1912 reduce2D(opcode, dst, src, vtmp1); 1913 break; 1914 case 4: 1915 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1916 break; 1917 case 8: 1918 reduce8D(opcode, dst, src, vtmp1, vtmp2); 1919 break; 1920 default: assert(false, "wrong vector length"); 1921 } 1922 } 1923 1924 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1925 if (opcode == Op_AddReductionVI) { 1926 if (vtmp1 != src2) { 1927 movdqu(vtmp1, src2); 1928 } 1929 phaddd(vtmp1, vtmp1); 1930 } else { 1931 pshufd(vtmp1, src2, 0x1); 1932 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1933 } 1934 movdl(vtmp2, src1); 1935 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1936 movdl(dst, vtmp1); 1937 } 1938 1939 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1940 if (opcode == Op_AddReductionVI) { 1941 if (vtmp1 != src2) { 1942 movdqu(vtmp1, src2); 1943 } 1944 phaddd(vtmp1, src2); 1945 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1946 } else { 1947 pshufd(vtmp2, src2, 0xE); 1948 reduce_operation_128(T_INT, opcode, vtmp2, src2); 1949 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1950 } 1951 } 1952 1953 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1954 if (opcode == Op_AddReductionVI) { 1955 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 1956 vextracti128_high(vtmp2, vtmp1); 1957 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 1958 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1959 } else { 1960 vextracti128_high(vtmp1, src2); 1961 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1962 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1963 } 1964 } 1965 1966 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1967 vextracti64x4_high(vtmp2, src2); 1968 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 1969 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1970 } 1971 1972 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1973 pshufd(vtmp2, src2, 0x1); 1974 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1975 movdqu(vtmp1, vtmp2); 1976 psrldq(vtmp1, 2); 1977 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 1978 movdqu(vtmp2, vtmp1); 1979 psrldq(vtmp2, 1); 1980 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 1981 movdl(vtmp2, src1); 1982 pmovsxbd(vtmp1, vtmp1); 1983 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1984 pextrb(dst, vtmp1, 0x0); 1985 movsbl(dst, dst); 1986 } 1987 1988 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1989 pshufd(vtmp1, src2, 0xE); 1990 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 1991 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1992 } 1993 1994 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1995 vextracti128_high(vtmp2, src2); 1996 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1997 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1998 } 1999 2000 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2001 vextracti64x4_high(vtmp1, src2); 2002 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2003 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2004 } 2005 2006 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2007 pmovsxbw(vtmp2, src2); 2008 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2009 } 2010 2011 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2012 if (UseAVX > 1) { 2013 int vector_len = Assembler::AVX_256bit; 2014 vpmovsxbw(vtmp1, src2, vector_len); 2015 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2016 } else { 2017 pmovsxbw(vtmp2, src2); 2018 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2019 pshufd(vtmp2, src2, 0x1); 2020 pmovsxbw(vtmp2, src2); 2021 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2022 } 2023 } 2024 2025 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2026 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2027 int vector_len = Assembler::AVX_512bit; 2028 vpmovsxbw(vtmp1, src2, vector_len); 2029 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2030 } else { 2031 assert(UseAVX >= 2,"Should not reach here."); 2032 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2033 vextracti128_high(vtmp2, src2); 2034 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2035 } 2036 } 2037 2038 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2039 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2040 vextracti64x4_high(vtmp2, src2); 2041 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2042 } 2043 2044 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2045 if (opcode == Op_AddReductionVI) { 2046 if (vtmp1 != src2) { 2047 movdqu(vtmp1, src2); 2048 } 2049 phaddw(vtmp1, vtmp1); 2050 phaddw(vtmp1, vtmp1); 2051 } else { 2052 pshufd(vtmp2, src2, 0x1); 2053 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2054 movdqu(vtmp1, vtmp2); 2055 psrldq(vtmp1, 2); 2056 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2057 } 2058 movdl(vtmp2, src1); 2059 pmovsxwd(vtmp1, vtmp1); 2060 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2061 pextrw(dst, vtmp1, 0x0); 2062 movswl(dst, dst); 2063 } 2064 2065 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2066 if (opcode == Op_AddReductionVI) { 2067 if (vtmp1 != src2) { 2068 movdqu(vtmp1, src2); 2069 } 2070 phaddw(vtmp1, src2); 2071 } else { 2072 pshufd(vtmp1, src2, 0xE); 2073 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2074 } 2075 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2076 } 2077 2078 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2079 if (opcode == Op_AddReductionVI) { 2080 int vector_len = Assembler::AVX_256bit; 2081 vphaddw(vtmp2, src2, src2, vector_len); 2082 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2083 } else { 2084 vextracti128_high(vtmp2, src2); 2085 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2086 } 2087 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2088 } 2089 2090 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2091 int vector_len = Assembler::AVX_256bit; 2092 vextracti64x4_high(vtmp1, src2); 2093 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2094 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2095 } 2096 2097 #ifdef _LP64 2098 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2099 pshufd(vtmp2, src2, 0xE); 2100 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2101 movdq(vtmp1, src1); 2102 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2103 movdq(dst, vtmp1); 2104 } 2105 2106 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2107 vextracti128_high(vtmp1, src2); 2108 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2109 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2110 } 2111 2112 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2113 vextracti64x4_high(vtmp2, src2); 2114 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2115 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2116 } 2117 2118 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2119 mov64(temp, -1L); 2120 bzhiq(temp, temp, len); 2121 kmovql(dst, temp); 2122 } 2123 #endif // _LP64 2124 2125 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2126 reduce_operation_128(T_FLOAT, opcode, dst, src); 2127 pshufd(vtmp, src, 0x1); 2128 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2129 } 2130 2131 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2132 reduce2F(opcode, dst, src, vtmp); 2133 pshufd(vtmp, src, 0x2); 2134 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2135 pshufd(vtmp, src, 0x3); 2136 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2137 } 2138 2139 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2140 reduce4F(opcode, dst, src, vtmp2); 2141 vextractf128_high(vtmp2, src); 2142 reduce4F(opcode, dst, vtmp2, vtmp1); 2143 } 2144 2145 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2146 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2147 vextracti64x4_high(vtmp1, src); 2148 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2149 } 2150 2151 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2152 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2153 pshufd(vtmp, src, 0xE); 2154 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2155 } 2156 2157 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2158 reduce2D(opcode, dst, src, vtmp2); 2159 vextractf128_high(vtmp2, src); 2160 reduce2D(opcode, dst, vtmp2, vtmp1); 2161 } 2162 2163 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2164 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2165 vextracti64x4_high(vtmp1, src); 2166 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2167 } 2168 2169 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2170 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2171 } 2172 2173 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2174 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2175 } 2176 2177 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2178 int vec_enc) { 2179 switch(elem_bt) { 2180 case T_INT: 2181 case T_FLOAT: 2182 vmaskmovps(dst, src, mask, vec_enc); 2183 break; 2184 case T_LONG: 2185 case T_DOUBLE: 2186 vmaskmovpd(dst, src, mask, vec_enc); 2187 break; 2188 default: 2189 fatal("Unsupported type %s", type2name(elem_bt)); 2190 break; 2191 } 2192 } 2193 2194 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2195 int vec_enc) { 2196 switch(elem_bt) { 2197 case T_INT: 2198 case T_FLOAT: 2199 vmaskmovps(dst, src, mask, vec_enc); 2200 break; 2201 case T_LONG: 2202 case T_DOUBLE: 2203 vmaskmovpd(dst, src, mask, vec_enc); 2204 break; 2205 default: 2206 fatal("Unsupported type %s", type2name(elem_bt)); 2207 break; 2208 } 2209 } 2210 2211 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2212 XMMRegister dst, XMMRegister src, 2213 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2214 XMMRegister xmm_0, XMMRegister xmm_1) { 2215 const int permconst[] = {1, 14}; 2216 XMMRegister wsrc = src; 2217 XMMRegister wdst = xmm_0; 2218 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2219 2220 int vlen_enc = Assembler::AVX_128bit; 2221 if (vlen == 16) { 2222 vlen_enc = Assembler::AVX_256bit; 2223 } 2224 2225 for (int i = log2(vlen) - 1; i >=0; i--) { 2226 if (i == 0 && !is_dst_valid) { 2227 wdst = dst; 2228 } 2229 if (i == 3) { 2230 vextracti64x4_high(wtmp, wsrc); 2231 } else if (i == 2) { 2232 vextracti128_high(wtmp, wsrc); 2233 } else { // i = [0,1] 2234 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2235 } 2236 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2237 wsrc = wdst; 2238 vlen_enc = Assembler::AVX_128bit; 2239 } 2240 if (is_dst_valid) { 2241 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2242 } 2243 } 2244 2245 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2246 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2247 XMMRegister xmm_0, XMMRegister xmm_1) { 2248 XMMRegister wsrc = src; 2249 XMMRegister wdst = xmm_0; 2250 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2251 int vlen_enc = Assembler::AVX_128bit; 2252 if (vlen == 8) { 2253 vlen_enc = Assembler::AVX_256bit; 2254 } 2255 for (int i = log2(vlen) - 1; i >=0; i--) { 2256 if (i == 0 && !is_dst_valid) { 2257 wdst = dst; 2258 } 2259 if (i == 1) { 2260 vextracti128_high(wtmp, wsrc); 2261 } else if (i == 2) { 2262 vextracti64x4_high(wtmp, wsrc); 2263 } else { 2264 assert(i == 0, "%d", i); 2265 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2266 } 2267 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2268 wsrc = wdst; 2269 vlen_enc = Assembler::AVX_128bit; 2270 } 2271 if (is_dst_valid) { 2272 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2273 } 2274 } 2275 2276 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2277 switch (bt) { 2278 case T_BYTE: pextrb(dst, src, idx); break; 2279 case T_SHORT: pextrw(dst, src, idx); break; 2280 case T_INT: pextrd(dst, src, idx); break; 2281 case T_LONG: pextrq(dst, src, idx); break; 2282 2283 default: 2284 assert(false,"Should not reach here."); 2285 break; 2286 } 2287 } 2288 2289 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2290 int esize = type2aelembytes(typ); 2291 int elem_per_lane = 16/esize; 2292 int lane = elemindex / elem_per_lane; 2293 int eindex = elemindex % elem_per_lane; 2294 2295 if (lane >= 2) { 2296 assert(UseAVX > 2, "required"); 2297 vextractf32x4(dst, src, lane & 3); 2298 return dst; 2299 } else if (lane > 0) { 2300 assert(UseAVX > 0, "required"); 2301 vextractf128(dst, src, lane); 2302 return dst; 2303 } else { 2304 return src; 2305 } 2306 } 2307 2308 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2309 if (typ == T_BYTE) { 2310 movsbl(dst, dst); 2311 } else if (typ == T_SHORT) { 2312 movswl(dst, dst); 2313 } 2314 } 2315 2316 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2317 int esize = type2aelembytes(typ); 2318 int elem_per_lane = 16/esize; 2319 int eindex = elemindex % elem_per_lane; 2320 assert(is_integral_type(typ),"required"); 2321 2322 if (eindex == 0) { 2323 if (typ == T_LONG) { 2324 movq(dst, src); 2325 } else { 2326 movdl(dst, src); 2327 movsxl(typ, dst); 2328 } 2329 } else { 2330 extract(typ, dst, src, eindex); 2331 movsxl(typ, dst); 2332 } 2333 } 2334 2335 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2336 int esize = type2aelembytes(typ); 2337 int elem_per_lane = 16/esize; 2338 int eindex = elemindex % elem_per_lane; 2339 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2340 2341 if (eindex == 0) { 2342 movq(dst, src); 2343 } else { 2344 if (typ == T_FLOAT) { 2345 if (UseAVX == 0) { 2346 movdqu(dst, src); 2347 shufps(dst, dst, eindex); 2348 } else { 2349 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2350 } 2351 } else { 2352 if (UseAVX == 0) { 2353 movdqu(dst, src); 2354 psrldq(dst, eindex*esize); 2355 } else { 2356 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2357 } 2358 movq(dst, dst); 2359 } 2360 } 2361 // Zero upper bits 2362 if (typ == T_FLOAT) { 2363 if (UseAVX == 0) { 2364 assert(vtmp != xnoreg, "required."); 2365 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2366 pand(dst, vtmp); 2367 } else { 2368 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2369 } 2370 } 2371 } 2372 2373 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2374 switch(typ) { 2375 case T_BYTE: 2376 case T_BOOLEAN: 2377 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2378 break; 2379 case T_SHORT: 2380 case T_CHAR: 2381 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2382 break; 2383 case T_INT: 2384 case T_FLOAT: 2385 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2386 break; 2387 case T_LONG: 2388 case T_DOUBLE: 2389 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2390 break; 2391 default: 2392 assert(false,"Should not reach here."); 2393 break; 2394 } 2395 } 2396 2397 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2398 assert(rscratch != noreg || always_reachable(src2), "missing"); 2399 2400 switch(typ) { 2401 case T_BOOLEAN: 2402 case T_BYTE: 2403 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2404 break; 2405 case T_CHAR: 2406 case T_SHORT: 2407 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2408 break; 2409 case T_INT: 2410 case T_FLOAT: 2411 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2412 break; 2413 case T_LONG: 2414 case T_DOUBLE: 2415 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2416 break; 2417 default: 2418 assert(false,"Should not reach here."); 2419 break; 2420 } 2421 } 2422 2423 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2424 switch(typ) { 2425 case T_BYTE: 2426 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2427 break; 2428 case T_SHORT: 2429 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2430 break; 2431 case T_INT: 2432 case T_FLOAT: 2433 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2434 break; 2435 case T_LONG: 2436 case T_DOUBLE: 2437 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2438 break; 2439 default: 2440 assert(false,"Should not reach here."); 2441 break; 2442 } 2443 } 2444 2445 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2446 assert(vlen_in_bytes <= 32, ""); 2447 int esize = type2aelembytes(bt); 2448 if (vlen_in_bytes == 32) { 2449 assert(vtmp == xnoreg, "required."); 2450 if (esize >= 4) { 2451 vtestps(src1, src2, AVX_256bit); 2452 } else { 2453 vptest(src1, src2, AVX_256bit); 2454 } 2455 return; 2456 } 2457 if (vlen_in_bytes < 16) { 2458 // Duplicate the lower part to fill the whole register, 2459 // Don't need to do so for src2 2460 assert(vtmp != xnoreg, "required"); 2461 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2462 pshufd(vtmp, src1, shuffle_imm); 2463 } else { 2464 assert(vtmp == xnoreg, "required"); 2465 vtmp = src1; 2466 } 2467 if (esize >= 4 && VM_Version::supports_avx()) { 2468 vtestps(vtmp, src2, AVX_128bit); 2469 } else { 2470 ptest(vtmp, src2); 2471 } 2472 } 2473 2474 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2475 assert(UseAVX >= 2, "required"); 2476 #ifdef ASSERT 2477 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2478 bool is_bw_supported = VM_Version::supports_avx512bw(); 2479 if (is_bw && !is_bw_supported) { 2480 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2481 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2482 "XMM register should be 0-15"); 2483 } 2484 #endif // ASSERT 2485 switch (elem_bt) { 2486 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2487 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2488 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2489 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2490 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2491 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2492 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2493 } 2494 } 2495 2496 #ifdef _LP64 2497 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2498 assert(UseAVX >= 2, "required"); 2499 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2500 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2501 if ((UseAVX > 2) && 2502 (!is_bw || VM_Version::supports_avx512bw()) && 2503 (!is_vl || VM_Version::supports_avx512vl())) { 2504 switch (elem_bt) { 2505 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2506 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2507 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2508 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2509 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2510 } 2511 } else { 2512 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2513 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2514 switch (elem_bt) { 2515 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2516 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2517 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2518 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2519 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2520 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2521 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2522 } 2523 } 2524 } 2525 #endif 2526 2527 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2528 switch (to_elem_bt) { 2529 case T_SHORT: 2530 vpmovsxbw(dst, src, vlen_enc); 2531 break; 2532 case T_INT: 2533 vpmovsxbd(dst, src, vlen_enc); 2534 break; 2535 case T_FLOAT: 2536 vpmovsxbd(dst, src, vlen_enc); 2537 vcvtdq2ps(dst, dst, vlen_enc); 2538 break; 2539 case T_LONG: 2540 vpmovsxbq(dst, src, vlen_enc); 2541 break; 2542 case T_DOUBLE: { 2543 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2544 vpmovsxbd(dst, src, mid_vlen_enc); 2545 vcvtdq2pd(dst, dst, vlen_enc); 2546 break; 2547 } 2548 default: 2549 fatal("Unsupported type %s", type2name(to_elem_bt)); 2550 break; 2551 } 2552 } 2553 2554 //------------------------------------------------------------------------------------------- 2555 2556 // IndexOf for constant substrings with size >= 8 chars 2557 // which don't need to be loaded through stack. 2558 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2559 Register cnt1, Register cnt2, 2560 int int_cnt2, Register result, 2561 XMMRegister vec, Register tmp, 2562 int ae) { 2563 ShortBranchVerifier sbv(this); 2564 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2565 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2566 2567 // This method uses the pcmpestri instruction with bound registers 2568 // inputs: 2569 // xmm - substring 2570 // rax - substring length (elements count) 2571 // mem - scanned string 2572 // rdx - string length (elements count) 2573 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2574 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2575 // outputs: 2576 // rcx - matched index in string 2577 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2578 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2579 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2580 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2581 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2582 2583 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2584 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2585 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2586 2587 // Note, inline_string_indexOf() generates checks: 2588 // if (substr.count > string.count) return -1; 2589 // if (substr.count == 0) return 0; 2590 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2591 2592 // Load substring. 2593 if (ae == StrIntrinsicNode::UL) { 2594 pmovzxbw(vec, Address(str2, 0)); 2595 } else { 2596 movdqu(vec, Address(str2, 0)); 2597 } 2598 movl(cnt2, int_cnt2); 2599 movptr(result, str1); // string addr 2600 2601 if (int_cnt2 > stride) { 2602 jmpb(SCAN_TO_SUBSTR); 2603 2604 // Reload substr for rescan, this code 2605 // is executed only for large substrings (> 8 chars) 2606 bind(RELOAD_SUBSTR); 2607 if (ae == StrIntrinsicNode::UL) { 2608 pmovzxbw(vec, Address(str2, 0)); 2609 } else { 2610 movdqu(vec, Address(str2, 0)); 2611 } 2612 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2613 2614 bind(RELOAD_STR); 2615 // We came here after the beginning of the substring was 2616 // matched but the rest of it was not so we need to search 2617 // again. Start from the next element after the previous match. 2618 2619 // cnt2 is number of substring reminding elements and 2620 // cnt1 is number of string reminding elements when cmp failed. 2621 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2622 subl(cnt1, cnt2); 2623 addl(cnt1, int_cnt2); 2624 movl(cnt2, int_cnt2); // Now restore cnt2 2625 2626 decrementl(cnt1); // Shift to next element 2627 cmpl(cnt1, cnt2); 2628 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2629 2630 addptr(result, (1<<scale1)); 2631 2632 } // (int_cnt2 > 8) 2633 2634 // Scan string for start of substr in 16-byte vectors 2635 bind(SCAN_TO_SUBSTR); 2636 pcmpestri(vec, Address(result, 0), mode); 2637 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2638 subl(cnt1, stride); 2639 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2640 cmpl(cnt1, cnt2); 2641 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2642 addptr(result, 16); 2643 jmpb(SCAN_TO_SUBSTR); 2644 2645 // Found a potential substr 2646 bind(FOUND_CANDIDATE); 2647 // Matched whole vector if first element matched (tmp(rcx) == 0). 2648 if (int_cnt2 == stride) { 2649 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2650 } else { // int_cnt2 > 8 2651 jccb(Assembler::overflow, FOUND_SUBSTR); 2652 } 2653 // After pcmpestri tmp(rcx) contains matched element index 2654 // Compute start addr of substr 2655 lea(result, Address(result, tmp, scale1)); 2656 2657 // Make sure string is still long enough 2658 subl(cnt1, tmp); 2659 cmpl(cnt1, cnt2); 2660 if (int_cnt2 == stride) { 2661 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2662 } else { // int_cnt2 > 8 2663 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2664 } 2665 // Left less then substring. 2666 2667 bind(RET_NOT_FOUND); 2668 movl(result, -1); 2669 jmp(EXIT); 2670 2671 if (int_cnt2 > stride) { 2672 // This code is optimized for the case when whole substring 2673 // is matched if its head is matched. 2674 bind(MATCH_SUBSTR_HEAD); 2675 pcmpestri(vec, Address(result, 0), mode); 2676 // Reload only string if does not match 2677 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2678 2679 Label CONT_SCAN_SUBSTR; 2680 // Compare the rest of substring (> 8 chars). 2681 bind(FOUND_SUBSTR); 2682 // First 8 chars are already matched. 2683 negptr(cnt2); 2684 addptr(cnt2, stride); 2685 2686 bind(SCAN_SUBSTR); 2687 subl(cnt1, stride); 2688 cmpl(cnt2, -stride); // Do not read beyond substring 2689 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2690 // Back-up strings to avoid reading beyond substring: 2691 // cnt1 = cnt1 - cnt2 + 8 2692 addl(cnt1, cnt2); // cnt2 is negative 2693 addl(cnt1, stride); 2694 movl(cnt2, stride); negptr(cnt2); 2695 bind(CONT_SCAN_SUBSTR); 2696 if (int_cnt2 < (int)G) { 2697 int tail_off1 = int_cnt2<<scale1; 2698 int tail_off2 = int_cnt2<<scale2; 2699 if (ae == StrIntrinsicNode::UL) { 2700 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2701 } else { 2702 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2703 } 2704 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2705 } else { 2706 // calculate index in register to avoid integer overflow (int_cnt2*2) 2707 movl(tmp, int_cnt2); 2708 addptr(tmp, cnt2); 2709 if (ae == StrIntrinsicNode::UL) { 2710 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2711 } else { 2712 movdqu(vec, Address(str2, tmp, scale2, 0)); 2713 } 2714 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2715 } 2716 // Need to reload strings pointers if not matched whole vector 2717 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2718 addptr(cnt2, stride); 2719 jcc(Assembler::negative, SCAN_SUBSTR); 2720 // Fall through if found full substring 2721 2722 } // (int_cnt2 > 8) 2723 2724 bind(RET_FOUND); 2725 // Found result if we matched full small substring. 2726 // Compute substr offset 2727 subptr(result, str1); 2728 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2729 shrl(result, 1); // index 2730 } 2731 bind(EXIT); 2732 2733 } // string_indexofC8 2734 2735 // Small strings are loaded through stack if they cross page boundary. 2736 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2737 Register cnt1, Register cnt2, 2738 int int_cnt2, Register result, 2739 XMMRegister vec, Register tmp, 2740 int ae) { 2741 ShortBranchVerifier sbv(this); 2742 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2743 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2744 2745 // 2746 // int_cnt2 is length of small (< 8 chars) constant substring 2747 // or (-1) for non constant substring in which case its length 2748 // is in cnt2 register. 2749 // 2750 // Note, inline_string_indexOf() generates checks: 2751 // if (substr.count > string.count) return -1; 2752 // if (substr.count == 0) return 0; 2753 // 2754 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2755 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2756 // This method uses the pcmpestri instruction with bound registers 2757 // inputs: 2758 // xmm - substring 2759 // rax - substring length (elements count) 2760 // mem - scanned string 2761 // rdx - string length (elements count) 2762 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2763 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2764 // outputs: 2765 // rcx - matched index in string 2766 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2767 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2768 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2769 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2770 2771 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2772 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2773 FOUND_CANDIDATE; 2774 2775 { //======================================================== 2776 // We don't know where these strings are located 2777 // and we can't read beyond them. Load them through stack. 2778 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2779 2780 movptr(tmp, rsp); // save old SP 2781 2782 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2783 if (int_cnt2 == (1>>scale2)) { // One byte 2784 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2785 load_unsigned_byte(result, Address(str2, 0)); 2786 movdl(vec, result); // move 32 bits 2787 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2788 // Not enough header space in 32-bit VM: 12+3 = 15. 2789 movl(result, Address(str2, -1)); 2790 shrl(result, 8); 2791 movdl(vec, result); // move 32 bits 2792 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2793 load_unsigned_short(result, Address(str2, 0)); 2794 movdl(vec, result); // move 32 bits 2795 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2796 movdl(vec, Address(str2, 0)); // move 32 bits 2797 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2798 movq(vec, Address(str2, 0)); // move 64 bits 2799 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2800 // Array header size is 12 bytes in 32-bit VM 2801 // + 6 bytes for 3 chars == 18 bytes, 2802 // enough space to load vec and shift. 2803 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2804 if (ae == StrIntrinsicNode::UL) { 2805 int tail_off = int_cnt2-8; 2806 pmovzxbw(vec, Address(str2, tail_off)); 2807 psrldq(vec, -2*tail_off); 2808 } 2809 else { 2810 int tail_off = int_cnt2*(1<<scale2); 2811 movdqu(vec, Address(str2, tail_off-16)); 2812 psrldq(vec, 16-tail_off); 2813 } 2814 } 2815 } else { // not constant substring 2816 cmpl(cnt2, stride); 2817 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 2818 2819 // We can read beyond string if srt+16 does not cross page boundary 2820 // since heaps are aligned and mapped by pages. 2821 assert(os::vm_page_size() < (int)G, "default page should be small"); 2822 movl(result, str2); // We need only low 32 bits 2823 andl(result, ((int)os::vm_page_size()-1)); 2824 cmpl(result, ((int)os::vm_page_size()-16)); 2825 jccb(Assembler::belowEqual, CHECK_STR); 2826 2827 // Move small strings to stack to allow load 16 bytes into vec. 2828 subptr(rsp, 16); 2829 int stk_offset = wordSize-(1<<scale2); 2830 push(cnt2); 2831 2832 bind(COPY_SUBSTR); 2833 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 2834 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 2835 movb(Address(rsp, cnt2, scale2, stk_offset), result); 2836 } else if (ae == StrIntrinsicNode::UU) { 2837 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 2838 movw(Address(rsp, cnt2, scale2, stk_offset), result); 2839 } 2840 decrement(cnt2); 2841 jccb(Assembler::notZero, COPY_SUBSTR); 2842 2843 pop(cnt2); 2844 movptr(str2, rsp); // New substring address 2845 } // non constant 2846 2847 bind(CHECK_STR); 2848 cmpl(cnt1, stride); 2849 jccb(Assembler::aboveEqual, BIG_STRINGS); 2850 2851 // Check cross page boundary. 2852 movl(result, str1); // We need only low 32 bits 2853 andl(result, ((int)os::vm_page_size()-1)); 2854 cmpl(result, ((int)os::vm_page_size()-16)); 2855 jccb(Assembler::belowEqual, BIG_STRINGS); 2856 2857 subptr(rsp, 16); 2858 int stk_offset = -(1<<scale1); 2859 if (int_cnt2 < 0) { // not constant 2860 push(cnt2); 2861 stk_offset += wordSize; 2862 } 2863 movl(cnt2, cnt1); 2864 2865 bind(COPY_STR); 2866 if (ae == StrIntrinsicNode::LL) { 2867 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 2868 movb(Address(rsp, cnt2, scale1, stk_offset), result); 2869 } else { 2870 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 2871 movw(Address(rsp, cnt2, scale1, stk_offset), result); 2872 } 2873 decrement(cnt2); 2874 jccb(Assembler::notZero, COPY_STR); 2875 2876 if (int_cnt2 < 0) { // not constant 2877 pop(cnt2); 2878 } 2879 movptr(str1, rsp); // New string address 2880 2881 bind(BIG_STRINGS); 2882 // Load substring. 2883 if (int_cnt2 < 0) { // -1 2884 if (ae == StrIntrinsicNode::UL) { 2885 pmovzxbw(vec, Address(str2, 0)); 2886 } else { 2887 movdqu(vec, Address(str2, 0)); 2888 } 2889 push(cnt2); // substr count 2890 push(str2); // substr addr 2891 push(str1); // string addr 2892 } else { 2893 // Small (< 8 chars) constant substrings are loaded already. 2894 movl(cnt2, int_cnt2); 2895 } 2896 push(tmp); // original SP 2897 2898 } // Finished loading 2899 2900 //======================================================== 2901 // Start search 2902 // 2903 2904 movptr(result, str1); // string addr 2905 2906 if (int_cnt2 < 0) { // Only for non constant substring 2907 jmpb(SCAN_TO_SUBSTR); 2908 2909 // SP saved at sp+0 2910 // String saved at sp+1*wordSize 2911 // Substr saved at sp+2*wordSize 2912 // Substr count saved at sp+3*wordSize 2913 2914 // Reload substr for rescan, this code 2915 // is executed only for large substrings (> 8 chars) 2916 bind(RELOAD_SUBSTR); 2917 movptr(str2, Address(rsp, 2*wordSize)); 2918 movl(cnt2, Address(rsp, 3*wordSize)); 2919 if (ae == StrIntrinsicNode::UL) { 2920 pmovzxbw(vec, Address(str2, 0)); 2921 } else { 2922 movdqu(vec, Address(str2, 0)); 2923 } 2924 // We came here after the beginning of the substring was 2925 // matched but the rest of it was not so we need to search 2926 // again. Start from the next element after the previous match. 2927 subptr(str1, result); // Restore counter 2928 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2929 shrl(str1, 1); 2930 } 2931 addl(cnt1, str1); 2932 decrementl(cnt1); // Shift to next element 2933 cmpl(cnt1, cnt2); 2934 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2935 2936 addptr(result, (1<<scale1)); 2937 } // non constant 2938 2939 // Scan string for start of substr in 16-byte vectors 2940 bind(SCAN_TO_SUBSTR); 2941 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2942 pcmpestri(vec, Address(result, 0), mode); 2943 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2944 subl(cnt1, stride); 2945 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2946 cmpl(cnt1, cnt2); 2947 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2948 addptr(result, 16); 2949 2950 bind(ADJUST_STR); 2951 cmpl(cnt1, stride); // Do not read beyond string 2952 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2953 // Back-up string to avoid reading beyond string. 2954 lea(result, Address(result, cnt1, scale1, -16)); 2955 movl(cnt1, stride); 2956 jmpb(SCAN_TO_SUBSTR); 2957 2958 // Found a potential substr 2959 bind(FOUND_CANDIDATE); 2960 // After pcmpestri tmp(rcx) contains matched element index 2961 2962 // Make sure string is still long enough 2963 subl(cnt1, tmp); 2964 cmpl(cnt1, cnt2); 2965 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 2966 // Left less then substring. 2967 2968 bind(RET_NOT_FOUND); 2969 movl(result, -1); 2970 jmp(CLEANUP); 2971 2972 bind(FOUND_SUBSTR); 2973 // Compute start addr of substr 2974 lea(result, Address(result, tmp, scale1)); 2975 if (int_cnt2 > 0) { // Constant substring 2976 // Repeat search for small substring (< 8 chars) 2977 // from new point without reloading substring. 2978 // Have to check that we don't read beyond string. 2979 cmpl(tmp, stride-int_cnt2); 2980 jccb(Assembler::greater, ADJUST_STR); 2981 // Fall through if matched whole substring. 2982 } else { // non constant 2983 assert(int_cnt2 == -1, "should be != 0"); 2984 2985 addl(tmp, cnt2); 2986 // Found result if we matched whole substring. 2987 cmpl(tmp, stride); 2988 jcc(Assembler::lessEqual, RET_FOUND); 2989 2990 // Repeat search for small substring (<= 8 chars) 2991 // from new point 'str1' without reloading substring. 2992 cmpl(cnt2, stride); 2993 // Have to check that we don't read beyond string. 2994 jccb(Assembler::lessEqual, ADJUST_STR); 2995 2996 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 2997 // Compare the rest of substring (> 8 chars). 2998 movptr(str1, result); 2999 3000 cmpl(tmp, cnt2); 3001 // First 8 chars are already matched. 3002 jccb(Assembler::equal, CHECK_NEXT); 3003 3004 bind(SCAN_SUBSTR); 3005 pcmpestri(vec, Address(str1, 0), mode); 3006 // Need to reload strings pointers if not matched whole vector 3007 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3008 3009 bind(CHECK_NEXT); 3010 subl(cnt2, stride); 3011 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3012 addptr(str1, 16); 3013 if (ae == StrIntrinsicNode::UL) { 3014 addptr(str2, 8); 3015 } else { 3016 addptr(str2, 16); 3017 } 3018 subl(cnt1, stride); 3019 cmpl(cnt2, stride); // Do not read beyond substring 3020 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3021 // Back-up strings to avoid reading beyond substring. 3022 3023 if (ae == StrIntrinsicNode::UL) { 3024 lea(str2, Address(str2, cnt2, scale2, -8)); 3025 lea(str1, Address(str1, cnt2, scale1, -16)); 3026 } else { 3027 lea(str2, Address(str2, cnt2, scale2, -16)); 3028 lea(str1, Address(str1, cnt2, scale1, -16)); 3029 } 3030 subl(cnt1, cnt2); 3031 movl(cnt2, stride); 3032 addl(cnt1, stride); 3033 bind(CONT_SCAN_SUBSTR); 3034 if (ae == StrIntrinsicNode::UL) { 3035 pmovzxbw(vec, Address(str2, 0)); 3036 } else { 3037 movdqu(vec, Address(str2, 0)); 3038 } 3039 jmp(SCAN_SUBSTR); 3040 3041 bind(RET_FOUND_LONG); 3042 movptr(str1, Address(rsp, wordSize)); 3043 } // non constant 3044 3045 bind(RET_FOUND); 3046 // Compute substr offset 3047 subptr(result, str1); 3048 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3049 shrl(result, 1); // index 3050 } 3051 bind(CLEANUP); 3052 pop(rsp); // restore SP 3053 3054 } // string_indexof 3055 3056 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3057 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3058 ShortBranchVerifier sbv(this); 3059 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3060 3061 int stride = 8; 3062 3063 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3064 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3065 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3066 FOUND_SEQ_CHAR, DONE_LABEL; 3067 3068 movptr(result, str1); 3069 if (UseAVX >= 2) { 3070 cmpl(cnt1, stride); 3071 jcc(Assembler::less, SCAN_TO_CHAR); 3072 cmpl(cnt1, 2*stride); 3073 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3074 movdl(vec1, ch); 3075 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3076 vpxor(vec2, vec2); 3077 movl(tmp, cnt1); 3078 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3079 andl(cnt1,0x0000000F); //tail count (in chars) 3080 3081 bind(SCAN_TO_16_CHAR_LOOP); 3082 vmovdqu(vec3, Address(result, 0)); 3083 vpcmpeqw(vec3, vec3, vec1, 1); 3084 vptest(vec2, vec3); 3085 jcc(Assembler::carryClear, FOUND_CHAR); 3086 addptr(result, 32); 3087 subl(tmp, 2*stride); 3088 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3089 jmp(SCAN_TO_8_CHAR); 3090 bind(SCAN_TO_8_CHAR_INIT); 3091 movdl(vec1, ch); 3092 pshuflw(vec1, vec1, 0x00); 3093 pshufd(vec1, vec1, 0); 3094 pxor(vec2, vec2); 3095 } 3096 bind(SCAN_TO_8_CHAR); 3097 cmpl(cnt1, stride); 3098 jcc(Assembler::less, SCAN_TO_CHAR); 3099 if (UseAVX < 2) { 3100 movdl(vec1, ch); 3101 pshuflw(vec1, vec1, 0x00); 3102 pshufd(vec1, vec1, 0); 3103 pxor(vec2, vec2); 3104 } 3105 movl(tmp, cnt1); 3106 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3107 andl(cnt1,0x00000007); //tail count (in chars) 3108 3109 bind(SCAN_TO_8_CHAR_LOOP); 3110 movdqu(vec3, Address(result, 0)); 3111 pcmpeqw(vec3, vec1); 3112 ptest(vec2, vec3); 3113 jcc(Assembler::carryClear, FOUND_CHAR); 3114 addptr(result, 16); 3115 subl(tmp, stride); 3116 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3117 bind(SCAN_TO_CHAR); 3118 testl(cnt1, cnt1); 3119 jcc(Assembler::zero, RET_NOT_FOUND); 3120 bind(SCAN_TO_CHAR_LOOP); 3121 load_unsigned_short(tmp, Address(result, 0)); 3122 cmpl(ch, tmp); 3123 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3124 addptr(result, 2); 3125 subl(cnt1, 1); 3126 jccb(Assembler::zero, RET_NOT_FOUND); 3127 jmp(SCAN_TO_CHAR_LOOP); 3128 3129 bind(RET_NOT_FOUND); 3130 movl(result, -1); 3131 jmpb(DONE_LABEL); 3132 3133 bind(FOUND_CHAR); 3134 if (UseAVX >= 2) { 3135 vpmovmskb(tmp, vec3); 3136 } else { 3137 pmovmskb(tmp, vec3); 3138 } 3139 bsfl(ch, tmp); 3140 addptr(result, ch); 3141 3142 bind(FOUND_SEQ_CHAR); 3143 subptr(result, str1); 3144 shrl(result, 1); 3145 3146 bind(DONE_LABEL); 3147 } // string_indexof_char 3148 3149 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3150 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3151 ShortBranchVerifier sbv(this); 3152 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3153 3154 int stride = 16; 3155 3156 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3157 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3158 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3159 FOUND_SEQ_CHAR, DONE_LABEL; 3160 3161 movptr(result, str1); 3162 if (UseAVX >= 2) { 3163 cmpl(cnt1, stride); 3164 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3165 cmpl(cnt1, stride*2); 3166 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3167 movdl(vec1, ch); 3168 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3169 vpxor(vec2, vec2); 3170 movl(tmp, cnt1); 3171 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3172 andl(cnt1,0x0000001F); //tail count (in chars) 3173 3174 bind(SCAN_TO_32_CHAR_LOOP); 3175 vmovdqu(vec3, Address(result, 0)); 3176 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3177 vptest(vec2, vec3); 3178 jcc(Assembler::carryClear, FOUND_CHAR); 3179 addptr(result, 32); 3180 subl(tmp, stride*2); 3181 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3182 jmp(SCAN_TO_16_CHAR); 3183 3184 bind(SCAN_TO_16_CHAR_INIT); 3185 movdl(vec1, ch); 3186 pxor(vec2, vec2); 3187 pshufb(vec1, vec2); 3188 } 3189 3190 bind(SCAN_TO_16_CHAR); 3191 cmpl(cnt1, stride); 3192 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3193 if (UseAVX < 2) { 3194 movdl(vec1, ch); 3195 pxor(vec2, vec2); 3196 pshufb(vec1, vec2); 3197 } 3198 movl(tmp, cnt1); 3199 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3200 andl(cnt1,0x0000000F); //tail count (in bytes) 3201 3202 bind(SCAN_TO_16_CHAR_LOOP); 3203 movdqu(vec3, Address(result, 0)); 3204 pcmpeqb(vec3, vec1); 3205 ptest(vec2, vec3); 3206 jcc(Assembler::carryClear, FOUND_CHAR); 3207 addptr(result, 16); 3208 subl(tmp, stride); 3209 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3210 3211 bind(SCAN_TO_CHAR_INIT); 3212 testl(cnt1, cnt1); 3213 jcc(Assembler::zero, RET_NOT_FOUND); 3214 bind(SCAN_TO_CHAR_LOOP); 3215 load_unsigned_byte(tmp, Address(result, 0)); 3216 cmpl(ch, tmp); 3217 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3218 addptr(result, 1); 3219 subl(cnt1, 1); 3220 jccb(Assembler::zero, RET_NOT_FOUND); 3221 jmp(SCAN_TO_CHAR_LOOP); 3222 3223 bind(RET_NOT_FOUND); 3224 movl(result, -1); 3225 jmpb(DONE_LABEL); 3226 3227 bind(FOUND_CHAR); 3228 if (UseAVX >= 2) { 3229 vpmovmskb(tmp, vec3); 3230 } else { 3231 pmovmskb(tmp, vec3); 3232 } 3233 bsfl(ch, tmp); 3234 addptr(result, ch); 3235 3236 bind(FOUND_SEQ_CHAR); 3237 subptr(result, str1); 3238 3239 bind(DONE_LABEL); 3240 } // stringL_indexof_char 3241 3242 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3243 switch (eltype) { 3244 case T_BOOLEAN: return sizeof(jboolean); 3245 case T_BYTE: return sizeof(jbyte); 3246 case T_SHORT: return sizeof(jshort); 3247 case T_CHAR: return sizeof(jchar); 3248 case T_INT: return sizeof(jint); 3249 default: 3250 ShouldNotReachHere(); 3251 return -1; 3252 } 3253 } 3254 3255 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3256 switch (eltype) { 3257 // T_BOOLEAN used as surrogate for unsigned byte 3258 case T_BOOLEAN: movzbl(dst, src); break; 3259 case T_BYTE: movsbl(dst, src); break; 3260 case T_SHORT: movswl(dst, src); break; 3261 case T_CHAR: movzwl(dst, src); break; 3262 case T_INT: movl(dst, src); break; 3263 default: 3264 ShouldNotReachHere(); 3265 } 3266 } 3267 3268 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3269 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3270 } 3271 3272 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3273 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3274 } 3275 3276 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3277 const int vlen = Assembler::AVX_256bit; 3278 switch (eltype) { 3279 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3280 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3281 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3282 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3283 case T_INT: 3284 // do nothing 3285 break; 3286 default: 3287 ShouldNotReachHere(); 3288 } 3289 } 3290 3291 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3292 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3293 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3294 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3295 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3296 BasicType eltype) { 3297 ShortBranchVerifier sbv(this); 3298 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3299 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3300 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3301 3302 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3303 SHORT_UNROLLED_LOOP_EXIT, 3304 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3305 UNROLLED_VECTOR_LOOP_BEGIN, 3306 END; 3307 switch (eltype) { 3308 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3309 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3310 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3311 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3312 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3313 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3314 } 3315 3316 // For "renaming" for readibility of the code 3317 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3318 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3319 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3320 3321 const int elsize = arrays_hashcode_elsize(eltype); 3322 3323 /* 3324 if (cnt1 >= 2) { 3325 if (cnt1 >= 32) { 3326 UNROLLED VECTOR LOOP 3327 } 3328 UNROLLED SCALAR LOOP 3329 } 3330 SINGLE SCALAR 3331 */ 3332 3333 cmpl(cnt1, 32); 3334 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3335 3336 // cnt1 >= 32 && generate_vectorized_loop 3337 xorl(index, index); 3338 3339 // vresult = IntVector.zero(I256); 3340 for (int idx = 0; idx < 4; idx++) { 3341 vpxor(vresult[idx], vresult[idx]); 3342 } 3343 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3344 Register bound = tmp2; 3345 Register next = tmp3; 3346 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3347 movl(next, Address(tmp2, 0)); 3348 movdl(vnext, next); 3349 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3350 3351 // index = 0; 3352 // bound = cnt1 & ~(32 - 1); 3353 movl(bound, cnt1); 3354 andl(bound, ~(32 - 1)); 3355 // for (; index < bound; index += 32) { 3356 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3357 // result *= next; 3358 imull(result, next); 3359 // loop fission to upfront the cost of fetching from memory, OOO execution 3360 // can then hopefully do a better job of prefetching 3361 for (int idx = 0; idx < 4; idx++) { 3362 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3363 } 3364 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3365 for (int idx = 0; idx < 4; idx++) { 3366 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3367 arrays_hashcode_elvcast(vtmp[idx], eltype); 3368 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3369 } 3370 // index += 32; 3371 addl(index, 32); 3372 // index < bound; 3373 cmpl(index, bound); 3374 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3375 // } 3376 3377 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3378 subl(cnt1, bound); 3379 // release bound 3380 3381 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3382 for (int idx = 0; idx < 4; idx++) { 3383 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3384 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3385 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3386 } 3387 // result += vresult.reduceLanes(ADD); 3388 for (int idx = 0; idx < 4; idx++) { 3389 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3390 } 3391 3392 // } else if (cnt1 < 32) { 3393 3394 bind(SHORT_UNROLLED_BEGIN); 3395 // int i = 1; 3396 movl(index, 1); 3397 cmpl(index, cnt1); 3398 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3399 3400 // for (; i < cnt1 ; i += 2) { 3401 bind(SHORT_UNROLLED_LOOP_BEGIN); 3402 movl(tmp3, 961); 3403 imull(result, tmp3); 3404 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3405 movl(tmp3, tmp2); 3406 shll(tmp3, 5); 3407 subl(tmp3, tmp2); 3408 addl(result, tmp3); 3409 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3410 addl(result, tmp3); 3411 addl(index, 2); 3412 cmpl(index, cnt1); 3413 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3414 3415 // } 3416 // if (i >= cnt1) { 3417 bind(SHORT_UNROLLED_LOOP_EXIT); 3418 jccb(Assembler::greater, END); 3419 movl(tmp2, result); 3420 shll(result, 5); 3421 subl(result, tmp2); 3422 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3423 addl(result, tmp3); 3424 // } 3425 bind(END); 3426 3427 BLOCK_COMMENT("} // arrays_hashcode"); 3428 3429 } // arrays_hashcode 3430 3431 // helper function for string_compare 3432 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3433 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3434 Address::ScaleFactor scale2, Register index, int ae) { 3435 if (ae == StrIntrinsicNode::LL) { 3436 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3437 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3438 } else if (ae == StrIntrinsicNode::UU) { 3439 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3440 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3441 } else { 3442 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3443 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3444 } 3445 } 3446 3447 // Compare strings, used for char[] and byte[]. 3448 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3449 Register cnt1, Register cnt2, Register result, 3450 XMMRegister vec1, int ae, KRegister mask) { 3451 ShortBranchVerifier sbv(this); 3452 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3453 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3454 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3455 int stride2x2 = 0x40; 3456 Address::ScaleFactor scale = Address::no_scale; 3457 Address::ScaleFactor scale1 = Address::no_scale; 3458 Address::ScaleFactor scale2 = Address::no_scale; 3459 3460 if (ae != StrIntrinsicNode::LL) { 3461 stride2x2 = 0x20; 3462 } 3463 3464 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3465 shrl(cnt2, 1); 3466 } 3467 // Compute the minimum of the string lengths and the 3468 // difference of the string lengths (stack). 3469 // Do the conditional move stuff 3470 movl(result, cnt1); 3471 subl(cnt1, cnt2); 3472 push(cnt1); 3473 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3474 3475 // Is the minimum length zero? 3476 testl(cnt2, cnt2); 3477 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3478 if (ae == StrIntrinsicNode::LL) { 3479 // Load first bytes 3480 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3481 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3482 } else if (ae == StrIntrinsicNode::UU) { 3483 // Load first characters 3484 load_unsigned_short(result, Address(str1, 0)); 3485 load_unsigned_short(cnt1, Address(str2, 0)); 3486 } else { 3487 load_unsigned_byte(result, Address(str1, 0)); 3488 load_unsigned_short(cnt1, Address(str2, 0)); 3489 } 3490 subl(result, cnt1); 3491 jcc(Assembler::notZero, POP_LABEL); 3492 3493 if (ae == StrIntrinsicNode::UU) { 3494 // Divide length by 2 to get number of chars 3495 shrl(cnt2, 1); 3496 } 3497 cmpl(cnt2, 1); 3498 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3499 3500 // Check if the strings start at the same location and setup scale and stride 3501 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3502 cmpptr(str1, str2); 3503 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3504 if (ae == StrIntrinsicNode::LL) { 3505 scale = Address::times_1; 3506 stride = 16; 3507 } else { 3508 scale = Address::times_2; 3509 stride = 8; 3510 } 3511 } else { 3512 scale1 = Address::times_1; 3513 scale2 = Address::times_2; 3514 // scale not used 3515 stride = 8; 3516 } 3517 3518 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3519 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3520 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3521 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3522 Label COMPARE_TAIL_LONG; 3523 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3524 3525 int pcmpmask = 0x19; 3526 if (ae == StrIntrinsicNode::LL) { 3527 pcmpmask &= ~0x01; 3528 } 3529 3530 // Setup to compare 16-chars (32-bytes) vectors, 3531 // start from first character again because it has aligned address. 3532 if (ae == StrIntrinsicNode::LL) { 3533 stride2 = 32; 3534 } else { 3535 stride2 = 16; 3536 } 3537 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3538 adr_stride = stride << scale; 3539 } else { 3540 adr_stride1 = 8; //stride << scale1; 3541 adr_stride2 = 16; //stride << scale2; 3542 } 3543 3544 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3545 // rax and rdx are used by pcmpestri as elements counters 3546 movl(result, cnt2); 3547 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3548 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3549 3550 // fast path : compare first 2 8-char vectors. 3551 bind(COMPARE_16_CHARS); 3552 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3553 movdqu(vec1, Address(str1, 0)); 3554 } else { 3555 pmovzxbw(vec1, Address(str1, 0)); 3556 } 3557 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3558 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3559 3560 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3561 movdqu(vec1, Address(str1, adr_stride)); 3562 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3563 } else { 3564 pmovzxbw(vec1, Address(str1, adr_stride1)); 3565 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3566 } 3567 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3568 addl(cnt1, stride); 3569 3570 // Compare the characters at index in cnt1 3571 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3572 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3573 subl(result, cnt2); 3574 jmp(POP_LABEL); 3575 3576 // Setup the registers to start vector comparison loop 3577 bind(COMPARE_WIDE_VECTORS); 3578 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3579 lea(str1, Address(str1, result, scale)); 3580 lea(str2, Address(str2, result, scale)); 3581 } else { 3582 lea(str1, Address(str1, result, scale1)); 3583 lea(str2, Address(str2, result, scale2)); 3584 } 3585 subl(result, stride2); 3586 subl(cnt2, stride2); 3587 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3588 negptr(result); 3589 3590 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3591 bind(COMPARE_WIDE_VECTORS_LOOP); 3592 3593 #ifdef _LP64 3594 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3595 cmpl(cnt2, stride2x2); 3596 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3597 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3598 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3599 3600 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3601 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3602 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3603 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3604 } else { 3605 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3606 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3607 } 3608 kortestql(mask, mask); 3609 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3610 addptr(result, stride2x2); // update since we already compared at this addr 3611 subl(cnt2, stride2x2); // and sub the size too 3612 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3613 3614 vpxor(vec1, vec1); 3615 jmpb(COMPARE_WIDE_TAIL); 3616 }//if (VM_Version::supports_avx512vlbw()) 3617 #endif // _LP64 3618 3619 3620 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3621 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3622 vmovdqu(vec1, Address(str1, result, scale)); 3623 vpxor(vec1, Address(str2, result, scale)); 3624 } else { 3625 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3626 vpxor(vec1, Address(str2, result, scale2)); 3627 } 3628 vptest(vec1, vec1); 3629 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3630 addptr(result, stride2); 3631 subl(cnt2, stride2); 3632 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3633 // clean upper bits of YMM registers 3634 vpxor(vec1, vec1); 3635 3636 // compare wide vectors tail 3637 bind(COMPARE_WIDE_TAIL); 3638 testptr(result, result); 3639 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3640 3641 movl(result, stride2); 3642 movl(cnt2, result); 3643 negptr(result); 3644 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3645 3646 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3647 bind(VECTOR_NOT_EQUAL); 3648 // clean upper bits of YMM registers 3649 vpxor(vec1, vec1); 3650 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3651 lea(str1, Address(str1, result, scale)); 3652 lea(str2, Address(str2, result, scale)); 3653 } else { 3654 lea(str1, Address(str1, result, scale1)); 3655 lea(str2, Address(str2, result, scale2)); 3656 } 3657 jmp(COMPARE_16_CHARS); 3658 3659 // Compare tail chars, length between 1 to 15 chars 3660 bind(COMPARE_TAIL_LONG); 3661 movl(cnt2, result); 3662 cmpl(cnt2, stride); 3663 jcc(Assembler::less, COMPARE_SMALL_STR); 3664 3665 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3666 movdqu(vec1, Address(str1, 0)); 3667 } else { 3668 pmovzxbw(vec1, Address(str1, 0)); 3669 } 3670 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3671 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3672 subptr(cnt2, stride); 3673 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3674 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3675 lea(str1, Address(str1, result, scale)); 3676 lea(str2, Address(str2, result, scale)); 3677 } else { 3678 lea(str1, Address(str1, result, scale1)); 3679 lea(str2, Address(str2, result, scale2)); 3680 } 3681 negptr(cnt2); 3682 jmpb(WHILE_HEAD_LABEL); 3683 3684 bind(COMPARE_SMALL_STR); 3685 } else if (UseSSE42Intrinsics) { 3686 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3687 int pcmpmask = 0x19; 3688 // Setup to compare 8-char (16-byte) vectors, 3689 // start from first character again because it has aligned address. 3690 movl(result, cnt2); 3691 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3692 if (ae == StrIntrinsicNode::LL) { 3693 pcmpmask &= ~0x01; 3694 } 3695 jcc(Assembler::zero, COMPARE_TAIL); 3696 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3697 lea(str1, Address(str1, result, scale)); 3698 lea(str2, Address(str2, result, scale)); 3699 } else { 3700 lea(str1, Address(str1, result, scale1)); 3701 lea(str2, Address(str2, result, scale2)); 3702 } 3703 negptr(result); 3704 3705 // pcmpestri 3706 // inputs: 3707 // vec1- substring 3708 // rax - negative string length (elements count) 3709 // mem - scanned string 3710 // rdx - string length (elements count) 3711 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3712 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3713 // outputs: 3714 // rcx - first mismatched element index 3715 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3716 3717 bind(COMPARE_WIDE_VECTORS); 3718 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3719 movdqu(vec1, Address(str1, result, scale)); 3720 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3721 } else { 3722 pmovzxbw(vec1, Address(str1, result, scale1)); 3723 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3724 } 3725 // After pcmpestri cnt1(rcx) contains mismatched element index 3726 3727 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3728 addptr(result, stride); 3729 subptr(cnt2, stride); 3730 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3731 3732 // compare wide vectors tail 3733 testptr(result, result); 3734 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3735 3736 movl(cnt2, stride); 3737 movl(result, stride); 3738 negptr(result); 3739 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3740 movdqu(vec1, Address(str1, result, scale)); 3741 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3742 } else { 3743 pmovzxbw(vec1, Address(str1, result, scale1)); 3744 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3745 } 3746 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3747 3748 // Mismatched characters in the vectors 3749 bind(VECTOR_NOT_EQUAL); 3750 addptr(cnt1, result); 3751 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3752 subl(result, cnt2); 3753 jmpb(POP_LABEL); 3754 3755 bind(COMPARE_TAIL); // limit is zero 3756 movl(cnt2, result); 3757 // Fallthru to tail compare 3758 } 3759 // Shift str2 and str1 to the end of the arrays, negate min 3760 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3761 lea(str1, Address(str1, cnt2, scale)); 3762 lea(str2, Address(str2, cnt2, scale)); 3763 } else { 3764 lea(str1, Address(str1, cnt2, scale1)); 3765 lea(str2, Address(str2, cnt2, scale2)); 3766 } 3767 decrementl(cnt2); // first character was compared already 3768 negptr(cnt2); 3769 3770 // Compare the rest of the elements 3771 bind(WHILE_HEAD_LABEL); 3772 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3773 subl(result, cnt1); 3774 jccb(Assembler::notZero, POP_LABEL); 3775 increment(cnt2); 3776 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3777 3778 // Strings are equal up to min length. Return the length difference. 3779 bind(LENGTH_DIFF_LABEL); 3780 pop(result); 3781 if (ae == StrIntrinsicNode::UU) { 3782 // Divide diff by 2 to get number of chars 3783 sarl(result, 1); 3784 } 3785 jmpb(DONE_LABEL); 3786 3787 #ifdef _LP64 3788 if (VM_Version::supports_avx512vlbw()) { 3789 3790 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3791 3792 kmovql(cnt1, mask); 3793 notq(cnt1); 3794 bsfq(cnt2, cnt1); 3795 if (ae != StrIntrinsicNode::LL) { 3796 // Divide diff by 2 to get number of chars 3797 sarl(cnt2, 1); 3798 } 3799 addq(result, cnt2); 3800 if (ae == StrIntrinsicNode::LL) { 3801 load_unsigned_byte(cnt1, Address(str2, result)); 3802 load_unsigned_byte(result, Address(str1, result)); 3803 } else if (ae == StrIntrinsicNode::UU) { 3804 load_unsigned_short(cnt1, Address(str2, result, scale)); 3805 load_unsigned_short(result, Address(str1, result, scale)); 3806 } else { 3807 load_unsigned_short(cnt1, Address(str2, result, scale2)); 3808 load_unsigned_byte(result, Address(str1, result, scale1)); 3809 } 3810 subl(result, cnt1); 3811 jmpb(POP_LABEL); 3812 }//if (VM_Version::supports_avx512vlbw()) 3813 #endif // _LP64 3814 3815 // Discard the stored length difference 3816 bind(POP_LABEL); 3817 pop(cnt1); 3818 3819 // That's it 3820 bind(DONE_LABEL); 3821 if(ae == StrIntrinsicNode::UL) { 3822 negl(result); 3823 } 3824 3825 } 3826 3827 // Search for Non-ASCII character (Negative byte value) in a byte array, 3828 // return the index of the first such character, otherwise the length 3829 // of the array segment searched. 3830 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 3831 // @IntrinsicCandidate 3832 // public static int countPositives(byte[] ba, int off, int len) { 3833 // for (int i = off; i < off + len; i++) { 3834 // if (ba[i] < 0) { 3835 // return i - off; 3836 // } 3837 // } 3838 // return len; 3839 // } 3840 void C2_MacroAssembler::count_positives(Register ary1, Register len, 3841 Register result, Register tmp1, 3842 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 3843 // rsi: byte array 3844 // rcx: len 3845 // rax: result 3846 ShortBranchVerifier sbv(this); 3847 assert_different_registers(ary1, len, result, tmp1); 3848 assert_different_registers(vec1, vec2); 3849 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 3850 3851 movl(result, len); // copy 3852 // len == 0 3853 testl(len, len); 3854 jcc(Assembler::zero, DONE); 3855 3856 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 3857 VM_Version::supports_avx512vlbw() && 3858 VM_Version::supports_bmi2()) { 3859 3860 Label test_64_loop, test_tail, BREAK_LOOP; 3861 Register tmp3_aliased = len; 3862 3863 movl(tmp1, len); 3864 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 3865 3866 andl(tmp1, 64 - 1); // tail count (in chars) 0x3F 3867 andl(len, ~(64 - 1)); // vector count (in chars) 3868 jccb(Assembler::zero, test_tail); 3869 3870 lea(ary1, Address(ary1, len, Address::times_1)); 3871 negptr(len); 3872 3873 bind(test_64_loop); 3874 // Check whether our 64 elements of size byte contain negatives 3875 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 3876 kortestql(mask1, mask1); 3877 jcc(Assembler::notZero, BREAK_LOOP); 3878 3879 addptr(len, 64); 3880 jccb(Assembler::notZero, test_64_loop); 3881 3882 bind(test_tail); 3883 // bail out when there is nothing to be done 3884 testl(tmp1, -1); 3885 jcc(Assembler::zero, DONE); 3886 3887 // ~(~0 << len) applied up to two times (for 32-bit scenario) 3888 #ifdef _LP64 3889 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 3890 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 3891 notq(tmp3_aliased); 3892 kmovql(mask2, tmp3_aliased); 3893 #else 3894 Label k_init; 3895 jmp(k_init); 3896 3897 // We could not read 64-bits from a general purpose register thus we move 3898 // data required to compose 64 1's to the instruction stream 3899 // We emit 64 byte wide series of elements from 0..63 which later on would 3900 // be used as a compare targets with tail count contained in tmp1 register. 3901 // Result would be a k register having tmp1 consecutive number or 1 3902 // counting from least significant bit. 3903 address tmp = pc(); 3904 emit_int64(0x0706050403020100); 3905 emit_int64(0x0F0E0D0C0B0A0908); 3906 emit_int64(0x1716151413121110); 3907 emit_int64(0x1F1E1D1C1B1A1918); 3908 emit_int64(0x2726252423222120); 3909 emit_int64(0x2F2E2D2C2B2A2928); 3910 emit_int64(0x3736353433323130); 3911 emit_int64(0x3F3E3D3C3B3A3938); 3912 3913 bind(k_init); 3914 lea(len, InternalAddress(tmp)); 3915 // create mask to test for negative byte inside a vector 3916 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 3917 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 3918 3919 #endif 3920 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 3921 ktestq(mask1, mask2); 3922 jcc(Assembler::zero, DONE); 3923 3924 bind(BREAK_LOOP); 3925 // At least one byte in the last 64 bytes is negative. 3926 // Set up to look at the last 64 bytes as if they were a tail 3927 lea(ary1, Address(ary1, len, Address::times_1)); 3928 addptr(result, len); 3929 // Ignore the very last byte: if all others are positive, 3930 // it must be negative, so we can skip right to the 2+1 byte 3931 // end comparison at this point 3932 orl(result, 63); 3933 movl(len, 63); 3934 // Fallthru to tail compare 3935 } else { 3936 3937 if (UseAVX >= 2 && UseSSE >= 2) { 3938 // With AVX2, use 32-byte vector compare 3939 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 3940 3941 // Compare 32-byte vectors 3942 testl(len, 0xffffffe0); // vector count (in bytes) 3943 jccb(Assembler::zero, TAIL_START); 3944 3945 andl(len, 0xffffffe0); 3946 lea(ary1, Address(ary1, len, Address::times_1)); 3947 negptr(len); 3948 3949 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 3950 movdl(vec2, tmp1); 3951 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 3952 3953 bind(COMPARE_WIDE_VECTORS); 3954 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 3955 vptest(vec1, vec2); 3956 jccb(Assembler::notZero, BREAK_LOOP); 3957 addptr(len, 32); 3958 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3959 3960 testl(result, 0x0000001f); // any bytes remaining? 3961 jcc(Assembler::zero, DONE); 3962 3963 // Quick test using the already prepared vector mask 3964 movl(len, result); 3965 andl(len, 0x0000001f); 3966 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 3967 vptest(vec1, vec2); 3968 jcc(Assembler::zero, DONE); 3969 // There are zeros, jump to the tail to determine exactly where 3970 jmpb(TAIL_START); 3971 3972 bind(BREAK_LOOP); 3973 // At least one byte in the last 32-byte vector is negative. 3974 // Set up to look at the last 32 bytes as if they were a tail 3975 lea(ary1, Address(ary1, len, Address::times_1)); 3976 addptr(result, len); 3977 // Ignore the very last byte: if all others are positive, 3978 // it must be negative, so we can skip right to the 2+1 byte 3979 // end comparison at this point 3980 orl(result, 31); 3981 movl(len, 31); 3982 // Fallthru to tail compare 3983 } else if (UseSSE42Intrinsics) { 3984 // With SSE4.2, use double quad vector compare 3985 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 3986 3987 // Compare 16-byte vectors 3988 testl(len, 0xfffffff0); // vector count (in bytes) 3989 jcc(Assembler::zero, TAIL_START); 3990 3991 andl(len, 0xfffffff0); 3992 lea(ary1, Address(ary1, len, Address::times_1)); 3993 negptr(len); 3994 3995 movl(tmp1, 0x80808080); 3996 movdl(vec2, tmp1); 3997 pshufd(vec2, vec2, 0); 3998 3999 bind(COMPARE_WIDE_VECTORS); 4000 movdqu(vec1, Address(ary1, len, Address::times_1)); 4001 ptest(vec1, vec2); 4002 jccb(Assembler::notZero, BREAK_LOOP); 4003 addptr(len, 16); 4004 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4005 4006 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4007 jcc(Assembler::zero, DONE); 4008 4009 // Quick test using the already prepared vector mask 4010 movl(len, result); 4011 andl(len, 0x0000000f); // tail count (in bytes) 4012 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4013 ptest(vec1, vec2); 4014 jcc(Assembler::zero, DONE); 4015 jmpb(TAIL_START); 4016 4017 bind(BREAK_LOOP); 4018 // At least one byte in the last 16-byte vector is negative. 4019 // Set up and look at the last 16 bytes as if they were a tail 4020 lea(ary1, Address(ary1, len, Address::times_1)); 4021 addptr(result, len); 4022 // Ignore the very last byte: if all others are positive, 4023 // it must be negative, so we can skip right to the 2+1 byte 4024 // end comparison at this point 4025 orl(result, 15); 4026 movl(len, 15); 4027 // Fallthru to tail compare 4028 } 4029 } 4030 4031 bind(TAIL_START); 4032 // Compare 4-byte vectors 4033 andl(len, 0xfffffffc); // vector count (in bytes) 4034 jccb(Assembler::zero, COMPARE_CHAR); 4035 4036 lea(ary1, Address(ary1, len, Address::times_1)); 4037 negptr(len); 4038 4039 bind(COMPARE_VECTORS); 4040 movl(tmp1, Address(ary1, len, Address::times_1)); 4041 andl(tmp1, 0x80808080); 4042 jccb(Assembler::notZero, TAIL_ADJUST); 4043 addptr(len, 4); 4044 jccb(Assembler::notZero, COMPARE_VECTORS); 4045 4046 // Compare trailing char (final 2-3 bytes), if any 4047 bind(COMPARE_CHAR); 4048 4049 testl(result, 0x2); // tail char 4050 jccb(Assembler::zero, COMPARE_BYTE); 4051 load_unsigned_short(tmp1, Address(ary1, 0)); 4052 andl(tmp1, 0x00008080); 4053 jccb(Assembler::notZero, CHAR_ADJUST); 4054 lea(ary1, Address(ary1, 2)); 4055 4056 bind(COMPARE_BYTE); 4057 testl(result, 0x1); // tail byte 4058 jccb(Assembler::zero, DONE); 4059 load_unsigned_byte(tmp1, Address(ary1, 0)); 4060 testl(tmp1, 0x00000080); 4061 jccb(Assembler::zero, DONE); 4062 subptr(result, 1); 4063 jmpb(DONE); 4064 4065 bind(TAIL_ADJUST); 4066 // there are negative bits in the last 4 byte block. 4067 // Adjust result and check the next three bytes 4068 addptr(result, len); 4069 orl(result, 3); 4070 lea(ary1, Address(ary1, len, Address::times_1)); 4071 jmpb(COMPARE_CHAR); 4072 4073 bind(CHAR_ADJUST); 4074 // We are looking at a char + optional byte tail, and found that one 4075 // of the bytes in the char is negative. Adjust the result, check the 4076 // first byte and readjust if needed. 4077 andl(result, 0xfffffffc); 4078 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4079 jccb(Assembler::notZero, DONE); 4080 addptr(result, 1); 4081 4082 // That's it 4083 bind(DONE); 4084 if (UseAVX >= 2 && UseSSE >= 2) { 4085 // clean upper bits of YMM registers 4086 vpxor(vec1, vec1); 4087 vpxor(vec2, vec2); 4088 } 4089 } 4090 4091 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4092 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4093 Register limit, Register result, Register chr, 4094 XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) { 4095 ShortBranchVerifier sbv(this); 4096 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4097 4098 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4099 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4100 4101 if (is_array_equ) { 4102 // Check the input args 4103 cmpoop(ary1, ary2); 4104 jcc(Assembler::equal, TRUE_LABEL); 4105 4106 // Need additional checks for arrays_equals. 4107 testptr(ary1, ary1); 4108 jcc(Assembler::zero, FALSE_LABEL); 4109 testptr(ary2, ary2); 4110 jcc(Assembler::zero, FALSE_LABEL); 4111 4112 // Check the lengths 4113 movl(limit, Address(ary1, length_offset)); 4114 cmpl(limit, Address(ary2, length_offset)); 4115 jcc(Assembler::notEqual, FALSE_LABEL); 4116 } 4117 4118 // count == 0 4119 testl(limit, limit); 4120 jcc(Assembler::zero, TRUE_LABEL); 4121 4122 if (is_array_equ) { 4123 // Load array address 4124 lea(ary1, Address(ary1, base_offset)); 4125 lea(ary2, Address(ary2, base_offset)); 4126 } 4127 4128 if (is_array_equ && is_char) { 4129 // arrays_equals when used for char[]. 4130 shll(limit, 1); // byte count != 0 4131 } 4132 movl(result, limit); // copy 4133 4134 if (UseAVX >= 2) { 4135 // With AVX2, use 32-byte vector compare 4136 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4137 4138 // Compare 32-byte vectors 4139 andl(result, 0x0000001f); // tail count (in bytes) 4140 andl(limit, 0xffffffe0); // vector count (in bytes) 4141 jcc(Assembler::zero, COMPARE_TAIL); 4142 4143 lea(ary1, Address(ary1, limit, Address::times_1)); 4144 lea(ary2, Address(ary2, limit, Address::times_1)); 4145 negptr(limit); 4146 4147 #ifdef _LP64 4148 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4149 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4150 4151 cmpl(limit, -64); 4152 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4153 4154 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4155 4156 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4157 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4158 kortestql(mask, mask); 4159 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4160 addptr(limit, 64); // update since we already compared at this addr 4161 cmpl(limit, -64); 4162 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4163 4164 // At this point we may still need to compare -limit+result bytes. 4165 // We could execute the next two instruction and just continue via non-wide path: 4166 // cmpl(limit, 0); 4167 // jcc(Assembler::equal, COMPARE_TAIL); // true 4168 // But since we stopped at the points ary{1,2}+limit which are 4169 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4170 // (|limit| <= 32 and result < 32), 4171 // we may just compare the last 64 bytes. 4172 // 4173 addptr(result, -64); // it is safe, bc we just came from this area 4174 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4175 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4176 kortestql(mask, mask); 4177 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4178 4179 jmp(TRUE_LABEL); 4180 4181 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4182 4183 }//if (VM_Version::supports_avx512vlbw()) 4184 #endif //_LP64 4185 bind(COMPARE_WIDE_VECTORS); 4186 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); 4187 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4188 vpxor(vec1, vec2); 4189 4190 vptest(vec1, vec1); 4191 jcc(Assembler::notZero, FALSE_LABEL); 4192 addptr(limit, 32); 4193 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4194 4195 testl(result, result); 4196 jcc(Assembler::zero, TRUE_LABEL); 4197 4198 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 4199 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4200 vpxor(vec1, vec2); 4201 4202 vptest(vec1, vec1); 4203 jccb(Assembler::notZero, FALSE_LABEL); 4204 jmpb(TRUE_LABEL); 4205 4206 bind(COMPARE_TAIL); // limit is zero 4207 movl(limit, result); 4208 // Fallthru to tail compare 4209 } else if (UseSSE42Intrinsics) { 4210 // With SSE4.2, use double quad vector compare 4211 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4212 4213 // Compare 16-byte vectors 4214 andl(result, 0x0000000f); // tail count (in bytes) 4215 andl(limit, 0xfffffff0); // vector count (in bytes) 4216 jcc(Assembler::zero, COMPARE_TAIL); 4217 4218 lea(ary1, Address(ary1, limit, Address::times_1)); 4219 lea(ary2, Address(ary2, limit, Address::times_1)); 4220 negptr(limit); 4221 4222 bind(COMPARE_WIDE_VECTORS); 4223 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4224 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4225 pxor(vec1, vec2); 4226 4227 ptest(vec1, vec1); 4228 jcc(Assembler::notZero, FALSE_LABEL); 4229 addptr(limit, 16); 4230 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4231 4232 testl(result, result); 4233 jcc(Assembler::zero, TRUE_LABEL); 4234 4235 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4236 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4237 pxor(vec1, vec2); 4238 4239 ptest(vec1, vec1); 4240 jccb(Assembler::notZero, FALSE_LABEL); 4241 jmpb(TRUE_LABEL); 4242 4243 bind(COMPARE_TAIL); // limit is zero 4244 movl(limit, result); 4245 // Fallthru to tail compare 4246 } 4247 4248 // Compare 4-byte vectors 4249 andl(limit, 0xfffffffc); // vector count (in bytes) 4250 jccb(Assembler::zero, COMPARE_CHAR); 4251 4252 lea(ary1, Address(ary1, limit, Address::times_1)); 4253 lea(ary2, Address(ary2, limit, Address::times_1)); 4254 negptr(limit); 4255 4256 bind(COMPARE_VECTORS); 4257 movl(chr, Address(ary1, limit, Address::times_1)); 4258 cmpl(chr, Address(ary2, limit, Address::times_1)); 4259 jccb(Assembler::notEqual, FALSE_LABEL); 4260 addptr(limit, 4); 4261 jcc(Assembler::notZero, COMPARE_VECTORS); 4262 4263 // Compare trailing char (final 2 bytes), if any 4264 bind(COMPARE_CHAR); 4265 testl(result, 0x2); // tail char 4266 jccb(Assembler::zero, COMPARE_BYTE); 4267 load_unsigned_short(chr, Address(ary1, 0)); 4268 load_unsigned_short(limit, Address(ary2, 0)); 4269 cmpl(chr, limit); 4270 jccb(Assembler::notEqual, FALSE_LABEL); 4271 4272 if (is_array_equ && is_char) { 4273 bind(COMPARE_BYTE); 4274 } else { 4275 lea(ary1, Address(ary1, 2)); 4276 lea(ary2, Address(ary2, 2)); 4277 4278 bind(COMPARE_BYTE); 4279 testl(result, 0x1); // tail byte 4280 jccb(Assembler::zero, TRUE_LABEL); 4281 load_unsigned_byte(chr, Address(ary1, 0)); 4282 load_unsigned_byte(limit, Address(ary2, 0)); 4283 cmpl(chr, limit); 4284 jccb(Assembler::notEqual, FALSE_LABEL); 4285 } 4286 bind(TRUE_LABEL); 4287 movl(result, 1); // return true 4288 jmpb(DONE); 4289 4290 bind(FALSE_LABEL); 4291 xorl(result, result); // return false 4292 4293 // That's it 4294 bind(DONE); 4295 if (UseAVX >= 2) { 4296 // clean upper bits of YMM registers 4297 vpxor(vec1, vec1); 4298 vpxor(vec2, vec2); 4299 } 4300 } 4301 4302 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4303 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4304 switch(ideal_opc) { 4305 case Op_LShiftVS: 4306 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4307 case Op_LShiftVI: 4308 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4309 case Op_LShiftVL: 4310 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4311 case Op_RShiftVS: 4312 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4313 case Op_RShiftVI: 4314 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4315 case Op_RShiftVL: 4316 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4317 case Op_URShiftVS: 4318 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4319 case Op_URShiftVI: 4320 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4321 case Op_URShiftVL: 4322 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4323 case Op_RotateRightV: 4324 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4325 case Op_RotateLeftV: 4326 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4327 default: 4328 fatal("Unsupported masked operation"); break; 4329 } 4330 } 4331 4332 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4333 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4334 bool is_varshift) { 4335 switch (ideal_opc) { 4336 case Op_AddVB: 4337 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4338 case Op_AddVS: 4339 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4340 case Op_AddVI: 4341 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4342 case Op_AddVL: 4343 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4344 case Op_AddVF: 4345 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4346 case Op_AddVD: 4347 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4348 case Op_SubVB: 4349 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4350 case Op_SubVS: 4351 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4352 case Op_SubVI: 4353 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4354 case Op_SubVL: 4355 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4356 case Op_SubVF: 4357 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4358 case Op_SubVD: 4359 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4360 case Op_MulVS: 4361 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4362 case Op_MulVI: 4363 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4364 case Op_MulVL: 4365 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4366 case Op_MulVF: 4367 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4368 case Op_MulVD: 4369 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4370 case Op_DivVF: 4371 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4372 case Op_DivVD: 4373 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4374 case Op_SqrtVF: 4375 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4376 case Op_SqrtVD: 4377 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4378 case Op_AbsVB: 4379 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4380 case Op_AbsVS: 4381 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4382 case Op_AbsVI: 4383 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4384 case Op_AbsVL: 4385 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4386 case Op_FmaVF: 4387 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4388 case Op_FmaVD: 4389 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4390 case Op_VectorRearrange: 4391 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4392 case Op_LShiftVS: 4393 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4394 case Op_LShiftVI: 4395 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4396 case Op_LShiftVL: 4397 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4398 case Op_RShiftVS: 4399 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4400 case Op_RShiftVI: 4401 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4402 case Op_RShiftVL: 4403 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4404 case Op_URShiftVS: 4405 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4406 case Op_URShiftVI: 4407 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4408 case Op_URShiftVL: 4409 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4410 case Op_RotateLeftV: 4411 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4412 case Op_RotateRightV: 4413 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4414 case Op_MaxV: 4415 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4416 case Op_MinV: 4417 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4418 case Op_XorV: 4419 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4420 case Op_OrV: 4421 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4422 case Op_AndV: 4423 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4424 default: 4425 fatal("Unsupported masked operation"); break; 4426 } 4427 } 4428 4429 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4430 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4431 switch (ideal_opc) { 4432 case Op_AddVB: 4433 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4434 case Op_AddVS: 4435 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4436 case Op_AddVI: 4437 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4438 case Op_AddVL: 4439 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4440 case Op_AddVF: 4441 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4442 case Op_AddVD: 4443 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4444 case Op_SubVB: 4445 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4446 case Op_SubVS: 4447 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4448 case Op_SubVI: 4449 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4450 case Op_SubVL: 4451 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4452 case Op_SubVF: 4453 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4454 case Op_SubVD: 4455 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4456 case Op_MulVS: 4457 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4458 case Op_MulVI: 4459 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4460 case Op_MulVL: 4461 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4462 case Op_MulVF: 4463 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4464 case Op_MulVD: 4465 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4466 case Op_DivVF: 4467 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4468 case Op_DivVD: 4469 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4470 case Op_FmaVF: 4471 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4472 case Op_FmaVD: 4473 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4474 case Op_MaxV: 4475 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4476 case Op_MinV: 4477 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4478 case Op_XorV: 4479 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4480 case Op_OrV: 4481 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4482 case Op_AndV: 4483 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4484 default: 4485 fatal("Unsupported masked operation"); break; 4486 } 4487 } 4488 4489 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4490 KRegister src1, KRegister src2) { 4491 BasicType etype = T_ILLEGAL; 4492 switch(mask_len) { 4493 case 2: 4494 case 4: 4495 case 8: etype = T_BYTE; break; 4496 case 16: etype = T_SHORT; break; 4497 case 32: etype = T_INT; break; 4498 case 64: etype = T_LONG; break; 4499 default: fatal("Unsupported type"); break; 4500 } 4501 assert(etype != T_ILLEGAL, ""); 4502 switch(ideal_opc) { 4503 case Op_AndVMask: 4504 kand(etype, dst, src1, src2); break; 4505 case Op_OrVMask: 4506 kor(etype, dst, src1, src2); break; 4507 case Op_XorVMask: 4508 kxor(etype, dst, src1, src2); break; 4509 default: 4510 fatal("Unsupported masked operation"); break; 4511 } 4512 } 4513 4514 /* 4515 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4516 * If src is NaN, the result is 0. 4517 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4518 * the result is equal to the value of Integer.MIN_VALUE. 4519 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4520 * the result is equal to the value of Integer.MAX_VALUE. 4521 */ 4522 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4523 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4524 Register rscratch, AddressLiteral float_sign_flip, 4525 int vec_enc) { 4526 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4527 Label done; 4528 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4529 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4530 vptest(xtmp2, xtmp2, vec_enc); 4531 jccb(Assembler::equal, done); 4532 4533 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4534 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4535 4536 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4537 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4538 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4539 4540 // Recompute the mask for remaining special value. 4541 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4542 // Extract SRC values corresponding to TRUE mask lanes. 4543 vpand(xtmp4, xtmp2, src, vec_enc); 4544 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4545 // values are set. 4546 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4547 4548 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4549 bind(done); 4550 } 4551 4552 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4553 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4554 Register rscratch, AddressLiteral float_sign_flip, 4555 int vec_enc) { 4556 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4557 Label done; 4558 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4559 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4560 kortestwl(ktmp1, ktmp1); 4561 jccb(Assembler::equal, done); 4562 4563 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4564 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4565 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4566 4567 kxorwl(ktmp1, ktmp1, ktmp2); 4568 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4569 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4570 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4571 bind(done); 4572 } 4573 4574 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4575 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4576 Register rscratch, AddressLiteral double_sign_flip, 4577 int vec_enc) { 4578 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4579 4580 Label done; 4581 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4582 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4583 kortestwl(ktmp1, ktmp1); 4584 jccb(Assembler::equal, done); 4585 4586 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4587 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4588 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4589 4590 kxorwl(ktmp1, ktmp1, ktmp2); 4591 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4592 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4593 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4594 bind(done); 4595 } 4596 4597 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4598 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4599 Register rscratch, AddressLiteral float_sign_flip, 4600 int vec_enc) { 4601 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4602 Label done; 4603 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4604 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4605 kortestwl(ktmp1, ktmp1); 4606 jccb(Assembler::equal, done); 4607 4608 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4609 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4610 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4611 4612 kxorwl(ktmp1, ktmp1, ktmp2); 4613 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4614 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4615 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4616 bind(done); 4617 } 4618 4619 /* 4620 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4621 * If src is NaN, the result is 0. 4622 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 4623 * the result is equal to the value of Long.MIN_VALUE. 4624 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 4625 * the result is equal to the value of Long.MAX_VALUE. 4626 */ 4627 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4628 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4629 Register rscratch, AddressLiteral double_sign_flip, 4630 int vec_enc) { 4631 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4632 4633 Label done; 4634 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4635 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 4636 kortestwl(ktmp1, ktmp1); 4637 jccb(Assembler::equal, done); 4638 4639 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4640 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4641 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4642 4643 kxorwl(ktmp1, ktmp1, ktmp2); 4644 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4645 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4646 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4647 bind(done); 4648 } 4649 4650 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 4651 XMMRegister xtmp, int index, int vec_enc) { 4652 assert(vec_enc < Assembler::AVX_512bit, ""); 4653 if (vec_enc == Assembler::AVX_256bit) { 4654 vextractf128_high(xtmp, src); 4655 vshufps(dst, src, xtmp, index, vec_enc); 4656 } else { 4657 vshufps(dst, src, zero, index, vec_enc); 4658 } 4659 } 4660 4661 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4662 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 4663 AddressLiteral float_sign_flip, int src_vec_enc) { 4664 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4665 4666 Label done; 4667 // Compare the destination lanes with float_sign_flip 4668 // value to get mask for all special values. 4669 movdqu(xtmp1, float_sign_flip, rscratch); 4670 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 4671 ptest(xtmp2, xtmp2); 4672 jccb(Assembler::equal, done); 4673 4674 // Flip float_sign_flip to get max integer value. 4675 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 4676 pxor(xtmp1, xtmp4); 4677 4678 // Set detination lanes corresponding to unordered source lanes as zero. 4679 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 4680 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 4681 4682 // Shuffle mask vector and pack lower doubles word from each quadword lane. 4683 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 4684 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 4685 4686 // Recompute the mask for remaining special value. 4687 pxor(xtmp2, xtmp3); 4688 // Extract mask corresponding to non-negative source lanes. 4689 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 4690 4691 // Shuffle mask vector and pack lower doubles word from each quadword lane. 4692 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 4693 pand(xtmp3, xtmp2); 4694 4695 // Replace destination lanes holding special value(0x80000000) with max int 4696 // if corresponding source lane holds a +ve value. 4697 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 4698 bind(done); 4699 } 4700 4701 4702 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 4703 XMMRegister xtmp, Register rscratch, int vec_enc) { 4704 switch(to_elem_bt) { 4705 case T_SHORT: 4706 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 4707 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 4708 vpackusdw(dst, dst, zero, vec_enc); 4709 if (vec_enc == Assembler::AVX_256bit) { 4710 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 4711 } 4712 break; 4713 case T_BYTE: 4714 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 4715 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 4716 vpackusdw(dst, dst, zero, vec_enc); 4717 if (vec_enc == Assembler::AVX_256bit) { 4718 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 4719 } 4720 vpackuswb(dst, dst, zero, vec_enc); 4721 break; 4722 default: assert(false, "%s", type2name(to_elem_bt)); 4723 } 4724 } 4725 4726 /* 4727 * Algorithm for vector D2L and F2I conversions:- 4728 * a) Perform vector D2L/F2I cast. 4729 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 4730 * It signifies that source value could be any of the special floating point 4731 * values(NaN,-Inf,Inf,Max,-Min). 4732 * c) Set destination to zero if source is NaN value. 4733 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 4734 */ 4735 4736 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4737 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4738 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 4739 int to_elem_sz = type2aelembytes(to_elem_bt); 4740 assert(to_elem_sz <= 4, ""); 4741 vcvttps2dq(dst, src, vec_enc); 4742 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 4743 if (to_elem_sz < 4) { 4744 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4745 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 4746 } 4747 } 4748 4749 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4750 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 4751 Register rscratch, int vec_enc) { 4752 int to_elem_sz = type2aelembytes(to_elem_bt); 4753 assert(to_elem_sz <= 4, ""); 4754 vcvttps2dq(dst, src, vec_enc); 4755 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 4756 switch(to_elem_bt) { 4757 case T_INT: 4758 break; 4759 case T_SHORT: 4760 evpmovdw(dst, dst, vec_enc); 4761 break; 4762 case T_BYTE: 4763 evpmovdb(dst, dst, vec_enc); 4764 break; 4765 default: assert(false, "%s", type2name(to_elem_bt)); 4766 } 4767 } 4768 4769 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4770 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 4771 Register rscratch, int vec_enc) { 4772 evcvttps2qq(dst, src, vec_enc); 4773 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 4774 } 4775 4776 // Handling for downcasting from double to integer or sub-word types on AVX2. 4777 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4778 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 4779 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 4780 int to_elem_sz = type2aelembytes(to_elem_bt); 4781 assert(to_elem_sz < 8, ""); 4782 vcvttpd2dq(dst, src, vec_enc); 4783 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 4784 float_sign_flip, vec_enc); 4785 if (to_elem_sz < 4) { 4786 // xtmp4 holds all zero lanes. 4787 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 4788 } 4789 } 4790 4791 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 4792 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 4793 KRegister ktmp2, AddressLiteral sign_flip, 4794 Register rscratch, int vec_enc) { 4795 if (VM_Version::supports_avx512dq()) { 4796 evcvttpd2qq(dst, src, vec_enc); 4797 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 4798 switch(to_elem_bt) { 4799 case T_LONG: 4800 break; 4801 case T_INT: 4802 evpmovsqd(dst, dst, vec_enc); 4803 break; 4804 case T_SHORT: 4805 evpmovsqd(dst, dst, vec_enc); 4806 evpmovdw(dst, dst, vec_enc); 4807 break; 4808 case T_BYTE: 4809 evpmovsqd(dst, dst, vec_enc); 4810 evpmovdb(dst, dst, vec_enc); 4811 break; 4812 default: assert(false, "%s", type2name(to_elem_bt)); 4813 } 4814 } else { 4815 assert(type2aelembytes(to_elem_bt) <= 4, ""); 4816 vcvttpd2dq(dst, src, vec_enc); 4817 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 4818 switch(to_elem_bt) { 4819 case T_INT: 4820 break; 4821 case T_SHORT: 4822 evpmovdw(dst, dst, vec_enc); 4823 break; 4824 case T_BYTE: 4825 evpmovdb(dst, dst, vec_enc); 4826 break; 4827 default: assert(false, "%s", type2name(to_elem_bt)); 4828 } 4829 } 4830 } 4831 4832 #ifdef _LP64 4833 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 4834 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 4835 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 4836 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4837 // and re-instantiate original MXCSR.RC mode after that. 4838 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 4839 4840 mov64(tmp, julong_cast(0.5L)); 4841 evpbroadcastq(xtmp1, tmp, vec_enc); 4842 vaddpd(xtmp1, src , xtmp1, vec_enc); 4843 evcvtpd2qq(dst, xtmp1, vec_enc); 4844 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 4845 double_sign_flip, vec_enc);; 4846 4847 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 4848 } 4849 4850 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 4851 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 4852 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 4853 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4854 // and re-instantiate original MXCSR.RC mode after that. 4855 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 4856 4857 movl(tmp, jint_cast(0.5)); 4858 movq(xtmp1, tmp); 4859 vbroadcastss(xtmp1, xtmp1, vec_enc); 4860 vaddps(xtmp1, src , xtmp1, vec_enc); 4861 vcvtps2dq(dst, xtmp1, vec_enc); 4862 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 4863 float_sign_flip, vec_enc); 4864 4865 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 4866 } 4867 4868 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 4869 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 4870 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 4871 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4872 // and re-instantiate original MXCSR.RC mode after that. 4873 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 4874 4875 movl(tmp, jint_cast(0.5)); 4876 movq(xtmp1, tmp); 4877 vbroadcastss(xtmp1, xtmp1, vec_enc); 4878 vaddps(xtmp1, src , xtmp1, vec_enc); 4879 vcvtps2dq(dst, xtmp1, vec_enc); 4880 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 4881 4882 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 4883 } 4884 #endif // _LP64 4885 4886 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 4887 BasicType from_elem_bt, BasicType to_elem_bt) { 4888 switch (from_elem_bt) { 4889 case T_BYTE: 4890 switch (to_elem_bt) { 4891 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 4892 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 4893 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 4894 default: ShouldNotReachHere(); 4895 } 4896 break; 4897 case T_SHORT: 4898 switch (to_elem_bt) { 4899 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 4900 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 4901 default: ShouldNotReachHere(); 4902 } 4903 break; 4904 case T_INT: 4905 assert(to_elem_bt == T_LONG, ""); 4906 vpmovzxdq(dst, src, vlen_enc); 4907 break; 4908 default: 4909 ShouldNotReachHere(); 4910 } 4911 } 4912 4913 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 4914 BasicType from_elem_bt, BasicType to_elem_bt) { 4915 switch (from_elem_bt) { 4916 case T_BYTE: 4917 switch (to_elem_bt) { 4918 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 4919 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 4920 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 4921 default: ShouldNotReachHere(); 4922 } 4923 break; 4924 case T_SHORT: 4925 switch (to_elem_bt) { 4926 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 4927 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 4928 default: ShouldNotReachHere(); 4929 } 4930 break; 4931 case T_INT: 4932 assert(to_elem_bt == T_LONG, ""); 4933 vpmovsxdq(dst, src, vlen_enc); 4934 break; 4935 default: 4936 ShouldNotReachHere(); 4937 } 4938 } 4939 4940 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 4941 BasicType dst_bt, BasicType src_bt, int vlen) { 4942 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 4943 assert(vlen_enc != AVX_512bit, ""); 4944 4945 int dst_bt_size = type2aelembytes(dst_bt); 4946 int src_bt_size = type2aelembytes(src_bt); 4947 if (dst_bt_size > src_bt_size) { 4948 switch (dst_bt_size / src_bt_size) { 4949 case 2: vpmovsxbw(dst, src, vlen_enc); break; 4950 case 4: vpmovsxbd(dst, src, vlen_enc); break; 4951 case 8: vpmovsxbq(dst, src, vlen_enc); break; 4952 default: ShouldNotReachHere(); 4953 } 4954 } else { 4955 assert(dst_bt_size < src_bt_size, ""); 4956 switch (src_bt_size / dst_bt_size) { 4957 case 2: { 4958 if (vlen_enc == AVX_128bit) { 4959 vpacksswb(dst, src, src, vlen_enc); 4960 } else { 4961 vpacksswb(dst, src, src, vlen_enc); 4962 vpermq(dst, dst, 0x08, vlen_enc); 4963 } 4964 break; 4965 } 4966 case 4: { 4967 if (vlen_enc == AVX_128bit) { 4968 vpackssdw(dst, src, src, vlen_enc); 4969 vpacksswb(dst, dst, dst, vlen_enc); 4970 } else { 4971 vpackssdw(dst, src, src, vlen_enc); 4972 vpermq(dst, dst, 0x08, vlen_enc); 4973 vpacksswb(dst, dst, dst, AVX_128bit); 4974 } 4975 break; 4976 } 4977 case 8: { 4978 if (vlen_enc == AVX_128bit) { 4979 vpshufd(dst, src, 0x08, vlen_enc); 4980 vpackssdw(dst, dst, dst, vlen_enc); 4981 vpacksswb(dst, dst, dst, vlen_enc); 4982 } else { 4983 vpshufd(dst, src, 0x08, vlen_enc); 4984 vpermq(dst, dst, 0x08, vlen_enc); 4985 vpackssdw(dst, dst, dst, AVX_128bit); 4986 vpacksswb(dst, dst, dst, AVX_128bit); 4987 } 4988 break; 4989 } 4990 default: ShouldNotReachHere(); 4991 } 4992 } 4993 } 4994 4995 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 4996 bool merge, BasicType bt, int vlen_enc) { 4997 if (bt == T_INT) { 4998 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 4999 } else { 5000 assert(bt == T_LONG, ""); 5001 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5002 } 5003 } 5004 5005 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5006 bool merge, BasicType bt, int vlen_enc) { 5007 if (bt == T_INT) { 5008 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5009 } else { 5010 assert(bt == T_LONG, ""); 5011 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5012 } 5013 } 5014 5015 #ifdef _LP64 5016 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5017 Register rtmp2, XMMRegister xtmp, int mask_len, 5018 int vec_enc) { 5019 int index = 0; 5020 int vindex = 0; 5021 mov64(rtmp1, 0x0101010101010101L); 5022 pdepq(rtmp1, src, rtmp1); 5023 if (mask_len > 8) { 5024 movq(rtmp2, src); 5025 vpxor(xtmp, xtmp, xtmp, vec_enc); 5026 movq(xtmp, rtmp1); 5027 } 5028 movq(dst, rtmp1); 5029 5030 mask_len -= 8; 5031 while (mask_len > 0) { 5032 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5033 index++; 5034 if ((index % 2) == 0) { 5035 pxor(xtmp, xtmp); 5036 } 5037 mov64(rtmp1, 0x0101010101010101L); 5038 shrq(rtmp2, 8); 5039 pdepq(rtmp1, rtmp2, rtmp1); 5040 pinsrq(xtmp, rtmp1, index % 2); 5041 vindex = index / 2; 5042 if (vindex) { 5043 // Write entire 16 byte vector when both 64 bit 5044 // lanes are update to save redundant instructions. 5045 if (index % 2) { 5046 vinsertf128(dst, dst, xtmp, vindex); 5047 } 5048 } else { 5049 vmovdqu(dst, xtmp); 5050 } 5051 mask_len -= 8; 5052 } 5053 } 5054 5055 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5056 switch(opc) { 5057 case Op_VectorMaskTrueCount: 5058 popcntq(dst, tmp); 5059 break; 5060 case Op_VectorMaskLastTrue: 5061 if (VM_Version::supports_lzcnt()) { 5062 lzcntq(tmp, tmp); 5063 movl(dst, 63); 5064 subl(dst, tmp); 5065 } else { 5066 movl(dst, -1); 5067 bsrq(tmp, tmp); 5068 cmov32(Assembler::notZero, dst, tmp); 5069 } 5070 break; 5071 case Op_VectorMaskFirstTrue: 5072 if (VM_Version::supports_bmi1()) { 5073 if (masklen < 32) { 5074 orl(tmp, 1 << masklen); 5075 tzcntl(dst, tmp); 5076 } else if (masklen == 32) { 5077 tzcntl(dst, tmp); 5078 } else { 5079 assert(masklen == 64, ""); 5080 tzcntq(dst, tmp); 5081 } 5082 } else { 5083 if (masklen < 32) { 5084 orl(tmp, 1 << masklen); 5085 bsfl(dst, tmp); 5086 } else { 5087 assert(masklen == 32 || masklen == 64, ""); 5088 movl(dst, masklen); 5089 if (masklen == 32) { 5090 bsfl(tmp, tmp); 5091 } else { 5092 bsfq(tmp, tmp); 5093 } 5094 cmov32(Assembler::notZero, dst, tmp); 5095 } 5096 } 5097 break; 5098 case Op_VectorMaskToLong: 5099 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5100 break; 5101 default: assert(false, "Unhandled mask operation"); 5102 } 5103 } 5104 5105 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5106 int masklen, int masksize, int vec_enc) { 5107 assert(VM_Version::supports_popcnt(), ""); 5108 5109 if(VM_Version::supports_avx512bw()) { 5110 kmovql(tmp, mask); 5111 } else { 5112 assert(masklen <= 16, ""); 5113 kmovwl(tmp, mask); 5114 } 5115 5116 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5117 // operations needs to be clipped. 5118 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5119 andq(tmp, (1 << masklen) - 1); 5120 } 5121 5122 vector_mask_operation_helper(opc, dst, tmp, masklen); 5123 } 5124 5125 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5126 Register tmp, int masklen, BasicType bt, int vec_enc) { 5127 assert(vec_enc == AVX_128bit && VM_Version::supports_avx() || 5128 vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4), ""); 5129 assert(VM_Version::supports_popcnt(), ""); 5130 5131 bool need_clip = false; 5132 switch(bt) { 5133 case T_BOOLEAN: 5134 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5135 vpxor(xtmp, xtmp, xtmp, vec_enc); 5136 vpsubb(xtmp, xtmp, mask, vec_enc); 5137 vpmovmskb(tmp, xtmp, vec_enc); 5138 need_clip = masklen < 16; 5139 break; 5140 case T_BYTE: 5141 vpmovmskb(tmp, mask, vec_enc); 5142 need_clip = masklen < 16; 5143 break; 5144 case T_SHORT: 5145 vpacksswb(xtmp, mask, mask, vec_enc); 5146 if (masklen >= 16) { 5147 vpermpd(xtmp, xtmp, 8, vec_enc); 5148 } 5149 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5150 need_clip = masklen < 16; 5151 break; 5152 case T_INT: 5153 case T_FLOAT: 5154 vmovmskps(tmp, mask, vec_enc); 5155 need_clip = masklen < 4; 5156 break; 5157 case T_LONG: 5158 case T_DOUBLE: 5159 vmovmskpd(tmp, mask, vec_enc); 5160 need_clip = masklen < 2; 5161 break; 5162 default: assert(false, "Unhandled type, %s", type2name(bt)); 5163 } 5164 5165 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5166 // operations needs to be clipped. 5167 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5168 // need_clip implies masklen < 32 5169 andq(tmp, (1 << masklen) - 1); 5170 } 5171 5172 vector_mask_operation_helper(opc, dst, tmp, masklen); 5173 } 5174 5175 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5176 Register rtmp2, int mask_len) { 5177 kmov(rtmp1, src); 5178 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5179 mov64(rtmp2, -1L); 5180 pextq(rtmp2, rtmp2, rtmp1); 5181 kmov(dst, rtmp2); 5182 } 5183 5184 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5185 bool merge, BasicType bt, int vec_enc) { 5186 if (opcode == Op_CompressV) { 5187 switch(bt) { 5188 case T_BYTE: 5189 evpcompressb(dst, mask, src, merge, vec_enc); 5190 break; 5191 case T_CHAR: 5192 case T_SHORT: 5193 evpcompressw(dst, mask, src, merge, vec_enc); 5194 break; 5195 case T_INT: 5196 evpcompressd(dst, mask, src, merge, vec_enc); 5197 break; 5198 case T_FLOAT: 5199 evcompressps(dst, mask, src, merge, vec_enc); 5200 break; 5201 case T_LONG: 5202 evpcompressq(dst, mask, src, merge, vec_enc); 5203 break; 5204 case T_DOUBLE: 5205 evcompresspd(dst, mask, src, merge, vec_enc); 5206 break; 5207 default: 5208 fatal("Unsupported type %s", type2name(bt)); 5209 break; 5210 } 5211 } else { 5212 assert(opcode == Op_ExpandV, ""); 5213 switch(bt) { 5214 case T_BYTE: 5215 evpexpandb(dst, mask, src, merge, vec_enc); 5216 break; 5217 case T_CHAR: 5218 case T_SHORT: 5219 evpexpandw(dst, mask, src, merge, vec_enc); 5220 break; 5221 case T_INT: 5222 evpexpandd(dst, mask, src, merge, vec_enc); 5223 break; 5224 case T_FLOAT: 5225 evexpandps(dst, mask, src, merge, vec_enc); 5226 break; 5227 case T_LONG: 5228 evpexpandq(dst, mask, src, merge, vec_enc); 5229 break; 5230 case T_DOUBLE: 5231 evexpandpd(dst, mask, src, merge, vec_enc); 5232 break; 5233 default: 5234 fatal("Unsupported type %s", type2name(bt)); 5235 break; 5236 } 5237 } 5238 } 5239 #endif 5240 5241 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5242 KRegister ktmp1, int vec_enc) { 5243 if (opcode == Op_SignumVD) { 5244 vsubpd(dst, zero, one, vec_enc); 5245 // if src < 0 ? -1 : 1 5246 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5247 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5248 // if src == NaN, -0.0 or 0.0 return src. 5249 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5250 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5251 } else { 5252 assert(opcode == Op_SignumVF, ""); 5253 vsubps(dst, zero, one, vec_enc); 5254 // if src < 0 ? -1 : 1 5255 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5256 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5257 // if src == NaN, -0.0 or 0.0 return src. 5258 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5259 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5260 } 5261 } 5262 5263 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5264 XMMRegister xtmp1, int vec_enc) { 5265 if (opcode == Op_SignumVD) { 5266 vsubpd(dst, zero, one, vec_enc); 5267 // if src < 0 ? -1 : 1 5268 vblendvpd(dst, one, dst, src, vec_enc); 5269 // if src == NaN, -0.0 or 0.0 return src. 5270 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5271 vblendvpd(dst, dst, src, xtmp1, vec_enc); 5272 } else { 5273 assert(opcode == Op_SignumVF, ""); 5274 vsubps(dst, zero, one, vec_enc); 5275 // if src < 0 ? -1 : 1 5276 vblendvps(dst, one, dst, src, vec_enc); 5277 // if src == NaN, -0.0 or 0.0 return src. 5278 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5279 vblendvps(dst, dst, src, xtmp1, vec_enc); 5280 } 5281 } 5282 5283 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5284 if (VM_Version::supports_avx512bw()) { 5285 if (mask_len > 32) { 5286 kmovql(dst, src); 5287 } else { 5288 kmovdl(dst, src); 5289 if (mask_len != 32) { 5290 kshiftrdl(dst, dst, 32 - mask_len); 5291 } 5292 } 5293 } else { 5294 assert(mask_len <= 16, ""); 5295 kmovwl(dst, src); 5296 if (mask_len != 16) { 5297 kshiftrwl(dst, dst, 16 - mask_len); 5298 } 5299 } 5300 } 5301 5302 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5303 int lane_size = type2aelembytes(bt); 5304 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5305 if ((is_LP64 || lane_size < 8) && 5306 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5307 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5308 movptr(rtmp, imm32); 5309 switch(lane_size) { 5310 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5311 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5312 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5313 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5314 fatal("Unsupported lane size %d", lane_size); 5315 break; 5316 } 5317 } else { 5318 movptr(rtmp, imm32); 5319 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5320 switch(lane_size) { 5321 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5322 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5323 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5324 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5325 fatal("Unsupported lane size %d", lane_size); 5326 break; 5327 } 5328 } 5329 } 5330 5331 // 5332 // Following is lookup table based popcount computation algorithm:- 5333 // Index Bit set count 5334 // [ 0000 -> 0, 5335 // 0001 -> 1, 5336 // 0010 -> 1, 5337 // 0011 -> 2, 5338 // 0100 -> 1, 5339 // 0101 -> 2, 5340 // 0110 -> 2, 5341 // 0111 -> 3, 5342 // 1000 -> 1, 5343 // 1001 -> 2, 5344 // 1010 -> 3, 5345 // 1011 -> 3, 5346 // 1100 -> 2, 5347 // 1101 -> 3, 5348 // 1111 -> 4 ] 5349 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5350 // shuffle indices for lookup table access. 5351 // b. Right shift each byte of vector lane by 4 positions. 5352 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5353 // shuffle indices for lookup table access. 5354 // d. Add the bitset count of upper and lower 4 bits of each byte. 5355 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5356 // count of all the bytes of a quadword. 5357 // f. Perform step e. for upper 128bit vector lane. 5358 // g. Pack the bitset count of quadwords back to double word. 5359 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5360 5361 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5362 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5363 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5364 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5365 vpsrlw(dst, src, 4, vec_enc); 5366 vpand(dst, dst, xtmp1, vec_enc); 5367 vpand(xtmp1, src, xtmp1, vec_enc); 5368 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5369 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5370 vpshufb(dst, xtmp2, dst, vec_enc); 5371 vpaddb(dst, dst, xtmp1, vec_enc); 5372 } 5373 5374 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5375 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5376 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5377 // Following code is as per steps e,f,g and h of above algorithm. 5378 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5379 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5380 vpsadbw(dst, dst, xtmp2, vec_enc); 5381 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5382 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5383 vpackuswb(dst, xtmp1, dst, vec_enc); 5384 } 5385 5386 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5387 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5388 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5389 // Add the popcount of upper and lower bytes of word. 5390 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5391 vpsrlw(dst, xtmp1, 8, vec_enc); 5392 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5393 vpaddw(dst, dst, xtmp1, vec_enc); 5394 } 5395 5396 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5397 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5398 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5399 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5400 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5401 } 5402 5403 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5404 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5405 switch(bt) { 5406 case T_LONG: 5407 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5408 break; 5409 case T_INT: 5410 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5411 break; 5412 case T_CHAR: 5413 case T_SHORT: 5414 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5415 break; 5416 case T_BYTE: 5417 case T_BOOLEAN: 5418 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5419 break; 5420 default: 5421 fatal("Unsupported type %s", type2name(bt)); 5422 break; 5423 } 5424 } 5425 5426 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5427 KRegister mask, bool merge, int vec_enc) { 5428 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5429 switch(bt) { 5430 case T_LONG: 5431 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5432 evpopcntq(dst, mask, src, merge, vec_enc); 5433 break; 5434 case T_INT: 5435 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5436 evpopcntd(dst, mask, src, merge, vec_enc); 5437 break; 5438 case T_CHAR: 5439 case T_SHORT: 5440 assert(VM_Version::supports_avx512_bitalg(), ""); 5441 evpopcntw(dst, mask, src, merge, vec_enc); 5442 break; 5443 case T_BYTE: 5444 case T_BOOLEAN: 5445 assert(VM_Version::supports_avx512_bitalg(), ""); 5446 evpopcntb(dst, mask, src, merge, vec_enc); 5447 break; 5448 default: 5449 fatal("Unsupported type %s", type2name(bt)); 5450 break; 5451 } 5452 } 5453 5454 #ifndef _LP64 5455 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 5456 assert(VM_Version::supports_avx512bw(), ""); 5457 kmovdl(tmp, src); 5458 kunpckdql(dst, tmp, tmp); 5459 } 5460 #endif 5461 5462 // Bit reversal algorithm first reverses the bits of each byte followed by 5463 // a byte level reversal for multi-byte primitive types (short/int/long). 5464 // Algorithm performs a lookup table access to get reverse bit sequence 5465 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5466 // is obtained by swapping the reverse bit sequences of upper and lower 5467 // nibble of a byte. 5468 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5469 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5470 if (VM_Version::supports_avx512vlbw()) { 5471 5472 // Get the reverse bit sequence of lower nibble of each byte. 5473 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5474 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5475 evpandq(dst, xtmp2, src, vec_enc); 5476 vpshufb(dst, xtmp1, dst, vec_enc); 5477 vpsllq(dst, dst, 4, vec_enc); 5478 5479 // Get the reverse bit sequence of upper nibble of each byte. 5480 vpandn(xtmp2, xtmp2, src, vec_enc); 5481 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5482 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5483 5484 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5485 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5486 evporq(xtmp2, dst, xtmp2, vec_enc); 5487 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5488 5489 } else if(vec_enc == Assembler::AVX_512bit) { 5490 // Shift based bit reversal. 5491 assert(bt == T_LONG || bt == T_INT, ""); 5492 5493 // Swap lower and upper nibble of each byte. 5494 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5495 5496 // Swap two least and most significant bits of each nibble. 5497 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5498 5499 // Swap adjacent pair of bits. 5500 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5501 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5502 5503 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5504 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5505 } else { 5506 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5507 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5508 5509 // Get the reverse bit sequence of lower nibble of each byte. 5510 vpand(dst, xtmp2, src, vec_enc); 5511 vpshufb(dst, xtmp1, dst, vec_enc); 5512 vpsllq(dst, dst, 4, vec_enc); 5513 5514 // Get the reverse bit sequence of upper nibble of each byte. 5515 vpandn(xtmp2, xtmp2, src, vec_enc); 5516 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5517 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5518 5519 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5520 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5521 vpor(xtmp2, dst, xtmp2, vec_enc); 5522 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5523 } 5524 } 5525 5526 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 5527 XMMRegister xtmp, Register rscratch) { 5528 assert(VM_Version::supports_gfni(), ""); 5529 assert(rscratch != noreg || always_reachable(mask), "missing"); 5530 5531 // Galois field instruction based bit reversal based on following algorithm. 5532 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5533 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 5534 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 5535 vector_reverse_byte(bt, dst, xtmp, vec_enc); 5536 } 5537 5538 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 5539 XMMRegister xtmp1, Register rtmp, int vec_enc) { 5540 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 5541 evpandq(dst, xtmp1, src, vec_enc); 5542 vpsllq(dst, dst, nbits, vec_enc); 5543 vpandn(xtmp1, xtmp1, src, vec_enc); 5544 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 5545 evporq(dst, dst, xtmp1, vec_enc); 5546 } 5547 5548 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5549 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5550 // Shift based bit reversal. 5551 assert(VM_Version::supports_evex(), ""); 5552 switch(bt) { 5553 case T_LONG: 5554 // Swap upper and lower double word of each quad word. 5555 evprorq(xtmp1, k0, src, 32, true, vec_enc); 5556 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 5557 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5558 break; 5559 case T_INT: 5560 // Swap upper and lower word of each double word. 5561 evprord(xtmp1, k0, src, 16, true, vec_enc); 5562 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5563 break; 5564 case T_CHAR: 5565 case T_SHORT: 5566 // Swap upper and lower byte of each word. 5567 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 5568 break; 5569 case T_BYTE: 5570 evmovdquq(dst, k0, src, true, vec_enc); 5571 break; 5572 default: 5573 fatal("Unsupported type %s", type2name(bt)); 5574 break; 5575 } 5576 } 5577 5578 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5579 if (bt == T_BYTE) { 5580 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 5581 evmovdquq(dst, k0, src, true, vec_enc); 5582 } else { 5583 vmovdqu(dst, src); 5584 } 5585 return; 5586 } 5587 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 5588 // pre-computed shuffle indices. 5589 switch(bt) { 5590 case T_LONG: 5591 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 5592 break; 5593 case T_INT: 5594 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 5595 break; 5596 case T_CHAR: 5597 case T_SHORT: 5598 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 5599 break; 5600 default: 5601 fatal("Unsupported type %s", type2name(bt)); 5602 break; 5603 } 5604 vpshufb(dst, src, dst, vec_enc); 5605 } 5606 5607 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5608 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5609 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 5610 assert(is_integral_type(bt), ""); 5611 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5612 assert(VM_Version::supports_avx512cd(), ""); 5613 switch(bt) { 5614 case T_LONG: 5615 evplzcntq(dst, ktmp, src, merge, vec_enc); 5616 break; 5617 case T_INT: 5618 evplzcntd(dst, ktmp, src, merge, vec_enc); 5619 break; 5620 case T_SHORT: 5621 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 5622 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 5623 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 5624 vpunpckhwd(dst, xtmp1, src, vec_enc); 5625 evplzcntd(dst, ktmp, dst, merge, vec_enc); 5626 vpackusdw(dst, xtmp2, dst, vec_enc); 5627 break; 5628 case T_BYTE: 5629 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5630 // accessing the lookup table. 5631 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 5632 // accessing the lookup table. 5633 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 5634 assert(VM_Version::supports_avx512bw(), ""); 5635 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 5636 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 5637 vpand(xtmp2, dst, src, vec_enc); 5638 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5639 vpsrlw(xtmp3, src, 4, vec_enc); 5640 vpand(xtmp3, dst, xtmp3, vec_enc); 5641 vpshufb(dst, xtmp1, xtmp3, vec_enc); 5642 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 5643 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 5644 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 5645 break; 5646 default: 5647 fatal("Unsupported type %s", type2name(bt)); 5648 break; 5649 } 5650 } 5651 5652 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5653 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5654 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 5655 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5656 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5657 // accessing the lookup table. 5658 vpand(dst, xtmp2, src, vec_enc); 5659 vpshufb(dst, xtmp1, dst, vec_enc); 5660 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 5661 // accessing the lookup table. 5662 vpsrlw(xtmp3, src, 4, vec_enc); 5663 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 5664 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 5665 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 5666 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 5667 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 5668 vpaddb(dst, dst, xtmp2, vec_enc); 5669 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 5670 } 5671 5672 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5673 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5674 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5675 // Add zero counts of lower byte and upper byte of a word if 5676 // upper byte holds a zero value. 5677 vpsrlw(xtmp3, src, 8, vec_enc); 5678 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 5679 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 5680 vpsllw(xtmp2, dst, 8, vec_enc); 5681 vpaddw(xtmp2, xtmp2, dst, vec_enc); 5682 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5683 vpsrlw(dst, dst, 8, vec_enc); 5684 } 5685 5686 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5687 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 5688 // Since IEEE 754 floating point format represents mantissa in 1.0 format 5689 // hence biased exponent can be used to compute leading zero count as per 5690 // following formula:- 5691 // LZCNT = 32 - (biased_exp - 127) 5692 // Special handling has been introduced for Zero, Max_Int and -ve source values. 5693 5694 // Broadcast 0xFF 5695 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 5696 vpsrld(xtmp1, xtmp1, 24, vec_enc); 5697 5698 // Extract biased exponent. 5699 vcvtdq2ps(dst, src, vec_enc); 5700 vpsrld(dst, dst, 23, vec_enc); 5701 vpand(dst, dst, xtmp1, vec_enc); 5702 5703 // Broadcast 127. 5704 vpsrld(xtmp1, xtmp1, 1, vec_enc); 5705 // Exponent = biased_exp - 127 5706 vpsubd(dst, dst, xtmp1, vec_enc); 5707 5708 // Exponent = Exponent + 1 5709 vpsrld(xtmp3, xtmp1, 6, vec_enc); 5710 vpaddd(dst, dst, xtmp3, vec_enc); 5711 5712 // Replace -ve exponent with zero, exponent is -ve when src 5713 // lane contains a zero value. 5714 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5715 vblendvps(dst, dst, xtmp2, dst, vec_enc); 5716 5717 // Rematerialize broadcast 32. 5718 vpslld(xtmp1, xtmp3, 5, vec_enc); 5719 // Exponent is 32 if corresponding source lane contains max_int value. 5720 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 5721 // LZCNT = 32 - exponent 5722 vpsubd(dst, xtmp1, dst, vec_enc); 5723 5724 // Replace LZCNT with a value 1 if corresponding source lane 5725 // contains max_int value. 5726 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 5727 5728 // Replace biased_exp with 0 if source lane value is less than zero. 5729 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5730 vblendvps(dst, dst, xtmp2, src, vec_enc); 5731 } 5732 5733 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5734 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5735 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5736 // Add zero counts of lower word and upper word of a double word if 5737 // upper word holds a zero value. 5738 vpsrld(xtmp3, src, 16, vec_enc); 5739 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 5740 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 5741 vpslld(xtmp2, dst, 16, vec_enc); 5742 vpaddd(xtmp2, xtmp2, dst, vec_enc); 5743 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5744 vpsrld(dst, dst, 16, vec_enc); 5745 // Add zero counts of lower doubleword and upper doubleword of a 5746 // quadword if upper doubleword holds a zero value. 5747 vpsrlq(xtmp3, src, 32, vec_enc); 5748 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 5749 vpsllq(xtmp2, dst, 32, vec_enc); 5750 vpaddq(xtmp2, xtmp2, dst, vec_enc); 5751 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5752 vpsrlq(dst, dst, 32, vec_enc); 5753 } 5754 5755 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 5756 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5757 Register rtmp, int vec_enc) { 5758 assert(is_integral_type(bt), "unexpected type"); 5759 assert(vec_enc < Assembler::AVX_512bit, ""); 5760 switch(bt) { 5761 case T_LONG: 5762 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5763 break; 5764 case T_INT: 5765 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 5766 break; 5767 case T_SHORT: 5768 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5769 break; 5770 case T_BYTE: 5771 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5772 break; 5773 default: 5774 fatal("Unsupported type %s", type2name(bt)); 5775 break; 5776 } 5777 } 5778 5779 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 5780 switch(bt) { 5781 case T_BYTE: 5782 vpsubb(dst, src1, src2, vec_enc); 5783 break; 5784 case T_SHORT: 5785 vpsubw(dst, src1, src2, vec_enc); 5786 break; 5787 case T_INT: 5788 vpsubd(dst, src1, src2, vec_enc); 5789 break; 5790 case T_LONG: 5791 vpsubq(dst, src1, src2, vec_enc); 5792 break; 5793 default: 5794 fatal("Unsupported type %s", type2name(bt)); 5795 break; 5796 } 5797 } 5798 5799 // Trailing zero count computation is based on leading zero count operation as per 5800 // following equation. All AVX3 targets support AVX512CD feature which offers 5801 // direct vector instruction to compute leading zero count. 5802 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 5803 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5804 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5805 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 5806 assert(is_integral_type(bt), ""); 5807 // xtmp = -1 5808 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 5809 // xtmp = xtmp + src 5810 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 5811 // xtmp = xtmp & ~src 5812 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 5813 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 5814 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 5815 vpsub(bt, dst, xtmp4, dst, vec_enc); 5816 } 5817 5818 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 5819 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 5820 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5821 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5822 assert(is_integral_type(bt), ""); 5823 // xtmp = 0 5824 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 5825 // xtmp = 0 - src 5826 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 5827 // xtmp = xtmp | src 5828 vpor(xtmp3, xtmp3, src, vec_enc); 5829 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 5830 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 5831 vpsub(bt, dst, xtmp1, dst, vec_enc); 5832 } 5833 5834 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 5835 Label done; 5836 Label neg_divisor_fastpath; 5837 cmpl(divisor, 0); 5838 jccb(Assembler::less, neg_divisor_fastpath); 5839 xorl(rdx, rdx); 5840 divl(divisor); 5841 jmpb(done); 5842 bind(neg_divisor_fastpath); 5843 // Fastpath for divisor < 0: 5844 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 5845 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 5846 movl(rdx, rax); 5847 subl(rdx, divisor); 5848 if (VM_Version::supports_bmi1()) { 5849 andnl(rax, rdx, rax); 5850 } else { 5851 notl(rdx); 5852 andl(rax, rdx); 5853 } 5854 shrl(rax, 31); 5855 bind(done); 5856 } 5857 5858 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 5859 Label done; 5860 Label neg_divisor_fastpath; 5861 cmpl(divisor, 0); 5862 jccb(Assembler::less, neg_divisor_fastpath); 5863 xorl(rdx, rdx); 5864 divl(divisor); 5865 jmpb(done); 5866 bind(neg_divisor_fastpath); 5867 // Fastpath when divisor < 0: 5868 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 5869 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 5870 movl(rdx, rax); 5871 subl(rax, divisor); 5872 if (VM_Version::supports_bmi1()) { 5873 andnl(rax, rax, rdx); 5874 } else { 5875 notl(rax); 5876 andl(rax, rdx); 5877 } 5878 sarl(rax, 31); 5879 andl(rax, divisor); 5880 subl(rdx, rax); 5881 bind(done); 5882 } 5883 5884 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 5885 Label done; 5886 Label neg_divisor_fastpath; 5887 5888 cmpl(divisor, 0); 5889 jccb(Assembler::less, neg_divisor_fastpath); 5890 xorl(rdx, rdx); 5891 divl(divisor); 5892 jmpb(done); 5893 bind(neg_divisor_fastpath); 5894 // Fastpath for divisor < 0: 5895 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 5896 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 5897 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 5898 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 5899 movl(rdx, rax); 5900 subl(rax, divisor); 5901 if (VM_Version::supports_bmi1()) { 5902 andnl(rax, rax, rdx); 5903 } else { 5904 notl(rax); 5905 andl(rax, rdx); 5906 } 5907 movl(tmp, rax); 5908 shrl(rax, 31); // quotient 5909 sarl(tmp, 31); 5910 andl(tmp, divisor); 5911 subl(rdx, tmp); // remainder 5912 bind(done); 5913 } 5914 5915 #ifdef _LP64 5916 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 5917 XMMRegister xtmp2, Register rtmp) { 5918 if(VM_Version::supports_gfni()) { 5919 // Galois field instruction based bit reversal based on following algorithm. 5920 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5921 mov64(rtmp, 0x8040201008040201L); 5922 movq(xtmp1, src); 5923 movq(xtmp2, rtmp); 5924 gf2p8affineqb(xtmp1, xtmp2, 0); 5925 movq(dst, xtmp1); 5926 } else { 5927 // Swap even and odd numbered bits. 5928 movl(rtmp, src); 5929 andl(rtmp, 0x55555555); 5930 shll(rtmp, 1); 5931 movl(dst, src); 5932 andl(dst, 0xAAAAAAAA); 5933 shrl(dst, 1); 5934 orl(dst, rtmp); 5935 5936 // Swap LSB and MSB 2 bits of each nibble. 5937 movl(rtmp, dst); 5938 andl(rtmp, 0x33333333); 5939 shll(rtmp, 2); 5940 andl(dst, 0xCCCCCCCC); 5941 shrl(dst, 2); 5942 orl(dst, rtmp); 5943 5944 // Swap LSB and MSB 4 bits of each byte. 5945 movl(rtmp, dst); 5946 andl(rtmp, 0x0F0F0F0F); 5947 shll(rtmp, 4); 5948 andl(dst, 0xF0F0F0F0); 5949 shrl(dst, 4); 5950 orl(dst, rtmp); 5951 } 5952 bswapl(dst); 5953 } 5954 5955 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 5956 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 5957 if(VM_Version::supports_gfni()) { 5958 // Galois field instruction based bit reversal based on following algorithm. 5959 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5960 mov64(rtmp1, 0x8040201008040201L); 5961 movq(xtmp1, src); 5962 movq(xtmp2, rtmp1); 5963 gf2p8affineqb(xtmp1, xtmp2, 0); 5964 movq(dst, xtmp1); 5965 } else { 5966 // Swap even and odd numbered bits. 5967 movq(rtmp1, src); 5968 mov64(rtmp2, 0x5555555555555555L); 5969 andq(rtmp1, rtmp2); 5970 shlq(rtmp1, 1); 5971 movq(dst, src); 5972 notq(rtmp2); 5973 andq(dst, rtmp2); 5974 shrq(dst, 1); 5975 orq(dst, rtmp1); 5976 5977 // Swap LSB and MSB 2 bits of each nibble. 5978 movq(rtmp1, dst); 5979 mov64(rtmp2, 0x3333333333333333L); 5980 andq(rtmp1, rtmp2); 5981 shlq(rtmp1, 2); 5982 notq(rtmp2); 5983 andq(dst, rtmp2); 5984 shrq(dst, 2); 5985 orq(dst, rtmp1); 5986 5987 // Swap LSB and MSB 4 bits of each byte. 5988 movq(rtmp1, dst); 5989 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 5990 andq(rtmp1, rtmp2); 5991 shlq(rtmp1, 4); 5992 notq(rtmp2); 5993 andq(dst, rtmp2); 5994 shrq(dst, 4); 5995 orq(dst, rtmp1); 5996 } 5997 bswapq(dst); 5998 } 5999 6000 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6001 Label done; 6002 Label neg_divisor_fastpath; 6003 cmpq(divisor, 0); 6004 jccb(Assembler::less, neg_divisor_fastpath); 6005 xorl(rdx, rdx); 6006 divq(divisor); 6007 jmpb(done); 6008 bind(neg_divisor_fastpath); 6009 // Fastpath for divisor < 0: 6010 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6011 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6012 movq(rdx, rax); 6013 subq(rdx, divisor); 6014 if (VM_Version::supports_bmi1()) { 6015 andnq(rax, rdx, rax); 6016 } else { 6017 notq(rdx); 6018 andq(rax, rdx); 6019 } 6020 shrq(rax, 63); 6021 bind(done); 6022 } 6023 6024 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6025 Label done; 6026 Label neg_divisor_fastpath; 6027 cmpq(divisor, 0); 6028 jccb(Assembler::less, neg_divisor_fastpath); 6029 xorq(rdx, rdx); 6030 divq(divisor); 6031 jmp(done); 6032 bind(neg_divisor_fastpath); 6033 // Fastpath when divisor < 0: 6034 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6035 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6036 movq(rdx, rax); 6037 subq(rax, divisor); 6038 if (VM_Version::supports_bmi1()) { 6039 andnq(rax, rax, rdx); 6040 } else { 6041 notq(rax); 6042 andq(rax, rdx); 6043 } 6044 sarq(rax, 63); 6045 andq(rax, divisor); 6046 subq(rdx, rax); 6047 bind(done); 6048 } 6049 6050 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6051 Label done; 6052 Label neg_divisor_fastpath; 6053 cmpq(divisor, 0); 6054 jccb(Assembler::less, neg_divisor_fastpath); 6055 xorq(rdx, rdx); 6056 divq(divisor); 6057 jmp(done); 6058 bind(neg_divisor_fastpath); 6059 // Fastpath for divisor < 0: 6060 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6061 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6062 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6063 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6064 movq(rdx, rax); 6065 subq(rax, divisor); 6066 if (VM_Version::supports_bmi1()) { 6067 andnq(rax, rax, rdx); 6068 } else { 6069 notq(rax); 6070 andq(rax, rdx); 6071 } 6072 movq(tmp, rax); 6073 shrq(rax, 63); // quotient 6074 sarq(tmp, 63); 6075 andq(tmp, divisor); 6076 subq(rdx, tmp); // remainder 6077 bind(done); 6078 } 6079 #endif 6080 6081 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6082 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6083 int vlen_enc) { 6084 assert(VM_Version::supports_avx512bw(), ""); 6085 // Byte shuffles are inlane operations and indices are determined using 6086 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6087 // normalized to index range 0-15. This makes sure that all the multiples 6088 // of an index value are placed at same relative position in 128 bit 6089 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6090 // will be 16th element in their respective 128 bit lanes. 6091 movl(rtmp, 16); 6092 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6093 6094 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6095 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6096 // original shuffle indices and move the shuffled lanes corresponding to true 6097 // mask to destination vector. 6098 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6099 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6100 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6101 6102 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6103 // and broadcasting second 128 bit lane. 6104 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6105 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6106 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6107 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6108 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6109 6110 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6111 // and broadcasting third 128 bit lane. 6112 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6113 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6114 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6115 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6116 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6117 6118 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6119 // and broadcasting third 128 bit lane. 6120 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6121 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6122 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6123 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6124 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6125 } 6126 6127 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6128 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6129 if (vlen_enc == AVX_128bit) { 6130 vpermilps(dst, src, shuffle, vlen_enc); 6131 } else if (bt == T_INT) { 6132 vpermd(dst, shuffle, src, vlen_enc); 6133 } else { 6134 assert(bt == T_FLOAT, ""); 6135 vpermps(dst, shuffle, src, vlen_enc); 6136 } 6137 }