1 /* 2 * Copyright (c) 2020, 2022, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "oops/methodData.hpp" 31 #include "opto/c2_MacroAssembler.hpp" 32 #include "opto/intrinsicnode.hpp" 33 #include "opto/output.hpp" 34 #include "opto/opcodes.hpp" 35 #include "opto/subnode.hpp" 36 #include "runtime/objectMonitor.hpp" 37 #include "runtime/stubRoutines.hpp" 38 39 #ifdef PRODUCT 40 #define BLOCK_COMMENT(str) /* nothing */ 41 #define STOP(error) stop(error) 42 #else 43 #define BLOCK_COMMENT(str) block_comment(str) 44 #define STOP(error) block_comment(error); stop(error) 45 #endif 46 47 // C2 compiled method's prolog code. 48 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) { 49 if (C->clinit_barrier_on_entry()) { 50 assert(VM_Version::supports_fast_class_init_checks(), "sanity"); 51 assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started"); 52 53 Label L_skip_barrier; 54 Register klass = rscratch1; 55 56 mov_metadata(klass, C->method()->holder()->constant_encoding()); 57 clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 58 59 jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 60 61 bind(L_skip_barrier); 62 } 63 64 int framesize = C->output()->frame_size_in_bytes(); 65 int bangsize = C->output()->bang_size_in_bytes(); 66 bool fp_mode_24b = false; 67 int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0; 68 69 // WARNING: Initial instruction MUST be 5 bytes or longer so that 70 // NativeJump::patch_verified_entry will be able to patch out the entry 71 // code safely. The push to verify stack depth is ok at 5 bytes, 72 // the frame allocation can be either 3 or 6 bytes. So if we don't do 73 // stack bang then we must use the 6 byte frame allocation even if 74 // we have no frame. :-( 75 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 76 77 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 78 // Remove word for return addr 79 framesize -= wordSize; 80 stack_bang_size -= wordSize; 81 82 // Calls to C2R adapters often do not accept exceptional returns. 83 // We require that their callers must bang for them. But be careful, because 84 // some VM calls (such as call site linkage) can use several kilobytes of 85 // stack. But the stack safety zone should account for that. 86 // See bugs 4446381, 4468289, 4497237. 87 if (stack_bang_size > 0) { 88 generate_stack_overflow_check(stack_bang_size); 89 90 // We always push rbp, so that on return to interpreter rbp, will be 91 // restored correctly and we can correct the stack. 92 push(rbp); 93 // Save caller's stack pointer into RBP if the frame pointer is preserved. 94 if (PreserveFramePointer) { 95 mov(rbp, rsp); 96 } 97 // Remove word for ebp 98 framesize -= wordSize; 99 100 // Create frame 101 if (framesize) { 102 subptr(rsp, framesize); 103 } 104 } else { 105 // Create frame (force generation of a 4 byte immediate value) 106 subptr_imm32(rsp, framesize); 107 108 // Save RBP register now. 109 framesize -= wordSize; 110 movptr(Address(rsp, framesize), rbp); 111 // Save caller's stack pointer into RBP if the frame pointer is preserved. 112 if (PreserveFramePointer) { 113 movptr(rbp, rsp); 114 if (framesize > 0) { 115 addptr(rbp, framesize); 116 } 117 } 118 } 119 120 if (C->needs_stack_repair()) { 121 // Save stack increment just below the saved rbp (also account for fixed framesize and rbp) 122 assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned"); 123 movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize); 124 } 125 126 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 127 framesize -= wordSize; 128 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 129 } 130 131 #ifndef _LP64 132 // If method sets FPU control word do it now 133 if (fp_mode_24b) { 134 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 135 } 136 if (UseSSE >= 2 && VerifyFPU) { 137 verify_FPU(0, "FPU stack must be clean on entry"); 138 } 139 #endif 140 141 #ifdef ASSERT 142 if (VerifyStackAtCalls) { 143 Label L; 144 push(rax); 145 mov(rax, rsp); 146 andptr(rax, StackAlignmentInBytes-1); 147 cmpptr(rax, StackAlignmentInBytes-wordSize); 148 pop(rax); 149 jcc(Assembler::equal, L); 150 STOP("Stack is not properly aligned!"); 151 bind(L); 152 } 153 #endif 154 } 155 156 void C2_MacroAssembler::entry_barrier() { 157 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 158 #ifdef _LP64 159 if (BarrierSet::barrier_set()->barrier_set_nmethod() != NULL) { 160 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 161 Label dummy_slow_path; 162 Label dummy_continuation; 163 Label* slow_path = &dummy_slow_path; 164 Label* continuation = &dummy_continuation; 165 if (!Compile::current()->output()->in_scratch_emit_size()) { 166 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 167 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 168 Compile::current()->output()->add_stub(stub); 169 slow_path = &stub->entry(); continuation = &stub->continuation(); 170 } 171 bs->nmethod_entry_barrier(this, slow_path, continuation); 172 } 173 #else 174 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 175 bs->nmethod_entry_barrier(this, NULL /* slow_path */, NULL /* continuation */); 176 #endif 177 } 178 179 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 180 switch (vlen_in_bytes) { 181 case 4: // fall-through 182 case 8: // fall-through 183 case 16: return Assembler::AVX_128bit; 184 case 32: return Assembler::AVX_256bit; 185 case 64: return Assembler::AVX_512bit; 186 187 default: { 188 ShouldNotReachHere(); 189 return Assembler::AVX_NoVec; 190 } 191 } 192 } 193 194 #if INCLUDE_RTM_OPT 195 196 // Update rtm_counters based on abort status 197 // input: abort_status 198 // rtm_counters (RTMLockingCounters*) 199 // flags are killed 200 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) { 201 202 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset())); 203 if (PrintPreciseRTMLockingStatistics) { 204 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 205 Label check_abort; 206 testl(abort_status, (1<<i)); 207 jccb(Assembler::equal, check_abort); 208 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx)))); 209 bind(check_abort); 210 } 211 } 212 } 213 214 // Branch if (random & (count-1) != 0), count is 2^n 215 // tmp, scr and flags are killed 216 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) { 217 assert(tmp == rax, ""); 218 assert(scr == rdx, ""); 219 rdtsc(); // modifies EDX:EAX 220 andptr(tmp, count-1); 221 jccb(Assembler::notZero, brLabel); 222 } 223 224 // Perform abort ratio calculation, set no_rtm bit if high ratio 225 // input: rtm_counters_Reg (RTMLockingCounters* address) 226 // tmpReg, rtm_counters_Reg and flags are killed 227 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg, 228 Register rtm_counters_Reg, 229 RTMLockingCounters* rtm_counters, 230 Metadata* method_data) { 231 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 232 233 if (RTMLockingCalculationDelay > 0) { 234 // Delay calculation 235 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr())); 236 testptr(tmpReg, tmpReg); 237 jccb(Assembler::equal, L_done); 238 } 239 // Abort ratio calculation only if abort_count > RTMAbortThreshold 240 // Aborted transactions = abort_count * 100 241 // All transactions = total_count * RTMTotalCountIncrRate 242 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 243 244 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset())); 245 cmpptr(tmpReg, RTMAbortThreshold); 246 jccb(Assembler::below, L_check_always_rtm2); 247 imulptr(tmpReg, tmpReg, 100); 248 249 Register scrReg = rtm_counters_Reg; 250 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 251 imulptr(scrReg, scrReg, RTMTotalCountIncrRate); 252 imulptr(scrReg, scrReg, RTMAbortRatio); 253 cmpptr(tmpReg, scrReg); 254 jccb(Assembler::below, L_check_always_rtm1); 255 if (method_data != NULL) { 256 // set rtm_state to "no rtm" in MDO 257 mov_metadata(tmpReg, method_data); 258 lock(); 259 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM); 260 } 261 jmpb(L_done); 262 bind(L_check_always_rtm1); 263 // Reload RTMLockingCounters* address 264 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 265 bind(L_check_always_rtm2); 266 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 267 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 268 jccb(Assembler::below, L_done); 269 if (method_data != NULL) { 270 // set rtm_state to "always rtm" in MDO 271 mov_metadata(tmpReg, method_data); 272 lock(); 273 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM); 274 } 275 bind(L_done); 276 } 277 278 // Update counters and perform abort ratio calculation 279 // input: abort_status_Reg 280 // rtm_counters_Reg, flags are killed 281 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg, 282 Register rtm_counters_Reg, 283 RTMLockingCounters* rtm_counters, 284 Metadata* method_data, 285 bool profile_rtm) { 286 287 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 288 // update rtm counters based on rax value at abort 289 // reads abort_status_Reg, updates flags 290 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 291 rtm_counters_update(abort_status_Reg, rtm_counters_Reg); 292 if (profile_rtm) { 293 // Save abort status because abort_status_Reg is used by following code. 294 if (RTMRetryCount > 0) { 295 push(abort_status_Reg); 296 } 297 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 298 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data); 299 // restore abort status 300 if (RTMRetryCount > 0) { 301 pop(abort_status_Reg); 302 } 303 } 304 } 305 306 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4) 307 // inputs: retry_count_Reg 308 // : abort_status_Reg 309 // output: retry_count_Reg decremented by 1 310 // flags are killed 311 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) { 312 Label doneRetry; 313 assert(abort_status_Reg == rax, ""); 314 // The abort reason bits are in eax (see all states in rtmLocking.hpp) 315 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) 316 // if reason is in 0x6 and retry count != 0 then retry 317 andptr(abort_status_Reg, 0x6); 318 jccb(Assembler::zero, doneRetry); 319 testl(retry_count_Reg, retry_count_Reg); 320 jccb(Assembler::zero, doneRetry); 321 pause(); 322 decrementl(retry_count_Reg); 323 jmp(retryLabel); 324 bind(doneRetry); 325 } 326 327 // Spin and retry if lock is busy, 328 // inputs: box_Reg (monitor address) 329 // : retry_count_Reg 330 // output: retry_count_Reg decremented by 1 331 // : clear z flag if retry count exceeded 332 // tmp_Reg, scr_Reg, flags are killed 333 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg, 334 Register tmp_Reg, Register scr_Reg, Label& retryLabel) { 335 Label SpinLoop, SpinExit, doneRetry; 336 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 337 338 testl(retry_count_Reg, retry_count_Reg); 339 jccb(Assembler::zero, doneRetry); 340 decrementl(retry_count_Reg); 341 movptr(scr_Reg, RTMSpinLoopCount); 342 343 bind(SpinLoop); 344 pause(); 345 decrementl(scr_Reg); 346 jccb(Assembler::lessEqual, SpinExit); 347 movptr(tmp_Reg, Address(box_Reg, owner_offset)); 348 testptr(tmp_Reg, tmp_Reg); 349 jccb(Assembler::notZero, SpinLoop); 350 351 bind(SpinExit); 352 jmp(retryLabel); 353 bind(doneRetry); 354 incrementl(retry_count_Reg); // clear z flag 355 } 356 357 // Use RTM for normal stack locks 358 // Input: objReg (object to lock) 359 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg, 360 Register retry_on_abort_count_Reg, 361 RTMLockingCounters* stack_rtm_counters, 362 Metadata* method_data, bool profile_rtm, 363 Label& DONE_LABEL, Label& IsInflated) { 364 assert(UseRTMForStackLocks, "why call this otherwise?"); 365 assert(tmpReg == rax, ""); 366 assert(scrReg == rdx, ""); 367 Label L_rtm_retry, L_decrement_retry, L_on_abort; 368 369 if (RTMRetryCount > 0) { 370 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 371 bind(L_rtm_retry); 372 } 373 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 374 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 375 jcc(Assembler::notZero, IsInflated); 376 377 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 378 Label L_noincrement; 379 if (RTMTotalCountIncrRate > 1) { 380 // tmpReg, scrReg and flags are killed 381 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 382 } 383 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 384 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg); 385 bind(L_noincrement); 386 } 387 xbegin(L_on_abort); 388 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 389 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 390 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 391 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked 392 393 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 394 if (UseRTMXendForLockBusy) { 395 xend(); 396 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry) 397 jmp(L_decrement_retry); 398 } 399 else { 400 xabort(0); 401 } 402 bind(L_on_abort); 403 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 404 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm); 405 } 406 bind(L_decrement_retry); 407 if (RTMRetryCount > 0) { 408 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 409 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 410 } 411 } 412 413 // Use RTM for inflating locks 414 // inputs: objReg (object to lock) 415 // boxReg (on-stack box address (displaced header location) - KILLED) 416 // tmpReg (ObjectMonitor address + markWord::monitor_value) 417 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg, 418 Register scrReg, Register retry_on_busy_count_Reg, 419 Register retry_on_abort_count_Reg, 420 RTMLockingCounters* rtm_counters, 421 Metadata* method_data, bool profile_rtm, 422 Label& DONE_LABEL) { 423 assert(UseRTMLocking, "why call this otherwise?"); 424 assert(tmpReg == rax, ""); 425 assert(scrReg == rdx, ""); 426 Label L_rtm_retry, L_decrement_retry, L_on_abort; 427 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 428 429 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 430 movptr(boxReg, tmpReg); // Save ObjectMonitor address 431 432 if (RTMRetryCount > 0) { 433 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy 434 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 435 bind(L_rtm_retry); 436 } 437 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 438 Label L_noincrement; 439 if (RTMTotalCountIncrRate > 1) { 440 // tmpReg, scrReg and flags are killed 441 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 442 } 443 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 444 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg); 445 bind(L_noincrement); 446 } 447 xbegin(L_on_abort); 448 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 449 movptr(tmpReg, Address(tmpReg, owner_offset)); 450 testptr(tmpReg, tmpReg); 451 jcc(Assembler::zero, DONE_LABEL); 452 if (UseRTMXendForLockBusy) { 453 xend(); 454 jmp(L_decrement_retry); 455 } 456 else { 457 xabort(0); 458 } 459 bind(L_on_abort); 460 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 461 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 462 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm); 463 } 464 if (RTMRetryCount > 0) { 465 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 466 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 467 } 468 469 movptr(tmpReg, Address(boxReg, owner_offset)) ; 470 testptr(tmpReg, tmpReg) ; 471 jccb(Assembler::notZero, L_decrement_retry) ; 472 473 // Appears unlocked - try to swing _owner from null to non-null. 474 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 475 #ifdef _LP64 476 Register threadReg = r15_thread; 477 #else 478 get_thread(scrReg); 479 Register threadReg = scrReg; 480 #endif 481 lock(); 482 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg 483 484 if (RTMRetryCount > 0) { 485 // success done else retry 486 jccb(Assembler::equal, DONE_LABEL) ; 487 bind(L_decrement_retry); 488 // Spin and retry if lock is busy. 489 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry); 490 } 491 else { 492 bind(L_decrement_retry); 493 } 494 } 495 496 #endif // INCLUDE_RTM_OPT 497 498 // fast_lock and fast_unlock used by C2 499 500 // Because the transitions from emitted code to the runtime 501 // monitorenter/exit helper stubs are so slow it's critical that 502 // we inline both the stack-locking fast path and the inflated fast path. 503 // 504 // See also: cmpFastLock and cmpFastUnlock. 505 // 506 // What follows is a specialized inline transliteration of the code 507 // in enter() and exit(). If we're concerned about I$ bloat another 508 // option would be to emit TrySlowEnter and TrySlowExit methods 509 // at startup-time. These methods would accept arguments as 510 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 511 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 512 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 513 // In practice, however, the # of lock sites is bounded and is usually small. 514 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 515 // if the processor uses simple bimodal branch predictors keyed by EIP 516 // Since the helper routines would be called from multiple synchronization 517 // sites. 518 // 519 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 520 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 521 // to those specialized methods. That'd give us a mostly platform-independent 522 // implementation that the JITs could optimize and inline at their pleasure. 523 // Done correctly, the only time we'd need to cross to native could would be 524 // to park() or unpark() threads. We'd also need a few more unsafe operators 525 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 526 // (b) explicit barriers or fence operations. 527 // 528 // TODO: 529 // 530 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 531 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 532 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 533 // the lock operators would typically be faster than reifying Self. 534 // 535 // * Ideally I'd define the primitives as: 536 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 537 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 538 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 539 // Instead, we're stuck with a rather awkward and brittle register assignments below. 540 // Furthermore the register assignments are overconstrained, possibly resulting in 541 // sub-optimal code near the synchronization site. 542 // 543 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 544 // Alternately, use a better sp-proximity test. 545 // 546 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 547 // Either one is sufficient to uniquely identify a thread. 548 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 549 // 550 // * Intrinsify notify() and notifyAll() for the common cases where the 551 // object is locked by the calling thread but the waitlist is empty. 552 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 553 // 554 // * use jccb and jmpb instead of jcc and jmp to improve code density. 555 // But beware of excessive branch density on AMD Opterons. 556 // 557 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 558 // or failure of the fast path. If the fast path fails then we pass 559 // control to the slow path, typically in C. In fast_lock and 560 // fast_unlock we often branch to DONE_LABEL, just to find that C2 561 // will emit a conditional branch immediately after the node. 562 // So we have branches to branches and lots of ICC.ZF games. 563 // Instead, it might be better to have C2 pass a "FailureLabel" 564 // into fast_lock and fast_unlock. In the case of success, control 565 // will drop through the node. ICC.ZF is undefined at exit. 566 // In the case of failure, the node will branch directly to the 567 // FailureLabel 568 569 570 // obj: object to lock 571 // box: on-stack box address (displaced header location) - KILLED 572 // rax,: tmp -- KILLED 573 // scr: tmp -- KILLED 574 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 575 Register scrReg, Register cx1Reg, Register cx2Reg, 576 RTMLockingCounters* rtm_counters, 577 RTMLockingCounters* stack_rtm_counters, 578 Metadata* method_data, 579 bool use_rtm, bool profile_rtm) { 580 // Ensure the register assignments are disjoint 581 assert(tmpReg == rax, ""); 582 583 if (use_rtm) { 584 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg); 585 } else { 586 assert(cx2Reg == noreg, ""); 587 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 588 } 589 590 // Possible cases that we'll encounter in fast_lock 591 // ------------------------------------------------ 592 // * Inflated 593 // -- unlocked 594 // -- Locked 595 // = by self 596 // = by other 597 // * neutral 598 // * stack-locked 599 // -- by self 600 // = sp-proximity test hits 601 // = sp-proximity test generates false-negative 602 // -- by other 603 // 604 605 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 606 607 if (DiagnoseSyncOnValueBasedClasses != 0) { 608 load_klass(tmpReg, objReg, cx1Reg); 609 movl(tmpReg, Address(tmpReg, Klass::access_flags_offset())); 610 testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS); 611 jcc(Assembler::notZero, DONE_LABEL); 612 } 613 614 #if INCLUDE_RTM_OPT 615 if (UseRTMForStackLocks && use_rtm) { 616 assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive"); 617 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg, 618 stack_rtm_counters, method_data, profile_rtm, 619 DONE_LABEL, IsInflated); 620 } 621 #endif // INCLUDE_RTM_OPT 622 623 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 624 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 625 jccb(Assembler::notZero, IsInflated); 626 627 if (!UseHeavyMonitors) { 628 // Attempt stack-locking ... 629 orptr (tmpReg, markWord::unlocked_value); 630 if (EnableValhalla) { 631 // Mask inline_type bit such that we go to the slow path if object is an inline type 632 andptr(tmpReg, ~((int) markWord::inline_type_bit_in_place)); 633 } 634 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 635 lock(); 636 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 637 jcc(Assembler::equal, COUNT); // Success 638 639 // Recursive locking. 640 // The object is stack-locked: markword contains stack pointer to BasicLock. 641 // Locked by current thread if difference with current SP is less than one page. 642 subptr(tmpReg, rsp); 643 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 644 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) ); 645 movptr(Address(boxReg, 0), tmpReg); 646 } else { 647 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 648 testptr(objReg, objReg); 649 } 650 jmp(DONE_LABEL); 651 652 bind(IsInflated); 653 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 654 655 #if INCLUDE_RTM_OPT 656 // Use the same RTM locking code in 32- and 64-bit VM. 657 if (use_rtm) { 658 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg, 659 rtm_counters, method_data, profile_rtm, DONE_LABEL); 660 } else { 661 #endif // INCLUDE_RTM_OPT 662 663 #ifndef _LP64 664 // The object is inflated. 665 666 // boxReg refers to the on-stack BasicLock in the current frame. 667 // We'd like to write: 668 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 669 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 670 // additional latency as we have another ST in the store buffer that must drain. 671 672 // avoid ST-before-CAS 673 // register juggle because we need tmpReg for cmpxchgptr below 674 movptr(scrReg, boxReg); 675 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 676 677 // Optimistic form: consider XORL tmpReg,tmpReg 678 movptr(tmpReg, NULL_WORD); 679 680 // Appears unlocked - try to swing _owner from null to non-null. 681 // Ideally, I'd manifest "Self" with get_thread and then attempt 682 // to CAS the register containing Self into m->Owner. 683 // But we don't have enough registers, so instead we can either try to CAS 684 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 685 // we later store "Self" into m->Owner. Transiently storing a stack address 686 // (rsp or the address of the box) into m->owner is harmless. 687 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 688 lock(); 689 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 690 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 691 // If we weren't able to swing _owner from NULL to the BasicLock 692 // then take the slow path. 693 jccb (Assembler::notZero, NO_COUNT); 694 // update _owner from BasicLock to thread 695 get_thread (scrReg); // beware: clobbers ICCs 696 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 697 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 698 699 // If the CAS fails we can either retry or pass control to the slow path. 700 // We use the latter tactic. 701 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 702 // If the CAS was successful ... 703 // Self has acquired the lock 704 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 705 // Intentional fall-through into DONE_LABEL ... 706 #else // _LP64 707 // It's inflated and we use scrReg for ObjectMonitor* in this section. 708 movq(scrReg, tmpReg); 709 xorq(tmpReg, tmpReg); 710 lock(); 711 cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 712 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 713 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 714 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 715 // Propagate ICC.ZF from CAS above into DONE_LABEL. 716 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 717 718 cmpptr(r15_thread, rax); // Check if we are already the owner (recursive lock) 719 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 720 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 721 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 722 #endif // _LP64 723 #if INCLUDE_RTM_OPT 724 } // use_rtm() 725 #endif 726 bind(DONE_LABEL); 727 728 // ZFlag == 1 count in fast path 729 // ZFlag == 0 count in slow path 730 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 731 732 bind(COUNT); 733 // Count monitors in fast path 734 #ifndef _LP64 735 get_thread(tmpReg); 736 incrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); 737 #else // _LP64 738 incrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 739 #endif 740 741 xorl(tmpReg, tmpReg); // Set ZF == 1 742 743 bind(NO_COUNT); 744 745 // At NO_COUNT the icc ZFlag is set as follows ... 746 // fast_unlock uses the same protocol. 747 // ZFlag == 1 -> Success 748 // ZFlag == 0 -> Failure - force control through the slow path 749 } 750 751 // obj: object to unlock 752 // box: box address (displaced header location), killed. Must be EAX. 753 // tmp: killed, cannot be obj nor box. 754 // 755 // Some commentary on balanced locking: 756 // 757 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 758 // Methods that don't have provably balanced locking are forced to run in the 759 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 760 // The interpreter provides two properties: 761 // I1: At return-time the interpreter automatically and quietly unlocks any 762 // objects acquired the current activation (frame). Recall that the 763 // interpreter maintains an on-stack list of locks currently held by 764 // a frame. 765 // I2: If a method attempts to unlock an object that is not held by the 766 // the frame the interpreter throws IMSX. 767 // 768 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 769 // B() doesn't have provably balanced locking so it runs in the interpreter. 770 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 771 // is still locked by A(). 772 // 773 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 774 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 775 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 776 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 777 // Arguably given that the spec legislates the JNI case as undefined our implementation 778 // could reasonably *avoid* checking owner in fast_unlock(). 779 // In the interest of performance we elide m->Owner==Self check in unlock. 780 // A perfectly viable alternative is to elide the owner check except when 781 // Xcheck:jni is enabled. 782 783 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { 784 assert(boxReg == rax, ""); 785 assert_different_registers(objReg, boxReg, tmpReg); 786 787 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 788 789 #if INCLUDE_RTM_OPT 790 if (UseRTMForStackLocks && use_rtm) { 791 assert(!UseHeavyMonitors, "+UseHeavyMonitors and +UseRTMForStackLocks are mutually exclusive"); 792 Label L_regular_unlock; 793 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 794 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 795 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 796 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock 797 xend(); // otherwise end... 798 jmp(DONE_LABEL); // ... and we're done 799 bind(L_regular_unlock); 800 } 801 #endif 802 803 if (!UseHeavyMonitors) { 804 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 805 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 806 } 807 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 808 if (!UseHeavyMonitors) { 809 testptr(tmpReg, markWord::monitor_value); // Inflated? 810 jccb (Assembler::zero, Stacked); 811 } 812 813 // It's inflated. 814 #if INCLUDE_RTM_OPT 815 if (use_rtm) { 816 Label L_regular_inflated_unlock; 817 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 818 movptr(boxReg, Address(tmpReg, owner_offset)); 819 testptr(boxReg, boxReg); 820 jccb(Assembler::notZero, L_regular_inflated_unlock); 821 xend(); 822 jmpb(DONE_LABEL); 823 bind(L_regular_inflated_unlock); 824 } 825 #endif 826 827 // Despite our balanced locking property we still check that m->_owner == Self 828 // as java routines or native JNI code called by this thread might 829 // have released the lock. 830 // Refer to the comments in synchronizer.cpp for how we might encode extra 831 // state in _succ so we can avoid fetching EntryList|cxq. 832 // 833 // If there's no contention try a 1-0 exit. That is, exit without 834 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 835 // we detect and recover from the race that the 1-0 exit admits. 836 // 837 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 838 // before it STs null into _owner, releasing the lock. Updates 839 // to data protected by the critical section must be visible before 840 // we drop the lock (and thus before any other thread could acquire 841 // the lock and observe the fields protected by the lock). 842 // IA32's memory-model is SPO, so STs are ordered with respect to 843 // each other and there's no need for an explicit barrier (fence). 844 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 845 #ifndef _LP64 846 // Note that we could employ various encoding schemes to reduce 847 // the number of loads below (currently 4) to just 2 or 3. 848 // Refer to the comments in synchronizer.cpp. 849 // In practice the chain of fetches doesn't seem to impact performance, however. 850 xorptr(boxReg, boxReg); 851 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 852 jccb (Assembler::notZero, DONE_LABEL); 853 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 854 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 855 jccb (Assembler::notZero, DONE_LABEL); 856 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 857 jmpb (DONE_LABEL); 858 #else // _LP64 859 // It's inflated 860 Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath; 861 862 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 863 jccb(Assembler::equal, LNotRecursive); 864 865 // Recursive inflated unlock 866 decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 867 jmpb(LSuccess); 868 869 bind(LNotRecursive); 870 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 871 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 872 jccb (Assembler::notZero, CheckSucc); 873 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 874 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 875 jmpb (DONE_LABEL); 876 877 // Try to avoid passing control into the slow_path ... 878 bind (CheckSucc); 879 880 // The following optional optimization can be elided if necessary 881 // Effectively: if (succ == null) goto slow path 882 // The code reduces the window for a race, however, 883 // and thus benefits performance. 884 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 885 jccb (Assembler::zero, LGoSlowPath); 886 887 xorptr(boxReg, boxReg); 888 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 889 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 890 891 // Memory barrier/fence 892 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 893 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 894 // This is faster on Nehalem and AMD Shanghai/Barcelona. 895 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 896 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 897 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 898 lock(); addl(Address(rsp, 0), 0); 899 900 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 901 jccb (Assembler::notZero, LSuccess); 902 903 // Rare inopportune interleaving - race. 904 // The successor vanished in the small window above. 905 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 906 // We need to ensure progress and succession. 907 // Try to reacquire the lock. 908 // If that fails then the new owner is responsible for succession and this 909 // thread needs to take no further action and can exit via the fast path (success). 910 // If the re-acquire succeeds then pass control into the slow path. 911 // As implemented, this latter mode is horrible because we generated more 912 // coherence traffic on the lock *and* artificially extended the critical section 913 // length while by virtue of passing control into the slow path. 914 915 // box is really RAX -- the following CMPXCHG depends on that binding 916 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 917 lock(); 918 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 919 // There's no successor so we tried to regrab the lock. 920 // If that didn't work, then another thread grabbed the 921 // lock so we're done (and exit was a success). 922 jccb (Assembler::notEqual, LSuccess); 923 // Intentional fall-through into slow path 924 925 bind (LGoSlowPath); 926 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 927 jmpb (DONE_LABEL); 928 929 bind (LSuccess); 930 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 931 jmpb (DONE_LABEL); 932 933 #endif 934 if (!UseHeavyMonitors) { 935 bind (Stacked); 936 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 937 lock(); 938 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 939 // Intentional fall-thru into DONE_LABEL 940 } 941 bind(DONE_LABEL); 942 943 // ZFlag == 1 count in fast path 944 // ZFlag == 0 count in slow path 945 jccb(Assembler::notZero, NO_COUNT); 946 947 bind(COUNT); 948 // Count monitors in fast path 949 #ifndef _LP64 950 get_thread(tmpReg); 951 decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); 952 #else // _LP64 953 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 954 #endif 955 956 xorl(tmpReg, tmpReg); // Set ZF == 1 957 958 bind(NO_COUNT); 959 } 960 961 //------------------------------------------------------------------------------------------- 962 // Generic instructions support for use in .ad files C2 code generation 963 964 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 965 if (dst != src) { 966 movdqu(dst, src); 967 } 968 if (opcode == Op_AbsVD) { 969 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 970 } else { 971 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 972 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 973 } 974 } 975 976 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 977 if (opcode == Op_AbsVD) { 978 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 979 } else { 980 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 981 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 982 } 983 } 984 985 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 986 if (dst != src) { 987 movdqu(dst, src); 988 } 989 if (opcode == Op_AbsVF) { 990 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 991 } else { 992 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 993 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 994 } 995 } 996 997 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 998 if (opcode == Op_AbsVF) { 999 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 1000 } else { 1001 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 1002 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 1003 } 1004 } 1005 1006 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 1007 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1008 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 1009 1010 if (opcode == Op_MinV) { 1011 if (elem_bt == T_BYTE) { 1012 pminsb(dst, src); 1013 } else if (elem_bt == T_SHORT) { 1014 pminsw(dst, src); 1015 } else if (elem_bt == T_INT) { 1016 pminsd(dst, src); 1017 } else { 1018 assert(elem_bt == T_LONG, "required"); 1019 assert(tmp == xmm0, "required"); 1020 assert_different_registers(dst, src, tmp); 1021 movdqu(xmm0, dst); 1022 pcmpgtq(xmm0, src); 1023 blendvpd(dst, src); // xmm0 as mask 1024 } 1025 } else { // opcode == Op_MaxV 1026 if (elem_bt == T_BYTE) { 1027 pmaxsb(dst, src); 1028 } else if (elem_bt == T_SHORT) { 1029 pmaxsw(dst, src); 1030 } else if (elem_bt == T_INT) { 1031 pmaxsd(dst, src); 1032 } else { 1033 assert(elem_bt == T_LONG, "required"); 1034 assert(tmp == xmm0, "required"); 1035 assert_different_registers(dst, src, tmp); 1036 movdqu(xmm0, src); 1037 pcmpgtq(xmm0, dst); 1038 blendvpd(dst, src); // xmm0 as mask 1039 } 1040 } 1041 } 1042 1043 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 1044 XMMRegister dst, XMMRegister src1, XMMRegister src2, 1045 int vlen_enc) { 1046 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1047 1048 if (opcode == Op_MinV) { 1049 if (elem_bt == T_BYTE) { 1050 vpminsb(dst, src1, src2, vlen_enc); 1051 } else if (elem_bt == T_SHORT) { 1052 vpminsw(dst, src1, src2, vlen_enc); 1053 } else if (elem_bt == T_INT) { 1054 vpminsd(dst, src1, src2, vlen_enc); 1055 } else { 1056 assert(elem_bt == T_LONG, "required"); 1057 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1058 vpminsq(dst, src1, src2, vlen_enc); 1059 } else { 1060 assert_different_registers(dst, src1, src2); 1061 vpcmpgtq(dst, src1, src2, vlen_enc); 1062 vblendvpd(dst, src1, src2, dst, vlen_enc); 1063 } 1064 } 1065 } else { // opcode == Op_MaxV 1066 if (elem_bt == T_BYTE) { 1067 vpmaxsb(dst, src1, src2, vlen_enc); 1068 } else if (elem_bt == T_SHORT) { 1069 vpmaxsw(dst, src1, src2, vlen_enc); 1070 } else if (elem_bt == T_INT) { 1071 vpmaxsd(dst, src1, src2, vlen_enc); 1072 } else { 1073 assert(elem_bt == T_LONG, "required"); 1074 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1075 vpmaxsq(dst, src1, src2, vlen_enc); 1076 } else { 1077 assert_different_registers(dst, src1, src2); 1078 vpcmpgtq(dst, src1, src2, vlen_enc); 1079 vblendvpd(dst, src2, src1, dst, vlen_enc); 1080 } 1081 } 1082 } 1083 } 1084 1085 // Float/Double min max 1086 1087 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1088 XMMRegister dst, XMMRegister a, XMMRegister b, 1089 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1090 int vlen_enc) { 1091 assert(UseAVX > 0, "required"); 1092 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1093 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1094 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1095 assert_different_registers(a, b, tmp, atmp, btmp); 1096 1097 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1098 bool is_double_word = is_double_word_type(elem_bt); 1099 1100 if (!is_double_word && is_min) { 1101 vblendvps(atmp, a, b, a, vlen_enc); 1102 vblendvps(btmp, b, a, a, vlen_enc); 1103 vminps(tmp, atmp, btmp, vlen_enc); 1104 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1105 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 1106 } else if (!is_double_word && !is_min) { 1107 vblendvps(btmp, b, a, b, vlen_enc); 1108 vblendvps(atmp, a, b, b, vlen_enc); 1109 vmaxps(tmp, atmp, btmp, vlen_enc); 1110 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1111 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 1112 } else if (is_double_word && is_min) { 1113 vblendvpd(atmp, a, b, a, vlen_enc); 1114 vblendvpd(btmp, b, a, a, vlen_enc); 1115 vminpd(tmp, atmp, btmp, vlen_enc); 1116 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1117 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 1118 } else { 1119 assert(is_double_word && !is_min, "sanity"); 1120 vblendvpd(btmp, b, a, b, vlen_enc); 1121 vblendvpd(atmp, a, b, b, vlen_enc); 1122 vmaxpd(tmp, atmp, btmp, vlen_enc); 1123 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1124 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 1125 } 1126 } 1127 1128 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1129 XMMRegister dst, XMMRegister a, XMMRegister b, 1130 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1131 int vlen_enc) { 1132 assert(UseAVX > 2, "required"); 1133 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1134 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1135 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1136 assert_different_registers(dst, a, b, atmp, btmp); 1137 1138 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1139 bool is_double_word = is_double_word_type(elem_bt); 1140 bool merge = true; 1141 1142 if (!is_double_word && is_min) { 1143 evpmovd2m(ktmp, a, vlen_enc); 1144 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1145 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1146 vminps(dst, atmp, btmp, vlen_enc); 1147 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1148 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1149 } else if (!is_double_word && !is_min) { 1150 evpmovd2m(ktmp, b, vlen_enc); 1151 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1152 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1153 vmaxps(dst, atmp, btmp, vlen_enc); 1154 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1155 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1156 } else if (is_double_word && is_min) { 1157 evpmovq2m(ktmp, a, vlen_enc); 1158 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1159 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1160 vminpd(dst, atmp, btmp, vlen_enc); 1161 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1162 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1163 } else { 1164 assert(is_double_word && !is_min, "sanity"); 1165 evpmovq2m(ktmp, b, vlen_enc); 1166 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1167 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1168 vmaxpd(dst, atmp, btmp, vlen_enc); 1169 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1170 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1171 } 1172 } 1173 1174 // Float/Double signum 1175 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1176 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1177 1178 Label DONE_LABEL; 1179 1180 if (opcode == Op_SignumF) { 1181 assert(UseSSE > 0, "required"); 1182 ucomiss(dst, zero); 1183 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1184 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1185 movflt(dst, one); 1186 jcc(Assembler::above, DONE_LABEL); 1187 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1188 } else if (opcode == Op_SignumD) { 1189 assert(UseSSE > 1, "required"); 1190 ucomisd(dst, zero); 1191 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1192 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1193 movdbl(dst, one); 1194 jcc(Assembler::above, DONE_LABEL); 1195 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1196 } 1197 1198 bind(DONE_LABEL); 1199 } 1200 1201 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1202 if (sign) { 1203 pmovsxbw(dst, src); 1204 } else { 1205 pmovzxbw(dst, src); 1206 } 1207 } 1208 1209 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1210 if (sign) { 1211 vpmovsxbw(dst, src, vector_len); 1212 } else { 1213 vpmovzxbw(dst, src, vector_len); 1214 } 1215 } 1216 1217 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1218 if (sign) { 1219 vpmovsxbd(dst, src, vector_len); 1220 } else { 1221 vpmovzxbd(dst, src, vector_len); 1222 } 1223 } 1224 1225 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1226 if (sign) { 1227 vpmovsxwd(dst, src, vector_len); 1228 } else { 1229 vpmovzxwd(dst, src, vector_len); 1230 } 1231 } 1232 1233 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1234 int shift, int vector_len) { 1235 if (opcode == Op_RotateLeftV) { 1236 if (etype == T_INT) { 1237 evprold(dst, src, shift, vector_len); 1238 } else { 1239 assert(etype == T_LONG, "expected type T_LONG"); 1240 evprolq(dst, src, shift, vector_len); 1241 } 1242 } else { 1243 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1244 if (etype == T_INT) { 1245 evprord(dst, src, shift, vector_len); 1246 } else { 1247 assert(etype == T_LONG, "expected type T_LONG"); 1248 evprorq(dst, src, shift, vector_len); 1249 } 1250 } 1251 } 1252 1253 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1254 XMMRegister shift, int vector_len) { 1255 if (opcode == Op_RotateLeftV) { 1256 if (etype == T_INT) { 1257 evprolvd(dst, src, shift, vector_len); 1258 } else { 1259 assert(etype == T_LONG, "expected type T_LONG"); 1260 evprolvq(dst, src, shift, vector_len); 1261 } 1262 } else { 1263 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1264 if (etype == T_INT) { 1265 evprorvd(dst, src, shift, vector_len); 1266 } else { 1267 assert(etype == T_LONG, "expected type T_LONG"); 1268 evprorvq(dst, src, shift, vector_len); 1269 } 1270 } 1271 } 1272 1273 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1274 if (opcode == Op_RShiftVI) { 1275 psrad(dst, shift); 1276 } else if (opcode == Op_LShiftVI) { 1277 pslld(dst, shift); 1278 } else { 1279 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1280 psrld(dst, shift); 1281 } 1282 } 1283 1284 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1285 switch (opcode) { 1286 case Op_RShiftVI: psrad(dst, shift); break; 1287 case Op_LShiftVI: pslld(dst, shift); break; 1288 case Op_URShiftVI: psrld(dst, shift); break; 1289 1290 default: assert(false, "%s", NodeClassNames[opcode]); 1291 } 1292 } 1293 1294 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1295 if (opcode == Op_RShiftVI) { 1296 vpsrad(dst, nds, shift, vector_len); 1297 } else if (opcode == Op_LShiftVI) { 1298 vpslld(dst, nds, shift, vector_len); 1299 } else { 1300 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1301 vpsrld(dst, nds, shift, vector_len); 1302 } 1303 } 1304 1305 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1306 switch (opcode) { 1307 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1308 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1309 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1310 1311 default: assert(false, "%s", NodeClassNames[opcode]); 1312 } 1313 } 1314 1315 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1316 switch (opcode) { 1317 case Op_RShiftVB: // fall-through 1318 case Op_RShiftVS: psraw(dst, shift); break; 1319 1320 case Op_LShiftVB: // fall-through 1321 case Op_LShiftVS: psllw(dst, shift); break; 1322 1323 case Op_URShiftVS: // fall-through 1324 case Op_URShiftVB: psrlw(dst, shift); break; 1325 1326 default: assert(false, "%s", NodeClassNames[opcode]); 1327 } 1328 } 1329 1330 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1331 switch (opcode) { 1332 case Op_RShiftVB: // fall-through 1333 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1334 1335 case Op_LShiftVB: // fall-through 1336 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1337 1338 case Op_URShiftVS: // fall-through 1339 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1340 1341 default: assert(false, "%s", NodeClassNames[opcode]); 1342 } 1343 } 1344 1345 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1346 switch (opcode) { 1347 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1348 case Op_LShiftVL: psllq(dst, shift); break; 1349 case Op_URShiftVL: psrlq(dst, shift); break; 1350 1351 default: assert(false, "%s", NodeClassNames[opcode]); 1352 } 1353 } 1354 1355 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1356 if (opcode == Op_RShiftVL) { 1357 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1358 } else if (opcode == Op_LShiftVL) { 1359 psllq(dst, shift); 1360 } else { 1361 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1362 psrlq(dst, shift); 1363 } 1364 } 1365 1366 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1367 switch (opcode) { 1368 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1369 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1370 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1371 1372 default: assert(false, "%s", NodeClassNames[opcode]); 1373 } 1374 } 1375 1376 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1377 if (opcode == Op_RShiftVL) { 1378 evpsraq(dst, nds, shift, vector_len); 1379 } else if (opcode == Op_LShiftVL) { 1380 vpsllq(dst, nds, shift, vector_len); 1381 } else { 1382 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1383 vpsrlq(dst, nds, shift, vector_len); 1384 } 1385 } 1386 1387 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1388 switch (opcode) { 1389 case Op_RShiftVB: // fall-through 1390 case Op_RShiftVS: // fall-through 1391 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1392 1393 case Op_LShiftVB: // fall-through 1394 case Op_LShiftVS: // fall-through 1395 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1396 1397 case Op_URShiftVB: // fall-through 1398 case Op_URShiftVS: // fall-through 1399 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1400 1401 default: assert(false, "%s", NodeClassNames[opcode]); 1402 } 1403 } 1404 1405 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1406 switch (opcode) { 1407 case Op_RShiftVB: // fall-through 1408 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1409 1410 case Op_LShiftVB: // fall-through 1411 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1412 1413 case Op_URShiftVB: // fall-through 1414 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1415 1416 default: assert(false, "%s", NodeClassNames[opcode]); 1417 } 1418 } 1419 1420 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1421 assert(UseAVX >= 2, "required"); 1422 switch (opcode) { 1423 case Op_RShiftVL: { 1424 if (UseAVX > 2) { 1425 assert(tmp == xnoreg, "not used"); 1426 if (!VM_Version::supports_avx512vl()) { 1427 vlen_enc = Assembler::AVX_512bit; 1428 } 1429 evpsravq(dst, src, shift, vlen_enc); 1430 } else { 1431 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1432 vpsrlvq(dst, src, shift, vlen_enc); 1433 vpsrlvq(tmp, tmp, shift, vlen_enc); 1434 vpxor(dst, dst, tmp, vlen_enc); 1435 vpsubq(dst, dst, tmp, vlen_enc); 1436 } 1437 break; 1438 } 1439 case Op_LShiftVL: { 1440 assert(tmp == xnoreg, "not used"); 1441 vpsllvq(dst, src, shift, vlen_enc); 1442 break; 1443 } 1444 case Op_URShiftVL: { 1445 assert(tmp == xnoreg, "not used"); 1446 vpsrlvq(dst, src, shift, vlen_enc); 1447 break; 1448 } 1449 default: assert(false, "%s", NodeClassNames[opcode]); 1450 } 1451 } 1452 1453 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1454 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1455 assert(opcode == Op_LShiftVB || 1456 opcode == Op_RShiftVB || 1457 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1458 bool sign = (opcode != Op_URShiftVB); 1459 assert(vector_len == 0, "required"); 1460 vextendbd(sign, dst, src, 1); 1461 vpmovzxbd(vtmp, shift, 1); 1462 varshiftd(opcode, dst, dst, vtmp, 1); 1463 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1464 vextracti128_high(vtmp, dst); 1465 vpackusdw(dst, dst, vtmp, 0); 1466 } 1467 1468 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1469 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1470 assert(opcode == Op_LShiftVB || 1471 opcode == Op_RShiftVB || 1472 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1473 bool sign = (opcode != Op_URShiftVB); 1474 int ext_vector_len = vector_len + 1; 1475 vextendbw(sign, dst, src, ext_vector_len); 1476 vpmovzxbw(vtmp, shift, ext_vector_len); 1477 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1478 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1479 if (vector_len == 0) { 1480 vextracti128_high(vtmp, dst); 1481 vpackuswb(dst, dst, vtmp, vector_len); 1482 } else { 1483 vextracti64x4_high(vtmp, dst); 1484 vpackuswb(dst, dst, vtmp, vector_len); 1485 vpermq(dst, dst, 0xD8, vector_len); 1486 } 1487 } 1488 1489 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1490 switch(typ) { 1491 case T_BYTE: 1492 pinsrb(dst, val, idx); 1493 break; 1494 case T_SHORT: 1495 pinsrw(dst, val, idx); 1496 break; 1497 case T_INT: 1498 pinsrd(dst, val, idx); 1499 break; 1500 case T_LONG: 1501 pinsrq(dst, val, idx); 1502 break; 1503 default: 1504 assert(false,"Should not reach here."); 1505 break; 1506 } 1507 } 1508 1509 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1510 switch(typ) { 1511 case T_BYTE: 1512 vpinsrb(dst, src, val, idx); 1513 break; 1514 case T_SHORT: 1515 vpinsrw(dst, src, val, idx); 1516 break; 1517 case T_INT: 1518 vpinsrd(dst, src, val, idx); 1519 break; 1520 case T_LONG: 1521 vpinsrq(dst, src, val, idx); 1522 break; 1523 default: 1524 assert(false,"Should not reach here."); 1525 break; 1526 } 1527 } 1528 1529 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1530 switch(typ) { 1531 case T_INT: 1532 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1533 break; 1534 case T_FLOAT: 1535 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1536 break; 1537 case T_LONG: 1538 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1539 break; 1540 case T_DOUBLE: 1541 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1542 break; 1543 default: 1544 assert(false,"Should not reach here."); 1545 break; 1546 } 1547 } 1548 1549 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1550 switch(typ) { 1551 case T_INT: 1552 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1553 break; 1554 case T_FLOAT: 1555 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1556 break; 1557 case T_LONG: 1558 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1559 break; 1560 case T_DOUBLE: 1561 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1562 break; 1563 default: 1564 assert(false,"Should not reach here."); 1565 break; 1566 } 1567 } 1568 1569 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1570 switch(typ) { 1571 case T_INT: 1572 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1573 break; 1574 case T_FLOAT: 1575 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1576 break; 1577 case T_LONG: 1578 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1579 break; 1580 case T_DOUBLE: 1581 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1582 break; 1583 default: 1584 assert(false,"Should not reach here."); 1585 break; 1586 } 1587 } 1588 1589 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1590 if (vlen_in_bytes <= 16) { 1591 pxor (dst, dst); 1592 psubb(dst, src); 1593 switch (elem_bt) { 1594 case T_BYTE: /* nothing to do */ break; 1595 case T_SHORT: pmovsxbw(dst, dst); break; 1596 case T_INT: pmovsxbd(dst, dst); break; 1597 case T_FLOAT: pmovsxbd(dst, dst); break; 1598 case T_LONG: pmovsxbq(dst, dst); break; 1599 case T_DOUBLE: pmovsxbq(dst, dst); break; 1600 1601 default: assert(false, "%s", type2name(elem_bt)); 1602 } 1603 } else { 1604 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1605 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1606 1607 vpxor (dst, dst, dst, vlen_enc); 1608 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1609 1610 switch (elem_bt) { 1611 case T_BYTE: /* nothing to do */ break; 1612 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1613 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1614 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1615 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1616 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1617 1618 default: assert(false, "%s", type2name(elem_bt)); 1619 } 1620 } 1621 } 1622 1623 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1624 if (novlbwdq) { 1625 vpmovsxbd(xtmp, src, vlen_enc); 1626 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1627 Assembler::eq, true, vlen_enc, noreg); 1628 } else { 1629 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1630 vpsubb(xtmp, xtmp, src, vlen_enc); 1631 evpmovb2m(dst, xtmp, vlen_enc); 1632 } 1633 } 1634 1635 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) { 1636 switch (vlen_in_bytes) { 1637 case 4: movdl(dst, src); break; 1638 case 8: movq(dst, src); break; 1639 case 16: movdqu(dst, src); break; 1640 case 32: vmovdqu(dst, src); break; 1641 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1642 default: ShouldNotReachHere(); 1643 } 1644 } 1645 1646 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1647 assert(rscratch != noreg || always_reachable(src), "missing"); 1648 1649 if (reachable(src)) { 1650 load_vector(dst, as_Address(src), vlen_in_bytes); 1651 } else { 1652 lea(rscratch, src); 1653 load_vector(dst, Address(rscratch, 0), vlen_in_bytes); 1654 } 1655 } 1656 1657 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1658 int vlen_enc = vector_length_encoding(vlen); 1659 if (VM_Version::supports_avx()) { 1660 if (bt == T_LONG) { 1661 if (VM_Version::supports_avx2()) { 1662 vpbroadcastq(dst, src, vlen_enc); 1663 } else { 1664 vmovddup(dst, src, vlen_enc); 1665 } 1666 } else if (bt == T_DOUBLE) { 1667 if (vlen_enc != Assembler::AVX_128bit) { 1668 vbroadcastsd(dst, src, vlen_enc, noreg); 1669 } else { 1670 vmovddup(dst, src, vlen_enc); 1671 } 1672 } else { 1673 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1674 vpbroadcastd(dst, src, vlen_enc); 1675 } else { 1676 vbroadcastss(dst, src, vlen_enc); 1677 } 1678 } 1679 } else if (VM_Version::supports_sse3()) { 1680 movddup(dst, src); 1681 } else { 1682 movq(dst, src); 1683 if (vlen == 16) { 1684 punpcklqdq(dst, dst); 1685 } 1686 } 1687 } 1688 1689 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1690 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1691 int offset = exact_log2(type2aelembytes(bt)) << 6; 1692 if (is_floating_point_type(bt)) { 1693 offset += 128; 1694 } 1695 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1696 load_vector(dst, addr, vlen_in_bytes); 1697 } 1698 1699 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1700 1701 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1702 int vector_len = Assembler::AVX_128bit; 1703 1704 switch (opcode) { 1705 case Op_AndReductionV: pand(dst, src); break; 1706 case Op_OrReductionV: por (dst, src); break; 1707 case Op_XorReductionV: pxor(dst, src); break; 1708 case Op_MinReductionV: 1709 switch (typ) { 1710 case T_BYTE: pminsb(dst, src); break; 1711 case T_SHORT: pminsw(dst, src); break; 1712 case T_INT: pminsd(dst, src); break; 1713 case T_LONG: assert(UseAVX > 2, "required"); 1714 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1715 default: assert(false, "wrong type"); 1716 } 1717 break; 1718 case Op_MaxReductionV: 1719 switch (typ) { 1720 case T_BYTE: pmaxsb(dst, src); break; 1721 case T_SHORT: pmaxsw(dst, src); break; 1722 case T_INT: pmaxsd(dst, src); break; 1723 case T_LONG: assert(UseAVX > 2, "required"); 1724 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1725 default: assert(false, "wrong type"); 1726 } 1727 break; 1728 case Op_AddReductionVF: addss(dst, src); break; 1729 case Op_AddReductionVD: addsd(dst, src); break; 1730 case Op_AddReductionVI: 1731 switch (typ) { 1732 case T_BYTE: paddb(dst, src); break; 1733 case T_SHORT: paddw(dst, src); break; 1734 case T_INT: paddd(dst, src); break; 1735 default: assert(false, "wrong type"); 1736 } 1737 break; 1738 case Op_AddReductionVL: paddq(dst, src); break; 1739 case Op_MulReductionVF: mulss(dst, src); break; 1740 case Op_MulReductionVD: mulsd(dst, src); break; 1741 case Op_MulReductionVI: 1742 switch (typ) { 1743 case T_SHORT: pmullw(dst, src); break; 1744 case T_INT: pmulld(dst, src); break; 1745 default: assert(false, "wrong type"); 1746 } 1747 break; 1748 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1749 evpmullq(dst, dst, src, vector_len); break; 1750 default: assert(false, "wrong opcode"); 1751 } 1752 } 1753 1754 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1755 int vector_len = Assembler::AVX_256bit; 1756 1757 switch (opcode) { 1758 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1759 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1760 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1761 case Op_MinReductionV: 1762 switch (typ) { 1763 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1764 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1765 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1766 case T_LONG: assert(UseAVX > 2, "required"); 1767 vpminsq(dst, src1, src2, vector_len); break; 1768 default: assert(false, "wrong type"); 1769 } 1770 break; 1771 case Op_MaxReductionV: 1772 switch (typ) { 1773 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1774 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1775 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1776 case T_LONG: assert(UseAVX > 2, "required"); 1777 vpmaxsq(dst, src1, src2, vector_len); break; 1778 default: assert(false, "wrong type"); 1779 } 1780 break; 1781 case Op_AddReductionVI: 1782 switch (typ) { 1783 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1784 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1785 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1786 default: assert(false, "wrong type"); 1787 } 1788 break; 1789 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1790 case Op_MulReductionVI: 1791 switch (typ) { 1792 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1793 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1794 default: assert(false, "wrong type"); 1795 } 1796 break; 1797 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1798 default: assert(false, "wrong opcode"); 1799 } 1800 } 1801 1802 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1803 XMMRegister dst, XMMRegister src, 1804 XMMRegister vtmp1, XMMRegister vtmp2) { 1805 switch (opcode) { 1806 case Op_AddReductionVF: 1807 case Op_MulReductionVF: 1808 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1809 break; 1810 1811 case Op_AddReductionVD: 1812 case Op_MulReductionVD: 1813 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1814 break; 1815 1816 default: assert(false, "wrong opcode"); 1817 } 1818 } 1819 1820 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1821 Register dst, Register src1, XMMRegister src2, 1822 XMMRegister vtmp1, XMMRegister vtmp2) { 1823 switch (vlen) { 1824 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1825 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1826 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1827 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1828 1829 default: assert(false, "wrong vector length"); 1830 } 1831 } 1832 1833 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1834 Register dst, Register src1, XMMRegister src2, 1835 XMMRegister vtmp1, XMMRegister vtmp2) { 1836 switch (vlen) { 1837 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1838 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1839 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1840 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1841 1842 default: assert(false, "wrong vector length"); 1843 } 1844 } 1845 1846 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1847 Register dst, Register src1, XMMRegister src2, 1848 XMMRegister vtmp1, XMMRegister vtmp2) { 1849 switch (vlen) { 1850 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1851 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1852 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1853 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1854 1855 default: assert(false, "wrong vector length"); 1856 } 1857 } 1858 1859 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1860 Register dst, Register src1, XMMRegister src2, 1861 XMMRegister vtmp1, XMMRegister vtmp2) { 1862 switch (vlen) { 1863 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1864 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1865 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1866 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1867 1868 default: assert(false, "wrong vector length"); 1869 } 1870 } 1871 1872 #ifdef _LP64 1873 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1874 Register dst, Register src1, XMMRegister src2, 1875 XMMRegister vtmp1, XMMRegister vtmp2) { 1876 switch (vlen) { 1877 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1878 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1879 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1880 1881 default: assert(false, "wrong vector length"); 1882 } 1883 } 1884 #endif // _LP64 1885 1886 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1887 switch (vlen) { 1888 case 2: 1889 assert(vtmp2 == xnoreg, ""); 1890 reduce2F(opcode, dst, src, vtmp1); 1891 break; 1892 case 4: 1893 assert(vtmp2 == xnoreg, ""); 1894 reduce4F(opcode, dst, src, vtmp1); 1895 break; 1896 case 8: 1897 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1898 break; 1899 case 16: 1900 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1901 break; 1902 default: assert(false, "wrong vector length"); 1903 } 1904 } 1905 1906 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1907 switch (vlen) { 1908 case 2: 1909 assert(vtmp2 == xnoreg, ""); 1910 reduce2D(opcode, dst, src, vtmp1); 1911 break; 1912 case 4: 1913 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1914 break; 1915 case 8: 1916 reduce8D(opcode, dst, src, vtmp1, vtmp2); 1917 break; 1918 default: assert(false, "wrong vector length"); 1919 } 1920 } 1921 1922 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1923 if (opcode == Op_AddReductionVI) { 1924 if (vtmp1 != src2) { 1925 movdqu(vtmp1, src2); 1926 } 1927 phaddd(vtmp1, vtmp1); 1928 } else { 1929 pshufd(vtmp1, src2, 0x1); 1930 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1931 } 1932 movdl(vtmp2, src1); 1933 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1934 movdl(dst, vtmp1); 1935 } 1936 1937 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1938 if (opcode == Op_AddReductionVI) { 1939 if (vtmp1 != src2) { 1940 movdqu(vtmp1, src2); 1941 } 1942 phaddd(vtmp1, src2); 1943 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1944 } else { 1945 pshufd(vtmp2, src2, 0xE); 1946 reduce_operation_128(T_INT, opcode, vtmp2, src2); 1947 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1948 } 1949 } 1950 1951 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1952 if (opcode == Op_AddReductionVI) { 1953 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 1954 vextracti128_high(vtmp2, vtmp1); 1955 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 1956 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1957 } else { 1958 vextracti128_high(vtmp1, src2); 1959 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1960 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1961 } 1962 } 1963 1964 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1965 vextracti64x4_high(vtmp2, src2); 1966 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 1967 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1968 } 1969 1970 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1971 pshufd(vtmp2, src2, 0x1); 1972 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1973 movdqu(vtmp1, vtmp2); 1974 psrldq(vtmp1, 2); 1975 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 1976 movdqu(vtmp2, vtmp1); 1977 psrldq(vtmp2, 1); 1978 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 1979 movdl(vtmp2, src1); 1980 pmovsxbd(vtmp1, vtmp1); 1981 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1982 pextrb(dst, vtmp1, 0x0); 1983 movsbl(dst, dst); 1984 } 1985 1986 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1987 pshufd(vtmp1, src2, 0xE); 1988 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 1989 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1990 } 1991 1992 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1993 vextracti128_high(vtmp2, src2); 1994 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1995 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1996 } 1997 1998 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1999 vextracti64x4_high(vtmp1, src2); 2000 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2001 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2002 } 2003 2004 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2005 pmovsxbw(vtmp2, src2); 2006 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2007 } 2008 2009 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2010 if (UseAVX > 1) { 2011 int vector_len = Assembler::AVX_256bit; 2012 vpmovsxbw(vtmp1, src2, vector_len); 2013 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2014 } else { 2015 pmovsxbw(vtmp2, src2); 2016 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2017 pshufd(vtmp2, src2, 0x1); 2018 pmovsxbw(vtmp2, src2); 2019 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2020 } 2021 } 2022 2023 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2024 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2025 int vector_len = Assembler::AVX_512bit; 2026 vpmovsxbw(vtmp1, src2, vector_len); 2027 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2028 } else { 2029 assert(UseAVX >= 2,"Should not reach here."); 2030 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2031 vextracti128_high(vtmp2, src2); 2032 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2033 } 2034 } 2035 2036 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2037 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2038 vextracti64x4_high(vtmp2, src2); 2039 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2040 } 2041 2042 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2043 if (opcode == Op_AddReductionVI) { 2044 if (vtmp1 != src2) { 2045 movdqu(vtmp1, src2); 2046 } 2047 phaddw(vtmp1, vtmp1); 2048 phaddw(vtmp1, vtmp1); 2049 } else { 2050 pshufd(vtmp2, src2, 0x1); 2051 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2052 movdqu(vtmp1, vtmp2); 2053 psrldq(vtmp1, 2); 2054 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2055 } 2056 movdl(vtmp2, src1); 2057 pmovsxwd(vtmp1, vtmp1); 2058 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2059 pextrw(dst, vtmp1, 0x0); 2060 movswl(dst, dst); 2061 } 2062 2063 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2064 if (opcode == Op_AddReductionVI) { 2065 if (vtmp1 != src2) { 2066 movdqu(vtmp1, src2); 2067 } 2068 phaddw(vtmp1, src2); 2069 } else { 2070 pshufd(vtmp1, src2, 0xE); 2071 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2072 } 2073 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2074 } 2075 2076 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2077 if (opcode == Op_AddReductionVI) { 2078 int vector_len = Assembler::AVX_256bit; 2079 vphaddw(vtmp2, src2, src2, vector_len); 2080 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2081 } else { 2082 vextracti128_high(vtmp2, src2); 2083 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2084 } 2085 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2086 } 2087 2088 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2089 int vector_len = Assembler::AVX_256bit; 2090 vextracti64x4_high(vtmp1, src2); 2091 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2092 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2093 } 2094 2095 #ifdef _LP64 2096 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2097 pshufd(vtmp2, src2, 0xE); 2098 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2099 movdq(vtmp1, src1); 2100 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2101 movdq(dst, vtmp1); 2102 } 2103 2104 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2105 vextracti128_high(vtmp1, src2); 2106 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2107 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2108 } 2109 2110 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2111 vextracti64x4_high(vtmp2, src2); 2112 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2113 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2114 } 2115 2116 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2117 mov64(temp, -1L); 2118 bzhiq(temp, temp, len); 2119 kmovql(dst, temp); 2120 } 2121 #endif // _LP64 2122 2123 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2124 reduce_operation_128(T_FLOAT, opcode, dst, src); 2125 pshufd(vtmp, src, 0x1); 2126 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2127 } 2128 2129 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2130 reduce2F(opcode, dst, src, vtmp); 2131 pshufd(vtmp, src, 0x2); 2132 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2133 pshufd(vtmp, src, 0x3); 2134 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2135 } 2136 2137 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2138 reduce4F(opcode, dst, src, vtmp2); 2139 vextractf128_high(vtmp2, src); 2140 reduce4F(opcode, dst, vtmp2, vtmp1); 2141 } 2142 2143 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2144 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2145 vextracti64x4_high(vtmp1, src); 2146 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2147 } 2148 2149 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2150 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2151 pshufd(vtmp, src, 0xE); 2152 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2153 } 2154 2155 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2156 reduce2D(opcode, dst, src, vtmp2); 2157 vextractf128_high(vtmp2, src); 2158 reduce2D(opcode, dst, vtmp2, vtmp1); 2159 } 2160 2161 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2162 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2163 vextracti64x4_high(vtmp1, src); 2164 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2165 } 2166 2167 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2168 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2169 } 2170 2171 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2172 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2173 } 2174 2175 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2176 int vec_enc) { 2177 switch(elem_bt) { 2178 case T_INT: 2179 case T_FLOAT: 2180 vmaskmovps(dst, src, mask, vec_enc); 2181 break; 2182 case T_LONG: 2183 case T_DOUBLE: 2184 vmaskmovpd(dst, src, mask, vec_enc); 2185 break; 2186 default: 2187 fatal("Unsupported type %s", type2name(elem_bt)); 2188 break; 2189 } 2190 } 2191 2192 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2193 int vec_enc) { 2194 switch(elem_bt) { 2195 case T_INT: 2196 case T_FLOAT: 2197 vmaskmovps(dst, src, mask, vec_enc); 2198 break; 2199 case T_LONG: 2200 case T_DOUBLE: 2201 vmaskmovpd(dst, src, mask, vec_enc); 2202 break; 2203 default: 2204 fatal("Unsupported type %s", type2name(elem_bt)); 2205 break; 2206 } 2207 } 2208 2209 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2210 XMMRegister dst, XMMRegister src, 2211 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2212 XMMRegister xmm_0, XMMRegister xmm_1) { 2213 int permconst[] = {1, 14}; 2214 XMMRegister wsrc = src; 2215 XMMRegister wdst = xmm_0; 2216 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2217 2218 int vlen_enc = Assembler::AVX_128bit; 2219 if (vlen == 16) { 2220 vlen_enc = Assembler::AVX_256bit; 2221 } 2222 2223 for (int i = log2(vlen) - 1; i >=0; i--) { 2224 if (i == 0 && !is_dst_valid) { 2225 wdst = dst; 2226 } 2227 if (i == 3) { 2228 vextracti64x4_high(wtmp, wsrc); 2229 } else if (i == 2) { 2230 vextracti128_high(wtmp, wsrc); 2231 } else { // i = [0,1] 2232 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2233 } 2234 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2235 wsrc = wdst; 2236 vlen_enc = Assembler::AVX_128bit; 2237 } 2238 if (is_dst_valid) { 2239 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2240 } 2241 } 2242 2243 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2244 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2245 XMMRegister xmm_0, XMMRegister xmm_1) { 2246 XMMRegister wsrc = src; 2247 XMMRegister wdst = xmm_0; 2248 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2249 int vlen_enc = Assembler::AVX_128bit; 2250 if (vlen == 8) { 2251 vlen_enc = Assembler::AVX_256bit; 2252 } 2253 for (int i = log2(vlen) - 1; i >=0; i--) { 2254 if (i == 0 && !is_dst_valid) { 2255 wdst = dst; 2256 } 2257 if (i == 1) { 2258 vextracti128_high(wtmp, wsrc); 2259 } else if (i == 2) { 2260 vextracti64x4_high(wtmp, wsrc); 2261 } else { 2262 assert(i == 0, "%d", i); 2263 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2264 } 2265 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2266 wsrc = wdst; 2267 vlen_enc = Assembler::AVX_128bit; 2268 } 2269 if (is_dst_valid) { 2270 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2271 } 2272 } 2273 2274 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2275 switch (bt) { 2276 case T_BYTE: pextrb(dst, src, idx); break; 2277 case T_SHORT: pextrw(dst, src, idx); break; 2278 case T_INT: pextrd(dst, src, idx); break; 2279 case T_LONG: pextrq(dst, src, idx); break; 2280 2281 default: 2282 assert(false,"Should not reach here."); 2283 break; 2284 } 2285 } 2286 2287 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2288 int esize = type2aelembytes(typ); 2289 int elem_per_lane = 16/esize; 2290 int lane = elemindex / elem_per_lane; 2291 int eindex = elemindex % elem_per_lane; 2292 2293 if (lane >= 2) { 2294 assert(UseAVX > 2, "required"); 2295 vextractf32x4(dst, src, lane & 3); 2296 return dst; 2297 } else if (lane > 0) { 2298 assert(UseAVX > 0, "required"); 2299 vextractf128(dst, src, lane); 2300 return dst; 2301 } else { 2302 return src; 2303 } 2304 } 2305 2306 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2307 int esize = type2aelembytes(typ); 2308 int elem_per_lane = 16/esize; 2309 int eindex = elemindex % elem_per_lane; 2310 assert(is_integral_type(typ),"required"); 2311 2312 if (eindex == 0) { 2313 if (typ == T_LONG) { 2314 movq(dst, src); 2315 } else { 2316 movdl(dst, src); 2317 if (typ == T_BYTE) 2318 movsbl(dst, dst); 2319 else if (typ == T_SHORT) 2320 movswl(dst, dst); 2321 } 2322 } else { 2323 extract(typ, dst, src, eindex); 2324 } 2325 } 2326 2327 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2328 int esize = type2aelembytes(typ); 2329 int elem_per_lane = 16/esize; 2330 int eindex = elemindex % elem_per_lane; 2331 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2332 2333 if (eindex == 0) { 2334 movq(dst, src); 2335 } else { 2336 if (typ == T_FLOAT) { 2337 if (UseAVX == 0) { 2338 movdqu(dst, src); 2339 shufps(dst, dst, eindex); 2340 } else { 2341 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2342 } 2343 } else { 2344 if (UseAVX == 0) { 2345 movdqu(dst, src); 2346 psrldq(dst, eindex*esize); 2347 } else { 2348 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2349 } 2350 movq(dst, dst); 2351 } 2352 } 2353 // Zero upper bits 2354 if (typ == T_FLOAT) { 2355 if (UseAVX == 0) { 2356 assert(vtmp != xnoreg, "required."); 2357 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2358 pand(dst, vtmp); 2359 } else { 2360 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2361 } 2362 } 2363 } 2364 2365 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2366 switch(typ) { 2367 case T_BYTE: 2368 case T_BOOLEAN: 2369 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2370 break; 2371 case T_SHORT: 2372 case T_CHAR: 2373 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2374 break; 2375 case T_INT: 2376 case T_FLOAT: 2377 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2378 break; 2379 case T_LONG: 2380 case T_DOUBLE: 2381 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2382 break; 2383 default: 2384 assert(false,"Should not reach here."); 2385 break; 2386 } 2387 } 2388 2389 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2390 assert(rscratch != noreg || always_reachable(src2), "missing"); 2391 2392 switch(typ) { 2393 case T_BOOLEAN: 2394 case T_BYTE: 2395 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2396 break; 2397 case T_CHAR: 2398 case T_SHORT: 2399 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2400 break; 2401 case T_INT: 2402 case T_FLOAT: 2403 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2404 break; 2405 case T_LONG: 2406 case T_DOUBLE: 2407 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2408 break; 2409 default: 2410 assert(false,"Should not reach here."); 2411 break; 2412 } 2413 } 2414 2415 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2416 switch(typ) { 2417 case T_BYTE: 2418 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2419 break; 2420 case T_SHORT: 2421 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2422 break; 2423 case T_INT: 2424 case T_FLOAT: 2425 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2426 break; 2427 case T_LONG: 2428 case T_DOUBLE: 2429 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2430 break; 2431 default: 2432 assert(false,"Should not reach here."); 2433 break; 2434 } 2435 } 2436 2437 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2438 assert(vlen_in_bytes <= 32, ""); 2439 int esize = type2aelembytes(bt); 2440 if (vlen_in_bytes == 32) { 2441 assert(vtmp == xnoreg, "required."); 2442 if (esize >= 4) { 2443 vtestps(src1, src2, AVX_256bit); 2444 } else { 2445 vptest(src1, src2, AVX_256bit); 2446 } 2447 return; 2448 } 2449 if (vlen_in_bytes < 16) { 2450 // Duplicate the lower part to fill the whole register, 2451 // Don't need to do so for src2 2452 assert(vtmp != xnoreg, "required"); 2453 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2454 pshufd(vtmp, src1, shuffle_imm); 2455 } else { 2456 assert(vtmp == xnoreg, "required"); 2457 vtmp = src1; 2458 } 2459 if (esize >= 4 && VM_Version::supports_avx()) { 2460 vtestps(vtmp, src2, AVX_128bit); 2461 } else { 2462 ptest(vtmp, src2); 2463 } 2464 } 2465 2466 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2467 assert(UseAVX >= 2, "required"); 2468 #ifdef ASSERT 2469 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2470 bool is_bw_supported = VM_Version::supports_avx512bw(); 2471 if (is_bw && !is_bw_supported) { 2472 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2473 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2474 "XMM register should be 0-15"); 2475 } 2476 #endif // ASSERT 2477 switch (elem_bt) { 2478 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2479 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2480 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2481 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2482 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2483 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2484 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2485 } 2486 } 2487 2488 #ifdef _LP64 2489 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2490 assert(UseAVX >= 2, "required"); 2491 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2492 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2493 if ((UseAVX > 2) && 2494 (!is_bw || VM_Version::supports_avx512bw()) && 2495 (!is_vl || VM_Version::supports_avx512vl())) { 2496 switch (elem_bt) { 2497 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2498 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2499 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2500 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2501 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2502 } 2503 } else { 2504 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2505 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2506 switch (elem_bt) { 2507 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2508 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2509 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2510 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2511 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2512 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2513 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2514 } 2515 } 2516 } 2517 #endif 2518 2519 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2520 switch (to_elem_bt) { 2521 case T_SHORT: 2522 vpmovsxbw(dst, src, vlen_enc); 2523 break; 2524 case T_INT: 2525 vpmovsxbd(dst, src, vlen_enc); 2526 break; 2527 case T_FLOAT: 2528 vpmovsxbd(dst, src, vlen_enc); 2529 vcvtdq2ps(dst, dst, vlen_enc); 2530 break; 2531 case T_LONG: 2532 vpmovsxbq(dst, src, vlen_enc); 2533 break; 2534 case T_DOUBLE: { 2535 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2536 vpmovsxbd(dst, src, mid_vlen_enc); 2537 vcvtdq2pd(dst, dst, vlen_enc); 2538 break; 2539 } 2540 default: 2541 fatal("Unsupported type %s", type2name(to_elem_bt)); 2542 break; 2543 } 2544 } 2545 2546 //------------------------------------------------------------------------------------------- 2547 2548 // IndexOf for constant substrings with size >= 8 chars 2549 // which don't need to be loaded through stack. 2550 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2551 Register cnt1, Register cnt2, 2552 int int_cnt2, Register result, 2553 XMMRegister vec, Register tmp, 2554 int ae) { 2555 ShortBranchVerifier sbv(this); 2556 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2557 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2558 2559 // This method uses the pcmpestri instruction with bound registers 2560 // inputs: 2561 // xmm - substring 2562 // rax - substring length (elements count) 2563 // mem - scanned string 2564 // rdx - string length (elements count) 2565 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2566 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2567 // outputs: 2568 // rcx - matched index in string 2569 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2570 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2571 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2572 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2573 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2574 2575 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2576 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2577 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2578 2579 // Note, inline_string_indexOf() generates checks: 2580 // if (substr.count > string.count) return -1; 2581 // if (substr.count == 0) return 0; 2582 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2583 2584 // Load substring. 2585 if (ae == StrIntrinsicNode::UL) { 2586 pmovzxbw(vec, Address(str2, 0)); 2587 } else { 2588 movdqu(vec, Address(str2, 0)); 2589 } 2590 movl(cnt2, int_cnt2); 2591 movptr(result, str1); // string addr 2592 2593 if (int_cnt2 > stride) { 2594 jmpb(SCAN_TO_SUBSTR); 2595 2596 // Reload substr for rescan, this code 2597 // is executed only for large substrings (> 8 chars) 2598 bind(RELOAD_SUBSTR); 2599 if (ae == StrIntrinsicNode::UL) { 2600 pmovzxbw(vec, Address(str2, 0)); 2601 } else { 2602 movdqu(vec, Address(str2, 0)); 2603 } 2604 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2605 2606 bind(RELOAD_STR); 2607 // We came here after the beginning of the substring was 2608 // matched but the rest of it was not so we need to search 2609 // again. Start from the next element after the previous match. 2610 2611 // cnt2 is number of substring reminding elements and 2612 // cnt1 is number of string reminding elements when cmp failed. 2613 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2614 subl(cnt1, cnt2); 2615 addl(cnt1, int_cnt2); 2616 movl(cnt2, int_cnt2); // Now restore cnt2 2617 2618 decrementl(cnt1); // Shift to next element 2619 cmpl(cnt1, cnt2); 2620 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2621 2622 addptr(result, (1<<scale1)); 2623 2624 } // (int_cnt2 > 8) 2625 2626 // Scan string for start of substr in 16-byte vectors 2627 bind(SCAN_TO_SUBSTR); 2628 pcmpestri(vec, Address(result, 0), mode); 2629 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2630 subl(cnt1, stride); 2631 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2632 cmpl(cnt1, cnt2); 2633 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2634 addptr(result, 16); 2635 jmpb(SCAN_TO_SUBSTR); 2636 2637 // Found a potential substr 2638 bind(FOUND_CANDIDATE); 2639 // Matched whole vector if first element matched (tmp(rcx) == 0). 2640 if (int_cnt2 == stride) { 2641 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2642 } else { // int_cnt2 > 8 2643 jccb(Assembler::overflow, FOUND_SUBSTR); 2644 } 2645 // After pcmpestri tmp(rcx) contains matched element index 2646 // Compute start addr of substr 2647 lea(result, Address(result, tmp, scale1)); 2648 2649 // Make sure string is still long enough 2650 subl(cnt1, tmp); 2651 cmpl(cnt1, cnt2); 2652 if (int_cnt2 == stride) { 2653 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2654 } else { // int_cnt2 > 8 2655 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2656 } 2657 // Left less then substring. 2658 2659 bind(RET_NOT_FOUND); 2660 movl(result, -1); 2661 jmp(EXIT); 2662 2663 if (int_cnt2 > stride) { 2664 // This code is optimized for the case when whole substring 2665 // is matched if its head is matched. 2666 bind(MATCH_SUBSTR_HEAD); 2667 pcmpestri(vec, Address(result, 0), mode); 2668 // Reload only string if does not match 2669 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2670 2671 Label CONT_SCAN_SUBSTR; 2672 // Compare the rest of substring (> 8 chars). 2673 bind(FOUND_SUBSTR); 2674 // First 8 chars are already matched. 2675 negptr(cnt2); 2676 addptr(cnt2, stride); 2677 2678 bind(SCAN_SUBSTR); 2679 subl(cnt1, stride); 2680 cmpl(cnt2, -stride); // Do not read beyond substring 2681 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2682 // Back-up strings to avoid reading beyond substring: 2683 // cnt1 = cnt1 - cnt2 + 8 2684 addl(cnt1, cnt2); // cnt2 is negative 2685 addl(cnt1, stride); 2686 movl(cnt2, stride); negptr(cnt2); 2687 bind(CONT_SCAN_SUBSTR); 2688 if (int_cnt2 < (int)G) { 2689 int tail_off1 = int_cnt2<<scale1; 2690 int tail_off2 = int_cnt2<<scale2; 2691 if (ae == StrIntrinsicNode::UL) { 2692 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2693 } else { 2694 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2695 } 2696 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2697 } else { 2698 // calculate index in register to avoid integer overflow (int_cnt2*2) 2699 movl(tmp, int_cnt2); 2700 addptr(tmp, cnt2); 2701 if (ae == StrIntrinsicNode::UL) { 2702 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2703 } else { 2704 movdqu(vec, Address(str2, tmp, scale2, 0)); 2705 } 2706 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2707 } 2708 // Need to reload strings pointers if not matched whole vector 2709 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2710 addptr(cnt2, stride); 2711 jcc(Assembler::negative, SCAN_SUBSTR); 2712 // Fall through if found full substring 2713 2714 } // (int_cnt2 > 8) 2715 2716 bind(RET_FOUND); 2717 // Found result if we matched full small substring. 2718 // Compute substr offset 2719 subptr(result, str1); 2720 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2721 shrl(result, 1); // index 2722 } 2723 bind(EXIT); 2724 2725 } // string_indexofC8 2726 2727 // Small strings are loaded through stack if they cross page boundary. 2728 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2729 Register cnt1, Register cnt2, 2730 int int_cnt2, Register result, 2731 XMMRegister vec, Register tmp, 2732 int ae) { 2733 ShortBranchVerifier sbv(this); 2734 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2735 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2736 2737 // 2738 // int_cnt2 is length of small (< 8 chars) constant substring 2739 // or (-1) for non constant substring in which case its length 2740 // is in cnt2 register. 2741 // 2742 // Note, inline_string_indexOf() generates checks: 2743 // if (substr.count > string.count) return -1; 2744 // if (substr.count == 0) return 0; 2745 // 2746 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2747 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2748 // This method uses the pcmpestri instruction with bound registers 2749 // inputs: 2750 // xmm - substring 2751 // rax - substring length (elements count) 2752 // mem - scanned string 2753 // rdx - string length (elements count) 2754 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2755 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2756 // outputs: 2757 // rcx - matched index in string 2758 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2759 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2760 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2761 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2762 2763 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2764 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2765 FOUND_CANDIDATE; 2766 2767 { //======================================================== 2768 // We don't know where these strings are located 2769 // and we can't read beyond them. Load them through stack. 2770 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2771 2772 movptr(tmp, rsp); // save old SP 2773 2774 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2775 if (int_cnt2 == (1>>scale2)) { // One byte 2776 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2777 load_unsigned_byte(result, Address(str2, 0)); 2778 movdl(vec, result); // move 32 bits 2779 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2780 // Not enough header space in 32-bit VM: 12+3 = 15. 2781 movl(result, Address(str2, -1)); 2782 shrl(result, 8); 2783 movdl(vec, result); // move 32 bits 2784 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2785 load_unsigned_short(result, Address(str2, 0)); 2786 movdl(vec, result); // move 32 bits 2787 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2788 movdl(vec, Address(str2, 0)); // move 32 bits 2789 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2790 movq(vec, Address(str2, 0)); // move 64 bits 2791 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2792 // Array header size is 12 bytes in 32-bit VM 2793 // + 6 bytes for 3 chars == 18 bytes, 2794 // enough space to load vec and shift. 2795 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2796 if (ae == StrIntrinsicNode::UL) { 2797 int tail_off = int_cnt2-8; 2798 pmovzxbw(vec, Address(str2, tail_off)); 2799 psrldq(vec, -2*tail_off); 2800 } 2801 else { 2802 int tail_off = int_cnt2*(1<<scale2); 2803 movdqu(vec, Address(str2, tail_off-16)); 2804 psrldq(vec, 16-tail_off); 2805 } 2806 } 2807 } else { // not constant substring 2808 cmpl(cnt2, stride); 2809 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 2810 2811 // We can read beyond string if srt+16 does not cross page boundary 2812 // since heaps are aligned and mapped by pages. 2813 assert(os::vm_page_size() < (int)G, "default page should be small"); 2814 movl(result, str2); // We need only low 32 bits 2815 andl(result, (os::vm_page_size()-1)); 2816 cmpl(result, (os::vm_page_size()-16)); 2817 jccb(Assembler::belowEqual, CHECK_STR); 2818 2819 // Move small strings to stack to allow load 16 bytes into vec. 2820 subptr(rsp, 16); 2821 int stk_offset = wordSize-(1<<scale2); 2822 push(cnt2); 2823 2824 bind(COPY_SUBSTR); 2825 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 2826 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 2827 movb(Address(rsp, cnt2, scale2, stk_offset), result); 2828 } else if (ae == StrIntrinsicNode::UU) { 2829 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 2830 movw(Address(rsp, cnt2, scale2, stk_offset), result); 2831 } 2832 decrement(cnt2); 2833 jccb(Assembler::notZero, COPY_SUBSTR); 2834 2835 pop(cnt2); 2836 movptr(str2, rsp); // New substring address 2837 } // non constant 2838 2839 bind(CHECK_STR); 2840 cmpl(cnt1, stride); 2841 jccb(Assembler::aboveEqual, BIG_STRINGS); 2842 2843 // Check cross page boundary. 2844 movl(result, str1); // We need only low 32 bits 2845 andl(result, (os::vm_page_size()-1)); 2846 cmpl(result, (os::vm_page_size()-16)); 2847 jccb(Assembler::belowEqual, BIG_STRINGS); 2848 2849 subptr(rsp, 16); 2850 int stk_offset = -(1<<scale1); 2851 if (int_cnt2 < 0) { // not constant 2852 push(cnt2); 2853 stk_offset += wordSize; 2854 } 2855 movl(cnt2, cnt1); 2856 2857 bind(COPY_STR); 2858 if (ae == StrIntrinsicNode::LL) { 2859 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 2860 movb(Address(rsp, cnt2, scale1, stk_offset), result); 2861 } else { 2862 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 2863 movw(Address(rsp, cnt2, scale1, stk_offset), result); 2864 } 2865 decrement(cnt2); 2866 jccb(Assembler::notZero, COPY_STR); 2867 2868 if (int_cnt2 < 0) { // not constant 2869 pop(cnt2); 2870 } 2871 movptr(str1, rsp); // New string address 2872 2873 bind(BIG_STRINGS); 2874 // Load substring. 2875 if (int_cnt2 < 0) { // -1 2876 if (ae == StrIntrinsicNode::UL) { 2877 pmovzxbw(vec, Address(str2, 0)); 2878 } else { 2879 movdqu(vec, Address(str2, 0)); 2880 } 2881 push(cnt2); // substr count 2882 push(str2); // substr addr 2883 push(str1); // string addr 2884 } else { 2885 // Small (< 8 chars) constant substrings are loaded already. 2886 movl(cnt2, int_cnt2); 2887 } 2888 push(tmp); // original SP 2889 2890 } // Finished loading 2891 2892 //======================================================== 2893 // Start search 2894 // 2895 2896 movptr(result, str1); // string addr 2897 2898 if (int_cnt2 < 0) { // Only for non constant substring 2899 jmpb(SCAN_TO_SUBSTR); 2900 2901 // SP saved at sp+0 2902 // String saved at sp+1*wordSize 2903 // Substr saved at sp+2*wordSize 2904 // Substr count saved at sp+3*wordSize 2905 2906 // Reload substr for rescan, this code 2907 // is executed only for large substrings (> 8 chars) 2908 bind(RELOAD_SUBSTR); 2909 movptr(str2, Address(rsp, 2*wordSize)); 2910 movl(cnt2, Address(rsp, 3*wordSize)); 2911 if (ae == StrIntrinsicNode::UL) { 2912 pmovzxbw(vec, Address(str2, 0)); 2913 } else { 2914 movdqu(vec, Address(str2, 0)); 2915 } 2916 // We came here after the beginning of the substring was 2917 // matched but the rest of it was not so we need to search 2918 // again. Start from the next element after the previous match. 2919 subptr(str1, result); // Restore counter 2920 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2921 shrl(str1, 1); 2922 } 2923 addl(cnt1, str1); 2924 decrementl(cnt1); // Shift to next element 2925 cmpl(cnt1, cnt2); 2926 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2927 2928 addptr(result, (1<<scale1)); 2929 } // non constant 2930 2931 // Scan string for start of substr in 16-byte vectors 2932 bind(SCAN_TO_SUBSTR); 2933 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2934 pcmpestri(vec, Address(result, 0), mode); 2935 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2936 subl(cnt1, stride); 2937 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2938 cmpl(cnt1, cnt2); 2939 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2940 addptr(result, 16); 2941 2942 bind(ADJUST_STR); 2943 cmpl(cnt1, stride); // Do not read beyond string 2944 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2945 // Back-up string to avoid reading beyond string. 2946 lea(result, Address(result, cnt1, scale1, -16)); 2947 movl(cnt1, stride); 2948 jmpb(SCAN_TO_SUBSTR); 2949 2950 // Found a potential substr 2951 bind(FOUND_CANDIDATE); 2952 // After pcmpestri tmp(rcx) contains matched element index 2953 2954 // Make sure string is still long enough 2955 subl(cnt1, tmp); 2956 cmpl(cnt1, cnt2); 2957 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 2958 // Left less then substring. 2959 2960 bind(RET_NOT_FOUND); 2961 movl(result, -1); 2962 jmp(CLEANUP); 2963 2964 bind(FOUND_SUBSTR); 2965 // Compute start addr of substr 2966 lea(result, Address(result, tmp, scale1)); 2967 if (int_cnt2 > 0) { // Constant substring 2968 // Repeat search for small substring (< 8 chars) 2969 // from new point without reloading substring. 2970 // Have to check that we don't read beyond string. 2971 cmpl(tmp, stride-int_cnt2); 2972 jccb(Assembler::greater, ADJUST_STR); 2973 // Fall through if matched whole substring. 2974 } else { // non constant 2975 assert(int_cnt2 == -1, "should be != 0"); 2976 2977 addl(tmp, cnt2); 2978 // Found result if we matched whole substring. 2979 cmpl(tmp, stride); 2980 jcc(Assembler::lessEqual, RET_FOUND); 2981 2982 // Repeat search for small substring (<= 8 chars) 2983 // from new point 'str1' without reloading substring. 2984 cmpl(cnt2, stride); 2985 // Have to check that we don't read beyond string. 2986 jccb(Assembler::lessEqual, ADJUST_STR); 2987 2988 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 2989 // Compare the rest of substring (> 8 chars). 2990 movptr(str1, result); 2991 2992 cmpl(tmp, cnt2); 2993 // First 8 chars are already matched. 2994 jccb(Assembler::equal, CHECK_NEXT); 2995 2996 bind(SCAN_SUBSTR); 2997 pcmpestri(vec, Address(str1, 0), mode); 2998 // Need to reload strings pointers if not matched whole vector 2999 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3000 3001 bind(CHECK_NEXT); 3002 subl(cnt2, stride); 3003 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3004 addptr(str1, 16); 3005 if (ae == StrIntrinsicNode::UL) { 3006 addptr(str2, 8); 3007 } else { 3008 addptr(str2, 16); 3009 } 3010 subl(cnt1, stride); 3011 cmpl(cnt2, stride); // Do not read beyond substring 3012 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3013 // Back-up strings to avoid reading beyond substring. 3014 3015 if (ae == StrIntrinsicNode::UL) { 3016 lea(str2, Address(str2, cnt2, scale2, -8)); 3017 lea(str1, Address(str1, cnt2, scale1, -16)); 3018 } else { 3019 lea(str2, Address(str2, cnt2, scale2, -16)); 3020 lea(str1, Address(str1, cnt2, scale1, -16)); 3021 } 3022 subl(cnt1, cnt2); 3023 movl(cnt2, stride); 3024 addl(cnt1, stride); 3025 bind(CONT_SCAN_SUBSTR); 3026 if (ae == StrIntrinsicNode::UL) { 3027 pmovzxbw(vec, Address(str2, 0)); 3028 } else { 3029 movdqu(vec, Address(str2, 0)); 3030 } 3031 jmp(SCAN_SUBSTR); 3032 3033 bind(RET_FOUND_LONG); 3034 movptr(str1, Address(rsp, wordSize)); 3035 } // non constant 3036 3037 bind(RET_FOUND); 3038 // Compute substr offset 3039 subptr(result, str1); 3040 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3041 shrl(result, 1); // index 3042 } 3043 bind(CLEANUP); 3044 pop(rsp); // restore SP 3045 3046 } // string_indexof 3047 3048 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3049 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3050 ShortBranchVerifier sbv(this); 3051 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3052 3053 int stride = 8; 3054 3055 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3056 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3057 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3058 FOUND_SEQ_CHAR, DONE_LABEL; 3059 3060 movptr(result, str1); 3061 if (UseAVX >= 2) { 3062 cmpl(cnt1, stride); 3063 jcc(Assembler::less, SCAN_TO_CHAR); 3064 cmpl(cnt1, 2*stride); 3065 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3066 movdl(vec1, ch); 3067 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3068 vpxor(vec2, vec2); 3069 movl(tmp, cnt1); 3070 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3071 andl(cnt1,0x0000000F); //tail count (in chars) 3072 3073 bind(SCAN_TO_16_CHAR_LOOP); 3074 vmovdqu(vec3, Address(result, 0)); 3075 vpcmpeqw(vec3, vec3, vec1, 1); 3076 vptest(vec2, vec3); 3077 jcc(Assembler::carryClear, FOUND_CHAR); 3078 addptr(result, 32); 3079 subl(tmp, 2*stride); 3080 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3081 jmp(SCAN_TO_8_CHAR); 3082 bind(SCAN_TO_8_CHAR_INIT); 3083 movdl(vec1, ch); 3084 pshuflw(vec1, vec1, 0x00); 3085 pshufd(vec1, vec1, 0); 3086 pxor(vec2, vec2); 3087 } 3088 bind(SCAN_TO_8_CHAR); 3089 cmpl(cnt1, stride); 3090 jcc(Assembler::less, SCAN_TO_CHAR); 3091 if (UseAVX < 2) { 3092 movdl(vec1, ch); 3093 pshuflw(vec1, vec1, 0x00); 3094 pshufd(vec1, vec1, 0); 3095 pxor(vec2, vec2); 3096 } 3097 movl(tmp, cnt1); 3098 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3099 andl(cnt1,0x00000007); //tail count (in chars) 3100 3101 bind(SCAN_TO_8_CHAR_LOOP); 3102 movdqu(vec3, Address(result, 0)); 3103 pcmpeqw(vec3, vec1); 3104 ptest(vec2, vec3); 3105 jcc(Assembler::carryClear, FOUND_CHAR); 3106 addptr(result, 16); 3107 subl(tmp, stride); 3108 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3109 bind(SCAN_TO_CHAR); 3110 testl(cnt1, cnt1); 3111 jcc(Assembler::zero, RET_NOT_FOUND); 3112 bind(SCAN_TO_CHAR_LOOP); 3113 load_unsigned_short(tmp, Address(result, 0)); 3114 cmpl(ch, tmp); 3115 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3116 addptr(result, 2); 3117 subl(cnt1, 1); 3118 jccb(Assembler::zero, RET_NOT_FOUND); 3119 jmp(SCAN_TO_CHAR_LOOP); 3120 3121 bind(RET_NOT_FOUND); 3122 movl(result, -1); 3123 jmpb(DONE_LABEL); 3124 3125 bind(FOUND_CHAR); 3126 if (UseAVX >= 2) { 3127 vpmovmskb(tmp, vec3); 3128 } else { 3129 pmovmskb(tmp, vec3); 3130 } 3131 bsfl(ch, tmp); 3132 addptr(result, ch); 3133 3134 bind(FOUND_SEQ_CHAR); 3135 subptr(result, str1); 3136 shrl(result, 1); 3137 3138 bind(DONE_LABEL); 3139 } // string_indexof_char 3140 3141 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3142 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3143 ShortBranchVerifier sbv(this); 3144 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3145 3146 int stride = 16; 3147 3148 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3149 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3150 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3151 FOUND_SEQ_CHAR, DONE_LABEL; 3152 3153 movptr(result, str1); 3154 if (UseAVX >= 2) { 3155 cmpl(cnt1, stride); 3156 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3157 cmpl(cnt1, stride*2); 3158 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3159 movdl(vec1, ch); 3160 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3161 vpxor(vec2, vec2); 3162 movl(tmp, cnt1); 3163 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3164 andl(cnt1,0x0000001F); //tail count (in chars) 3165 3166 bind(SCAN_TO_32_CHAR_LOOP); 3167 vmovdqu(vec3, Address(result, 0)); 3168 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3169 vptest(vec2, vec3); 3170 jcc(Assembler::carryClear, FOUND_CHAR); 3171 addptr(result, 32); 3172 subl(tmp, stride*2); 3173 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3174 jmp(SCAN_TO_16_CHAR); 3175 3176 bind(SCAN_TO_16_CHAR_INIT); 3177 movdl(vec1, ch); 3178 pxor(vec2, vec2); 3179 pshufb(vec1, vec2); 3180 } 3181 3182 bind(SCAN_TO_16_CHAR); 3183 cmpl(cnt1, stride); 3184 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3185 if (UseAVX < 2) { 3186 movdl(vec1, ch); 3187 pxor(vec2, vec2); 3188 pshufb(vec1, vec2); 3189 } 3190 movl(tmp, cnt1); 3191 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3192 andl(cnt1,0x0000000F); //tail count (in bytes) 3193 3194 bind(SCAN_TO_16_CHAR_LOOP); 3195 movdqu(vec3, Address(result, 0)); 3196 pcmpeqb(vec3, vec1); 3197 ptest(vec2, vec3); 3198 jcc(Assembler::carryClear, FOUND_CHAR); 3199 addptr(result, 16); 3200 subl(tmp, stride); 3201 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3202 3203 bind(SCAN_TO_CHAR_INIT); 3204 testl(cnt1, cnt1); 3205 jcc(Assembler::zero, RET_NOT_FOUND); 3206 bind(SCAN_TO_CHAR_LOOP); 3207 load_unsigned_byte(tmp, Address(result, 0)); 3208 cmpl(ch, tmp); 3209 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3210 addptr(result, 1); 3211 subl(cnt1, 1); 3212 jccb(Assembler::zero, RET_NOT_FOUND); 3213 jmp(SCAN_TO_CHAR_LOOP); 3214 3215 bind(RET_NOT_FOUND); 3216 movl(result, -1); 3217 jmpb(DONE_LABEL); 3218 3219 bind(FOUND_CHAR); 3220 if (UseAVX >= 2) { 3221 vpmovmskb(tmp, vec3); 3222 } else { 3223 pmovmskb(tmp, vec3); 3224 } 3225 bsfl(ch, tmp); 3226 addptr(result, ch); 3227 3228 bind(FOUND_SEQ_CHAR); 3229 subptr(result, str1); 3230 3231 bind(DONE_LABEL); 3232 } // stringL_indexof_char 3233 3234 // helper function for string_compare 3235 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3236 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3237 Address::ScaleFactor scale2, Register index, int ae) { 3238 if (ae == StrIntrinsicNode::LL) { 3239 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3240 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3241 } else if (ae == StrIntrinsicNode::UU) { 3242 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3243 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3244 } else { 3245 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3246 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3247 } 3248 } 3249 3250 // Compare strings, used for char[] and byte[]. 3251 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3252 Register cnt1, Register cnt2, Register result, 3253 XMMRegister vec1, int ae, KRegister mask) { 3254 ShortBranchVerifier sbv(this); 3255 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3256 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3257 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3258 int stride2x2 = 0x40; 3259 Address::ScaleFactor scale = Address::no_scale; 3260 Address::ScaleFactor scale1 = Address::no_scale; 3261 Address::ScaleFactor scale2 = Address::no_scale; 3262 3263 if (ae != StrIntrinsicNode::LL) { 3264 stride2x2 = 0x20; 3265 } 3266 3267 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3268 shrl(cnt2, 1); 3269 } 3270 // Compute the minimum of the string lengths and the 3271 // difference of the string lengths (stack). 3272 // Do the conditional move stuff 3273 movl(result, cnt1); 3274 subl(cnt1, cnt2); 3275 push(cnt1); 3276 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3277 3278 // Is the minimum length zero? 3279 testl(cnt2, cnt2); 3280 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3281 if (ae == StrIntrinsicNode::LL) { 3282 // Load first bytes 3283 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3284 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3285 } else if (ae == StrIntrinsicNode::UU) { 3286 // Load first characters 3287 load_unsigned_short(result, Address(str1, 0)); 3288 load_unsigned_short(cnt1, Address(str2, 0)); 3289 } else { 3290 load_unsigned_byte(result, Address(str1, 0)); 3291 load_unsigned_short(cnt1, Address(str2, 0)); 3292 } 3293 subl(result, cnt1); 3294 jcc(Assembler::notZero, POP_LABEL); 3295 3296 if (ae == StrIntrinsicNode::UU) { 3297 // Divide length by 2 to get number of chars 3298 shrl(cnt2, 1); 3299 } 3300 cmpl(cnt2, 1); 3301 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3302 3303 // Check if the strings start at the same location and setup scale and stride 3304 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3305 cmpptr(str1, str2); 3306 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3307 if (ae == StrIntrinsicNode::LL) { 3308 scale = Address::times_1; 3309 stride = 16; 3310 } else { 3311 scale = Address::times_2; 3312 stride = 8; 3313 } 3314 } else { 3315 scale1 = Address::times_1; 3316 scale2 = Address::times_2; 3317 // scale not used 3318 stride = 8; 3319 } 3320 3321 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3322 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3323 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3324 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3325 Label COMPARE_TAIL_LONG; 3326 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3327 3328 int pcmpmask = 0x19; 3329 if (ae == StrIntrinsicNode::LL) { 3330 pcmpmask &= ~0x01; 3331 } 3332 3333 // Setup to compare 16-chars (32-bytes) vectors, 3334 // start from first character again because it has aligned address. 3335 if (ae == StrIntrinsicNode::LL) { 3336 stride2 = 32; 3337 } else { 3338 stride2 = 16; 3339 } 3340 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3341 adr_stride = stride << scale; 3342 } else { 3343 adr_stride1 = 8; //stride << scale1; 3344 adr_stride2 = 16; //stride << scale2; 3345 } 3346 3347 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3348 // rax and rdx are used by pcmpestri as elements counters 3349 movl(result, cnt2); 3350 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3351 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3352 3353 // fast path : compare first 2 8-char vectors. 3354 bind(COMPARE_16_CHARS); 3355 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3356 movdqu(vec1, Address(str1, 0)); 3357 } else { 3358 pmovzxbw(vec1, Address(str1, 0)); 3359 } 3360 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3361 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3362 3363 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3364 movdqu(vec1, Address(str1, adr_stride)); 3365 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3366 } else { 3367 pmovzxbw(vec1, Address(str1, adr_stride1)); 3368 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3369 } 3370 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3371 addl(cnt1, stride); 3372 3373 // Compare the characters at index in cnt1 3374 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3375 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3376 subl(result, cnt2); 3377 jmp(POP_LABEL); 3378 3379 // Setup the registers to start vector comparison loop 3380 bind(COMPARE_WIDE_VECTORS); 3381 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3382 lea(str1, Address(str1, result, scale)); 3383 lea(str2, Address(str2, result, scale)); 3384 } else { 3385 lea(str1, Address(str1, result, scale1)); 3386 lea(str2, Address(str2, result, scale2)); 3387 } 3388 subl(result, stride2); 3389 subl(cnt2, stride2); 3390 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3391 negptr(result); 3392 3393 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3394 bind(COMPARE_WIDE_VECTORS_LOOP); 3395 3396 #ifdef _LP64 3397 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3398 cmpl(cnt2, stride2x2); 3399 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3400 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3401 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3402 3403 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3404 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3405 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3406 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3407 } else { 3408 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3409 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3410 } 3411 kortestql(mask, mask); 3412 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3413 addptr(result, stride2x2); // update since we already compared at this addr 3414 subl(cnt2, stride2x2); // and sub the size too 3415 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3416 3417 vpxor(vec1, vec1); 3418 jmpb(COMPARE_WIDE_TAIL); 3419 }//if (VM_Version::supports_avx512vlbw()) 3420 #endif // _LP64 3421 3422 3423 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3424 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3425 vmovdqu(vec1, Address(str1, result, scale)); 3426 vpxor(vec1, Address(str2, result, scale)); 3427 } else { 3428 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3429 vpxor(vec1, Address(str2, result, scale2)); 3430 } 3431 vptest(vec1, vec1); 3432 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3433 addptr(result, stride2); 3434 subl(cnt2, stride2); 3435 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3436 // clean upper bits of YMM registers 3437 vpxor(vec1, vec1); 3438 3439 // compare wide vectors tail 3440 bind(COMPARE_WIDE_TAIL); 3441 testptr(result, result); 3442 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3443 3444 movl(result, stride2); 3445 movl(cnt2, result); 3446 negptr(result); 3447 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3448 3449 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3450 bind(VECTOR_NOT_EQUAL); 3451 // clean upper bits of YMM registers 3452 vpxor(vec1, vec1); 3453 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3454 lea(str1, Address(str1, result, scale)); 3455 lea(str2, Address(str2, result, scale)); 3456 } else { 3457 lea(str1, Address(str1, result, scale1)); 3458 lea(str2, Address(str2, result, scale2)); 3459 } 3460 jmp(COMPARE_16_CHARS); 3461 3462 // Compare tail chars, length between 1 to 15 chars 3463 bind(COMPARE_TAIL_LONG); 3464 movl(cnt2, result); 3465 cmpl(cnt2, stride); 3466 jcc(Assembler::less, COMPARE_SMALL_STR); 3467 3468 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3469 movdqu(vec1, Address(str1, 0)); 3470 } else { 3471 pmovzxbw(vec1, Address(str1, 0)); 3472 } 3473 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3474 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3475 subptr(cnt2, stride); 3476 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3477 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3478 lea(str1, Address(str1, result, scale)); 3479 lea(str2, Address(str2, result, scale)); 3480 } else { 3481 lea(str1, Address(str1, result, scale1)); 3482 lea(str2, Address(str2, result, scale2)); 3483 } 3484 negptr(cnt2); 3485 jmpb(WHILE_HEAD_LABEL); 3486 3487 bind(COMPARE_SMALL_STR); 3488 } else if (UseSSE42Intrinsics) { 3489 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3490 int pcmpmask = 0x19; 3491 // Setup to compare 8-char (16-byte) vectors, 3492 // start from first character again because it has aligned address. 3493 movl(result, cnt2); 3494 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3495 if (ae == StrIntrinsicNode::LL) { 3496 pcmpmask &= ~0x01; 3497 } 3498 jcc(Assembler::zero, COMPARE_TAIL); 3499 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3500 lea(str1, Address(str1, result, scale)); 3501 lea(str2, Address(str2, result, scale)); 3502 } else { 3503 lea(str1, Address(str1, result, scale1)); 3504 lea(str2, Address(str2, result, scale2)); 3505 } 3506 negptr(result); 3507 3508 // pcmpestri 3509 // inputs: 3510 // vec1- substring 3511 // rax - negative string length (elements count) 3512 // mem - scanned string 3513 // rdx - string length (elements count) 3514 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3515 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3516 // outputs: 3517 // rcx - first mismatched element index 3518 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3519 3520 bind(COMPARE_WIDE_VECTORS); 3521 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3522 movdqu(vec1, Address(str1, result, scale)); 3523 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3524 } else { 3525 pmovzxbw(vec1, Address(str1, result, scale1)); 3526 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3527 } 3528 // After pcmpestri cnt1(rcx) contains mismatched element index 3529 3530 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3531 addptr(result, stride); 3532 subptr(cnt2, stride); 3533 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3534 3535 // compare wide vectors tail 3536 testptr(result, result); 3537 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3538 3539 movl(cnt2, stride); 3540 movl(result, stride); 3541 negptr(result); 3542 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3543 movdqu(vec1, Address(str1, result, scale)); 3544 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3545 } else { 3546 pmovzxbw(vec1, Address(str1, result, scale1)); 3547 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3548 } 3549 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3550 3551 // Mismatched characters in the vectors 3552 bind(VECTOR_NOT_EQUAL); 3553 addptr(cnt1, result); 3554 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3555 subl(result, cnt2); 3556 jmpb(POP_LABEL); 3557 3558 bind(COMPARE_TAIL); // limit is zero 3559 movl(cnt2, result); 3560 // Fallthru to tail compare 3561 } 3562 // Shift str2 and str1 to the end of the arrays, negate min 3563 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3564 lea(str1, Address(str1, cnt2, scale)); 3565 lea(str2, Address(str2, cnt2, scale)); 3566 } else { 3567 lea(str1, Address(str1, cnt2, scale1)); 3568 lea(str2, Address(str2, cnt2, scale2)); 3569 } 3570 decrementl(cnt2); // first character was compared already 3571 negptr(cnt2); 3572 3573 // Compare the rest of the elements 3574 bind(WHILE_HEAD_LABEL); 3575 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3576 subl(result, cnt1); 3577 jccb(Assembler::notZero, POP_LABEL); 3578 increment(cnt2); 3579 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3580 3581 // Strings are equal up to min length. Return the length difference. 3582 bind(LENGTH_DIFF_LABEL); 3583 pop(result); 3584 if (ae == StrIntrinsicNode::UU) { 3585 // Divide diff by 2 to get number of chars 3586 sarl(result, 1); 3587 } 3588 jmpb(DONE_LABEL); 3589 3590 #ifdef _LP64 3591 if (VM_Version::supports_avx512vlbw()) { 3592 3593 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3594 3595 kmovql(cnt1, mask); 3596 notq(cnt1); 3597 bsfq(cnt2, cnt1); 3598 if (ae != StrIntrinsicNode::LL) { 3599 // Divide diff by 2 to get number of chars 3600 sarl(cnt2, 1); 3601 } 3602 addq(result, cnt2); 3603 if (ae == StrIntrinsicNode::LL) { 3604 load_unsigned_byte(cnt1, Address(str2, result)); 3605 load_unsigned_byte(result, Address(str1, result)); 3606 } else if (ae == StrIntrinsicNode::UU) { 3607 load_unsigned_short(cnt1, Address(str2, result, scale)); 3608 load_unsigned_short(result, Address(str1, result, scale)); 3609 } else { 3610 load_unsigned_short(cnt1, Address(str2, result, scale2)); 3611 load_unsigned_byte(result, Address(str1, result, scale1)); 3612 } 3613 subl(result, cnt1); 3614 jmpb(POP_LABEL); 3615 }//if (VM_Version::supports_avx512vlbw()) 3616 #endif // _LP64 3617 3618 // Discard the stored length difference 3619 bind(POP_LABEL); 3620 pop(cnt1); 3621 3622 // That's it 3623 bind(DONE_LABEL); 3624 if(ae == StrIntrinsicNode::UL) { 3625 negl(result); 3626 } 3627 3628 } 3629 3630 // Search for Non-ASCII character (Negative byte value) in a byte array, 3631 // return the index of the first such character, otherwise the length 3632 // of the array segment searched. 3633 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 3634 // @IntrinsicCandidate 3635 // public static int countPositives(byte[] ba, int off, int len) { 3636 // for (int i = off; i < off + len; i++) { 3637 // if (ba[i] < 0) { 3638 // return i - off; 3639 // } 3640 // } 3641 // return len; 3642 // } 3643 void C2_MacroAssembler::count_positives(Register ary1, Register len, 3644 Register result, Register tmp1, 3645 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 3646 // rsi: byte array 3647 // rcx: len 3648 // rax: result 3649 ShortBranchVerifier sbv(this); 3650 assert_different_registers(ary1, len, result, tmp1); 3651 assert_different_registers(vec1, vec2); 3652 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 3653 3654 movl(result, len); // copy 3655 // len == 0 3656 testl(len, len); 3657 jcc(Assembler::zero, DONE); 3658 3659 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 3660 VM_Version::supports_avx512vlbw() && 3661 VM_Version::supports_bmi2()) { 3662 3663 Label test_64_loop, test_tail, BREAK_LOOP; 3664 Register tmp3_aliased = len; 3665 3666 movl(tmp1, len); 3667 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 3668 3669 andl(tmp1, 64 - 1); // tail count (in chars) 0x3F 3670 andl(len, ~(64 - 1)); // vector count (in chars) 3671 jccb(Assembler::zero, test_tail); 3672 3673 lea(ary1, Address(ary1, len, Address::times_1)); 3674 negptr(len); 3675 3676 bind(test_64_loop); 3677 // Check whether our 64 elements of size byte contain negatives 3678 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 3679 kortestql(mask1, mask1); 3680 jcc(Assembler::notZero, BREAK_LOOP); 3681 3682 addptr(len, 64); 3683 jccb(Assembler::notZero, test_64_loop); 3684 3685 bind(test_tail); 3686 // bail out when there is nothing to be done 3687 testl(tmp1, -1); 3688 jcc(Assembler::zero, DONE); 3689 3690 // ~(~0 << len) applied up to two times (for 32-bit scenario) 3691 #ifdef _LP64 3692 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 3693 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 3694 notq(tmp3_aliased); 3695 kmovql(mask2, tmp3_aliased); 3696 #else 3697 Label k_init; 3698 jmp(k_init); 3699 3700 // We could not read 64-bits from a general purpose register thus we move 3701 // data required to compose 64 1's to the instruction stream 3702 // We emit 64 byte wide series of elements from 0..63 which later on would 3703 // be used as a compare targets with tail count contained in tmp1 register. 3704 // Result would be a k register having tmp1 consecutive number or 1 3705 // counting from least significant bit. 3706 address tmp = pc(); 3707 emit_int64(0x0706050403020100); 3708 emit_int64(0x0F0E0D0C0B0A0908); 3709 emit_int64(0x1716151413121110); 3710 emit_int64(0x1F1E1D1C1B1A1918); 3711 emit_int64(0x2726252423222120); 3712 emit_int64(0x2F2E2D2C2B2A2928); 3713 emit_int64(0x3736353433323130); 3714 emit_int64(0x3F3E3D3C3B3A3938); 3715 3716 bind(k_init); 3717 lea(len, InternalAddress(tmp)); 3718 // create mask to test for negative byte inside a vector 3719 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 3720 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 3721 3722 #endif 3723 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 3724 ktestq(mask1, mask2); 3725 jcc(Assembler::zero, DONE); 3726 3727 bind(BREAK_LOOP); 3728 // At least one byte in the last 64 bytes is negative. 3729 // Set up to look at the last 64 bytes as if they were a tail 3730 lea(ary1, Address(ary1, len, Address::times_1)); 3731 addptr(result, len); 3732 // Ignore the very last byte: if all others are positive, 3733 // it must be negative, so we can skip right to the 2+1 byte 3734 // end comparison at this point 3735 orl(result, 63); 3736 movl(len, 63); 3737 // Fallthru to tail compare 3738 } else { 3739 3740 if (UseAVX >= 2 && UseSSE >= 2) { 3741 // With AVX2, use 32-byte vector compare 3742 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 3743 3744 // Compare 32-byte vectors 3745 testl(len, 0xffffffe0); // vector count (in bytes) 3746 jccb(Assembler::zero, TAIL_START); 3747 3748 andl(len, 0xffffffe0); 3749 lea(ary1, Address(ary1, len, Address::times_1)); 3750 negptr(len); 3751 3752 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 3753 movdl(vec2, tmp1); 3754 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 3755 3756 bind(COMPARE_WIDE_VECTORS); 3757 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 3758 vptest(vec1, vec2); 3759 jccb(Assembler::notZero, BREAK_LOOP); 3760 addptr(len, 32); 3761 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3762 3763 testl(result, 0x0000001f); // any bytes remaining? 3764 jcc(Assembler::zero, DONE); 3765 3766 // Quick test using the already prepared vector mask 3767 movl(len, result); 3768 andl(len, 0x0000001f); 3769 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 3770 vptest(vec1, vec2); 3771 jcc(Assembler::zero, DONE); 3772 // There are zeros, jump to the tail to determine exactly where 3773 jmpb(TAIL_START); 3774 3775 bind(BREAK_LOOP); 3776 // At least one byte in the last 32-byte vector is negative. 3777 // Set up to look at the last 32 bytes as if they were a tail 3778 lea(ary1, Address(ary1, len, Address::times_1)); 3779 addptr(result, len); 3780 // Ignore the very last byte: if all others are positive, 3781 // it must be negative, so we can skip right to the 2+1 byte 3782 // end comparison at this point 3783 orl(result, 31); 3784 movl(len, 31); 3785 // Fallthru to tail compare 3786 } else if (UseSSE42Intrinsics) { 3787 // With SSE4.2, use double quad vector compare 3788 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 3789 3790 // Compare 16-byte vectors 3791 testl(len, 0xfffffff0); // vector count (in bytes) 3792 jcc(Assembler::zero, TAIL_START); 3793 3794 andl(len, 0xfffffff0); 3795 lea(ary1, Address(ary1, len, Address::times_1)); 3796 negptr(len); 3797 3798 movl(tmp1, 0x80808080); 3799 movdl(vec2, tmp1); 3800 pshufd(vec2, vec2, 0); 3801 3802 bind(COMPARE_WIDE_VECTORS); 3803 movdqu(vec1, Address(ary1, len, Address::times_1)); 3804 ptest(vec1, vec2); 3805 jccb(Assembler::notZero, BREAK_LOOP); 3806 addptr(len, 16); 3807 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3808 3809 testl(result, 0x0000000f); // len is zero, any bytes remaining? 3810 jcc(Assembler::zero, DONE); 3811 3812 // Quick test using the already prepared vector mask 3813 movl(len, result); 3814 andl(len, 0x0000000f); // tail count (in bytes) 3815 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 3816 ptest(vec1, vec2); 3817 jcc(Assembler::zero, DONE); 3818 jmpb(TAIL_START); 3819 3820 bind(BREAK_LOOP); 3821 // At least one byte in the last 16-byte vector is negative. 3822 // Set up and look at the last 16 bytes as if they were a tail 3823 lea(ary1, Address(ary1, len, Address::times_1)); 3824 addptr(result, len); 3825 // Ignore the very last byte: if all others are positive, 3826 // it must be negative, so we can skip right to the 2+1 byte 3827 // end comparison at this point 3828 orl(result, 15); 3829 movl(len, 15); 3830 // Fallthru to tail compare 3831 } 3832 } 3833 3834 bind(TAIL_START); 3835 // Compare 4-byte vectors 3836 andl(len, 0xfffffffc); // vector count (in bytes) 3837 jccb(Assembler::zero, COMPARE_CHAR); 3838 3839 lea(ary1, Address(ary1, len, Address::times_1)); 3840 negptr(len); 3841 3842 bind(COMPARE_VECTORS); 3843 movl(tmp1, Address(ary1, len, Address::times_1)); 3844 andl(tmp1, 0x80808080); 3845 jccb(Assembler::notZero, TAIL_ADJUST); 3846 addptr(len, 4); 3847 jccb(Assembler::notZero, COMPARE_VECTORS); 3848 3849 // Compare trailing char (final 2-3 bytes), if any 3850 bind(COMPARE_CHAR); 3851 3852 testl(result, 0x2); // tail char 3853 jccb(Assembler::zero, COMPARE_BYTE); 3854 load_unsigned_short(tmp1, Address(ary1, 0)); 3855 andl(tmp1, 0x00008080); 3856 jccb(Assembler::notZero, CHAR_ADJUST); 3857 lea(ary1, Address(ary1, 2)); 3858 3859 bind(COMPARE_BYTE); 3860 testl(result, 0x1); // tail byte 3861 jccb(Assembler::zero, DONE); 3862 load_unsigned_byte(tmp1, Address(ary1, 0)); 3863 testl(tmp1, 0x00000080); 3864 jccb(Assembler::zero, DONE); 3865 subptr(result, 1); 3866 jmpb(DONE); 3867 3868 bind(TAIL_ADJUST); 3869 // there are negative bits in the last 4 byte block. 3870 // Adjust result and check the next three bytes 3871 addptr(result, len); 3872 orl(result, 3); 3873 lea(ary1, Address(ary1, len, Address::times_1)); 3874 jmpb(COMPARE_CHAR); 3875 3876 bind(CHAR_ADJUST); 3877 // We are looking at a char + optional byte tail, and found that one 3878 // of the bytes in the char is negative. Adjust the result, check the 3879 // first byte and readjust if needed. 3880 andl(result, 0xfffffffc); 3881 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 3882 jccb(Assembler::notZero, DONE); 3883 addptr(result, 1); 3884 3885 // That's it 3886 bind(DONE); 3887 if (UseAVX >= 2 && UseSSE >= 2) { 3888 // clean upper bits of YMM registers 3889 vpxor(vec1, vec1); 3890 vpxor(vec2, vec2); 3891 } 3892 } 3893 3894 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 3895 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 3896 Register limit, Register result, Register chr, 3897 XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) { 3898 ShortBranchVerifier sbv(this); 3899 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 3900 3901 int length_offset = arrayOopDesc::length_offset_in_bytes(); 3902 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 3903 3904 if (is_array_equ) { 3905 // Check the input args 3906 cmpoop(ary1, ary2); 3907 jcc(Assembler::equal, TRUE_LABEL); 3908 3909 // Need additional checks for arrays_equals. 3910 testptr(ary1, ary1); 3911 jcc(Assembler::zero, FALSE_LABEL); 3912 testptr(ary2, ary2); 3913 jcc(Assembler::zero, FALSE_LABEL); 3914 3915 // Check the lengths 3916 movl(limit, Address(ary1, length_offset)); 3917 cmpl(limit, Address(ary2, length_offset)); 3918 jcc(Assembler::notEqual, FALSE_LABEL); 3919 } 3920 3921 // count == 0 3922 testl(limit, limit); 3923 jcc(Assembler::zero, TRUE_LABEL); 3924 3925 if (is_array_equ) { 3926 // Load array address 3927 lea(ary1, Address(ary1, base_offset)); 3928 lea(ary2, Address(ary2, base_offset)); 3929 } 3930 3931 if (is_array_equ && is_char) { 3932 // arrays_equals when used for char[]. 3933 shll(limit, 1); // byte count != 0 3934 } 3935 movl(result, limit); // copy 3936 3937 if (UseAVX >= 2) { 3938 // With AVX2, use 32-byte vector compare 3939 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3940 3941 // Compare 32-byte vectors 3942 andl(result, 0x0000001f); // tail count (in bytes) 3943 andl(limit, 0xffffffe0); // vector count (in bytes) 3944 jcc(Assembler::zero, COMPARE_TAIL); 3945 3946 lea(ary1, Address(ary1, limit, Address::times_1)); 3947 lea(ary2, Address(ary2, limit, Address::times_1)); 3948 negptr(limit); 3949 3950 #ifdef _LP64 3951 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3952 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 3953 3954 cmpl(limit, -64); 3955 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3956 3957 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3958 3959 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 3960 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 3961 kortestql(mask, mask); 3962 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 3963 addptr(limit, 64); // update since we already compared at this addr 3964 cmpl(limit, -64); 3965 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3966 3967 // At this point we may still need to compare -limit+result bytes. 3968 // We could execute the next two instruction and just continue via non-wide path: 3969 // cmpl(limit, 0); 3970 // jcc(Assembler::equal, COMPARE_TAIL); // true 3971 // But since we stopped at the points ary{1,2}+limit which are 3972 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 3973 // (|limit| <= 32 and result < 32), 3974 // we may just compare the last 64 bytes. 3975 // 3976 addptr(result, -64); // it is safe, bc we just came from this area 3977 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 3978 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 3979 kortestql(mask, mask); 3980 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 3981 3982 jmp(TRUE_LABEL); 3983 3984 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3985 3986 }//if (VM_Version::supports_avx512vlbw()) 3987 #endif //_LP64 3988 bind(COMPARE_WIDE_VECTORS); 3989 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); 3990 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 3991 vpxor(vec1, vec2); 3992 3993 vptest(vec1, vec1); 3994 jcc(Assembler::notZero, FALSE_LABEL); 3995 addptr(limit, 32); 3996 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3997 3998 testl(result, result); 3999 jcc(Assembler::zero, TRUE_LABEL); 4000 4001 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 4002 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4003 vpxor(vec1, vec2); 4004 4005 vptest(vec1, vec1); 4006 jccb(Assembler::notZero, FALSE_LABEL); 4007 jmpb(TRUE_LABEL); 4008 4009 bind(COMPARE_TAIL); // limit is zero 4010 movl(limit, result); 4011 // Fallthru to tail compare 4012 } else if (UseSSE42Intrinsics) { 4013 // With SSE4.2, use double quad vector compare 4014 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4015 4016 // Compare 16-byte vectors 4017 andl(result, 0x0000000f); // tail count (in bytes) 4018 andl(limit, 0xfffffff0); // vector count (in bytes) 4019 jcc(Assembler::zero, COMPARE_TAIL); 4020 4021 lea(ary1, Address(ary1, limit, Address::times_1)); 4022 lea(ary2, Address(ary2, limit, Address::times_1)); 4023 negptr(limit); 4024 4025 bind(COMPARE_WIDE_VECTORS); 4026 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4027 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4028 pxor(vec1, vec2); 4029 4030 ptest(vec1, vec1); 4031 jcc(Assembler::notZero, FALSE_LABEL); 4032 addptr(limit, 16); 4033 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4034 4035 testl(result, result); 4036 jcc(Assembler::zero, TRUE_LABEL); 4037 4038 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4039 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4040 pxor(vec1, vec2); 4041 4042 ptest(vec1, vec1); 4043 jccb(Assembler::notZero, FALSE_LABEL); 4044 jmpb(TRUE_LABEL); 4045 4046 bind(COMPARE_TAIL); // limit is zero 4047 movl(limit, result); 4048 // Fallthru to tail compare 4049 } 4050 4051 // Compare 4-byte vectors 4052 andl(limit, 0xfffffffc); // vector count (in bytes) 4053 jccb(Assembler::zero, COMPARE_CHAR); 4054 4055 lea(ary1, Address(ary1, limit, Address::times_1)); 4056 lea(ary2, Address(ary2, limit, Address::times_1)); 4057 negptr(limit); 4058 4059 bind(COMPARE_VECTORS); 4060 movl(chr, Address(ary1, limit, Address::times_1)); 4061 cmpl(chr, Address(ary2, limit, Address::times_1)); 4062 jccb(Assembler::notEqual, FALSE_LABEL); 4063 addptr(limit, 4); 4064 jcc(Assembler::notZero, COMPARE_VECTORS); 4065 4066 // Compare trailing char (final 2 bytes), if any 4067 bind(COMPARE_CHAR); 4068 testl(result, 0x2); // tail char 4069 jccb(Assembler::zero, COMPARE_BYTE); 4070 load_unsigned_short(chr, Address(ary1, 0)); 4071 load_unsigned_short(limit, Address(ary2, 0)); 4072 cmpl(chr, limit); 4073 jccb(Assembler::notEqual, FALSE_LABEL); 4074 4075 if (is_array_equ && is_char) { 4076 bind(COMPARE_BYTE); 4077 } else { 4078 lea(ary1, Address(ary1, 2)); 4079 lea(ary2, Address(ary2, 2)); 4080 4081 bind(COMPARE_BYTE); 4082 testl(result, 0x1); // tail byte 4083 jccb(Assembler::zero, TRUE_LABEL); 4084 load_unsigned_byte(chr, Address(ary1, 0)); 4085 load_unsigned_byte(limit, Address(ary2, 0)); 4086 cmpl(chr, limit); 4087 jccb(Assembler::notEqual, FALSE_LABEL); 4088 } 4089 bind(TRUE_LABEL); 4090 movl(result, 1); // return true 4091 jmpb(DONE); 4092 4093 bind(FALSE_LABEL); 4094 xorl(result, result); // return false 4095 4096 // That's it 4097 bind(DONE); 4098 if (UseAVX >= 2) { 4099 // clean upper bits of YMM registers 4100 vpxor(vec1, vec1); 4101 vpxor(vec2, vec2); 4102 } 4103 } 4104 4105 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4106 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4107 switch(ideal_opc) { 4108 case Op_LShiftVS: 4109 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4110 case Op_LShiftVI: 4111 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4112 case Op_LShiftVL: 4113 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4114 case Op_RShiftVS: 4115 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4116 case Op_RShiftVI: 4117 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4118 case Op_RShiftVL: 4119 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4120 case Op_URShiftVS: 4121 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4122 case Op_URShiftVI: 4123 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4124 case Op_URShiftVL: 4125 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4126 case Op_RotateRightV: 4127 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4128 case Op_RotateLeftV: 4129 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4130 default: 4131 fatal("Unsupported masked operation"); break; 4132 } 4133 } 4134 4135 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4136 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4137 bool is_varshift) { 4138 switch (ideal_opc) { 4139 case Op_AddVB: 4140 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4141 case Op_AddVS: 4142 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4143 case Op_AddVI: 4144 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4145 case Op_AddVL: 4146 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4147 case Op_AddVF: 4148 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4149 case Op_AddVD: 4150 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4151 case Op_SubVB: 4152 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4153 case Op_SubVS: 4154 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4155 case Op_SubVI: 4156 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4157 case Op_SubVL: 4158 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4159 case Op_SubVF: 4160 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4161 case Op_SubVD: 4162 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4163 case Op_MulVS: 4164 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4165 case Op_MulVI: 4166 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4167 case Op_MulVL: 4168 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4169 case Op_MulVF: 4170 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4171 case Op_MulVD: 4172 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4173 case Op_DivVF: 4174 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4175 case Op_DivVD: 4176 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4177 case Op_SqrtVF: 4178 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4179 case Op_SqrtVD: 4180 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4181 case Op_AbsVB: 4182 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4183 case Op_AbsVS: 4184 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4185 case Op_AbsVI: 4186 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4187 case Op_AbsVL: 4188 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4189 case Op_FmaVF: 4190 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4191 case Op_FmaVD: 4192 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4193 case Op_VectorRearrange: 4194 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4195 case Op_LShiftVS: 4196 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4197 case Op_LShiftVI: 4198 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4199 case Op_LShiftVL: 4200 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4201 case Op_RShiftVS: 4202 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4203 case Op_RShiftVI: 4204 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4205 case Op_RShiftVL: 4206 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4207 case Op_URShiftVS: 4208 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4209 case Op_URShiftVI: 4210 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4211 case Op_URShiftVL: 4212 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4213 case Op_RotateLeftV: 4214 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4215 case Op_RotateRightV: 4216 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4217 case Op_MaxV: 4218 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4219 case Op_MinV: 4220 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4221 case Op_XorV: 4222 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4223 case Op_OrV: 4224 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4225 case Op_AndV: 4226 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4227 default: 4228 fatal("Unsupported masked operation"); break; 4229 } 4230 } 4231 4232 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4233 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4234 switch (ideal_opc) { 4235 case Op_AddVB: 4236 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4237 case Op_AddVS: 4238 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4239 case Op_AddVI: 4240 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4241 case Op_AddVL: 4242 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4243 case Op_AddVF: 4244 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4245 case Op_AddVD: 4246 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4247 case Op_SubVB: 4248 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4249 case Op_SubVS: 4250 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4251 case Op_SubVI: 4252 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4253 case Op_SubVL: 4254 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4255 case Op_SubVF: 4256 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4257 case Op_SubVD: 4258 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4259 case Op_MulVS: 4260 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4261 case Op_MulVI: 4262 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4263 case Op_MulVL: 4264 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4265 case Op_MulVF: 4266 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4267 case Op_MulVD: 4268 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4269 case Op_DivVF: 4270 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4271 case Op_DivVD: 4272 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4273 case Op_FmaVF: 4274 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4275 case Op_FmaVD: 4276 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4277 case Op_MaxV: 4278 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4279 case Op_MinV: 4280 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4281 case Op_XorV: 4282 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4283 case Op_OrV: 4284 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4285 case Op_AndV: 4286 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4287 default: 4288 fatal("Unsupported masked operation"); break; 4289 } 4290 } 4291 4292 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4293 KRegister src1, KRegister src2) { 4294 BasicType etype = T_ILLEGAL; 4295 switch(mask_len) { 4296 case 2: 4297 case 4: 4298 case 8: etype = T_BYTE; break; 4299 case 16: etype = T_SHORT; break; 4300 case 32: etype = T_INT; break; 4301 case 64: etype = T_LONG; break; 4302 default: fatal("Unsupported type"); break; 4303 } 4304 assert(etype != T_ILLEGAL, ""); 4305 switch(ideal_opc) { 4306 case Op_AndVMask: 4307 kand(etype, dst, src1, src2); break; 4308 case Op_OrVMask: 4309 kor(etype, dst, src1, src2); break; 4310 case Op_XorVMask: 4311 kxor(etype, dst, src1, src2); break; 4312 default: 4313 fatal("Unsupported masked operation"); break; 4314 } 4315 } 4316 4317 /* 4318 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4319 * If src is NaN, the result is 0. 4320 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4321 * the result is equal to the value of Integer.MIN_VALUE. 4322 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4323 * the result is equal to the value of Integer.MAX_VALUE. 4324 */ 4325 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4326 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4327 Register rscratch, AddressLiteral float_sign_flip, 4328 int vec_enc) { 4329 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4330 Label done; 4331 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4332 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4333 vptest(xtmp2, xtmp2, vec_enc); 4334 jccb(Assembler::equal, done); 4335 4336 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4337 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4338 4339 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4340 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4341 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4342 4343 // Recompute the mask for remaining special value. 4344 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4345 // Extract SRC values corresponding to TRUE mask lanes. 4346 vpand(xtmp4, xtmp2, src, vec_enc); 4347 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4348 // values are set. 4349 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4350 4351 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4352 bind(done); 4353 } 4354 4355 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4356 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4357 Register rscratch, AddressLiteral float_sign_flip, 4358 int vec_enc) { 4359 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4360 Label done; 4361 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4362 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4363 kortestwl(ktmp1, ktmp1); 4364 jccb(Assembler::equal, done); 4365 4366 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4367 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4368 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4369 4370 kxorwl(ktmp1, ktmp1, ktmp2); 4371 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4372 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4373 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4374 bind(done); 4375 } 4376 4377 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4378 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4379 Register rscratch, AddressLiteral double_sign_flip, 4380 int vec_enc) { 4381 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4382 4383 Label done; 4384 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4385 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4386 kortestwl(ktmp1, ktmp1); 4387 jccb(Assembler::equal, done); 4388 4389 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4390 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4391 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4392 4393 kxorwl(ktmp1, ktmp1, ktmp2); 4394 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4395 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4396 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4397 bind(done); 4398 } 4399 4400 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4401 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4402 Register rscratch, AddressLiteral float_sign_flip, 4403 int vec_enc) { 4404 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4405 Label done; 4406 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4407 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4408 kortestwl(ktmp1, ktmp1); 4409 jccb(Assembler::equal, done); 4410 4411 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4412 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4413 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4414 4415 kxorwl(ktmp1, ktmp1, ktmp2); 4416 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4417 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4418 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4419 bind(done); 4420 } 4421 4422 /* 4423 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4424 * If src is NaN, the result is 0. 4425 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 4426 * the result is equal to the value of Long.MIN_VALUE. 4427 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 4428 * the result is equal to the value of Long.MAX_VALUE. 4429 */ 4430 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4431 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4432 Register rscratch, AddressLiteral double_sign_flip, 4433 int vec_enc) { 4434 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4435 4436 Label done; 4437 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4438 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 4439 kortestwl(ktmp1, ktmp1); 4440 jccb(Assembler::equal, done); 4441 4442 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4443 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4444 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4445 4446 kxorwl(ktmp1, ktmp1, ktmp2); 4447 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4448 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4449 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4450 bind(done); 4451 } 4452 4453 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 4454 XMMRegister xtmp, int index, int vec_enc) { 4455 assert(vec_enc < Assembler::AVX_512bit, ""); 4456 if (vec_enc == Assembler::AVX_256bit) { 4457 vextractf128_high(xtmp, src); 4458 vshufps(dst, src, xtmp, index, vec_enc); 4459 } else { 4460 vshufps(dst, src, zero, index, vec_enc); 4461 } 4462 } 4463 4464 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4465 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 4466 AddressLiteral float_sign_flip, int src_vec_enc) { 4467 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4468 4469 Label done; 4470 // Compare the destination lanes with float_sign_flip 4471 // value to get mask for all special values. 4472 movdqu(xtmp1, float_sign_flip, rscratch); 4473 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 4474 ptest(xtmp2, xtmp2); 4475 jccb(Assembler::equal, done); 4476 4477 // Flip float_sign_flip to get max integer value. 4478 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 4479 pxor(xtmp1, xtmp4); 4480 4481 // Set detination lanes corresponding to unordered source lanes as zero. 4482 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 4483 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 4484 4485 // Shuffle mask vector and pack lower doubles word from each quadword lane. 4486 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 4487 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 4488 4489 // Recompute the mask for remaining special value. 4490 pxor(xtmp2, xtmp3); 4491 // Extract mask corresponding to non-negative source lanes. 4492 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 4493 4494 // Shuffle mask vector and pack lower doubles word from each quadword lane. 4495 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 4496 pand(xtmp3, xtmp2); 4497 4498 // Replace destination lanes holding special value(0x80000000) with max int 4499 // if corresponding source lane holds a +ve value. 4500 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 4501 bind(done); 4502 } 4503 4504 4505 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 4506 XMMRegister xtmp, Register rscratch, int vec_enc) { 4507 switch(to_elem_bt) { 4508 case T_SHORT: 4509 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 4510 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 4511 vpackusdw(dst, dst, zero, vec_enc); 4512 if (vec_enc == Assembler::AVX_256bit) { 4513 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 4514 } 4515 break; 4516 case T_BYTE: 4517 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 4518 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 4519 vpackusdw(dst, dst, zero, vec_enc); 4520 if (vec_enc == Assembler::AVX_256bit) { 4521 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 4522 } 4523 vpackuswb(dst, dst, zero, vec_enc); 4524 break; 4525 default: assert(false, "%s", type2name(to_elem_bt)); 4526 } 4527 } 4528 4529 /* 4530 * Algorithm for vector D2L and F2I conversions:- 4531 * a) Perform vector D2L/F2I cast. 4532 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 4533 * It signifies that source value could be any of the special floating point 4534 * values(NaN,-Inf,Inf,Max,-Min). 4535 * c) Set destination to zero if source is NaN value. 4536 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 4537 */ 4538 4539 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4540 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4541 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 4542 int to_elem_sz = type2aelembytes(to_elem_bt); 4543 assert(to_elem_sz <= 4, ""); 4544 vcvttps2dq(dst, src, vec_enc); 4545 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 4546 if (to_elem_sz < 4) { 4547 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4548 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 4549 } 4550 } 4551 4552 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4553 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 4554 Register rscratch, int vec_enc) { 4555 int to_elem_sz = type2aelembytes(to_elem_bt); 4556 assert(to_elem_sz <= 4, ""); 4557 vcvttps2dq(dst, src, vec_enc); 4558 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 4559 switch(to_elem_bt) { 4560 case T_INT: 4561 break; 4562 case T_SHORT: 4563 evpmovdw(dst, dst, vec_enc); 4564 break; 4565 case T_BYTE: 4566 evpmovdb(dst, dst, vec_enc); 4567 break; 4568 default: assert(false, "%s", type2name(to_elem_bt)); 4569 } 4570 } 4571 4572 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4573 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 4574 Register rscratch, int vec_enc) { 4575 evcvttps2qq(dst, src, vec_enc); 4576 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 4577 } 4578 4579 // Handling for downcasting from double to integer or sub-word types on AVX2. 4580 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4581 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 4582 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 4583 int to_elem_sz = type2aelembytes(to_elem_bt); 4584 assert(to_elem_sz < 8, ""); 4585 vcvttpd2dq(dst, src, vec_enc); 4586 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 4587 float_sign_flip, vec_enc); 4588 if (to_elem_sz < 4) { 4589 // xtmp4 holds all zero lanes. 4590 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 4591 } 4592 } 4593 4594 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 4595 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 4596 KRegister ktmp2, AddressLiteral sign_flip, 4597 Register rscratch, int vec_enc) { 4598 if (VM_Version::supports_avx512dq()) { 4599 evcvttpd2qq(dst, src, vec_enc); 4600 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 4601 switch(to_elem_bt) { 4602 case T_LONG: 4603 break; 4604 case T_INT: 4605 evpmovsqd(dst, dst, vec_enc); 4606 break; 4607 case T_SHORT: 4608 evpmovsqd(dst, dst, vec_enc); 4609 evpmovdw(dst, dst, vec_enc); 4610 break; 4611 case T_BYTE: 4612 evpmovsqd(dst, dst, vec_enc); 4613 evpmovdb(dst, dst, vec_enc); 4614 break; 4615 default: assert(false, "%s", type2name(to_elem_bt)); 4616 } 4617 } else { 4618 assert(type2aelembytes(to_elem_bt) <= 4, ""); 4619 vcvttpd2dq(dst, src, vec_enc); 4620 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 4621 switch(to_elem_bt) { 4622 case T_INT: 4623 break; 4624 case T_SHORT: 4625 evpmovdw(dst, dst, vec_enc); 4626 break; 4627 case T_BYTE: 4628 evpmovdb(dst, dst, vec_enc); 4629 break; 4630 default: assert(false, "%s", type2name(to_elem_bt)); 4631 } 4632 } 4633 } 4634 4635 #ifdef _LP64 4636 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 4637 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 4638 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 4639 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4640 // and re-instantiate original MXCSR.RC mode after that. 4641 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 4642 4643 mov64(tmp, julong_cast(0.5L)); 4644 evpbroadcastq(xtmp1, tmp, vec_enc); 4645 vaddpd(xtmp1, src , xtmp1, vec_enc); 4646 evcvtpd2qq(dst, xtmp1, vec_enc); 4647 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 4648 double_sign_flip, vec_enc);; 4649 4650 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 4651 } 4652 4653 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 4654 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 4655 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 4656 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4657 // and re-instantiate original MXCSR.RC mode after that. 4658 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 4659 4660 movl(tmp, jint_cast(0.5)); 4661 movq(xtmp1, tmp); 4662 vbroadcastss(xtmp1, xtmp1, vec_enc); 4663 vaddps(xtmp1, src , xtmp1, vec_enc); 4664 vcvtps2dq(dst, xtmp1, vec_enc); 4665 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 4666 float_sign_flip, vec_enc); 4667 4668 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 4669 } 4670 4671 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 4672 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 4673 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 4674 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4675 // and re-instantiate original MXCSR.RC mode after that. 4676 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 4677 4678 movl(tmp, jint_cast(0.5)); 4679 movq(xtmp1, tmp); 4680 vbroadcastss(xtmp1, xtmp1, vec_enc); 4681 vaddps(xtmp1, src , xtmp1, vec_enc); 4682 vcvtps2dq(dst, xtmp1, vec_enc); 4683 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 4684 4685 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 4686 } 4687 #endif // _LP64 4688 4689 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 4690 BasicType from_elem_bt, BasicType to_elem_bt) { 4691 switch (from_elem_bt) { 4692 case T_BYTE: 4693 switch (to_elem_bt) { 4694 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 4695 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 4696 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 4697 default: ShouldNotReachHere(); 4698 } 4699 break; 4700 case T_SHORT: 4701 switch (to_elem_bt) { 4702 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 4703 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 4704 default: ShouldNotReachHere(); 4705 } 4706 break; 4707 case T_INT: 4708 assert(to_elem_bt == T_LONG, ""); 4709 vpmovzxdq(dst, src, vlen_enc); 4710 break; 4711 default: 4712 ShouldNotReachHere(); 4713 } 4714 } 4715 4716 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 4717 BasicType dst_bt, BasicType src_bt, int vlen) { 4718 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 4719 assert(vlen_enc != AVX_512bit, ""); 4720 4721 int dst_bt_size = type2aelembytes(dst_bt); 4722 int src_bt_size = type2aelembytes(src_bt); 4723 if (dst_bt_size > src_bt_size) { 4724 switch (dst_bt_size / src_bt_size) { 4725 case 2: vpmovsxbw(dst, src, vlen_enc); break; 4726 case 4: vpmovsxbd(dst, src, vlen_enc); break; 4727 case 8: vpmovsxbq(dst, src, vlen_enc); break; 4728 default: ShouldNotReachHere(); 4729 } 4730 } else { 4731 assert(dst_bt_size < src_bt_size, ""); 4732 switch (src_bt_size / dst_bt_size) { 4733 case 2: { 4734 if (vlen_enc == AVX_128bit) { 4735 vpacksswb(dst, src, src, vlen_enc); 4736 } else { 4737 vpacksswb(dst, src, src, vlen_enc); 4738 vpermq(dst, dst, 0x08, vlen_enc); 4739 } 4740 break; 4741 } 4742 case 4: { 4743 if (vlen_enc == AVX_128bit) { 4744 vpackssdw(dst, src, src, vlen_enc); 4745 vpacksswb(dst, dst, dst, vlen_enc); 4746 } else { 4747 vpackssdw(dst, src, src, vlen_enc); 4748 vpermq(dst, dst, 0x08, vlen_enc); 4749 vpacksswb(dst, dst, dst, AVX_128bit); 4750 } 4751 break; 4752 } 4753 case 8: { 4754 if (vlen_enc == AVX_128bit) { 4755 vpshufd(dst, src, 0x08, vlen_enc); 4756 vpackssdw(dst, dst, dst, vlen_enc); 4757 vpacksswb(dst, dst, dst, vlen_enc); 4758 } else { 4759 vpshufd(dst, src, 0x08, vlen_enc); 4760 vpermq(dst, dst, 0x08, vlen_enc); 4761 vpackssdw(dst, dst, dst, AVX_128bit); 4762 vpacksswb(dst, dst, dst, AVX_128bit); 4763 } 4764 break; 4765 } 4766 default: ShouldNotReachHere(); 4767 } 4768 } 4769 } 4770 4771 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 4772 bool merge, BasicType bt, int vlen_enc) { 4773 if (bt == T_INT) { 4774 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 4775 } else { 4776 assert(bt == T_LONG, ""); 4777 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 4778 } 4779 } 4780 4781 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 4782 bool merge, BasicType bt, int vlen_enc) { 4783 if (bt == T_INT) { 4784 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 4785 } else { 4786 assert(bt == T_LONG, ""); 4787 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 4788 } 4789 } 4790 4791 #ifdef _LP64 4792 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 4793 Register rtmp2, XMMRegister xtmp, int mask_len, 4794 int vec_enc) { 4795 int index = 0; 4796 int vindex = 0; 4797 mov64(rtmp1, 0x0101010101010101L); 4798 pdepq(rtmp1, src, rtmp1); 4799 if (mask_len > 8) { 4800 movq(rtmp2, src); 4801 vpxor(xtmp, xtmp, xtmp, vec_enc); 4802 movq(xtmp, rtmp1); 4803 } 4804 movq(dst, rtmp1); 4805 4806 mask_len -= 8; 4807 while (mask_len > 0) { 4808 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 4809 index++; 4810 if ((index % 2) == 0) { 4811 pxor(xtmp, xtmp); 4812 } 4813 mov64(rtmp1, 0x0101010101010101L); 4814 shrq(rtmp2, 8); 4815 pdepq(rtmp1, rtmp2, rtmp1); 4816 pinsrq(xtmp, rtmp1, index % 2); 4817 vindex = index / 2; 4818 if (vindex) { 4819 // Write entire 16 byte vector when both 64 bit 4820 // lanes are update to save redundant instructions. 4821 if (index % 2) { 4822 vinsertf128(dst, dst, xtmp, vindex); 4823 } 4824 } else { 4825 vmovdqu(dst, xtmp); 4826 } 4827 mask_len -= 8; 4828 } 4829 } 4830 4831 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 4832 switch(opc) { 4833 case Op_VectorMaskTrueCount: 4834 popcntq(dst, tmp); 4835 break; 4836 case Op_VectorMaskLastTrue: 4837 if (VM_Version::supports_lzcnt()) { 4838 lzcntq(tmp, tmp); 4839 movl(dst, 63); 4840 subl(dst, tmp); 4841 } else { 4842 movl(dst, -1); 4843 bsrq(tmp, tmp); 4844 cmov32(Assembler::notZero, dst, tmp); 4845 } 4846 break; 4847 case Op_VectorMaskFirstTrue: 4848 if (VM_Version::supports_bmi1()) { 4849 if (masklen < 32) { 4850 orl(tmp, 1 << masklen); 4851 tzcntl(dst, tmp); 4852 } else if (masklen == 32) { 4853 tzcntl(dst, tmp); 4854 } else { 4855 assert(masklen == 64, ""); 4856 tzcntq(dst, tmp); 4857 } 4858 } else { 4859 if (masklen < 32) { 4860 orl(tmp, 1 << masklen); 4861 bsfl(dst, tmp); 4862 } else { 4863 assert(masklen == 32 || masklen == 64, ""); 4864 movl(dst, masklen); 4865 if (masklen == 32) { 4866 bsfl(tmp, tmp); 4867 } else { 4868 bsfq(tmp, tmp); 4869 } 4870 cmov32(Assembler::notZero, dst, tmp); 4871 } 4872 } 4873 break; 4874 case Op_VectorMaskToLong: 4875 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 4876 break; 4877 default: assert(false, "Unhandled mask operation"); 4878 } 4879 } 4880 4881 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 4882 int masklen, int masksize, int vec_enc) { 4883 assert(VM_Version::supports_popcnt(), ""); 4884 4885 if(VM_Version::supports_avx512bw()) { 4886 kmovql(tmp, mask); 4887 } else { 4888 assert(masklen <= 16, ""); 4889 kmovwl(tmp, mask); 4890 } 4891 4892 // Mask generated out of partial vector comparisons/replicate/mask manipulation 4893 // operations needs to be clipped. 4894 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 4895 andq(tmp, (1 << masklen) - 1); 4896 } 4897 4898 vector_mask_operation_helper(opc, dst, tmp, masklen); 4899 } 4900 4901 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 4902 Register tmp, int masklen, BasicType bt, int vec_enc) { 4903 assert(vec_enc == AVX_128bit && VM_Version::supports_avx() || 4904 vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4), ""); 4905 assert(VM_Version::supports_popcnt(), ""); 4906 4907 bool need_clip = false; 4908 switch(bt) { 4909 case T_BOOLEAN: 4910 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 4911 vpxor(xtmp, xtmp, xtmp, vec_enc); 4912 vpsubb(xtmp, xtmp, mask, vec_enc); 4913 vpmovmskb(tmp, xtmp, vec_enc); 4914 need_clip = masklen < 16; 4915 break; 4916 case T_BYTE: 4917 vpmovmskb(tmp, mask, vec_enc); 4918 need_clip = masklen < 16; 4919 break; 4920 case T_SHORT: 4921 vpacksswb(xtmp, mask, mask, vec_enc); 4922 if (masklen >= 16) { 4923 vpermpd(xtmp, xtmp, 8, vec_enc); 4924 } 4925 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 4926 need_clip = masklen < 16; 4927 break; 4928 case T_INT: 4929 case T_FLOAT: 4930 vmovmskps(tmp, mask, vec_enc); 4931 need_clip = masklen < 4; 4932 break; 4933 case T_LONG: 4934 case T_DOUBLE: 4935 vmovmskpd(tmp, mask, vec_enc); 4936 need_clip = masklen < 2; 4937 break; 4938 default: assert(false, "Unhandled type, %s", type2name(bt)); 4939 } 4940 4941 // Mask generated out of partial vector comparisons/replicate/mask manipulation 4942 // operations needs to be clipped. 4943 if (need_clip && opc != Op_VectorMaskFirstTrue) { 4944 // need_clip implies masklen < 32 4945 andq(tmp, (1 << masklen) - 1); 4946 } 4947 4948 vector_mask_operation_helper(opc, dst, tmp, masklen); 4949 } 4950 4951 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 4952 Register rtmp2, int mask_len) { 4953 kmov(rtmp1, src); 4954 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 4955 mov64(rtmp2, -1L); 4956 pextq(rtmp2, rtmp2, rtmp1); 4957 kmov(dst, rtmp2); 4958 } 4959 4960 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 4961 bool merge, BasicType bt, int vec_enc) { 4962 if (opcode == Op_CompressV) { 4963 switch(bt) { 4964 case T_BYTE: 4965 evpcompressb(dst, mask, src, merge, vec_enc); 4966 break; 4967 case T_CHAR: 4968 case T_SHORT: 4969 evpcompressw(dst, mask, src, merge, vec_enc); 4970 break; 4971 case T_INT: 4972 evpcompressd(dst, mask, src, merge, vec_enc); 4973 break; 4974 case T_FLOAT: 4975 evcompressps(dst, mask, src, merge, vec_enc); 4976 break; 4977 case T_LONG: 4978 evpcompressq(dst, mask, src, merge, vec_enc); 4979 break; 4980 case T_DOUBLE: 4981 evcompresspd(dst, mask, src, merge, vec_enc); 4982 break; 4983 default: 4984 fatal("Unsupported type %s", type2name(bt)); 4985 break; 4986 } 4987 } else { 4988 assert(opcode == Op_ExpandV, ""); 4989 switch(bt) { 4990 case T_BYTE: 4991 evpexpandb(dst, mask, src, merge, vec_enc); 4992 break; 4993 case T_CHAR: 4994 case T_SHORT: 4995 evpexpandw(dst, mask, src, merge, vec_enc); 4996 break; 4997 case T_INT: 4998 evpexpandd(dst, mask, src, merge, vec_enc); 4999 break; 5000 case T_FLOAT: 5001 evexpandps(dst, mask, src, merge, vec_enc); 5002 break; 5003 case T_LONG: 5004 evpexpandq(dst, mask, src, merge, vec_enc); 5005 break; 5006 case T_DOUBLE: 5007 evexpandpd(dst, mask, src, merge, vec_enc); 5008 break; 5009 default: 5010 fatal("Unsupported type %s", type2name(bt)); 5011 break; 5012 } 5013 } 5014 } 5015 #endif 5016 5017 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5018 KRegister ktmp1, int vec_enc) { 5019 if (opcode == Op_SignumVD) { 5020 vsubpd(dst, zero, one, vec_enc); 5021 // if src < 0 ? -1 : 1 5022 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5023 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5024 // if src == NaN, -0.0 or 0.0 return src. 5025 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5026 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5027 } else { 5028 assert(opcode == Op_SignumVF, ""); 5029 vsubps(dst, zero, one, vec_enc); 5030 // if src < 0 ? -1 : 1 5031 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5032 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5033 // if src == NaN, -0.0 or 0.0 return src. 5034 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5035 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5036 } 5037 } 5038 5039 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5040 XMMRegister xtmp1, int vec_enc) { 5041 if (opcode == Op_SignumVD) { 5042 vsubpd(dst, zero, one, vec_enc); 5043 // if src < 0 ? -1 : 1 5044 vblendvpd(dst, one, dst, src, vec_enc); 5045 // if src == NaN, -0.0 or 0.0 return src. 5046 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5047 vblendvpd(dst, dst, src, xtmp1, vec_enc); 5048 } else { 5049 assert(opcode == Op_SignumVF, ""); 5050 vsubps(dst, zero, one, vec_enc); 5051 // if src < 0 ? -1 : 1 5052 vblendvps(dst, one, dst, src, vec_enc); 5053 // if src == NaN, -0.0 or 0.0 return src. 5054 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5055 vblendvps(dst, dst, src, xtmp1, vec_enc); 5056 } 5057 } 5058 5059 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5060 if (VM_Version::supports_avx512bw()) { 5061 if (mask_len > 32) { 5062 kmovql(dst, src); 5063 } else { 5064 kmovdl(dst, src); 5065 if (mask_len != 32) { 5066 kshiftrdl(dst, dst, 32 - mask_len); 5067 } 5068 } 5069 } else { 5070 assert(mask_len <= 16, ""); 5071 kmovwl(dst, src); 5072 if (mask_len != 16) { 5073 kshiftrwl(dst, dst, 16 - mask_len); 5074 } 5075 } 5076 } 5077 5078 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5079 int lane_size = type2aelembytes(bt); 5080 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5081 if ((is_LP64 || lane_size < 8) && 5082 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5083 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5084 movptr(rtmp, imm32); 5085 switch(lane_size) { 5086 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5087 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5088 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5089 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5090 fatal("Unsupported lane size %d", lane_size); 5091 break; 5092 } 5093 } else { 5094 movptr(rtmp, imm32); 5095 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5096 switch(lane_size) { 5097 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5098 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5099 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5100 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5101 fatal("Unsupported lane size %d", lane_size); 5102 break; 5103 } 5104 } 5105 } 5106 5107 // 5108 // Following is lookup table based popcount computation algorithm:- 5109 // Index Bit set count 5110 // [ 0000 -> 0, 5111 // 0001 -> 1, 5112 // 0010 -> 1, 5113 // 0011 -> 2, 5114 // 0100 -> 1, 5115 // 0101 -> 2, 5116 // 0110 -> 2, 5117 // 0111 -> 3, 5118 // 1000 -> 1, 5119 // 1001 -> 2, 5120 // 1010 -> 3, 5121 // 1011 -> 3, 5122 // 1100 -> 2, 5123 // 1101 -> 3, 5124 // 1111 -> 4 ] 5125 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5126 // shuffle indices for lookup table access. 5127 // b. Right shift each byte of vector lane by 4 positions. 5128 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5129 // shuffle indices for lookup table access. 5130 // d. Add the bitset count of upper and lower 4 bits of each byte. 5131 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5132 // count of all the bytes of a quadword. 5133 // f. Perform step e. for upper 128bit vector lane. 5134 // g. Pack the bitset count of quadwords back to double word. 5135 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5136 5137 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5138 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5139 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5140 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5141 vpsrlw(dst, src, 4, vec_enc); 5142 vpand(dst, dst, xtmp1, vec_enc); 5143 vpand(xtmp1, src, xtmp1, vec_enc); 5144 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5145 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5146 vpshufb(dst, xtmp2, dst, vec_enc); 5147 vpaddb(dst, dst, xtmp1, vec_enc); 5148 } 5149 5150 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5151 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5152 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5153 // Following code is as per steps e,f,g and h of above algorithm. 5154 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5155 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5156 vpsadbw(dst, dst, xtmp2, vec_enc); 5157 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5158 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5159 vpackuswb(dst, xtmp1, dst, vec_enc); 5160 } 5161 5162 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5163 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5164 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5165 // Add the popcount of upper and lower bytes of word. 5166 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5167 vpsrlw(dst, xtmp1, 8, vec_enc); 5168 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5169 vpaddw(dst, dst, xtmp1, vec_enc); 5170 } 5171 5172 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5173 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5174 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5175 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5176 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5177 } 5178 5179 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5180 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5181 switch(bt) { 5182 case T_LONG: 5183 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5184 break; 5185 case T_INT: 5186 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5187 break; 5188 case T_CHAR: 5189 case T_SHORT: 5190 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5191 break; 5192 case T_BYTE: 5193 case T_BOOLEAN: 5194 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5195 break; 5196 default: 5197 fatal("Unsupported type %s", type2name(bt)); 5198 break; 5199 } 5200 } 5201 5202 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5203 KRegister mask, bool merge, int vec_enc) { 5204 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5205 switch(bt) { 5206 case T_LONG: 5207 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5208 evpopcntq(dst, mask, src, merge, vec_enc); 5209 break; 5210 case T_INT: 5211 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5212 evpopcntd(dst, mask, src, merge, vec_enc); 5213 break; 5214 case T_CHAR: 5215 case T_SHORT: 5216 assert(VM_Version::supports_avx512_bitalg(), ""); 5217 evpopcntw(dst, mask, src, merge, vec_enc); 5218 break; 5219 case T_BYTE: 5220 case T_BOOLEAN: 5221 assert(VM_Version::supports_avx512_bitalg(), ""); 5222 evpopcntb(dst, mask, src, merge, vec_enc); 5223 break; 5224 default: 5225 fatal("Unsupported type %s", type2name(bt)); 5226 break; 5227 } 5228 } 5229 5230 #ifndef _LP64 5231 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 5232 assert(VM_Version::supports_avx512bw(), ""); 5233 kmovdl(tmp, src); 5234 kunpckdql(dst, tmp, tmp); 5235 } 5236 #endif 5237 5238 // Bit reversal algorithm first reverses the bits of each byte followed by 5239 // a byte level reversal for multi-byte primitive types (short/int/long). 5240 // Algorithm performs a lookup table access to get reverse bit sequence 5241 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5242 // is obtained by swapping the reverse bit sequences of upper and lower 5243 // nibble of a byte. 5244 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5245 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5246 if (VM_Version::supports_avx512vlbw()) { 5247 5248 // Get the reverse bit sequence of lower nibble of each byte. 5249 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5250 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5251 evpandq(dst, xtmp2, src, vec_enc); 5252 vpshufb(dst, xtmp1, dst, vec_enc); 5253 vpsllq(dst, dst, 4, vec_enc); 5254 5255 // Get the reverse bit sequence of upper nibble of each byte. 5256 vpandn(xtmp2, xtmp2, src, vec_enc); 5257 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5258 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5259 5260 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5261 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5262 evporq(xtmp2, dst, xtmp2, vec_enc); 5263 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5264 5265 } else if(vec_enc == Assembler::AVX_512bit) { 5266 // Shift based bit reversal. 5267 assert(bt == T_LONG || bt == T_INT, ""); 5268 5269 // Swap lower and upper nibble of each byte. 5270 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5271 5272 // Swap two least and most significant bits of each nibble. 5273 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5274 5275 // Swap adjacent pair of bits. 5276 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5277 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5278 5279 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5280 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5281 } else { 5282 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5283 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5284 5285 // Get the reverse bit sequence of lower nibble of each byte. 5286 vpand(dst, xtmp2, src, vec_enc); 5287 vpshufb(dst, xtmp1, dst, vec_enc); 5288 vpsllq(dst, dst, 4, vec_enc); 5289 5290 // Get the reverse bit sequence of upper nibble of each byte. 5291 vpandn(xtmp2, xtmp2, src, vec_enc); 5292 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5293 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5294 5295 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5296 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5297 vpor(xtmp2, dst, xtmp2, vec_enc); 5298 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5299 } 5300 } 5301 5302 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 5303 XMMRegister xtmp, Register rscratch) { 5304 assert(VM_Version::supports_gfni(), ""); 5305 assert(rscratch != noreg || always_reachable(mask), "missing"); 5306 5307 // Galois field instruction based bit reversal based on following algorithm. 5308 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5309 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 5310 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 5311 vector_reverse_byte(bt, dst, xtmp, vec_enc); 5312 } 5313 5314 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 5315 XMMRegister xtmp1, Register rtmp, int vec_enc) { 5316 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 5317 evpandq(dst, xtmp1, src, vec_enc); 5318 vpsllq(dst, dst, nbits, vec_enc); 5319 vpandn(xtmp1, xtmp1, src, vec_enc); 5320 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 5321 evporq(dst, dst, xtmp1, vec_enc); 5322 } 5323 5324 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5325 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5326 // Shift based bit reversal. 5327 assert(VM_Version::supports_evex(), ""); 5328 switch(bt) { 5329 case T_LONG: 5330 // Swap upper and lower double word of each quad word. 5331 evprorq(xtmp1, k0, src, 32, true, vec_enc); 5332 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 5333 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5334 break; 5335 case T_INT: 5336 // Swap upper and lower word of each double word. 5337 evprord(xtmp1, k0, src, 16, true, vec_enc); 5338 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5339 break; 5340 case T_CHAR: 5341 case T_SHORT: 5342 // Swap upper and lower byte of each word. 5343 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 5344 break; 5345 case T_BYTE: 5346 evmovdquq(dst, k0, src, true, vec_enc); 5347 break; 5348 default: 5349 fatal("Unsupported type %s", type2name(bt)); 5350 break; 5351 } 5352 } 5353 5354 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5355 if (bt == T_BYTE) { 5356 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 5357 evmovdquq(dst, k0, src, true, vec_enc); 5358 } else { 5359 vmovdqu(dst, src); 5360 } 5361 return; 5362 } 5363 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 5364 // pre-computed shuffle indices. 5365 switch(bt) { 5366 case T_LONG: 5367 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 5368 break; 5369 case T_INT: 5370 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 5371 break; 5372 case T_CHAR: 5373 case T_SHORT: 5374 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 5375 break; 5376 default: 5377 fatal("Unsupported type %s", type2name(bt)); 5378 break; 5379 } 5380 vpshufb(dst, src, dst, vec_enc); 5381 } 5382 5383 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5384 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5385 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 5386 assert(is_integral_type(bt), ""); 5387 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5388 assert(VM_Version::supports_avx512cd(), ""); 5389 switch(bt) { 5390 case T_LONG: 5391 evplzcntq(dst, ktmp, src, merge, vec_enc); 5392 break; 5393 case T_INT: 5394 evplzcntd(dst, ktmp, src, merge, vec_enc); 5395 break; 5396 case T_SHORT: 5397 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 5398 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 5399 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 5400 vpunpckhwd(dst, xtmp1, src, vec_enc); 5401 evplzcntd(dst, ktmp, dst, merge, vec_enc); 5402 vpackusdw(dst, xtmp2, dst, vec_enc); 5403 break; 5404 case T_BYTE: 5405 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5406 // accessing the lookup table. 5407 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 5408 // accessing the lookup table. 5409 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 5410 assert(VM_Version::supports_avx512bw(), ""); 5411 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 5412 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 5413 vpand(xtmp2, dst, src, vec_enc); 5414 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5415 vpsrlw(xtmp3, src, 4, vec_enc); 5416 vpand(xtmp3, dst, xtmp3, vec_enc); 5417 vpshufb(dst, xtmp1, xtmp3, vec_enc); 5418 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 5419 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 5420 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 5421 break; 5422 default: 5423 fatal("Unsupported type %s", type2name(bt)); 5424 break; 5425 } 5426 } 5427 5428 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5429 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5430 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 5431 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5432 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5433 // accessing the lookup table. 5434 vpand(dst, xtmp2, src, vec_enc); 5435 vpshufb(dst, xtmp1, dst, vec_enc); 5436 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 5437 // accessing the lookup table. 5438 vpsrlw(xtmp3, src, 4, vec_enc); 5439 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 5440 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 5441 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 5442 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 5443 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 5444 vpaddb(dst, dst, xtmp2, vec_enc); 5445 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 5446 } 5447 5448 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5449 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5450 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5451 // Add zero counts of lower byte and upper byte of a word if 5452 // upper byte holds a zero value. 5453 vpsrlw(xtmp3, src, 8, vec_enc); 5454 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 5455 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 5456 vpsllw(xtmp2, dst, 8, vec_enc); 5457 vpaddw(xtmp2, xtmp2, dst, vec_enc); 5458 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5459 vpsrlw(dst, dst, 8, vec_enc); 5460 } 5461 5462 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5463 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 5464 // Since IEEE 754 floating point format represents mantissa in 1.0 format 5465 // hence biased exponent can be used to compute leading zero count as per 5466 // following formula:- 5467 // LZCNT = 32 - (biased_exp - 127) 5468 // Special handling has been introduced for Zero, Max_Int and -ve source values. 5469 5470 // Broadcast 0xFF 5471 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 5472 vpsrld(xtmp1, xtmp1, 24, vec_enc); 5473 5474 // Extract biased exponent. 5475 vcvtdq2ps(dst, src, vec_enc); 5476 vpsrld(dst, dst, 23, vec_enc); 5477 vpand(dst, dst, xtmp1, vec_enc); 5478 5479 // Broadcast 127. 5480 vpsrld(xtmp1, xtmp1, 1, vec_enc); 5481 // Exponent = biased_exp - 127 5482 vpsubd(dst, dst, xtmp1, vec_enc); 5483 5484 // Exponent = Exponent + 1 5485 vpsrld(xtmp3, xtmp1, 6, vec_enc); 5486 vpaddd(dst, dst, xtmp3, vec_enc); 5487 5488 // Replace -ve exponent with zero, exponent is -ve when src 5489 // lane contains a zero value. 5490 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5491 vblendvps(dst, dst, xtmp2, dst, vec_enc); 5492 5493 // Rematerialize broadcast 32. 5494 vpslld(xtmp1, xtmp3, 5, vec_enc); 5495 // Exponent is 32 if corresponding source lane contains max_int value. 5496 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 5497 // LZCNT = 32 - exponent 5498 vpsubd(dst, xtmp1, dst, vec_enc); 5499 5500 // Replace LZCNT with a value 1 if corresponding source lane 5501 // contains max_int value. 5502 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 5503 5504 // Replace biased_exp with 0 if source lane value is less than zero. 5505 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5506 vblendvps(dst, dst, xtmp2, src, vec_enc); 5507 } 5508 5509 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5510 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5511 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5512 // Add zero counts of lower word and upper word of a double word if 5513 // upper word holds a zero value. 5514 vpsrld(xtmp3, src, 16, vec_enc); 5515 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 5516 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 5517 vpslld(xtmp2, dst, 16, vec_enc); 5518 vpaddd(xtmp2, xtmp2, dst, vec_enc); 5519 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5520 vpsrld(dst, dst, 16, vec_enc); 5521 // Add zero counts of lower doubleword and upper doubleword of a 5522 // quadword if upper doubleword holds a zero value. 5523 vpsrlq(xtmp3, src, 32, vec_enc); 5524 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 5525 vpsllq(xtmp2, dst, 32, vec_enc); 5526 vpaddq(xtmp2, xtmp2, dst, vec_enc); 5527 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5528 vpsrlq(dst, dst, 32, vec_enc); 5529 } 5530 5531 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 5532 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5533 Register rtmp, int vec_enc) { 5534 assert(is_integral_type(bt), "unexpected type"); 5535 assert(vec_enc < Assembler::AVX_512bit, ""); 5536 switch(bt) { 5537 case T_LONG: 5538 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5539 break; 5540 case T_INT: 5541 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 5542 break; 5543 case T_SHORT: 5544 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5545 break; 5546 case T_BYTE: 5547 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5548 break; 5549 default: 5550 fatal("Unsupported type %s", type2name(bt)); 5551 break; 5552 } 5553 } 5554 5555 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 5556 switch(bt) { 5557 case T_BYTE: 5558 vpsubb(dst, src1, src2, vec_enc); 5559 break; 5560 case T_SHORT: 5561 vpsubw(dst, src1, src2, vec_enc); 5562 break; 5563 case T_INT: 5564 vpsubd(dst, src1, src2, vec_enc); 5565 break; 5566 case T_LONG: 5567 vpsubq(dst, src1, src2, vec_enc); 5568 break; 5569 default: 5570 fatal("Unsupported type %s", type2name(bt)); 5571 break; 5572 } 5573 } 5574 5575 // Trailing zero count computation is based on leading zero count operation as per 5576 // following equation. All AVX3 targets support AVX512CD feature which offers 5577 // direct vector instruction to compute leading zero count. 5578 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 5579 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5580 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5581 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 5582 assert(is_integral_type(bt), ""); 5583 // xtmp = -1 5584 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 5585 // xtmp = xtmp + src 5586 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 5587 // xtmp = xtmp & ~src 5588 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 5589 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 5590 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 5591 vpsub(bt, dst, xtmp4, dst, vec_enc); 5592 } 5593 5594 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 5595 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 5596 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5597 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5598 assert(is_integral_type(bt), ""); 5599 // xtmp = 0 5600 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 5601 // xtmp = 0 - src 5602 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 5603 // xtmp = xtmp | src 5604 vpor(xtmp3, xtmp3, src, vec_enc); 5605 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 5606 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 5607 vpsub(bt, dst, xtmp1, dst, vec_enc); 5608 } 5609 5610 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 5611 Label done; 5612 Label neg_divisor_fastpath; 5613 cmpl(divisor, 0); 5614 jccb(Assembler::less, neg_divisor_fastpath); 5615 xorl(rdx, rdx); 5616 divl(divisor); 5617 jmpb(done); 5618 bind(neg_divisor_fastpath); 5619 // Fastpath for divisor < 0: 5620 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 5621 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 5622 movl(rdx, rax); 5623 subl(rdx, divisor); 5624 if (VM_Version::supports_bmi1()) { 5625 andnl(rax, rdx, rax); 5626 } else { 5627 notl(rdx); 5628 andl(rax, rdx); 5629 } 5630 shrl(rax, 31); 5631 bind(done); 5632 } 5633 5634 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 5635 Label done; 5636 Label neg_divisor_fastpath; 5637 cmpl(divisor, 0); 5638 jccb(Assembler::less, neg_divisor_fastpath); 5639 xorl(rdx, rdx); 5640 divl(divisor); 5641 jmpb(done); 5642 bind(neg_divisor_fastpath); 5643 // Fastpath when divisor < 0: 5644 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 5645 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 5646 movl(rdx, rax); 5647 subl(rax, divisor); 5648 if (VM_Version::supports_bmi1()) { 5649 andnl(rax, rax, rdx); 5650 } else { 5651 notl(rax); 5652 andl(rax, rdx); 5653 } 5654 sarl(rax, 31); 5655 andl(rax, divisor); 5656 subl(rdx, rax); 5657 bind(done); 5658 } 5659 5660 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 5661 Label done; 5662 Label neg_divisor_fastpath; 5663 5664 cmpl(divisor, 0); 5665 jccb(Assembler::less, neg_divisor_fastpath); 5666 xorl(rdx, rdx); 5667 divl(divisor); 5668 jmpb(done); 5669 bind(neg_divisor_fastpath); 5670 // Fastpath for divisor < 0: 5671 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 5672 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 5673 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 5674 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 5675 movl(rdx, rax); 5676 subl(rax, divisor); 5677 if (VM_Version::supports_bmi1()) { 5678 andnl(rax, rax, rdx); 5679 } else { 5680 notl(rax); 5681 andl(rax, rdx); 5682 } 5683 movl(tmp, rax); 5684 shrl(rax, 31); // quotient 5685 sarl(tmp, 31); 5686 andl(tmp, divisor); 5687 subl(rdx, tmp); // remainder 5688 bind(done); 5689 } 5690 5691 #ifdef _LP64 5692 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 5693 XMMRegister xtmp2, Register rtmp) { 5694 if(VM_Version::supports_gfni()) { 5695 // Galois field instruction based bit reversal based on following algorithm. 5696 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5697 mov64(rtmp, 0x8040201008040201L); 5698 movq(xtmp1, src); 5699 movq(xtmp2, rtmp); 5700 gf2p8affineqb(xtmp1, xtmp2, 0); 5701 movq(dst, xtmp1); 5702 } else { 5703 // Swap even and odd numbered bits. 5704 movl(rtmp, src); 5705 andl(rtmp, 0x55555555); 5706 shll(rtmp, 1); 5707 movl(dst, src); 5708 andl(dst, 0xAAAAAAAA); 5709 shrl(dst, 1); 5710 orl(dst, rtmp); 5711 5712 // Swap LSB and MSB 2 bits of each nibble. 5713 movl(rtmp, dst); 5714 andl(rtmp, 0x33333333); 5715 shll(rtmp, 2); 5716 andl(dst, 0xCCCCCCCC); 5717 shrl(dst, 2); 5718 orl(dst, rtmp); 5719 5720 // Swap LSB and MSB 4 bits of each byte. 5721 movl(rtmp, dst); 5722 andl(rtmp, 0x0F0F0F0F); 5723 shll(rtmp, 4); 5724 andl(dst, 0xF0F0F0F0); 5725 shrl(dst, 4); 5726 orl(dst, rtmp); 5727 } 5728 bswapl(dst); 5729 } 5730 5731 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 5732 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 5733 if(VM_Version::supports_gfni()) { 5734 // Galois field instruction based bit reversal based on following algorithm. 5735 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5736 mov64(rtmp1, 0x8040201008040201L); 5737 movq(xtmp1, src); 5738 movq(xtmp2, rtmp1); 5739 gf2p8affineqb(xtmp1, xtmp2, 0); 5740 movq(dst, xtmp1); 5741 } else { 5742 // Swap even and odd numbered bits. 5743 movq(rtmp1, src); 5744 mov64(rtmp2, 0x5555555555555555L); 5745 andq(rtmp1, rtmp2); 5746 shlq(rtmp1, 1); 5747 movq(dst, src); 5748 notq(rtmp2); 5749 andq(dst, rtmp2); 5750 shrq(dst, 1); 5751 orq(dst, rtmp1); 5752 5753 // Swap LSB and MSB 2 bits of each nibble. 5754 movq(rtmp1, dst); 5755 mov64(rtmp2, 0x3333333333333333L); 5756 andq(rtmp1, rtmp2); 5757 shlq(rtmp1, 2); 5758 notq(rtmp2); 5759 andq(dst, rtmp2); 5760 shrq(dst, 2); 5761 orq(dst, rtmp1); 5762 5763 // Swap LSB and MSB 4 bits of each byte. 5764 movq(rtmp1, dst); 5765 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 5766 andq(rtmp1, rtmp2); 5767 shlq(rtmp1, 4); 5768 notq(rtmp2); 5769 andq(dst, rtmp2); 5770 shrq(dst, 4); 5771 orq(dst, rtmp1); 5772 } 5773 bswapq(dst); 5774 } 5775 5776 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 5777 Label done; 5778 Label neg_divisor_fastpath; 5779 cmpq(divisor, 0); 5780 jccb(Assembler::less, neg_divisor_fastpath); 5781 xorl(rdx, rdx); 5782 divq(divisor); 5783 jmpb(done); 5784 bind(neg_divisor_fastpath); 5785 // Fastpath for divisor < 0: 5786 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 5787 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 5788 movq(rdx, rax); 5789 subq(rdx, divisor); 5790 if (VM_Version::supports_bmi1()) { 5791 andnq(rax, rdx, rax); 5792 } else { 5793 notq(rdx); 5794 andq(rax, rdx); 5795 } 5796 shrq(rax, 63); 5797 bind(done); 5798 } 5799 5800 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 5801 Label done; 5802 Label neg_divisor_fastpath; 5803 cmpq(divisor, 0); 5804 jccb(Assembler::less, neg_divisor_fastpath); 5805 xorq(rdx, rdx); 5806 divq(divisor); 5807 jmp(done); 5808 bind(neg_divisor_fastpath); 5809 // Fastpath when divisor < 0: 5810 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 5811 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 5812 movq(rdx, rax); 5813 subq(rax, divisor); 5814 if (VM_Version::supports_bmi1()) { 5815 andnq(rax, rax, rdx); 5816 } else { 5817 notq(rax); 5818 andq(rax, rdx); 5819 } 5820 sarq(rax, 63); 5821 andq(rax, divisor); 5822 subq(rdx, rax); 5823 bind(done); 5824 } 5825 5826 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 5827 Label done; 5828 Label neg_divisor_fastpath; 5829 cmpq(divisor, 0); 5830 jccb(Assembler::less, neg_divisor_fastpath); 5831 xorq(rdx, rdx); 5832 divq(divisor); 5833 jmp(done); 5834 bind(neg_divisor_fastpath); 5835 // Fastpath for divisor < 0: 5836 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 5837 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 5838 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 5839 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 5840 movq(rdx, rax); 5841 subq(rax, divisor); 5842 if (VM_Version::supports_bmi1()) { 5843 andnq(rax, rax, rdx); 5844 } else { 5845 notq(rax); 5846 andq(rax, rdx); 5847 } 5848 movq(tmp, rax); 5849 shrq(rax, 63); // quotient 5850 sarq(tmp, 63); 5851 andq(tmp, divisor); 5852 subq(rdx, tmp); // remainder 5853 bind(done); 5854 } 5855 #endif 5856 5857 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 5858 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 5859 int vlen_enc) { 5860 assert(VM_Version::supports_avx512bw(), ""); 5861 // Byte shuffles are inlane operations and indices are determined using 5862 // lower 4 bit of each shuffle lane, thus all shuffle indices are 5863 // normalized to index range 0-15. This makes sure that all the multiples 5864 // of an index value are placed at same relative position in 128 bit 5865 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 5866 // will be 16th element in their respective 128 bit lanes. 5867 movl(rtmp, 16); 5868 evpbroadcastb(xtmp1, rtmp, vlen_enc); 5869 5870 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 5871 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 5872 // original shuffle indices and move the shuffled lanes corresponding to true 5873 // mask to destination vector. 5874 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 5875 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 5876 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 5877 5878 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 5879 // and broadcasting second 128 bit lane. 5880 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 5881 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 5882 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 5883 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 5884 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 5885 5886 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 5887 // and broadcasting third 128 bit lane. 5888 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 5889 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 5890 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 5891 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 5892 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 5893 5894 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 5895 // and broadcasting third 128 bit lane. 5896 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 5897 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 5898 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 5899 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 5900 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 5901 } 5902