1 /* 2 * Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "oops/methodData.hpp" 31 #include "opto/c2_MacroAssembler.hpp" 32 #include "opto/intrinsicnode.hpp" 33 #include "opto/output.hpp" 34 #include "opto/opcodes.hpp" 35 #include "opto/subnode.hpp" 36 #include "runtime/objectMonitor.hpp" 37 #include "runtime/stubRoutines.hpp" 38 #include "utilities/checkedCast.hpp" 39 40 #ifdef PRODUCT 41 #define BLOCK_COMMENT(str) /* nothing */ 42 #define STOP(error) stop(error) 43 #else 44 #define BLOCK_COMMENT(str) block_comment(str) 45 #define STOP(error) block_comment(error); stop(error) 46 #endif 47 48 // C2 compiled method's prolog code. 49 void C2_MacroAssembler::verified_entry(Compile* C, int sp_inc) { 50 if (C->clinit_barrier_on_entry()) { 51 assert(VM_Version::supports_fast_class_init_checks(), "sanity"); 52 assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started"); 53 54 Label L_skip_barrier; 55 Register klass = rscratch1; 56 57 mov_metadata(klass, C->method()->holder()->constant_encoding()); 58 clinit_barrier(klass, r15_thread, &L_skip_barrier /*L_fast_path*/); 59 60 jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); // slow path 61 62 bind(L_skip_barrier); 63 } 64 65 int framesize = C->output()->frame_size_in_bytes(); 66 int bangsize = C->output()->bang_size_in_bytes(); 67 bool fp_mode_24b = false; 68 int stack_bang_size = C->output()->need_stack_bang(bangsize) ? bangsize : 0; 69 70 // WARNING: Initial instruction MUST be 5 bytes or longer so that 71 // NativeJump::patch_verified_entry will be able to patch out the entry 72 // code safely. The push to verify stack depth is ok at 5 bytes, 73 // the frame allocation can be either 3 or 6 bytes. So if we don't do 74 // stack bang then we must use the 6 byte frame allocation even if 75 // we have no frame. :-( 76 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 77 78 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 79 // Remove word for return addr 80 framesize -= wordSize; 81 stack_bang_size -= wordSize; 82 83 // Calls to C2R adapters often do not accept exceptional returns. 84 // We require that their callers must bang for them. But be careful, because 85 // some VM calls (such as call site linkage) can use several kilobytes of 86 // stack. But the stack safety zone should account for that. 87 // See bugs 4446381, 4468289, 4497237. 88 if (stack_bang_size > 0) { 89 generate_stack_overflow_check(stack_bang_size); 90 91 // We always push rbp, so that on return to interpreter rbp, will be 92 // restored correctly and we can correct the stack. 93 push(rbp); 94 // Save caller's stack pointer into RBP if the frame pointer is preserved. 95 if (PreserveFramePointer) { 96 mov(rbp, rsp); 97 } 98 // Remove word for ebp 99 framesize -= wordSize; 100 101 // Create frame 102 if (framesize) { 103 subptr(rsp, framesize); 104 } 105 } else { 106 // Create frame (force generation of a 4 byte immediate value) 107 subptr_imm32(rsp, framesize); 108 109 // Save RBP register now. 110 framesize -= wordSize; 111 movptr(Address(rsp, framesize), rbp); 112 // Save caller's stack pointer into RBP if the frame pointer is preserved. 113 if (PreserveFramePointer) { 114 movptr(rbp, rsp); 115 if (framesize > 0) { 116 addptr(rbp, framesize); 117 } 118 } 119 } 120 121 if (C->needs_stack_repair()) { 122 // Save stack increment just below the saved rbp (also account for fixed framesize and rbp) 123 assert((sp_inc & (StackAlignmentInBytes-1)) == 0, "stack increment not aligned"); 124 movptr(Address(rsp, framesize - wordSize), sp_inc + framesize + wordSize); 125 } 126 127 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 128 framesize -= wordSize; 129 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 130 } 131 132 #ifndef _LP64 133 // If method sets FPU control word do it now 134 if (fp_mode_24b) { 135 fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24())); 136 } 137 if (UseSSE >= 2 && VerifyFPU) { 138 verify_FPU(0, "FPU stack must be clean on entry"); 139 } 140 #endif 141 142 #ifdef ASSERT 143 if (VerifyStackAtCalls) { 144 Label L; 145 push(rax); 146 mov(rax, rsp); 147 andptr(rax, StackAlignmentInBytes-1); 148 cmpptr(rax, StackAlignmentInBytes-wordSize); 149 pop(rax); 150 jcc(Assembler::equal, L); 151 STOP("Stack is not properly aligned!"); 152 bind(L); 153 } 154 #endif 155 } 156 157 void C2_MacroAssembler::entry_barrier() { 158 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 159 #ifdef _LP64 160 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 161 // We put the non-hot code of the nmethod entry barrier out-of-line in a stub. 162 Label dummy_slow_path; 163 Label dummy_continuation; 164 Label* slow_path = &dummy_slow_path; 165 Label* continuation = &dummy_continuation; 166 if (!Compile::current()->output()->in_scratch_emit_size()) { 167 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 168 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 169 Compile::current()->output()->add_stub(stub); 170 slow_path = &stub->entry(); 171 continuation = &stub->continuation(); 172 } 173 bs->nmethod_entry_barrier(this, slow_path, continuation); 174 } 175 #else 176 // Don't bother with out-of-line nmethod entry barrier stub for x86_32. 177 bs->nmethod_entry_barrier(this, nullptr /* slow_path */, nullptr /* continuation */); 178 #endif 179 } 180 181 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 182 switch (vlen_in_bytes) { 183 case 4: // fall-through 184 case 8: // fall-through 185 case 16: return Assembler::AVX_128bit; 186 case 32: return Assembler::AVX_256bit; 187 case 64: return Assembler::AVX_512bit; 188 189 default: { 190 ShouldNotReachHere(); 191 return Assembler::AVX_NoVec; 192 } 193 } 194 } 195 196 #if INCLUDE_RTM_OPT 197 198 // Update rtm_counters based on abort status 199 // input: abort_status 200 // rtm_counters (RTMLockingCounters*) 201 // flags are killed 202 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) { 203 204 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset())); 205 if (PrintPreciseRTMLockingStatistics) { 206 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 207 Label check_abort; 208 testl(abort_status, (1<<i)); 209 jccb(Assembler::equal, check_abort); 210 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx)))); 211 bind(check_abort); 212 } 213 } 214 } 215 216 // Branch if (random & (count-1) != 0), count is 2^n 217 // tmp, scr and flags are killed 218 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) { 219 assert(tmp == rax, ""); 220 assert(scr == rdx, ""); 221 rdtsc(); // modifies EDX:EAX 222 andptr(tmp, count-1); 223 jccb(Assembler::notZero, brLabel); 224 } 225 226 // Perform abort ratio calculation, set no_rtm bit if high ratio 227 // input: rtm_counters_Reg (RTMLockingCounters* address) 228 // tmpReg, rtm_counters_Reg and flags are killed 229 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg, 230 Register rtm_counters_Reg, 231 RTMLockingCounters* rtm_counters, 232 Metadata* method_data) { 233 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 234 235 if (RTMLockingCalculationDelay > 0) { 236 // Delay calculation 237 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr())); 238 testptr(tmpReg, tmpReg); 239 jccb(Assembler::equal, L_done); 240 } 241 // Abort ratio calculation only if abort_count > RTMAbortThreshold 242 // Aborted transactions = abort_count * 100 243 // All transactions = total_count * RTMTotalCountIncrRate 244 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 245 246 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset())); 247 cmpptr(tmpReg, RTMAbortThreshold); 248 jccb(Assembler::below, L_check_always_rtm2); 249 imulptr(tmpReg, tmpReg, 100); 250 251 Register scrReg = rtm_counters_Reg; 252 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 253 imulptr(scrReg, scrReg, RTMTotalCountIncrRate); 254 imulptr(scrReg, scrReg, RTMAbortRatio); 255 cmpptr(tmpReg, scrReg); 256 jccb(Assembler::below, L_check_always_rtm1); 257 if (method_data != nullptr) { 258 // set rtm_state to "no rtm" in MDO 259 mov_metadata(tmpReg, method_data); 260 lock(); 261 orl(Address(tmpReg, MethodData::rtm_state_offset()), NoRTM); 262 } 263 jmpb(L_done); 264 bind(L_check_always_rtm1); 265 // Reload RTMLockingCounters* address 266 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 267 bind(L_check_always_rtm2); 268 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 269 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 270 jccb(Assembler::below, L_done); 271 if (method_data != nullptr) { 272 // set rtm_state to "always rtm" in MDO 273 mov_metadata(tmpReg, method_data); 274 lock(); 275 orl(Address(tmpReg, MethodData::rtm_state_offset()), UseRTM); 276 } 277 bind(L_done); 278 } 279 280 // Update counters and perform abort ratio calculation 281 // input: abort_status_Reg 282 // rtm_counters_Reg, flags are killed 283 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg, 284 Register rtm_counters_Reg, 285 RTMLockingCounters* rtm_counters, 286 Metadata* method_data, 287 bool profile_rtm) { 288 289 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 290 // update rtm counters based on rax value at abort 291 // reads abort_status_Reg, updates flags 292 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 293 rtm_counters_update(abort_status_Reg, rtm_counters_Reg); 294 if (profile_rtm) { 295 // Save abort status because abort_status_Reg is used by following code. 296 if (RTMRetryCount > 0) { 297 push(abort_status_Reg); 298 } 299 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 300 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data); 301 // restore abort status 302 if (RTMRetryCount > 0) { 303 pop(abort_status_Reg); 304 } 305 } 306 } 307 308 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4) 309 // inputs: retry_count_Reg 310 // : abort_status_Reg 311 // output: retry_count_Reg decremented by 1 312 // flags are killed 313 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) { 314 Label doneRetry; 315 assert(abort_status_Reg == rax, ""); 316 // The abort reason bits are in eax (see all states in rtmLocking.hpp) 317 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) 318 // if reason is in 0x6 and retry count != 0 then retry 319 andptr(abort_status_Reg, 0x6); 320 jccb(Assembler::zero, doneRetry); 321 testl(retry_count_Reg, retry_count_Reg); 322 jccb(Assembler::zero, doneRetry); 323 pause(); 324 decrementl(retry_count_Reg); 325 jmp(retryLabel); 326 bind(doneRetry); 327 } 328 329 // Spin and retry if lock is busy, 330 // inputs: box_Reg (monitor address) 331 // : retry_count_Reg 332 // output: retry_count_Reg decremented by 1 333 // : clear z flag if retry count exceeded 334 // tmp_Reg, scr_Reg, flags are killed 335 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg, 336 Register tmp_Reg, Register scr_Reg, Label& retryLabel) { 337 Label SpinLoop, SpinExit, doneRetry; 338 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 339 340 testl(retry_count_Reg, retry_count_Reg); 341 jccb(Assembler::zero, doneRetry); 342 decrementl(retry_count_Reg); 343 movptr(scr_Reg, RTMSpinLoopCount); 344 345 bind(SpinLoop); 346 pause(); 347 decrementl(scr_Reg); 348 jccb(Assembler::lessEqual, SpinExit); 349 movptr(tmp_Reg, Address(box_Reg, owner_offset)); 350 testptr(tmp_Reg, tmp_Reg); 351 jccb(Assembler::notZero, SpinLoop); 352 353 bind(SpinExit); 354 jmp(retryLabel); 355 bind(doneRetry); 356 incrementl(retry_count_Reg); // clear z flag 357 } 358 359 // Use RTM for normal stack locks 360 // Input: objReg (object to lock) 361 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg, 362 Register retry_on_abort_count_Reg, 363 RTMLockingCounters* stack_rtm_counters, 364 Metadata* method_data, bool profile_rtm, 365 Label& DONE_LABEL, Label& IsInflated) { 366 assert(UseRTMForStackLocks, "why call this otherwise?"); 367 assert(tmpReg == rax, ""); 368 assert(scrReg == rdx, ""); 369 Label L_rtm_retry, L_decrement_retry, L_on_abort; 370 371 if (RTMRetryCount > 0) { 372 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 373 bind(L_rtm_retry); 374 } 375 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 376 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 377 jcc(Assembler::notZero, IsInflated); 378 379 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 380 Label L_noincrement; 381 if (RTMTotalCountIncrRate > 1) { 382 // tmpReg, scrReg and flags are killed 383 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 384 } 385 assert(stack_rtm_counters != nullptr, "should not be null when profiling RTM"); 386 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg); 387 bind(L_noincrement); 388 } 389 xbegin(L_on_abort); 390 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 391 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 392 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 393 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked 394 395 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 396 if (UseRTMXendForLockBusy) { 397 xend(); 398 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry) 399 jmp(L_decrement_retry); 400 } 401 else { 402 xabort(0); 403 } 404 bind(L_on_abort); 405 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 406 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm); 407 } 408 bind(L_decrement_retry); 409 if (RTMRetryCount > 0) { 410 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 411 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 412 } 413 } 414 415 // Use RTM for inflating locks 416 // inputs: objReg (object to lock) 417 // boxReg (on-stack box address (displaced header location) - KILLED) 418 // tmpReg (ObjectMonitor address + markWord::monitor_value) 419 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg, 420 Register scrReg, Register retry_on_busy_count_Reg, 421 Register retry_on_abort_count_Reg, 422 RTMLockingCounters* rtm_counters, 423 Metadata* method_data, bool profile_rtm, 424 Label& DONE_LABEL) { 425 assert(UseRTMLocking, "why call this otherwise?"); 426 assert(tmpReg == rax, ""); 427 assert(scrReg == rdx, ""); 428 Label L_rtm_retry, L_decrement_retry, L_on_abort; 429 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 430 431 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 432 movptr(boxReg, tmpReg); // Save ObjectMonitor address 433 434 if (RTMRetryCount > 0) { 435 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy 436 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 437 bind(L_rtm_retry); 438 } 439 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 440 Label L_noincrement; 441 if (RTMTotalCountIncrRate > 1) { 442 // tmpReg, scrReg and flags are killed 443 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 444 } 445 assert(rtm_counters != nullptr, "should not be null when profiling RTM"); 446 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg); 447 bind(L_noincrement); 448 } 449 xbegin(L_on_abort); 450 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 451 movptr(tmpReg, Address(tmpReg, owner_offset)); 452 testptr(tmpReg, tmpReg); 453 jcc(Assembler::zero, DONE_LABEL); 454 if (UseRTMXendForLockBusy) { 455 xend(); 456 jmp(L_decrement_retry); 457 } 458 else { 459 xabort(0); 460 } 461 bind(L_on_abort); 462 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 463 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 464 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm); 465 } 466 if (RTMRetryCount > 0) { 467 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 468 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 469 } 470 471 movptr(tmpReg, Address(boxReg, owner_offset)) ; 472 testptr(tmpReg, tmpReg) ; 473 jccb(Assembler::notZero, L_decrement_retry) ; 474 475 // Appears unlocked - try to swing _owner from null to non-null. 476 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 477 #ifdef _LP64 478 Register threadReg = r15_thread; 479 #else 480 get_thread(scrReg); 481 Register threadReg = scrReg; 482 #endif 483 lock(); 484 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg 485 486 if (RTMRetryCount > 0) { 487 // success done else retry 488 jccb(Assembler::equal, DONE_LABEL) ; 489 bind(L_decrement_retry); 490 // Spin and retry if lock is busy. 491 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry); 492 } 493 else { 494 bind(L_decrement_retry); 495 } 496 } 497 498 #endif // INCLUDE_RTM_OPT 499 500 // fast_lock and fast_unlock used by C2 501 502 // Because the transitions from emitted code to the runtime 503 // monitorenter/exit helper stubs are so slow it's critical that 504 // we inline both the stack-locking fast path and the inflated fast path. 505 // 506 // See also: cmpFastLock and cmpFastUnlock. 507 // 508 // What follows is a specialized inline transliteration of the code 509 // in enter() and exit(). If we're concerned about I$ bloat another 510 // option would be to emit TrySlowEnter and TrySlowExit methods 511 // at startup-time. These methods would accept arguments as 512 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 513 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 514 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 515 // In practice, however, the # of lock sites is bounded and is usually small. 516 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 517 // if the processor uses simple bimodal branch predictors keyed by EIP 518 // Since the helper routines would be called from multiple synchronization 519 // sites. 520 // 521 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 522 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 523 // to those specialized methods. That'd give us a mostly platform-independent 524 // implementation that the JITs could optimize and inline at their pleasure. 525 // Done correctly, the only time we'd need to cross to native could would be 526 // to park() or unpark() threads. We'd also need a few more unsafe operators 527 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 528 // (b) explicit barriers or fence operations. 529 // 530 // TODO: 531 // 532 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 533 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 534 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 535 // the lock operators would typically be faster than reifying Self. 536 // 537 // * Ideally I'd define the primitives as: 538 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 539 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 540 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 541 // Instead, we're stuck with a rather awkward and brittle register assignments below. 542 // Furthermore the register assignments are overconstrained, possibly resulting in 543 // sub-optimal code near the synchronization site. 544 // 545 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 546 // Alternately, use a better sp-proximity test. 547 // 548 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 549 // Either one is sufficient to uniquely identify a thread. 550 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 551 // 552 // * Intrinsify notify() and notifyAll() for the common cases where the 553 // object is locked by the calling thread but the waitlist is empty. 554 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 555 // 556 // * use jccb and jmpb instead of jcc and jmp to improve code density. 557 // But beware of excessive branch density on AMD Opterons. 558 // 559 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 560 // or failure of the fast path. If the fast path fails then we pass 561 // control to the slow path, typically in C. In fast_lock and 562 // fast_unlock we often branch to DONE_LABEL, just to find that C2 563 // will emit a conditional branch immediately after the node. 564 // So we have branches to branches and lots of ICC.ZF games. 565 // Instead, it might be better to have C2 pass a "FailureLabel" 566 // into fast_lock and fast_unlock. In the case of success, control 567 // will drop through the node. ICC.ZF is undefined at exit. 568 // In the case of failure, the node will branch directly to the 569 // FailureLabel 570 571 572 // obj: object to lock 573 // box: on-stack box address (displaced header location) - KILLED 574 // rax,: tmp -- KILLED 575 // scr: tmp -- KILLED 576 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 577 Register scrReg, Register cx1Reg, Register cx2Reg, Register thread, 578 RTMLockingCounters* rtm_counters, 579 RTMLockingCounters* stack_rtm_counters, 580 Metadata* method_data, 581 bool use_rtm, bool profile_rtm) { 582 // Ensure the register assignments are disjoint 583 assert(tmpReg == rax, ""); 584 585 if (use_rtm) { 586 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg); 587 } else { 588 assert(cx1Reg == noreg, ""); 589 assert(cx2Reg == noreg, ""); 590 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 591 } 592 593 // Possible cases that we'll encounter in fast_lock 594 // ------------------------------------------------ 595 // * Inflated 596 // -- unlocked 597 // -- Locked 598 // = by self 599 // = by other 600 // * neutral 601 // * stack-locked 602 // -- by self 603 // = sp-proximity test hits 604 // = sp-proximity test generates false-negative 605 // -- by other 606 // 607 608 Label IsInflated, DONE_LABEL, NO_COUNT, COUNT; 609 610 if (DiagnoseSyncOnValueBasedClasses != 0) { 611 load_klass(tmpReg, objReg, scrReg); 612 movl(tmpReg, Address(tmpReg, Klass::access_flags_offset())); 613 testl(tmpReg, JVM_ACC_IS_VALUE_BASED_CLASS); 614 jcc(Assembler::notZero, DONE_LABEL); 615 } 616 617 #if INCLUDE_RTM_OPT 618 if (UseRTMForStackLocks && use_rtm) { 619 assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive"); 620 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg, 621 stack_rtm_counters, method_data, profile_rtm, 622 DONE_LABEL, IsInflated); 623 } 624 #endif // INCLUDE_RTM_OPT 625 626 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 627 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral 628 jcc(Assembler::notZero, IsInflated); 629 630 if (LockingMode == LM_MONITOR) { 631 // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0. 632 testptr(objReg, objReg); 633 } else if (LockingMode == LM_LEGACY) { 634 // Attempt stack-locking ... 635 orptr (tmpReg, markWord::unlocked_value); 636 if (EnableValhalla) { 637 // Mask inline_type bit such that we go to the slow path if object is an inline type 638 andptr(tmpReg, ~((int) markWord::inline_type_bit_in_place)); 639 } 640 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 641 lock(); 642 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 643 jcc(Assembler::equal, COUNT); // Success 644 645 // Recursive locking. 646 // The object is stack-locked: markword contains stack pointer to BasicLock. 647 // Locked by current thread if difference with current SP is less than one page. 648 subptr(tmpReg, rsp); 649 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 650 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - (int)os::vm_page_size())) ); 651 movptr(Address(boxReg, 0), tmpReg); 652 } else { 653 assert(LockingMode == LM_LIGHTWEIGHT, ""); 654 lightweight_lock(objReg, tmpReg, thread, scrReg, NO_COUNT); 655 jmp(COUNT); 656 } 657 jmp(DONE_LABEL); 658 659 bind(IsInflated); 660 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 661 662 #if INCLUDE_RTM_OPT 663 // Use the same RTM locking code in 32- and 64-bit VM. 664 if (use_rtm) { 665 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg, 666 rtm_counters, method_data, profile_rtm, DONE_LABEL); 667 } else { 668 #endif // INCLUDE_RTM_OPT 669 670 #ifndef _LP64 671 // The object is inflated. 672 673 // boxReg refers to the on-stack BasicLock in the current frame. 674 // We'd like to write: 675 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 676 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 677 // additional latency as we have another ST in the store buffer that must drain. 678 679 // avoid ST-before-CAS 680 // register juggle because we need tmpReg for cmpxchgptr below 681 movptr(scrReg, boxReg); 682 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 683 684 // Optimistic form: consider XORL tmpReg,tmpReg 685 movptr(tmpReg, NULL_WORD); 686 687 // Appears unlocked - try to swing _owner from null to non-null. 688 // Ideally, I'd manifest "Self" with get_thread and then attempt 689 // to CAS the register containing Self into m->Owner. 690 // But we don't have enough registers, so instead we can either try to CAS 691 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 692 // we later store "Self" into m->Owner. Transiently storing a stack address 693 // (rsp or the address of the box) into m->owner is harmless. 694 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 695 lock(); 696 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 697 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 698 // If we weren't able to swing _owner from null to the BasicLock 699 // then take the slow path. 700 jccb (Assembler::notZero, NO_COUNT); 701 // update _owner from BasicLock to thread 702 get_thread (scrReg); // beware: clobbers ICCs 703 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 704 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 705 706 // If the CAS fails we can either retry or pass control to the slow path. 707 // We use the latter tactic. 708 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 709 // If the CAS was successful ... 710 // Self has acquired the lock 711 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 712 // Intentional fall-through into DONE_LABEL ... 713 #else // _LP64 714 // It's inflated and we use scrReg for ObjectMonitor* in this section. 715 movq(scrReg, tmpReg); 716 xorq(tmpReg, tmpReg); 717 lock(); 718 cmpxchgptr(thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 719 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 720 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 721 movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); 722 // Propagate ICC.ZF from CAS above into DONE_LABEL. 723 jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success) 724 725 cmpptr(thread, rax); // Check if we are already the owner (recursive lock) 726 jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail) 727 incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 728 xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success 729 #endif // _LP64 730 #if INCLUDE_RTM_OPT 731 } // use_rtm() 732 #endif 733 bind(DONE_LABEL); 734 735 // ZFlag == 1 count in fast path 736 // ZFlag == 0 count in slow path 737 jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0 738 739 bind(COUNT); 740 // Count monitors in fast path 741 increment(Address(thread, JavaThread::held_monitor_count_offset())); 742 743 xorl(tmpReg, tmpReg); // Set ZF == 1 744 745 bind(NO_COUNT); 746 747 // At NO_COUNT the icc ZFlag is set as follows ... 748 // fast_unlock uses the same protocol. 749 // ZFlag == 1 -> Success 750 // ZFlag == 0 -> Failure - force control through the slow path 751 } 752 753 // obj: object to unlock 754 // box: box address (displaced header location), killed. Must be EAX. 755 // tmp: killed, cannot be obj nor box. 756 // 757 // Some commentary on balanced locking: 758 // 759 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 760 // Methods that don't have provably balanced locking are forced to run in the 761 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 762 // The interpreter provides two properties: 763 // I1: At return-time the interpreter automatically and quietly unlocks any 764 // objects acquired the current activation (frame). Recall that the 765 // interpreter maintains an on-stack list of locks currently held by 766 // a frame. 767 // I2: If a method attempts to unlock an object that is not held by the 768 // the frame the interpreter throws IMSX. 769 // 770 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 771 // B() doesn't have provably balanced locking so it runs in the interpreter. 772 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 773 // is still locked by A(). 774 // 775 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 776 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 777 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 778 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 779 // Arguably given that the spec legislates the JNI case as undefined our implementation 780 // could reasonably *avoid* checking owner in fast_unlock(). 781 // In the interest of performance we elide m->Owner==Self check in unlock. 782 // A perfectly viable alternative is to elide the owner check except when 783 // Xcheck:jni is enabled. 784 785 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { 786 assert(boxReg == rax, ""); 787 assert_different_registers(objReg, boxReg, tmpReg); 788 789 Label DONE_LABEL, Stacked, COUNT, NO_COUNT; 790 791 #if INCLUDE_RTM_OPT 792 if (UseRTMForStackLocks && use_rtm) { 793 assert(LockingMode != LM_MONITOR, "LockingMode == 0 (LM_MONITOR) and +UseRTMForStackLocks are mutually exclusive"); 794 Label L_regular_unlock; 795 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 796 andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits 797 cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked 798 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock 799 xend(); // otherwise end... 800 jmp(DONE_LABEL); // ... and we're done 801 bind(L_regular_unlock); 802 } 803 #endif 804 805 if (LockingMode == LM_LEGACY) { 806 cmpptr(Address(boxReg, 0), NULL_WORD); // Examine the displaced header 807 jcc (Assembler::zero, COUNT); // 0 indicates recursive stack-lock 808 } 809 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 810 if (LockingMode != LM_MONITOR) { 811 testptr(tmpReg, markWord::monitor_value); // Inflated? 812 jcc(Assembler::zero, Stacked); 813 } 814 815 // It's inflated. 816 if (LockingMode == LM_LIGHTWEIGHT) { 817 // If the owner is ANONYMOUS, we need to fix it - in an outline stub. 818 testb(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t) ObjectMonitor::ANONYMOUS_OWNER); 819 #ifdef _LP64 820 if (!Compile::current()->output()->in_scratch_emit_size()) { 821 C2HandleAnonOMOwnerStub* stub = new (Compile::current()->comp_arena()) C2HandleAnonOMOwnerStub(tmpReg, boxReg); 822 Compile::current()->output()->add_stub(stub); 823 jcc(Assembler::notEqual, stub->entry()); 824 bind(stub->continuation()); 825 } else 826 #endif 827 { 828 // We can't easily implement this optimization on 32 bit because we don't have a thread register. 829 // Call the slow-path instead. 830 jcc(Assembler::notEqual, NO_COUNT); 831 } 832 } 833 834 #if INCLUDE_RTM_OPT 835 if (use_rtm) { 836 Label L_regular_inflated_unlock; 837 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 838 movptr(boxReg, Address(tmpReg, owner_offset)); 839 testptr(boxReg, boxReg); 840 jccb(Assembler::notZero, L_regular_inflated_unlock); 841 xend(); 842 jmp(DONE_LABEL); 843 bind(L_regular_inflated_unlock); 844 } 845 #endif 846 847 // Despite our balanced locking property we still check that m->_owner == Self 848 // as java routines or native JNI code called by this thread might 849 // have released the lock. 850 // Refer to the comments in synchronizer.cpp for how we might encode extra 851 // state in _succ so we can avoid fetching EntryList|cxq. 852 // 853 // If there's no contention try a 1-0 exit. That is, exit without 854 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 855 // we detect and recover from the race that the 1-0 exit admits. 856 // 857 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 858 // before it STs null into _owner, releasing the lock. Updates 859 // to data protected by the critical section must be visible before 860 // we drop the lock (and thus before any other thread could acquire 861 // the lock and observe the fields protected by the lock). 862 // IA32's memory-model is SPO, so STs are ordered with respect to 863 // each other and there's no need for an explicit barrier (fence). 864 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 865 #ifndef _LP64 866 // Note that we could employ various encoding schemes to reduce 867 // the number of loads below (currently 4) to just 2 or 3. 868 // Refer to the comments in synchronizer.cpp. 869 // In practice the chain of fetches doesn't seem to impact performance, however. 870 xorptr(boxReg, boxReg); 871 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 872 jccb (Assembler::notZero, DONE_LABEL); 873 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 874 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 875 jccb (Assembler::notZero, DONE_LABEL); 876 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 877 jmpb (DONE_LABEL); 878 #else // _LP64 879 // It's inflated 880 Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath; 881 882 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)), 0); 883 jccb(Assembler::equal, LNotRecursive); 884 885 // Recursive inflated unlock 886 decq(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 887 jmpb(LSuccess); 888 889 bind(LNotRecursive); 890 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 891 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 892 jccb (Assembler::notZero, CheckSucc); 893 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 894 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 895 jmpb (DONE_LABEL); 896 897 // Try to avoid passing control into the slow_path ... 898 bind (CheckSucc); 899 900 // The following optional optimization can be elided if necessary 901 // Effectively: if (succ == null) goto slow path 902 // The code reduces the window for a race, however, 903 // and thus benefits performance. 904 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 905 jccb (Assembler::zero, LGoSlowPath); 906 907 xorptr(boxReg, boxReg); 908 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 909 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 910 911 // Memory barrier/fence 912 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 913 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 914 // This is faster on Nehalem and AMD Shanghai/Barcelona. 915 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 916 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 917 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 918 lock(); addl(Address(rsp, 0), 0); 919 920 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD); 921 jccb (Assembler::notZero, LSuccess); 922 923 // Rare inopportune interleaving - race. 924 // The successor vanished in the small window above. 925 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 926 // We need to ensure progress and succession. 927 // Try to reacquire the lock. 928 // If that fails then the new owner is responsible for succession and this 929 // thread needs to take no further action and can exit via the fast path (success). 930 // If the re-acquire succeeds then pass control into the slow path. 931 // As implemented, this latter mode is horrible because we generated more 932 // coherence traffic on the lock *and* artificially extended the critical section 933 // length while by virtue of passing control into the slow path. 934 935 // box is really RAX -- the following CMPXCHG depends on that binding 936 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 937 lock(); 938 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 939 // There's no successor so we tried to regrab the lock. 940 // If that didn't work, then another thread grabbed the 941 // lock so we're done (and exit was a success). 942 jccb (Assembler::notEqual, LSuccess); 943 // Intentional fall-through into slow path 944 945 bind (LGoSlowPath); 946 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 947 jmpb (DONE_LABEL); 948 949 bind (LSuccess); 950 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 951 jmpb (DONE_LABEL); 952 953 #endif 954 if (LockingMode != LM_MONITOR) { 955 bind (Stacked); 956 if (LockingMode == LM_LIGHTWEIGHT) { 957 mov(boxReg, tmpReg); 958 lightweight_unlock(objReg, boxReg, tmpReg, NO_COUNT); 959 jmp(COUNT); 960 } else if (LockingMode == LM_LEGACY) { 961 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 962 lock(); 963 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 964 } 965 // Intentional fall-thru into DONE_LABEL 966 } 967 bind(DONE_LABEL); 968 969 // ZFlag == 1 count in fast path 970 // ZFlag == 0 count in slow path 971 jccb(Assembler::notZero, NO_COUNT); 972 973 bind(COUNT); 974 // Count monitors in fast path 975 #ifndef _LP64 976 get_thread(tmpReg); 977 decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); 978 #else // _LP64 979 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 980 #endif 981 982 xorl(tmpReg, tmpReg); // Set ZF == 1 983 984 bind(NO_COUNT); 985 } 986 987 //------------------------------------------------------------------------------------------- 988 // Generic instructions support for use in .ad files C2 code generation 989 990 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { 991 if (dst != src) { 992 movdqu(dst, src); 993 } 994 if (opcode == Op_AbsVD) { 995 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg); 996 } else { 997 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 998 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 999 } 1000 } 1001 1002 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 1003 if (opcode == Op_AbsVD) { 1004 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg); 1005 } else { 1006 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 1007 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg); 1008 } 1009 } 1010 1011 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { 1012 if (dst != src) { 1013 movdqu(dst, src); 1014 } 1015 if (opcode == Op_AbsVF) { 1016 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg); 1017 } else { 1018 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 1019 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1020 } 1021 } 1022 1023 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { 1024 if (opcode == Op_AbsVF) { 1025 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg); 1026 } else { 1027 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 1028 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg); 1029 } 1030 } 1031 1032 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 1033 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1034 assert(tmp == xnoreg || elem_bt == T_LONG, "unused"); 1035 1036 if (opcode == Op_MinV) { 1037 if (elem_bt == T_BYTE) { 1038 pminsb(dst, src); 1039 } else if (elem_bt == T_SHORT) { 1040 pminsw(dst, src); 1041 } else if (elem_bt == T_INT) { 1042 pminsd(dst, src); 1043 } else { 1044 assert(elem_bt == T_LONG, "required"); 1045 assert(tmp == xmm0, "required"); 1046 assert_different_registers(dst, src, tmp); 1047 movdqu(xmm0, dst); 1048 pcmpgtq(xmm0, src); 1049 blendvpd(dst, src); // xmm0 as mask 1050 } 1051 } else { // opcode == Op_MaxV 1052 if (elem_bt == T_BYTE) { 1053 pmaxsb(dst, src); 1054 } else if (elem_bt == T_SHORT) { 1055 pmaxsw(dst, src); 1056 } else if (elem_bt == T_INT) { 1057 pmaxsd(dst, src); 1058 } else { 1059 assert(elem_bt == T_LONG, "required"); 1060 assert(tmp == xmm0, "required"); 1061 assert_different_registers(dst, src, tmp); 1062 movdqu(xmm0, src); 1063 pcmpgtq(xmm0, dst); 1064 blendvpd(dst, src); // xmm0 as mask 1065 } 1066 } 1067 } 1068 1069 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 1070 XMMRegister dst, XMMRegister src1, XMMRegister src2, 1071 int vlen_enc) { 1072 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 1073 1074 if (opcode == Op_MinV) { 1075 if (elem_bt == T_BYTE) { 1076 vpminsb(dst, src1, src2, vlen_enc); 1077 } else if (elem_bt == T_SHORT) { 1078 vpminsw(dst, src1, src2, vlen_enc); 1079 } else if (elem_bt == T_INT) { 1080 vpminsd(dst, src1, src2, vlen_enc); 1081 } else { 1082 assert(elem_bt == T_LONG, "required"); 1083 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1084 vpminsq(dst, src1, src2, vlen_enc); 1085 } else { 1086 assert_different_registers(dst, src1, src2); 1087 vpcmpgtq(dst, src1, src2, vlen_enc); 1088 vblendvpd(dst, src1, src2, dst, vlen_enc); 1089 } 1090 } 1091 } else { // opcode == Op_MaxV 1092 if (elem_bt == T_BYTE) { 1093 vpmaxsb(dst, src1, src2, vlen_enc); 1094 } else if (elem_bt == T_SHORT) { 1095 vpmaxsw(dst, src1, src2, vlen_enc); 1096 } else if (elem_bt == T_INT) { 1097 vpmaxsd(dst, src1, src2, vlen_enc); 1098 } else { 1099 assert(elem_bt == T_LONG, "required"); 1100 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 1101 vpmaxsq(dst, src1, src2, vlen_enc); 1102 } else { 1103 assert_different_registers(dst, src1, src2); 1104 vpcmpgtq(dst, src1, src2, vlen_enc); 1105 vblendvpd(dst, src2, src1, dst, vlen_enc); 1106 } 1107 } 1108 } 1109 } 1110 1111 // Float/Double min max 1112 1113 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 1114 XMMRegister dst, XMMRegister a, XMMRegister b, 1115 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1116 int vlen_enc) { 1117 assert(UseAVX > 0, "required"); 1118 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1119 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1120 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1121 assert_different_registers(a, b, tmp, atmp, btmp); 1122 1123 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1124 bool is_double_word = is_double_word_type(elem_bt); 1125 1126 if (!is_double_word && is_min) { 1127 vblendvps(atmp, a, b, a, vlen_enc); 1128 vblendvps(btmp, b, a, a, vlen_enc); 1129 vminps(tmp, atmp, btmp, vlen_enc); 1130 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1131 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 1132 } else if (!is_double_word && !is_min) { 1133 vblendvps(btmp, b, a, b, vlen_enc); 1134 vblendvps(atmp, a, b, b, vlen_enc); 1135 vmaxps(tmp, atmp, btmp, vlen_enc); 1136 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1137 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 1138 } else if (is_double_word && is_min) { 1139 vblendvpd(atmp, a, b, a, vlen_enc); 1140 vblendvpd(btmp, b, a, a, vlen_enc); 1141 vminpd(tmp, atmp, btmp, vlen_enc); 1142 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1143 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 1144 } else { 1145 assert(is_double_word && !is_min, "sanity"); 1146 vblendvpd(btmp, b, a, b, vlen_enc); 1147 vblendvpd(atmp, a, b, b, vlen_enc); 1148 vmaxpd(tmp, atmp, btmp, vlen_enc); 1149 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1150 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 1151 } 1152 } 1153 1154 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 1155 XMMRegister dst, XMMRegister a, XMMRegister b, 1156 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 1157 int vlen_enc) { 1158 assert(UseAVX > 2, "required"); 1159 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 1160 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 1161 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 1162 assert_different_registers(dst, a, b, atmp, btmp); 1163 1164 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 1165 bool is_double_word = is_double_word_type(elem_bt); 1166 bool merge = true; 1167 1168 if (!is_double_word && is_min) { 1169 evpmovd2m(ktmp, a, vlen_enc); 1170 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1171 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1172 vminps(dst, atmp, btmp, vlen_enc); 1173 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1174 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1175 } else if (!is_double_word && !is_min) { 1176 evpmovd2m(ktmp, b, vlen_enc); 1177 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1178 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1179 vmaxps(dst, atmp, btmp, vlen_enc); 1180 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1181 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1182 } else if (is_double_word && is_min) { 1183 evpmovq2m(ktmp, a, vlen_enc); 1184 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1185 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1186 vminpd(dst, atmp, btmp, vlen_enc); 1187 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1188 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1189 } else { 1190 assert(is_double_word && !is_min, "sanity"); 1191 evpmovq2m(ktmp, b, vlen_enc); 1192 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1193 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1194 vmaxpd(dst, atmp, btmp, vlen_enc); 1195 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1196 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1197 } 1198 } 1199 1200 // Float/Double signum 1201 void C2_MacroAssembler::signum_fp(int opcode, XMMRegister dst, XMMRegister zero, XMMRegister one) { 1202 assert(opcode == Op_SignumF || opcode == Op_SignumD, "sanity"); 1203 1204 Label DONE_LABEL; 1205 1206 if (opcode == Op_SignumF) { 1207 assert(UseSSE > 0, "required"); 1208 ucomiss(dst, zero); 1209 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1210 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1211 movflt(dst, one); 1212 jcc(Assembler::above, DONE_LABEL); 1213 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg); 1214 } else if (opcode == Op_SignumD) { 1215 assert(UseSSE > 1, "required"); 1216 ucomisd(dst, zero); 1217 jcc(Assembler::equal, DONE_LABEL); // handle special case +0.0/-0.0, if argument is +0.0/-0.0, return argument 1218 jcc(Assembler::parity, DONE_LABEL); // handle special case NaN, if argument NaN, return NaN 1219 movdbl(dst, one); 1220 jcc(Assembler::above, DONE_LABEL); 1221 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg); 1222 } 1223 1224 bind(DONE_LABEL); 1225 } 1226 1227 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1228 if (sign) { 1229 pmovsxbw(dst, src); 1230 } else { 1231 pmovzxbw(dst, src); 1232 } 1233 } 1234 1235 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1236 if (sign) { 1237 vpmovsxbw(dst, src, vector_len); 1238 } else { 1239 vpmovzxbw(dst, src, vector_len); 1240 } 1241 } 1242 1243 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1244 if (sign) { 1245 vpmovsxbd(dst, src, vector_len); 1246 } else { 1247 vpmovzxbd(dst, src, vector_len); 1248 } 1249 } 1250 1251 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1252 if (sign) { 1253 vpmovsxwd(dst, src, vector_len); 1254 } else { 1255 vpmovzxwd(dst, src, vector_len); 1256 } 1257 } 1258 1259 void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1260 int shift, int vector_len) { 1261 if (opcode == Op_RotateLeftV) { 1262 if (etype == T_INT) { 1263 evprold(dst, src, shift, vector_len); 1264 } else { 1265 assert(etype == T_LONG, "expected type T_LONG"); 1266 evprolq(dst, src, shift, vector_len); 1267 } 1268 } else { 1269 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1270 if (etype == T_INT) { 1271 evprord(dst, src, shift, vector_len); 1272 } else { 1273 assert(etype == T_LONG, "expected type T_LONG"); 1274 evprorq(dst, src, shift, vector_len); 1275 } 1276 } 1277 } 1278 1279 void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src, 1280 XMMRegister shift, int vector_len) { 1281 if (opcode == Op_RotateLeftV) { 1282 if (etype == T_INT) { 1283 evprolvd(dst, src, shift, vector_len); 1284 } else { 1285 assert(etype == T_LONG, "expected type T_LONG"); 1286 evprolvq(dst, src, shift, vector_len); 1287 } 1288 } else { 1289 assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV"); 1290 if (etype == T_INT) { 1291 evprorvd(dst, src, shift, vector_len); 1292 } else { 1293 assert(etype == T_LONG, "expected type T_LONG"); 1294 evprorvq(dst, src, shift, vector_len); 1295 } 1296 } 1297 } 1298 1299 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) { 1300 if (opcode == Op_RShiftVI) { 1301 psrad(dst, shift); 1302 } else if (opcode == Op_LShiftVI) { 1303 pslld(dst, shift); 1304 } else { 1305 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1306 psrld(dst, shift); 1307 } 1308 } 1309 1310 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1311 switch (opcode) { 1312 case Op_RShiftVI: psrad(dst, shift); break; 1313 case Op_LShiftVI: pslld(dst, shift); break; 1314 case Op_URShiftVI: psrld(dst, shift); break; 1315 1316 default: assert(false, "%s", NodeClassNames[opcode]); 1317 } 1318 } 1319 1320 void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1321 if (opcode == Op_RShiftVI) { 1322 vpsrad(dst, nds, shift, vector_len); 1323 } else if (opcode == Op_LShiftVI) { 1324 vpslld(dst, nds, shift, vector_len); 1325 } else { 1326 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 1327 vpsrld(dst, nds, shift, vector_len); 1328 } 1329 } 1330 1331 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1332 switch (opcode) { 1333 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1334 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1335 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1336 1337 default: assert(false, "%s", NodeClassNames[opcode]); 1338 } 1339 } 1340 1341 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1342 switch (opcode) { 1343 case Op_RShiftVB: // fall-through 1344 case Op_RShiftVS: psraw(dst, shift); break; 1345 1346 case Op_LShiftVB: // fall-through 1347 case Op_LShiftVS: psllw(dst, shift); break; 1348 1349 case Op_URShiftVS: // fall-through 1350 case Op_URShiftVB: psrlw(dst, shift); break; 1351 1352 default: assert(false, "%s", NodeClassNames[opcode]); 1353 } 1354 } 1355 1356 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1357 switch (opcode) { 1358 case Op_RShiftVB: // fall-through 1359 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1360 1361 case Op_LShiftVB: // fall-through 1362 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1363 1364 case Op_URShiftVS: // fall-through 1365 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1366 1367 default: assert(false, "%s", NodeClassNames[opcode]); 1368 } 1369 } 1370 1371 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1372 switch (opcode) { 1373 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1374 case Op_LShiftVL: psllq(dst, shift); break; 1375 case Op_URShiftVL: psrlq(dst, shift); break; 1376 1377 default: assert(false, "%s", NodeClassNames[opcode]); 1378 } 1379 } 1380 1381 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) { 1382 if (opcode == Op_RShiftVL) { 1383 psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems 1384 } else if (opcode == Op_LShiftVL) { 1385 psllq(dst, shift); 1386 } else { 1387 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1388 psrlq(dst, shift); 1389 } 1390 } 1391 1392 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1393 switch (opcode) { 1394 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1395 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1396 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1397 1398 default: assert(false, "%s", NodeClassNames[opcode]); 1399 } 1400 } 1401 1402 void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 1403 if (opcode == Op_RShiftVL) { 1404 evpsraq(dst, nds, shift, vector_len); 1405 } else if (opcode == Op_LShiftVL) { 1406 vpsllq(dst, nds, shift, vector_len); 1407 } else { 1408 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 1409 vpsrlq(dst, nds, shift, vector_len); 1410 } 1411 } 1412 1413 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1414 switch (opcode) { 1415 case Op_RShiftVB: // fall-through 1416 case Op_RShiftVS: // fall-through 1417 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1418 1419 case Op_LShiftVB: // fall-through 1420 case Op_LShiftVS: // fall-through 1421 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1422 1423 case Op_URShiftVB: // fall-through 1424 case Op_URShiftVS: // fall-through 1425 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1426 1427 default: assert(false, "%s", NodeClassNames[opcode]); 1428 } 1429 } 1430 1431 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1432 switch (opcode) { 1433 case Op_RShiftVB: // fall-through 1434 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1435 1436 case Op_LShiftVB: // fall-through 1437 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1438 1439 case Op_URShiftVB: // fall-through 1440 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1441 1442 default: assert(false, "%s", NodeClassNames[opcode]); 1443 } 1444 } 1445 1446 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1447 assert(UseAVX >= 2, "required"); 1448 switch (opcode) { 1449 case Op_RShiftVL: { 1450 if (UseAVX > 2) { 1451 assert(tmp == xnoreg, "not used"); 1452 if (!VM_Version::supports_avx512vl()) { 1453 vlen_enc = Assembler::AVX_512bit; 1454 } 1455 evpsravq(dst, src, shift, vlen_enc); 1456 } else { 1457 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1458 vpsrlvq(dst, src, shift, vlen_enc); 1459 vpsrlvq(tmp, tmp, shift, vlen_enc); 1460 vpxor(dst, dst, tmp, vlen_enc); 1461 vpsubq(dst, dst, tmp, vlen_enc); 1462 } 1463 break; 1464 } 1465 case Op_LShiftVL: { 1466 assert(tmp == xnoreg, "not used"); 1467 vpsllvq(dst, src, shift, vlen_enc); 1468 break; 1469 } 1470 case Op_URShiftVL: { 1471 assert(tmp == xnoreg, "not used"); 1472 vpsrlvq(dst, src, shift, vlen_enc); 1473 break; 1474 } 1475 default: assert(false, "%s", NodeClassNames[opcode]); 1476 } 1477 } 1478 1479 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1480 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1481 assert(opcode == Op_LShiftVB || 1482 opcode == Op_RShiftVB || 1483 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1484 bool sign = (opcode != Op_URShiftVB); 1485 assert(vector_len == 0, "required"); 1486 vextendbd(sign, dst, src, 1); 1487 vpmovzxbd(vtmp, shift, 1); 1488 varshiftd(opcode, dst, dst, vtmp, 1); 1489 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, noreg); 1490 vextracti128_high(vtmp, dst); 1491 vpackusdw(dst, dst, vtmp, 0); 1492 } 1493 1494 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1495 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp) { 1496 assert(opcode == Op_LShiftVB || 1497 opcode == Op_RShiftVB || 1498 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1499 bool sign = (opcode != Op_URShiftVB); 1500 int ext_vector_len = vector_len + 1; 1501 vextendbw(sign, dst, src, ext_vector_len); 1502 vpmovzxbw(vtmp, shift, ext_vector_len); 1503 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1504 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, noreg); 1505 if (vector_len == 0) { 1506 vextracti128_high(vtmp, dst); 1507 vpackuswb(dst, dst, vtmp, vector_len); 1508 } else { 1509 vextracti64x4_high(vtmp, dst); 1510 vpackuswb(dst, dst, vtmp, vector_len); 1511 vpermq(dst, dst, 0xD8, vector_len); 1512 } 1513 } 1514 1515 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1516 switch(typ) { 1517 case T_BYTE: 1518 pinsrb(dst, val, idx); 1519 break; 1520 case T_SHORT: 1521 pinsrw(dst, val, idx); 1522 break; 1523 case T_INT: 1524 pinsrd(dst, val, idx); 1525 break; 1526 case T_LONG: 1527 pinsrq(dst, val, idx); 1528 break; 1529 default: 1530 assert(false,"Should not reach here."); 1531 break; 1532 } 1533 } 1534 1535 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1536 switch(typ) { 1537 case T_BYTE: 1538 vpinsrb(dst, src, val, idx); 1539 break; 1540 case T_SHORT: 1541 vpinsrw(dst, src, val, idx); 1542 break; 1543 case T_INT: 1544 vpinsrd(dst, src, val, idx); 1545 break; 1546 case T_LONG: 1547 vpinsrq(dst, src, val, idx); 1548 break; 1549 default: 1550 assert(false,"Should not reach here."); 1551 break; 1552 } 1553 } 1554 1555 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1556 switch(typ) { 1557 case T_INT: 1558 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1559 break; 1560 case T_FLOAT: 1561 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1562 break; 1563 case T_LONG: 1564 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1565 break; 1566 case T_DOUBLE: 1567 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1568 break; 1569 default: 1570 assert(false,"Should not reach here."); 1571 break; 1572 } 1573 } 1574 1575 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1576 switch(typ) { 1577 case T_INT: 1578 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1579 break; 1580 case T_FLOAT: 1581 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1582 break; 1583 case T_LONG: 1584 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1585 break; 1586 case T_DOUBLE: 1587 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1588 break; 1589 default: 1590 assert(false,"Should not reach here."); 1591 break; 1592 } 1593 } 1594 1595 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1596 switch(typ) { 1597 case T_INT: 1598 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1599 break; 1600 case T_FLOAT: 1601 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1602 break; 1603 case T_LONG: 1604 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1605 break; 1606 case T_DOUBLE: 1607 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1608 break; 1609 default: 1610 assert(false,"Should not reach here."); 1611 break; 1612 } 1613 } 1614 1615 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt, bool is_legacy) { 1616 if (vlen_in_bytes <= 16) { 1617 pxor (dst, dst); 1618 psubb(dst, src); 1619 switch (elem_bt) { 1620 case T_BYTE: /* nothing to do */ break; 1621 case T_SHORT: pmovsxbw(dst, dst); break; 1622 case T_INT: pmovsxbd(dst, dst); break; 1623 case T_FLOAT: pmovsxbd(dst, dst); break; 1624 case T_LONG: pmovsxbq(dst, dst); break; 1625 case T_DOUBLE: pmovsxbq(dst, dst); break; 1626 1627 default: assert(false, "%s", type2name(elem_bt)); 1628 } 1629 } else { 1630 assert(!is_legacy || !is_subword_type(elem_bt) || vlen_in_bytes < 64, ""); 1631 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1632 1633 vpxor (dst, dst, dst, vlen_enc); 1634 vpsubb(dst, dst, src, is_legacy ? AVX_256bit : vlen_enc); 1635 1636 switch (elem_bt) { 1637 case T_BYTE: /* nothing to do */ break; 1638 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1639 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1640 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1641 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1642 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1643 1644 default: assert(false, "%s", type2name(elem_bt)); 1645 } 1646 } 1647 } 1648 1649 void C2_MacroAssembler::load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc) { 1650 if (novlbwdq) { 1651 vpmovsxbd(xtmp, src, vlen_enc); 1652 evpcmpd(dst, k0, xtmp, ExternalAddress(StubRoutines::x86::vector_int_mask_cmp_bits()), 1653 Assembler::eq, true, vlen_enc, noreg); 1654 } else { 1655 vpxor(xtmp, xtmp, xtmp, vlen_enc); 1656 vpsubb(xtmp, xtmp, src, vlen_enc); 1657 evpmovb2m(dst, xtmp, vlen_enc); 1658 } 1659 } 1660 1661 void C2_MacroAssembler::load_vector(XMMRegister dst, Address src, int vlen_in_bytes) { 1662 switch (vlen_in_bytes) { 1663 case 4: movdl(dst, src); break; 1664 case 8: movq(dst, src); break; 1665 case 16: movdqu(dst, src); break; 1666 case 32: vmovdqu(dst, src); break; 1667 case 64: evmovdqul(dst, src, Assembler::AVX_512bit); break; 1668 default: ShouldNotReachHere(); 1669 } 1670 } 1671 1672 void C2_MacroAssembler::load_vector(XMMRegister dst, AddressLiteral src, int vlen_in_bytes, Register rscratch) { 1673 assert(rscratch != noreg || always_reachable(src), "missing"); 1674 1675 if (reachable(src)) { 1676 load_vector(dst, as_Address(src), vlen_in_bytes); 1677 } else { 1678 lea(rscratch, src); 1679 load_vector(dst, Address(rscratch, 0), vlen_in_bytes); 1680 } 1681 } 1682 1683 void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen) { 1684 int vlen_enc = vector_length_encoding(vlen); 1685 if (VM_Version::supports_avx()) { 1686 if (bt == T_LONG) { 1687 if (VM_Version::supports_avx2()) { 1688 vpbroadcastq(dst, src, vlen_enc); 1689 } else { 1690 vmovddup(dst, src, vlen_enc); 1691 } 1692 } else if (bt == T_DOUBLE) { 1693 if (vlen_enc != Assembler::AVX_128bit) { 1694 vbroadcastsd(dst, src, vlen_enc, noreg); 1695 } else { 1696 vmovddup(dst, src, vlen_enc); 1697 } 1698 } else { 1699 if (VM_Version::supports_avx2() && is_integral_type(bt)) { 1700 vpbroadcastd(dst, src, vlen_enc); 1701 } else { 1702 vbroadcastss(dst, src, vlen_enc); 1703 } 1704 } 1705 } else if (VM_Version::supports_sse3()) { 1706 movddup(dst, src); 1707 } else { 1708 movq(dst, src); 1709 if (vlen == 16) { 1710 punpcklqdq(dst, dst); 1711 } 1712 } 1713 } 1714 1715 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { 1716 // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. 1717 int offset = exact_log2(type2aelembytes(bt)) << 6; 1718 if (is_floating_point_type(bt)) { 1719 offset += 128; 1720 } 1721 ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset); 1722 load_vector(dst, addr, vlen_in_bytes); 1723 } 1724 1725 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1726 1727 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1728 int vector_len = Assembler::AVX_128bit; 1729 1730 switch (opcode) { 1731 case Op_AndReductionV: pand(dst, src); break; 1732 case Op_OrReductionV: por (dst, src); break; 1733 case Op_XorReductionV: pxor(dst, src); break; 1734 case Op_MinReductionV: 1735 switch (typ) { 1736 case T_BYTE: pminsb(dst, src); break; 1737 case T_SHORT: pminsw(dst, src); break; 1738 case T_INT: pminsd(dst, src); break; 1739 case T_LONG: assert(UseAVX > 2, "required"); 1740 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1741 default: assert(false, "wrong type"); 1742 } 1743 break; 1744 case Op_MaxReductionV: 1745 switch (typ) { 1746 case T_BYTE: pmaxsb(dst, src); break; 1747 case T_SHORT: pmaxsw(dst, src); break; 1748 case T_INT: pmaxsd(dst, src); break; 1749 case T_LONG: assert(UseAVX > 2, "required"); 1750 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1751 default: assert(false, "wrong type"); 1752 } 1753 break; 1754 case Op_AddReductionVF: addss(dst, src); break; 1755 case Op_AddReductionVD: addsd(dst, src); break; 1756 case Op_AddReductionVI: 1757 switch (typ) { 1758 case T_BYTE: paddb(dst, src); break; 1759 case T_SHORT: paddw(dst, src); break; 1760 case T_INT: paddd(dst, src); break; 1761 default: assert(false, "wrong type"); 1762 } 1763 break; 1764 case Op_AddReductionVL: paddq(dst, src); break; 1765 case Op_MulReductionVF: mulss(dst, src); break; 1766 case Op_MulReductionVD: mulsd(dst, src); break; 1767 case Op_MulReductionVI: 1768 switch (typ) { 1769 case T_SHORT: pmullw(dst, src); break; 1770 case T_INT: pmulld(dst, src); break; 1771 default: assert(false, "wrong type"); 1772 } 1773 break; 1774 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1775 evpmullq(dst, dst, src, vector_len); break; 1776 default: assert(false, "wrong opcode"); 1777 } 1778 } 1779 1780 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1781 int vector_len = Assembler::AVX_256bit; 1782 1783 switch (opcode) { 1784 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1785 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1786 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1787 case Op_MinReductionV: 1788 switch (typ) { 1789 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1790 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1791 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1792 case T_LONG: assert(UseAVX > 2, "required"); 1793 vpminsq(dst, src1, src2, vector_len); break; 1794 default: assert(false, "wrong type"); 1795 } 1796 break; 1797 case Op_MaxReductionV: 1798 switch (typ) { 1799 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1800 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1801 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1802 case T_LONG: assert(UseAVX > 2, "required"); 1803 vpmaxsq(dst, src1, src2, vector_len); break; 1804 default: assert(false, "wrong type"); 1805 } 1806 break; 1807 case Op_AddReductionVI: 1808 switch (typ) { 1809 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1810 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1811 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1812 default: assert(false, "wrong type"); 1813 } 1814 break; 1815 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1816 case Op_MulReductionVI: 1817 switch (typ) { 1818 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1819 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1820 default: assert(false, "wrong type"); 1821 } 1822 break; 1823 case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; 1824 default: assert(false, "wrong opcode"); 1825 } 1826 } 1827 1828 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1829 XMMRegister dst, XMMRegister src, 1830 XMMRegister vtmp1, XMMRegister vtmp2) { 1831 switch (opcode) { 1832 case Op_AddReductionVF: 1833 case Op_MulReductionVF: 1834 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1835 break; 1836 1837 case Op_AddReductionVD: 1838 case Op_MulReductionVD: 1839 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1840 break; 1841 1842 default: assert(false, "wrong opcode"); 1843 } 1844 } 1845 1846 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1847 Register dst, Register src1, XMMRegister src2, 1848 XMMRegister vtmp1, XMMRegister vtmp2) { 1849 switch (vlen) { 1850 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1851 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1852 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1853 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1854 1855 default: assert(false, "wrong vector length"); 1856 } 1857 } 1858 1859 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1860 Register dst, Register src1, XMMRegister src2, 1861 XMMRegister vtmp1, XMMRegister vtmp2) { 1862 switch (vlen) { 1863 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1864 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1865 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1866 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1867 1868 default: assert(false, "wrong vector length"); 1869 } 1870 } 1871 1872 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1873 Register dst, Register src1, XMMRegister src2, 1874 XMMRegister vtmp1, XMMRegister vtmp2) { 1875 switch (vlen) { 1876 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1877 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1878 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1879 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1880 1881 default: assert(false, "wrong vector length"); 1882 } 1883 } 1884 1885 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1886 Register dst, Register src1, XMMRegister src2, 1887 XMMRegister vtmp1, XMMRegister vtmp2) { 1888 switch (vlen) { 1889 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1890 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1891 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1892 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1893 1894 default: assert(false, "wrong vector length"); 1895 } 1896 } 1897 1898 #ifdef _LP64 1899 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1900 Register dst, Register src1, XMMRegister src2, 1901 XMMRegister vtmp1, XMMRegister vtmp2) { 1902 switch (vlen) { 1903 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1904 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1905 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1906 1907 default: assert(false, "wrong vector length"); 1908 } 1909 } 1910 #endif // _LP64 1911 1912 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1913 switch (vlen) { 1914 case 2: 1915 assert(vtmp2 == xnoreg, ""); 1916 reduce2F(opcode, dst, src, vtmp1); 1917 break; 1918 case 4: 1919 assert(vtmp2 == xnoreg, ""); 1920 reduce4F(opcode, dst, src, vtmp1); 1921 break; 1922 case 8: 1923 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1924 break; 1925 case 16: 1926 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1927 break; 1928 default: assert(false, "wrong vector length"); 1929 } 1930 } 1931 1932 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1933 switch (vlen) { 1934 case 2: 1935 assert(vtmp2 == xnoreg, ""); 1936 reduce2D(opcode, dst, src, vtmp1); 1937 break; 1938 case 4: 1939 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1940 break; 1941 case 8: 1942 reduce8D(opcode, dst, src, vtmp1, vtmp2); 1943 break; 1944 default: assert(false, "wrong vector length"); 1945 } 1946 } 1947 1948 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1949 if (opcode == Op_AddReductionVI) { 1950 if (vtmp1 != src2) { 1951 movdqu(vtmp1, src2); 1952 } 1953 phaddd(vtmp1, vtmp1); 1954 } else { 1955 pshufd(vtmp1, src2, 0x1); 1956 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1957 } 1958 movdl(vtmp2, src1); 1959 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1960 movdl(dst, vtmp1); 1961 } 1962 1963 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1964 if (opcode == Op_AddReductionVI) { 1965 if (vtmp1 != src2) { 1966 movdqu(vtmp1, src2); 1967 } 1968 phaddd(vtmp1, src2); 1969 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1970 } else { 1971 pshufd(vtmp2, src2, 0xE); 1972 reduce_operation_128(T_INT, opcode, vtmp2, src2); 1973 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1974 } 1975 } 1976 1977 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1978 if (opcode == Op_AddReductionVI) { 1979 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 1980 vextracti128_high(vtmp2, vtmp1); 1981 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 1982 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1983 } else { 1984 vextracti128_high(vtmp1, src2); 1985 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1986 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1987 } 1988 } 1989 1990 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1991 vextracti64x4_high(vtmp2, src2); 1992 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 1993 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1994 } 1995 1996 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1997 pshufd(vtmp2, src2, 0x1); 1998 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1999 movdqu(vtmp1, vtmp2); 2000 psrldq(vtmp1, 2); 2001 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2002 movdqu(vtmp2, vtmp1); 2003 psrldq(vtmp2, 1); 2004 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 2005 movdl(vtmp2, src1); 2006 pmovsxbd(vtmp1, vtmp1); 2007 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2008 pextrb(dst, vtmp1, 0x0); 2009 movsbl(dst, dst); 2010 } 2011 2012 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2013 pshufd(vtmp1, src2, 0xE); 2014 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 2015 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2016 } 2017 2018 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2019 vextracti128_high(vtmp2, src2); 2020 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 2021 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2022 } 2023 2024 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2025 vextracti64x4_high(vtmp1, src2); 2026 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 2027 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2028 } 2029 2030 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2031 pmovsxbw(vtmp2, src2); 2032 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2033 } 2034 2035 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2036 if (UseAVX > 1) { 2037 int vector_len = Assembler::AVX_256bit; 2038 vpmovsxbw(vtmp1, src2, vector_len); 2039 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2040 } else { 2041 pmovsxbw(vtmp2, src2); 2042 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2043 pshufd(vtmp2, src2, 0x1); 2044 pmovsxbw(vtmp2, src2); 2045 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2046 } 2047 } 2048 2049 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2050 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 2051 int vector_len = Assembler::AVX_512bit; 2052 vpmovsxbw(vtmp1, src2, vector_len); 2053 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2054 } else { 2055 assert(UseAVX >= 2,"Should not reach here."); 2056 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 2057 vextracti128_high(vtmp2, src2); 2058 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2059 } 2060 } 2061 2062 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2063 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 2064 vextracti64x4_high(vtmp2, src2); 2065 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 2066 } 2067 2068 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2069 if (opcode == Op_AddReductionVI) { 2070 if (vtmp1 != src2) { 2071 movdqu(vtmp1, src2); 2072 } 2073 phaddw(vtmp1, vtmp1); 2074 phaddw(vtmp1, vtmp1); 2075 } else { 2076 pshufd(vtmp2, src2, 0x1); 2077 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2078 movdqu(vtmp1, vtmp2); 2079 psrldq(vtmp1, 2); 2080 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 2081 } 2082 movdl(vtmp2, src1); 2083 pmovsxwd(vtmp1, vtmp1); 2084 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 2085 pextrw(dst, vtmp1, 0x0); 2086 movswl(dst, dst); 2087 } 2088 2089 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2090 if (opcode == Op_AddReductionVI) { 2091 if (vtmp1 != src2) { 2092 movdqu(vtmp1, src2); 2093 } 2094 phaddw(vtmp1, src2); 2095 } else { 2096 pshufd(vtmp1, src2, 0xE); 2097 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 2098 } 2099 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2100 } 2101 2102 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2103 if (opcode == Op_AddReductionVI) { 2104 int vector_len = Assembler::AVX_256bit; 2105 vphaddw(vtmp2, src2, src2, vector_len); 2106 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 2107 } else { 2108 vextracti128_high(vtmp2, src2); 2109 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 2110 } 2111 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2112 } 2113 2114 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2115 int vector_len = Assembler::AVX_256bit; 2116 vextracti64x4_high(vtmp1, src2); 2117 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 2118 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2119 } 2120 2121 #ifdef _LP64 2122 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2123 pshufd(vtmp2, src2, 0xE); 2124 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 2125 movdq(vtmp1, src1); 2126 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 2127 movdq(dst, vtmp1); 2128 } 2129 2130 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2131 vextracti128_high(vtmp1, src2); 2132 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 2133 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 2134 } 2135 2136 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 2137 vextracti64x4_high(vtmp2, src2); 2138 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 2139 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 2140 } 2141 2142 void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { 2143 mov64(temp, -1L); 2144 bzhiq(temp, temp, len); 2145 kmovql(dst, temp); 2146 } 2147 #endif // _LP64 2148 2149 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2150 reduce_operation_128(T_FLOAT, opcode, dst, src); 2151 pshufd(vtmp, src, 0x1); 2152 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2153 } 2154 2155 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2156 reduce2F(opcode, dst, src, vtmp); 2157 pshufd(vtmp, src, 0x2); 2158 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2159 pshufd(vtmp, src, 0x3); 2160 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 2161 } 2162 2163 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2164 reduce4F(opcode, dst, src, vtmp2); 2165 vextractf128_high(vtmp2, src); 2166 reduce4F(opcode, dst, vtmp2, vtmp1); 2167 } 2168 2169 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2170 reduce8F(opcode, dst, src, vtmp1, vtmp2); 2171 vextracti64x4_high(vtmp1, src); 2172 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 2173 } 2174 2175 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 2176 reduce_operation_128(T_DOUBLE, opcode, dst, src); 2177 pshufd(vtmp, src, 0xE); 2178 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 2179 } 2180 2181 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2182 reduce2D(opcode, dst, src, vtmp2); 2183 vextractf128_high(vtmp2, src); 2184 reduce2D(opcode, dst, vtmp2, vtmp1); 2185 } 2186 2187 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 2188 reduce4D(opcode, dst, src, vtmp1, vtmp2); 2189 vextracti64x4_high(vtmp1, src); 2190 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 2191 } 2192 2193 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 2194 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2195 } 2196 2197 void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 2198 MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len); 2199 } 2200 2201 void C2_MacroAssembler::vmovmask(BasicType elem_bt, XMMRegister dst, Address src, XMMRegister mask, 2202 int vec_enc) { 2203 switch(elem_bt) { 2204 case T_INT: 2205 case T_FLOAT: 2206 vmaskmovps(dst, src, mask, vec_enc); 2207 break; 2208 case T_LONG: 2209 case T_DOUBLE: 2210 vmaskmovpd(dst, src, mask, vec_enc); 2211 break; 2212 default: 2213 fatal("Unsupported type %s", type2name(elem_bt)); 2214 break; 2215 } 2216 } 2217 2218 void C2_MacroAssembler::vmovmask(BasicType elem_bt, Address dst, XMMRegister src, XMMRegister mask, 2219 int vec_enc) { 2220 switch(elem_bt) { 2221 case T_INT: 2222 case T_FLOAT: 2223 vmaskmovps(dst, src, mask, vec_enc); 2224 break; 2225 case T_LONG: 2226 case T_DOUBLE: 2227 vmaskmovpd(dst, src, mask, vec_enc); 2228 break; 2229 default: 2230 fatal("Unsupported type %s", type2name(elem_bt)); 2231 break; 2232 } 2233 } 2234 2235 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 2236 XMMRegister dst, XMMRegister src, 2237 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2238 XMMRegister xmm_0, XMMRegister xmm_1) { 2239 const int permconst[] = {1, 14}; 2240 XMMRegister wsrc = src; 2241 XMMRegister wdst = xmm_0; 2242 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2243 2244 int vlen_enc = Assembler::AVX_128bit; 2245 if (vlen == 16) { 2246 vlen_enc = Assembler::AVX_256bit; 2247 } 2248 2249 for (int i = log2(vlen) - 1; i >=0; i--) { 2250 if (i == 0 && !is_dst_valid) { 2251 wdst = dst; 2252 } 2253 if (i == 3) { 2254 vextracti64x4_high(wtmp, wsrc); 2255 } else if (i == 2) { 2256 vextracti128_high(wtmp, wsrc); 2257 } else { // i = [0,1] 2258 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 2259 } 2260 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2261 wsrc = wdst; 2262 vlen_enc = Assembler::AVX_128bit; 2263 } 2264 if (is_dst_valid) { 2265 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2266 } 2267 } 2268 2269 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 2270 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 2271 XMMRegister xmm_0, XMMRegister xmm_1) { 2272 XMMRegister wsrc = src; 2273 XMMRegister wdst = xmm_0; 2274 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 2275 int vlen_enc = Assembler::AVX_128bit; 2276 if (vlen == 8) { 2277 vlen_enc = Assembler::AVX_256bit; 2278 } 2279 for (int i = log2(vlen) - 1; i >=0; i--) { 2280 if (i == 0 && !is_dst_valid) { 2281 wdst = dst; 2282 } 2283 if (i == 1) { 2284 vextracti128_high(wtmp, wsrc); 2285 } else if (i == 2) { 2286 vextracti64x4_high(wtmp, wsrc); 2287 } else { 2288 assert(i == 0, "%d", i); 2289 vpermilpd(wtmp, wsrc, 1, vlen_enc); 2290 } 2291 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 2292 wsrc = wdst; 2293 vlen_enc = Assembler::AVX_128bit; 2294 } 2295 if (is_dst_valid) { 2296 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 2297 } 2298 } 2299 2300 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 2301 switch (bt) { 2302 case T_BYTE: pextrb(dst, src, idx); break; 2303 case T_SHORT: pextrw(dst, src, idx); break; 2304 case T_INT: pextrd(dst, src, idx); break; 2305 case T_LONG: pextrq(dst, src, idx); break; 2306 2307 default: 2308 assert(false,"Should not reach here."); 2309 break; 2310 } 2311 } 2312 2313 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 2314 int esize = type2aelembytes(typ); 2315 int elem_per_lane = 16/esize; 2316 int lane = elemindex / elem_per_lane; 2317 int eindex = elemindex % elem_per_lane; 2318 2319 if (lane >= 2) { 2320 assert(UseAVX > 2, "required"); 2321 vextractf32x4(dst, src, lane & 3); 2322 return dst; 2323 } else if (lane > 0) { 2324 assert(UseAVX > 0, "required"); 2325 vextractf128(dst, src, lane); 2326 return dst; 2327 } else { 2328 return src; 2329 } 2330 } 2331 2332 void C2_MacroAssembler::movsxl(BasicType typ, Register dst) { 2333 if (typ == T_BYTE) { 2334 movsbl(dst, dst); 2335 } else if (typ == T_SHORT) { 2336 movswl(dst, dst); 2337 } 2338 } 2339 2340 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 2341 int esize = type2aelembytes(typ); 2342 int elem_per_lane = 16/esize; 2343 int eindex = elemindex % elem_per_lane; 2344 assert(is_integral_type(typ),"required"); 2345 2346 if (eindex == 0) { 2347 if (typ == T_LONG) { 2348 movq(dst, src); 2349 } else { 2350 movdl(dst, src); 2351 movsxl(typ, dst); 2352 } 2353 } else { 2354 extract(typ, dst, src, eindex); 2355 movsxl(typ, dst); 2356 } 2357 } 2358 2359 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, XMMRegister vtmp) { 2360 int esize = type2aelembytes(typ); 2361 int elem_per_lane = 16/esize; 2362 int eindex = elemindex % elem_per_lane; 2363 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 2364 2365 if (eindex == 0) { 2366 movq(dst, src); 2367 } else { 2368 if (typ == T_FLOAT) { 2369 if (UseAVX == 0) { 2370 movdqu(dst, src); 2371 shufps(dst, dst, eindex); 2372 } else { 2373 vshufps(dst, src, src, eindex, Assembler::AVX_128bit); 2374 } 2375 } else { 2376 if (UseAVX == 0) { 2377 movdqu(dst, src); 2378 psrldq(dst, eindex*esize); 2379 } else { 2380 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 2381 } 2382 movq(dst, dst); 2383 } 2384 } 2385 // Zero upper bits 2386 if (typ == T_FLOAT) { 2387 if (UseAVX == 0) { 2388 assert(vtmp != xnoreg, "required."); 2389 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), noreg); 2390 pand(dst, vtmp); 2391 } else { 2392 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, noreg); 2393 } 2394 } 2395 } 2396 2397 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) { 2398 switch(typ) { 2399 case T_BYTE: 2400 case T_BOOLEAN: 2401 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2402 break; 2403 case T_SHORT: 2404 case T_CHAR: 2405 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2406 break; 2407 case T_INT: 2408 case T_FLOAT: 2409 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2410 break; 2411 case T_LONG: 2412 case T_DOUBLE: 2413 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len); 2414 break; 2415 default: 2416 assert(false,"Should not reach here."); 2417 break; 2418 } 2419 } 2420 2421 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral src2, int comparison, int vector_len, Register rscratch) { 2422 assert(rscratch != noreg || always_reachable(src2), "missing"); 2423 2424 switch(typ) { 2425 case T_BOOLEAN: 2426 case T_BYTE: 2427 evpcmpb(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2428 break; 2429 case T_CHAR: 2430 case T_SHORT: 2431 evpcmpw(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2432 break; 2433 case T_INT: 2434 case T_FLOAT: 2435 evpcmpd(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2436 break; 2437 case T_LONG: 2438 case T_DOUBLE: 2439 evpcmpq(kdmask, ksmask, src1, src2, comparison, /*signed*/ true, vector_len, rscratch); 2440 break; 2441 default: 2442 assert(false,"Should not reach here."); 2443 break; 2444 } 2445 } 2446 2447 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2448 switch(typ) { 2449 case T_BYTE: 2450 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2451 break; 2452 case T_SHORT: 2453 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2454 break; 2455 case T_INT: 2456 case T_FLOAT: 2457 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2458 break; 2459 case T_LONG: 2460 case T_DOUBLE: 2461 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2462 break; 2463 default: 2464 assert(false,"Should not reach here."); 2465 break; 2466 } 2467 } 2468 2469 void C2_MacroAssembler::vectortest(BasicType bt, XMMRegister src1, XMMRegister src2, XMMRegister vtmp, int vlen_in_bytes) { 2470 assert(vlen_in_bytes <= 32, ""); 2471 int esize = type2aelembytes(bt); 2472 if (vlen_in_bytes == 32) { 2473 assert(vtmp == xnoreg, "required."); 2474 if (esize >= 4) { 2475 vtestps(src1, src2, AVX_256bit); 2476 } else { 2477 vptest(src1, src2, AVX_256bit); 2478 } 2479 return; 2480 } 2481 if (vlen_in_bytes < 16) { 2482 // Duplicate the lower part to fill the whole register, 2483 // Don't need to do so for src2 2484 assert(vtmp != xnoreg, "required"); 2485 int shuffle_imm = (vlen_in_bytes == 4) ? 0x00 : 0x04; 2486 pshufd(vtmp, src1, shuffle_imm); 2487 } else { 2488 assert(vtmp == xnoreg, "required"); 2489 vtmp = src1; 2490 } 2491 if (esize >= 4 && VM_Version::supports_avx()) { 2492 vtestps(vtmp, src2, AVX_128bit); 2493 } else { 2494 ptest(vtmp, src2); 2495 } 2496 } 2497 2498 void C2_MacroAssembler::vpadd(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) { 2499 assert(UseAVX >= 2, "required"); 2500 #ifdef ASSERT 2501 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2502 bool is_bw_supported = VM_Version::supports_avx512bw(); 2503 if (is_bw && !is_bw_supported) { 2504 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2505 assert((dst->encoding() < 16) && (src1->encoding() < 16) && (src2->encoding() < 16), 2506 "XMM register should be 0-15"); 2507 } 2508 #endif // ASSERT 2509 switch (elem_bt) { 2510 case T_BYTE: vpaddb(dst, src1, src2, vlen_enc); return; 2511 case T_SHORT: vpaddw(dst, src1, src2, vlen_enc); return; 2512 case T_INT: vpaddd(dst, src1, src2, vlen_enc); return; 2513 case T_FLOAT: vaddps(dst, src1, src2, vlen_enc); return; 2514 case T_LONG: vpaddq(dst, src1, src2, vlen_enc); return; 2515 case T_DOUBLE: vaddpd(dst, src1, src2, vlen_enc); return; 2516 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2517 } 2518 } 2519 2520 #ifdef _LP64 2521 void C2_MacroAssembler::vpbroadcast(BasicType elem_bt, XMMRegister dst, Register src, int vlen_enc) { 2522 assert(UseAVX >= 2, "required"); 2523 bool is_bw = ((elem_bt == T_BYTE) || (elem_bt == T_SHORT)); 2524 bool is_vl = vlen_enc != Assembler::AVX_512bit; 2525 if ((UseAVX > 2) && 2526 (!is_bw || VM_Version::supports_avx512bw()) && 2527 (!is_vl || VM_Version::supports_avx512vl())) { 2528 switch (elem_bt) { 2529 case T_BYTE: evpbroadcastb(dst, src, vlen_enc); return; 2530 case T_SHORT: evpbroadcastw(dst, src, vlen_enc); return; 2531 case T_FLOAT: case T_INT: evpbroadcastd(dst, src, vlen_enc); return; 2532 case T_DOUBLE: case T_LONG: evpbroadcastq(dst, src, vlen_enc); return; 2533 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2534 } 2535 } else { 2536 assert(vlen_enc != Assembler::AVX_512bit, "required"); 2537 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2538 switch (elem_bt) { 2539 case T_BYTE: movdl(dst, src); vpbroadcastb(dst, dst, vlen_enc); return; 2540 case T_SHORT: movdl(dst, src); vpbroadcastw(dst, dst, vlen_enc); return; 2541 case T_INT: movdl(dst, src); vpbroadcastd(dst, dst, vlen_enc); return; 2542 case T_FLOAT: movdl(dst, src); vbroadcastss(dst, dst, vlen_enc); return; 2543 case T_LONG: movdq(dst, src); vpbroadcastq(dst, dst, vlen_enc); return; 2544 case T_DOUBLE: movdq(dst, src); vbroadcastsd(dst, dst, vlen_enc); return; 2545 default: fatal("Unsupported type %s", type2name(elem_bt)); return; 2546 } 2547 } 2548 } 2549 #endif 2550 2551 void C2_MacroAssembler::vconvert_b2x(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, int vlen_enc) { 2552 switch (to_elem_bt) { 2553 case T_SHORT: 2554 vpmovsxbw(dst, src, vlen_enc); 2555 break; 2556 case T_INT: 2557 vpmovsxbd(dst, src, vlen_enc); 2558 break; 2559 case T_FLOAT: 2560 vpmovsxbd(dst, src, vlen_enc); 2561 vcvtdq2ps(dst, dst, vlen_enc); 2562 break; 2563 case T_LONG: 2564 vpmovsxbq(dst, src, vlen_enc); 2565 break; 2566 case T_DOUBLE: { 2567 int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit; 2568 vpmovsxbd(dst, src, mid_vlen_enc); 2569 vcvtdq2pd(dst, dst, vlen_enc); 2570 break; 2571 } 2572 default: 2573 fatal("Unsupported type %s", type2name(to_elem_bt)); 2574 break; 2575 } 2576 } 2577 2578 //------------------------------------------------------------------------------------------- 2579 2580 // IndexOf for constant substrings with size >= 8 chars 2581 // which don't need to be loaded through stack. 2582 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2583 Register cnt1, Register cnt2, 2584 int int_cnt2, Register result, 2585 XMMRegister vec, Register tmp, 2586 int ae) { 2587 ShortBranchVerifier sbv(this); 2588 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2589 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2590 2591 // This method uses the pcmpestri instruction with bound registers 2592 // inputs: 2593 // xmm - substring 2594 // rax - substring length (elements count) 2595 // mem - scanned string 2596 // rdx - string length (elements count) 2597 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2598 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2599 // outputs: 2600 // rcx - matched index in string 2601 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2602 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2603 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2604 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2605 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2606 2607 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2608 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2609 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2610 2611 // Note, inline_string_indexOf() generates checks: 2612 // if (substr.count > string.count) return -1; 2613 // if (substr.count == 0) return 0; 2614 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2615 2616 // Load substring. 2617 if (ae == StrIntrinsicNode::UL) { 2618 pmovzxbw(vec, Address(str2, 0)); 2619 } else { 2620 movdqu(vec, Address(str2, 0)); 2621 } 2622 movl(cnt2, int_cnt2); 2623 movptr(result, str1); // string addr 2624 2625 if (int_cnt2 > stride) { 2626 jmpb(SCAN_TO_SUBSTR); 2627 2628 // Reload substr for rescan, this code 2629 // is executed only for large substrings (> 8 chars) 2630 bind(RELOAD_SUBSTR); 2631 if (ae == StrIntrinsicNode::UL) { 2632 pmovzxbw(vec, Address(str2, 0)); 2633 } else { 2634 movdqu(vec, Address(str2, 0)); 2635 } 2636 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2637 2638 bind(RELOAD_STR); 2639 // We came here after the beginning of the substring was 2640 // matched but the rest of it was not so we need to search 2641 // again. Start from the next element after the previous match. 2642 2643 // cnt2 is number of substring reminding elements and 2644 // cnt1 is number of string reminding elements when cmp failed. 2645 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2646 subl(cnt1, cnt2); 2647 addl(cnt1, int_cnt2); 2648 movl(cnt2, int_cnt2); // Now restore cnt2 2649 2650 decrementl(cnt1); // Shift to next element 2651 cmpl(cnt1, cnt2); 2652 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2653 2654 addptr(result, (1<<scale1)); 2655 2656 } // (int_cnt2 > 8) 2657 2658 // Scan string for start of substr in 16-byte vectors 2659 bind(SCAN_TO_SUBSTR); 2660 pcmpestri(vec, Address(result, 0), mode); 2661 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2662 subl(cnt1, stride); 2663 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2664 cmpl(cnt1, cnt2); 2665 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2666 addptr(result, 16); 2667 jmpb(SCAN_TO_SUBSTR); 2668 2669 // Found a potential substr 2670 bind(FOUND_CANDIDATE); 2671 // Matched whole vector if first element matched (tmp(rcx) == 0). 2672 if (int_cnt2 == stride) { 2673 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2674 } else { // int_cnt2 > 8 2675 jccb(Assembler::overflow, FOUND_SUBSTR); 2676 } 2677 // After pcmpestri tmp(rcx) contains matched element index 2678 // Compute start addr of substr 2679 lea(result, Address(result, tmp, scale1)); 2680 2681 // Make sure string is still long enough 2682 subl(cnt1, tmp); 2683 cmpl(cnt1, cnt2); 2684 if (int_cnt2 == stride) { 2685 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2686 } else { // int_cnt2 > 8 2687 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2688 } 2689 // Left less then substring. 2690 2691 bind(RET_NOT_FOUND); 2692 movl(result, -1); 2693 jmp(EXIT); 2694 2695 if (int_cnt2 > stride) { 2696 // This code is optimized for the case when whole substring 2697 // is matched if its head is matched. 2698 bind(MATCH_SUBSTR_HEAD); 2699 pcmpestri(vec, Address(result, 0), mode); 2700 // Reload only string if does not match 2701 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2702 2703 Label CONT_SCAN_SUBSTR; 2704 // Compare the rest of substring (> 8 chars). 2705 bind(FOUND_SUBSTR); 2706 // First 8 chars are already matched. 2707 negptr(cnt2); 2708 addptr(cnt2, stride); 2709 2710 bind(SCAN_SUBSTR); 2711 subl(cnt1, stride); 2712 cmpl(cnt2, -stride); // Do not read beyond substring 2713 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2714 // Back-up strings to avoid reading beyond substring: 2715 // cnt1 = cnt1 - cnt2 + 8 2716 addl(cnt1, cnt2); // cnt2 is negative 2717 addl(cnt1, stride); 2718 movl(cnt2, stride); negptr(cnt2); 2719 bind(CONT_SCAN_SUBSTR); 2720 if (int_cnt2 < (int)G) { 2721 int tail_off1 = int_cnt2<<scale1; 2722 int tail_off2 = int_cnt2<<scale2; 2723 if (ae == StrIntrinsicNode::UL) { 2724 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2725 } else { 2726 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2727 } 2728 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2729 } else { 2730 // calculate index in register to avoid integer overflow (int_cnt2*2) 2731 movl(tmp, int_cnt2); 2732 addptr(tmp, cnt2); 2733 if (ae == StrIntrinsicNode::UL) { 2734 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2735 } else { 2736 movdqu(vec, Address(str2, tmp, scale2, 0)); 2737 } 2738 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2739 } 2740 // Need to reload strings pointers if not matched whole vector 2741 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2742 addptr(cnt2, stride); 2743 jcc(Assembler::negative, SCAN_SUBSTR); 2744 // Fall through if found full substring 2745 2746 } // (int_cnt2 > 8) 2747 2748 bind(RET_FOUND); 2749 // Found result if we matched full small substring. 2750 // Compute substr offset 2751 subptr(result, str1); 2752 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2753 shrl(result, 1); // index 2754 } 2755 bind(EXIT); 2756 2757 } // string_indexofC8 2758 2759 // Small strings are loaded through stack if they cross page boundary. 2760 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2761 Register cnt1, Register cnt2, 2762 int int_cnt2, Register result, 2763 XMMRegister vec, Register tmp, 2764 int ae) { 2765 ShortBranchVerifier sbv(this); 2766 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2767 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2768 2769 // 2770 // int_cnt2 is length of small (< 8 chars) constant substring 2771 // or (-1) for non constant substring in which case its length 2772 // is in cnt2 register. 2773 // 2774 // Note, inline_string_indexOf() generates checks: 2775 // if (substr.count > string.count) return -1; 2776 // if (substr.count == 0) return 0; 2777 // 2778 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2779 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2780 // This method uses the pcmpestri instruction with bound registers 2781 // inputs: 2782 // xmm - substring 2783 // rax - substring length (elements count) 2784 // mem - scanned string 2785 // rdx - string length (elements count) 2786 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2787 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2788 // outputs: 2789 // rcx - matched index in string 2790 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2791 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2792 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2793 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2794 2795 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2796 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2797 FOUND_CANDIDATE; 2798 2799 { //======================================================== 2800 // We don't know where these strings are located 2801 // and we can't read beyond them. Load them through stack. 2802 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2803 2804 movptr(tmp, rsp); // save old SP 2805 2806 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2807 if (int_cnt2 == (1>>scale2)) { // One byte 2808 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2809 load_unsigned_byte(result, Address(str2, 0)); 2810 movdl(vec, result); // move 32 bits 2811 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2812 // Not enough header space in 32-bit VM: 12+3 = 15. 2813 movl(result, Address(str2, -1)); 2814 shrl(result, 8); 2815 movdl(vec, result); // move 32 bits 2816 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2817 load_unsigned_short(result, Address(str2, 0)); 2818 movdl(vec, result); // move 32 bits 2819 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2820 movdl(vec, Address(str2, 0)); // move 32 bits 2821 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2822 movq(vec, Address(str2, 0)); // move 64 bits 2823 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2824 // Array header size is 12 bytes in 32-bit VM 2825 // + 6 bytes for 3 chars == 18 bytes, 2826 // enough space to load vec and shift. 2827 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2828 if (ae == StrIntrinsicNode::UL) { 2829 int tail_off = int_cnt2-8; 2830 pmovzxbw(vec, Address(str2, tail_off)); 2831 psrldq(vec, -2*tail_off); 2832 } 2833 else { 2834 int tail_off = int_cnt2*(1<<scale2); 2835 movdqu(vec, Address(str2, tail_off-16)); 2836 psrldq(vec, 16-tail_off); 2837 } 2838 } 2839 } else { // not constant substring 2840 cmpl(cnt2, stride); 2841 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 2842 2843 // We can read beyond string if srt+16 does not cross page boundary 2844 // since heaps are aligned and mapped by pages. 2845 assert(os::vm_page_size() < (int)G, "default page should be small"); 2846 movl(result, str2); // We need only low 32 bits 2847 andl(result, ((int)os::vm_page_size()-1)); 2848 cmpl(result, ((int)os::vm_page_size()-16)); 2849 jccb(Assembler::belowEqual, CHECK_STR); 2850 2851 // Move small strings to stack to allow load 16 bytes into vec. 2852 subptr(rsp, 16); 2853 int stk_offset = wordSize-(1<<scale2); 2854 push(cnt2); 2855 2856 bind(COPY_SUBSTR); 2857 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 2858 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 2859 movb(Address(rsp, cnt2, scale2, stk_offset), result); 2860 } else if (ae == StrIntrinsicNode::UU) { 2861 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 2862 movw(Address(rsp, cnt2, scale2, stk_offset), result); 2863 } 2864 decrement(cnt2); 2865 jccb(Assembler::notZero, COPY_SUBSTR); 2866 2867 pop(cnt2); 2868 movptr(str2, rsp); // New substring address 2869 } // non constant 2870 2871 bind(CHECK_STR); 2872 cmpl(cnt1, stride); 2873 jccb(Assembler::aboveEqual, BIG_STRINGS); 2874 2875 // Check cross page boundary. 2876 movl(result, str1); // We need only low 32 bits 2877 andl(result, ((int)os::vm_page_size()-1)); 2878 cmpl(result, ((int)os::vm_page_size()-16)); 2879 jccb(Assembler::belowEqual, BIG_STRINGS); 2880 2881 subptr(rsp, 16); 2882 int stk_offset = -(1<<scale1); 2883 if (int_cnt2 < 0) { // not constant 2884 push(cnt2); 2885 stk_offset += wordSize; 2886 } 2887 movl(cnt2, cnt1); 2888 2889 bind(COPY_STR); 2890 if (ae == StrIntrinsicNode::LL) { 2891 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 2892 movb(Address(rsp, cnt2, scale1, stk_offset), result); 2893 } else { 2894 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 2895 movw(Address(rsp, cnt2, scale1, stk_offset), result); 2896 } 2897 decrement(cnt2); 2898 jccb(Assembler::notZero, COPY_STR); 2899 2900 if (int_cnt2 < 0) { // not constant 2901 pop(cnt2); 2902 } 2903 movptr(str1, rsp); // New string address 2904 2905 bind(BIG_STRINGS); 2906 // Load substring. 2907 if (int_cnt2 < 0) { // -1 2908 if (ae == StrIntrinsicNode::UL) { 2909 pmovzxbw(vec, Address(str2, 0)); 2910 } else { 2911 movdqu(vec, Address(str2, 0)); 2912 } 2913 push(cnt2); // substr count 2914 push(str2); // substr addr 2915 push(str1); // string addr 2916 } else { 2917 // Small (< 8 chars) constant substrings are loaded already. 2918 movl(cnt2, int_cnt2); 2919 } 2920 push(tmp); // original SP 2921 2922 } // Finished loading 2923 2924 //======================================================== 2925 // Start search 2926 // 2927 2928 movptr(result, str1); // string addr 2929 2930 if (int_cnt2 < 0) { // Only for non constant substring 2931 jmpb(SCAN_TO_SUBSTR); 2932 2933 // SP saved at sp+0 2934 // String saved at sp+1*wordSize 2935 // Substr saved at sp+2*wordSize 2936 // Substr count saved at sp+3*wordSize 2937 2938 // Reload substr for rescan, this code 2939 // is executed only for large substrings (> 8 chars) 2940 bind(RELOAD_SUBSTR); 2941 movptr(str2, Address(rsp, 2*wordSize)); 2942 movl(cnt2, Address(rsp, 3*wordSize)); 2943 if (ae == StrIntrinsicNode::UL) { 2944 pmovzxbw(vec, Address(str2, 0)); 2945 } else { 2946 movdqu(vec, Address(str2, 0)); 2947 } 2948 // We came here after the beginning of the substring was 2949 // matched but the rest of it was not so we need to search 2950 // again. Start from the next element after the previous match. 2951 subptr(str1, result); // Restore counter 2952 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2953 shrl(str1, 1); 2954 } 2955 addl(cnt1, str1); 2956 decrementl(cnt1); // Shift to next element 2957 cmpl(cnt1, cnt2); 2958 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2959 2960 addptr(result, (1<<scale1)); 2961 } // non constant 2962 2963 // Scan string for start of substr in 16-byte vectors 2964 bind(SCAN_TO_SUBSTR); 2965 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2966 pcmpestri(vec, Address(result, 0), mode); 2967 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2968 subl(cnt1, stride); 2969 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2970 cmpl(cnt1, cnt2); 2971 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2972 addptr(result, 16); 2973 2974 bind(ADJUST_STR); 2975 cmpl(cnt1, stride); // Do not read beyond string 2976 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2977 // Back-up string to avoid reading beyond string. 2978 lea(result, Address(result, cnt1, scale1, -16)); 2979 movl(cnt1, stride); 2980 jmpb(SCAN_TO_SUBSTR); 2981 2982 // Found a potential substr 2983 bind(FOUND_CANDIDATE); 2984 // After pcmpestri tmp(rcx) contains matched element index 2985 2986 // Make sure string is still long enough 2987 subl(cnt1, tmp); 2988 cmpl(cnt1, cnt2); 2989 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 2990 // Left less then substring. 2991 2992 bind(RET_NOT_FOUND); 2993 movl(result, -1); 2994 jmp(CLEANUP); 2995 2996 bind(FOUND_SUBSTR); 2997 // Compute start addr of substr 2998 lea(result, Address(result, tmp, scale1)); 2999 if (int_cnt2 > 0) { // Constant substring 3000 // Repeat search for small substring (< 8 chars) 3001 // from new point without reloading substring. 3002 // Have to check that we don't read beyond string. 3003 cmpl(tmp, stride-int_cnt2); 3004 jccb(Assembler::greater, ADJUST_STR); 3005 // Fall through if matched whole substring. 3006 } else { // non constant 3007 assert(int_cnt2 == -1, "should be != 0"); 3008 3009 addl(tmp, cnt2); 3010 // Found result if we matched whole substring. 3011 cmpl(tmp, stride); 3012 jcc(Assembler::lessEqual, RET_FOUND); 3013 3014 // Repeat search for small substring (<= 8 chars) 3015 // from new point 'str1' without reloading substring. 3016 cmpl(cnt2, stride); 3017 // Have to check that we don't read beyond string. 3018 jccb(Assembler::lessEqual, ADJUST_STR); 3019 3020 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 3021 // Compare the rest of substring (> 8 chars). 3022 movptr(str1, result); 3023 3024 cmpl(tmp, cnt2); 3025 // First 8 chars are already matched. 3026 jccb(Assembler::equal, CHECK_NEXT); 3027 3028 bind(SCAN_SUBSTR); 3029 pcmpestri(vec, Address(str1, 0), mode); 3030 // Need to reload strings pointers if not matched whole vector 3031 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 3032 3033 bind(CHECK_NEXT); 3034 subl(cnt2, stride); 3035 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 3036 addptr(str1, 16); 3037 if (ae == StrIntrinsicNode::UL) { 3038 addptr(str2, 8); 3039 } else { 3040 addptr(str2, 16); 3041 } 3042 subl(cnt1, stride); 3043 cmpl(cnt2, stride); // Do not read beyond substring 3044 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 3045 // Back-up strings to avoid reading beyond substring. 3046 3047 if (ae == StrIntrinsicNode::UL) { 3048 lea(str2, Address(str2, cnt2, scale2, -8)); 3049 lea(str1, Address(str1, cnt2, scale1, -16)); 3050 } else { 3051 lea(str2, Address(str2, cnt2, scale2, -16)); 3052 lea(str1, Address(str1, cnt2, scale1, -16)); 3053 } 3054 subl(cnt1, cnt2); 3055 movl(cnt2, stride); 3056 addl(cnt1, stride); 3057 bind(CONT_SCAN_SUBSTR); 3058 if (ae == StrIntrinsicNode::UL) { 3059 pmovzxbw(vec, Address(str2, 0)); 3060 } else { 3061 movdqu(vec, Address(str2, 0)); 3062 } 3063 jmp(SCAN_SUBSTR); 3064 3065 bind(RET_FOUND_LONG); 3066 movptr(str1, Address(rsp, wordSize)); 3067 } // non constant 3068 3069 bind(RET_FOUND); 3070 // Compute substr offset 3071 subptr(result, str1); 3072 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 3073 shrl(result, 1); // index 3074 } 3075 bind(CLEANUP); 3076 pop(rsp); // restore SP 3077 3078 } // string_indexof 3079 3080 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3081 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3082 ShortBranchVerifier sbv(this); 3083 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3084 3085 int stride = 8; 3086 3087 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 3088 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 3089 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 3090 FOUND_SEQ_CHAR, DONE_LABEL; 3091 3092 movptr(result, str1); 3093 if (UseAVX >= 2) { 3094 cmpl(cnt1, stride); 3095 jcc(Assembler::less, SCAN_TO_CHAR); 3096 cmpl(cnt1, 2*stride); 3097 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 3098 movdl(vec1, ch); 3099 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 3100 vpxor(vec2, vec2); 3101 movl(tmp, cnt1); 3102 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 3103 andl(cnt1,0x0000000F); //tail count (in chars) 3104 3105 bind(SCAN_TO_16_CHAR_LOOP); 3106 vmovdqu(vec3, Address(result, 0)); 3107 vpcmpeqw(vec3, vec3, vec1, 1); 3108 vptest(vec2, vec3); 3109 jcc(Assembler::carryClear, FOUND_CHAR); 3110 addptr(result, 32); 3111 subl(tmp, 2*stride); 3112 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 3113 jmp(SCAN_TO_8_CHAR); 3114 bind(SCAN_TO_8_CHAR_INIT); 3115 movdl(vec1, ch); 3116 pshuflw(vec1, vec1, 0x00); 3117 pshufd(vec1, vec1, 0); 3118 pxor(vec2, vec2); 3119 } 3120 bind(SCAN_TO_8_CHAR); 3121 cmpl(cnt1, stride); 3122 jcc(Assembler::less, SCAN_TO_CHAR); 3123 if (UseAVX < 2) { 3124 movdl(vec1, ch); 3125 pshuflw(vec1, vec1, 0x00); 3126 pshufd(vec1, vec1, 0); 3127 pxor(vec2, vec2); 3128 } 3129 movl(tmp, cnt1); 3130 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 3131 andl(cnt1,0x00000007); //tail count (in chars) 3132 3133 bind(SCAN_TO_8_CHAR_LOOP); 3134 movdqu(vec3, Address(result, 0)); 3135 pcmpeqw(vec3, vec1); 3136 ptest(vec2, vec3); 3137 jcc(Assembler::carryClear, FOUND_CHAR); 3138 addptr(result, 16); 3139 subl(tmp, stride); 3140 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 3141 bind(SCAN_TO_CHAR); 3142 testl(cnt1, cnt1); 3143 jcc(Assembler::zero, RET_NOT_FOUND); 3144 bind(SCAN_TO_CHAR_LOOP); 3145 load_unsigned_short(tmp, Address(result, 0)); 3146 cmpl(ch, tmp); 3147 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3148 addptr(result, 2); 3149 subl(cnt1, 1); 3150 jccb(Assembler::zero, RET_NOT_FOUND); 3151 jmp(SCAN_TO_CHAR_LOOP); 3152 3153 bind(RET_NOT_FOUND); 3154 movl(result, -1); 3155 jmpb(DONE_LABEL); 3156 3157 bind(FOUND_CHAR); 3158 if (UseAVX >= 2) { 3159 vpmovmskb(tmp, vec3); 3160 } else { 3161 pmovmskb(tmp, vec3); 3162 } 3163 bsfl(ch, tmp); 3164 addptr(result, ch); 3165 3166 bind(FOUND_SEQ_CHAR); 3167 subptr(result, str1); 3168 shrl(result, 1); 3169 3170 bind(DONE_LABEL); 3171 } // string_indexof_char 3172 3173 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result, 3174 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 3175 ShortBranchVerifier sbv(this); 3176 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 3177 3178 int stride = 16; 3179 3180 Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP, 3181 SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP, 3182 RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT, 3183 FOUND_SEQ_CHAR, DONE_LABEL; 3184 3185 movptr(result, str1); 3186 if (UseAVX >= 2) { 3187 cmpl(cnt1, stride); 3188 jcc(Assembler::less, SCAN_TO_CHAR_INIT); 3189 cmpl(cnt1, stride*2); 3190 jcc(Assembler::less, SCAN_TO_16_CHAR_INIT); 3191 movdl(vec1, ch); 3192 vpbroadcastb(vec1, vec1, Assembler::AVX_256bit); 3193 vpxor(vec2, vec2); 3194 movl(tmp, cnt1); 3195 andl(tmp, 0xFFFFFFE0); //vector count (in chars) 3196 andl(cnt1,0x0000001F); //tail count (in chars) 3197 3198 bind(SCAN_TO_32_CHAR_LOOP); 3199 vmovdqu(vec3, Address(result, 0)); 3200 vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit); 3201 vptest(vec2, vec3); 3202 jcc(Assembler::carryClear, FOUND_CHAR); 3203 addptr(result, 32); 3204 subl(tmp, stride*2); 3205 jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP); 3206 jmp(SCAN_TO_16_CHAR); 3207 3208 bind(SCAN_TO_16_CHAR_INIT); 3209 movdl(vec1, ch); 3210 pxor(vec2, vec2); 3211 pshufb(vec1, vec2); 3212 } 3213 3214 bind(SCAN_TO_16_CHAR); 3215 cmpl(cnt1, stride); 3216 jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entries left 3217 if (UseAVX < 2) { 3218 movdl(vec1, ch); 3219 pxor(vec2, vec2); 3220 pshufb(vec1, vec2); 3221 } 3222 movl(tmp, cnt1); 3223 andl(tmp, 0xFFFFFFF0); //vector count (in bytes) 3224 andl(cnt1,0x0000000F); //tail count (in bytes) 3225 3226 bind(SCAN_TO_16_CHAR_LOOP); 3227 movdqu(vec3, Address(result, 0)); 3228 pcmpeqb(vec3, vec1); 3229 ptest(vec2, vec3); 3230 jcc(Assembler::carryClear, FOUND_CHAR); 3231 addptr(result, 16); 3232 subl(tmp, stride); 3233 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items... 3234 3235 bind(SCAN_TO_CHAR_INIT); 3236 testl(cnt1, cnt1); 3237 jcc(Assembler::zero, RET_NOT_FOUND); 3238 bind(SCAN_TO_CHAR_LOOP); 3239 load_unsigned_byte(tmp, Address(result, 0)); 3240 cmpl(ch, tmp); 3241 jccb(Assembler::equal, FOUND_SEQ_CHAR); 3242 addptr(result, 1); 3243 subl(cnt1, 1); 3244 jccb(Assembler::zero, RET_NOT_FOUND); 3245 jmp(SCAN_TO_CHAR_LOOP); 3246 3247 bind(RET_NOT_FOUND); 3248 movl(result, -1); 3249 jmpb(DONE_LABEL); 3250 3251 bind(FOUND_CHAR); 3252 if (UseAVX >= 2) { 3253 vpmovmskb(tmp, vec3); 3254 } else { 3255 pmovmskb(tmp, vec3); 3256 } 3257 bsfl(ch, tmp); 3258 addptr(result, ch); 3259 3260 bind(FOUND_SEQ_CHAR); 3261 subptr(result, str1); 3262 3263 bind(DONE_LABEL); 3264 } // stringL_indexof_char 3265 3266 int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) { 3267 switch (eltype) { 3268 case T_BOOLEAN: return sizeof(jboolean); 3269 case T_BYTE: return sizeof(jbyte); 3270 case T_SHORT: return sizeof(jshort); 3271 case T_CHAR: return sizeof(jchar); 3272 case T_INT: return sizeof(jint); 3273 default: 3274 ShouldNotReachHere(); 3275 return -1; 3276 } 3277 } 3278 3279 void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) { 3280 switch (eltype) { 3281 // T_BOOLEAN used as surrogate for unsigned byte 3282 case T_BOOLEAN: movzbl(dst, src); break; 3283 case T_BYTE: movsbl(dst, src); break; 3284 case T_SHORT: movswl(dst, src); break; 3285 case T_CHAR: movzwl(dst, src); break; 3286 case T_INT: movl(dst, src); break; 3287 default: 3288 ShouldNotReachHere(); 3289 } 3290 } 3291 3292 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) { 3293 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3294 } 3295 3296 void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) { 3297 load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8); 3298 } 3299 3300 void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) { 3301 const int vlen = Assembler::AVX_256bit; 3302 switch (eltype) { 3303 case T_BOOLEAN: vector_unsigned_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3304 case T_BYTE: vector_signed_cast(dst, dst, vlen, T_BYTE, T_INT); break; 3305 case T_SHORT: vector_signed_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3306 case T_CHAR: vector_unsigned_cast(dst, dst, vlen, T_SHORT, T_INT); break; 3307 case T_INT: 3308 // do nothing 3309 break; 3310 default: 3311 ShouldNotReachHere(); 3312 } 3313 } 3314 3315 void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result, 3316 Register index, Register tmp2, Register tmp3, XMMRegister vnext, 3317 XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3, 3318 XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3, 3319 XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3, 3320 BasicType eltype) { 3321 ShortBranchVerifier sbv(this); 3322 assert(UseAVX >= 2, "AVX2 intrinsics are required"); 3323 assert_different_registers(ary1, cnt1, result, index, tmp2, tmp3); 3324 assert_different_registers(vnext, vcoef0, vcoef1, vcoef2, vcoef3, vresult0, vresult1, vresult2, vresult3, vtmp0, vtmp1, vtmp2, vtmp3); 3325 3326 Label SHORT_UNROLLED_BEGIN, SHORT_UNROLLED_LOOP_BEGIN, 3327 SHORT_UNROLLED_LOOP_EXIT, 3328 UNROLLED_SCALAR_LOOP_BEGIN, UNROLLED_SCALAR_SKIP, UNROLLED_SCALAR_RESUME, 3329 UNROLLED_VECTOR_LOOP_BEGIN, 3330 END; 3331 switch (eltype) { 3332 case T_BOOLEAN: BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); break; 3333 case T_CHAR: BLOCK_COMMENT("arrays_hashcode(char) {"); break; 3334 case T_BYTE: BLOCK_COMMENT("arrays_hashcode(byte) {"); break; 3335 case T_SHORT: BLOCK_COMMENT("arrays_hashcode(short) {"); break; 3336 case T_INT: BLOCK_COMMENT("arrays_hashcode(int) {"); break; 3337 default: BLOCK_COMMENT("arrays_hashcode {"); break; 3338 } 3339 3340 // For "renaming" for readibility of the code 3341 const XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 }, 3342 vresult[] = { vresult0, vresult1, vresult2, vresult3 }, 3343 vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 }; 3344 3345 const int elsize = arrays_hashcode_elsize(eltype); 3346 3347 /* 3348 if (cnt1 >= 2) { 3349 if (cnt1 >= 32) { 3350 UNROLLED VECTOR LOOP 3351 } 3352 UNROLLED SCALAR LOOP 3353 } 3354 SINGLE SCALAR 3355 */ 3356 3357 cmpl(cnt1, 32); 3358 jcc(Assembler::less, SHORT_UNROLLED_BEGIN); 3359 3360 // cnt1 >= 32 && generate_vectorized_loop 3361 xorl(index, index); 3362 3363 // vresult = IntVector.zero(I256); 3364 for (int idx = 0; idx < 4; idx++) { 3365 vpxor(vresult[idx], vresult[idx]); 3366 } 3367 // vnext = IntVector.broadcast(I256, power_of_31_backwards[0]); 3368 Register bound = tmp2; 3369 Register next = tmp3; 3370 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + (0 * sizeof(jint)))); 3371 movl(next, Address(tmp2, 0)); 3372 movdl(vnext, next); 3373 vpbroadcastd(vnext, vnext, Assembler::AVX_256bit); 3374 3375 // index = 0; 3376 // bound = cnt1 & ~(32 - 1); 3377 movl(bound, cnt1); 3378 andl(bound, ~(32 - 1)); 3379 // for (; index < bound; index += 32) { 3380 bind(UNROLLED_VECTOR_LOOP_BEGIN); 3381 // result *= next; 3382 imull(result, next); 3383 // loop fission to upfront the cost of fetching from memory, OOO execution 3384 // can then hopefully do a better job of prefetching 3385 for (int idx = 0; idx < 4; idx++) { 3386 arrays_hashcode_elvload(vtmp[idx], Address(ary1, index, Address::times(elsize), 8 * idx * elsize), eltype); 3387 } 3388 // vresult = vresult * vnext + ary1[index+8*idx:index+8*idx+7]; 3389 for (int idx = 0; idx < 4; idx++) { 3390 vpmulld(vresult[idx], vresult[idx], vnext, Assembler::AVX_256bit); 3391 arrays_hashcode_elvcast(vtmp[idx], eltype); 3392 vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit); 3393 } 3394 // index += 32; 3395 addl(index, 32); 3396 // index < bound; 3397 cmpl(index, bound); 3398 jcc(Assembler::less, UNROLLED_VECTOR_LOOP_BEGIN); 3399 // } 3400 3401 lea(ary1, Address(ary1, bound, Address::times(elsize))); 3402 subl(cnt1, bound); 3403 // release bound 3404 3405 // vresult *= IntVector.fromArray(I256, power_of_31_backwards, 1); 3406 for (int idx = 0; idx < 4; idx++) { 3407 lea(tmp2, ExternalAddress(StubRoutines::x86::arrays_hashcode_powers_of_31() + ((8 * idx + 1) * sizeof(jint)))); 3408 arrays_hashcode_elvload(vcoef[idx], Address(tmp2, 0), T_INT); 3409 vpmulld(vresult[idx], vresult[idx], vcoef[idx], Assembler::AVX_256bit); 3410 } 3411 // result += vresult.reduceLanes(ADD); 3412 for (int idx = 0; idx < 4; idx++) { 3413 reduceI(Op_AddReductionVI, 256/(sizeof(jint) * 8), result, result, vresult[idx], vtmp[(idx * 2 + 0) % 4], vtmp[(idx * 2 + 1) % 4]); 3414 } 3415 3416 // } else if (cnt1 < 32) { 3417 3418 bind(SHORT_UNROLLED_BEGIN); 3419 // int i = 1; 3420 movl(index, 1); 3421 cmpl(index, cnt1); 3422 jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_EXIT); 3423 3424 // for (; i < cnt1 ; i += 2) { 3425 bind(SHORT_UNROLLED_LOOP_BEGIN); 3426 movl(tmp3, 961); 3427 imull(result, tmp3); 3428 arrays_hashcode_elload(tmp2, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3429 movl(tmp3, tmp2); 3430 shll(tmp3, 5); 3431 subl(tmp3, tmp2); 3432 addl(result, tmp3); 3433 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize)), eltype); 3434 addl(result, tmp3); 3435 addl(index, 2); 3436 cmpl(index, cnt1); 3437 jccb(Assembler::less, SHORT_UNROLLED_LOOP_BEGIN); 3438 3439 // } 3440 // if (i >= cnt1) { 3441 bind(SHORT_UNROLLED_LOOP_EXIT); 3442 jccb(Assembler::greater, END); 3443 movl(tmp2, result); 3444 shll(result, 5); 3445 subl(result, tmp2); 3446 arrays_hashcode_elload(tmp3, Address(ary1, index, Address::times(elsize), -elsize), eltype); 3447 addl(result, tmp3); 3448 // } 3449 bind(END); 3450 3451 BLOCK_COMMENT("} // arrays_hashcode"); 3452 3453 } // arrays_hashcode 3454 3455 // helper function for string_compare 3456 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 3457 Address::ScaleFactor scale, Address::ScaleFactor scale1, 3458 Address::ScaleFactor scale2, Register index, int ae) { 3459 if (ae == StrIntrinsicNode::LL) { 3460 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 3461 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 3462 } else if (ae == StrIntrinsicNode::UU) { 3463 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 3464 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 3465 } else { 3466 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 3467 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 3468 } 3469 } 3470 3471 // Compare strings, used for char[] and byte[]. 3472 void C2_MacroAssembler::string_compare(Register str1, Register str2, 3473 Register cnt1, Register cnt2, Register result, 3474 XMMRegister vec1, int ae, KRegister mask) { 3475 ShortBranchVerifier sbv(this); 3476 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 3477 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 3478 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 3479 int stride2x2 = 0x40; 3480 Address::ScaleFactor scale = Address::no_scale; 3481 Address::ScaleFactor scale1 = Address::no_scale; 3482 Address::ScaleFactor scale2 = Address::no_scale; 3483 3484 if (ae != StrIntrinsicNode::LL) { 3485 stride2x2 = 0x20; 3486 } 3487 3488 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 3489 shrl(cnt2, 1); 3490 } 3491 // Compute the minimum of the string lengths and the 3492 // difference of the string lengths (stack). 3493 // Do the conditional move stuff 3494 movl(result, cnt1); 3495 subl(cnt1, cnt2); 3496 push(cnt1); 3497 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 3498 3499 // Is the minimum length zero? 3500 testl(cnt2, cnt2); 3501 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3502 if (ae == StrIntrinsicNode::LL) { 3503 // Load first bytes 3504 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 3505 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 3506 } else if (ae == StrIntrinsicNode::UU) { 3507 // Load first characters 3508 load_unsigned_short(result, Address(str1, 0)); 3509 load_unsigned_short(cnt1, Address(str2, 0)); 3510 } else { 3511 load_unsigned_byte(result, Address(str1, 0)); 3512 load_unsigned_short(cnt1, Address(str2, 0)); 3513 } 3514 subl(result, cnt1); 3515 jcc(Assembler::notZero, POP_LABEL); 3516 3517 if (ae == StrIntrinsicNode::UU) { 3518 // Divide length by 2 to get number of chars 3519 shrl(cnt2, 1); 3520 } 3521 cmpl(cnt2, 1); 3522 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3523 3524 // Check if the strings start at the same location and setup scale and stride 3525 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3526 cmpptr(str1, str2); 3527 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 3528 if (ae == StrIntrinsicNode::LL) { 3529 scale = Address::times_1; 3530 stride = 16; 3531 } else { 3532 scale = Address::times_2; 3533 stride = 8; 3534 } 3535 } else { 3536 scale1 = Address::times_1; 3537 scale2 = Address::times_2; 3538 // scale not used 3539 stride = 8; 3540 } 3541 3542 if (UseAVX >= 2 && UseSSE42Intrinsics) { 3543 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 3544 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 3545 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 3546 Label COMPARE_TAIL_LONG; 3547 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 3548 3549 int pcmpmask = 0x19; 3550 if (ae == StrIntrinsicNode::LL) { 3551 pcmpmask &= ~0x01; 3552 } 3553 3554 // Setup to compare 16-chars (32-bytes) vectors, 3555 // start from first character again because it has aligned address. 3556 if (ae == StrIntrinsicNode::LL) { 3557 stride2 = 32; 3558 } else { 3559 stride2 = 16; 3560 } 3561 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3562 adr_stride = stride << scale; 3563 } else { 3564 adr_stride1 = 8; //stride << scale1; 3565 adr_stride2 = 16; //stride << scale2; 3566 } 3567 3568 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3569 // rax and rdx are used by pcmpestri as elements counters 3570 movl(result, cnt2); 3571 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 3572 jcc(Assembler::zero, COMPARE_TAIL_LONG); 3573 3574 // fast path : compare first 2 8-char vectors. 3575 bind(COMPARE_16_CHARS); 3576 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3577 movdqu(vec1, Address(str1, 0)); 3578 } else { 3579 pmovzxbw(vec1, Address(str1, 0)); 3580 } 3581 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3582 jccb(Assembler::below, COMPARE_INDEX_CHAR); 3583 3584 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3585 movdqu(vec1, Address(str1, adr_stride)); 3586 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 3587 } else { 3588 pmovzxbw(vec1, Address(str1, adr_stride1)); 3589 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 3590 } 3591 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 3592 addl(cnt1, stride); 3593 3594 // Compare the characters at index in cnt1 3595 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 3596 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3597 subl(result, cnt2); 3598 jmp(POP_LABEL); 3599 3600 // Setup the registers to start vector comparison loop 3601 bind(COMPARE_WIDE_VECTORS); 3602 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3603 lea(str1, Address(str1, result, scale)); 3604 lea(str2, Address(str2, result, scale)); 3605 } else { 3606 lea(str1, Address(str1, result, scale1)); 3607 lea(str2, Address(str2, result, scale2)); 3608 } 3609 subl(result, stride2); 3610 subl(cnt2, stride2); 3611 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 3612 negptr(result); 3613 3614 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 3615 bind(COMPARE_WIDE_VECTORS_LOOP); 3616 3617 #ifdef _LP64 3618 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3619 cmpl(cnt2, stride2x2); 3620 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3621 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 3622 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 3623 3624 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3625 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3626 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 3627 evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3628 } else { 3629 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 3630 evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 3631 } 3632 kortestql(mask, mask); 3633 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 3634 addptr(result, stride2x2); // update since we already compared at this addr 3635 subl(cnt2, stride2x2); // and sub the size too 3636 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3637 3638 vpxor(vec1, vec1); 3639 jmpb(COMPARE_WIDE_TAIL); 3640 }//if (VM_Version::supports_avx512vlbw()) 3641 #endif // _LP64 3642 3643 3644 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3645 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3646 vmovdqu(vec1, Address(str1, result, scale)); 3647 vpxor(vec1, Address(str2, result, scale)); 3648 } else { 3649 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 3650 vpxor(vec1, Address(str2, result, scale2)); 3651 } 3652 vptest(vec1, vec1); 3653 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 3654 addptr(result, stride2); 3655 subl(cnt2, stride2); 3656 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 3657 // clean upper bits of YMM registers 3658 vpxor(vec1, vec1); 3659 3660 // compare wide vectors tail 3661 bind(COMPARE_WIDE_TAIL); 3662 testptr(result, result); 3663 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3664 3665 movl(result, stride2); 3666 movl(cnt2, result); 3667 negptr(result); 3668 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3669 3670 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 3671 bind(VECTOR_NOT_EQUAL); 3672 // clean upper bits of YMM registers 3673 vpxor(vec1, vec1); 3674 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3675 lea(str1, Address(str1, result, scale)); 3676 lea(str2, Address(str2, result, scale)); 3677 } else { 3678 lea(str1, Address(str1, result, scale1)); 3679 lea(str2, Address(str2, result, scale2)); 3680 } 3681 jmp(COMPARE_16_CHARS); 3682 3683 // Compare tail chars, length between 1 to 15 chars 3684 bind(COMPARE_TAIL_LONG); 3685 movl(cnt2, result); 3686 cmpl(cnt2, stride); 3687 jcc(Assembler::less, COMPARE_SMALL_STR); 3688 3689 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3690 movdqu(vec1, Address(str1, 0)); 3691 } else { 3692 pmovzxbw(vec1, Address(str1, 0)); 3693 } 3694 pcmpestri(vec1, Address(str2, 0), pcmpmask); 3695 jcc(Assembler::below, COMPARE_INDEX_CHAR); 3696 subptr(cnt2, stride); 3697 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3698 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3699 lea(str1, Address(str1, result, scale)); 3700 lea(str2, Address(str2, result, scale)); 3701 } else { 3702 lea(str1, Address(str1, result, scale1)); 3703 lea(str2, Address(str2, result, scale2)); 3704 } 3705 negptr(cnt2); 3706 jmpb(WHILE_HEAD_LABEL); 3707 3708 bind(COMPARE_SMALL_STR); 3709 } else if (UseSSE42Intrinsics) { 3710 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 3711 int pcmpmask = 0x19; 3712 // Setup to compare 8-char (16-byte) vectors, 3713 // start from first character again because it has aligned address. 3714 movl(result, cnt2); 3715 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 3716 if (ae == StrIntrinsicNode::LL) { 3717 pcmpmask &= ~0x01; 3718 } 3719 jcc(Assembler::zero, COMPARE_TAIL); 3720 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3721 lea(str1, Address(str1, result, scale)); 3722 lea(str2, Address(str2, result, scale)); 3723 } else { 3724 lea(str1, Address(str1, result, scale1)); 3725 lea(str2, Address(str2, result, scale2)); 3726 } 3727 negptr(result); 3728 3729 // pcmpestri 3730 // inputs: 3731 // vec1- substring 3732 // rax - negative string length (elements count) 3733 // mem - scanned string 3734 // rdx - string length (elements count) 3735 // pcmpmask - cmp mode: 11000 (string compare with negated result) 3736 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 3737 // outputs: 3738 // rcx - first mismatched element index 3739 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 3740 3741 bind(COMPARE_WIDE_VECTORS); 3742 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3743 movdqu(vec1, Address(str1, result, scale)); 3744 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3745 } else { 3746 pmovzxbw(vec1, Address(str1, result, scale1)); 3747 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3748 } 3749 // After pcmpestri cnt1(rcx) contains mismatched element index 3750 3751 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 3752 addptr(result, stride); 3753 subptr(cnt2, stride); 3754 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3755 3756 // compare wide vectors tail 3757 testptr(result, result); 3758 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 3759 3760 movl(cnt2, stride); 3761 movl(result, stride); 3762 negptr(result); 3763 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3764 movdqu(vec1, Address(str1, result, scale)); 3765 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 3766 } else { 3767 pmovzxbw(vec1, Address(str1, result, scale1)); 3768 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 3769 } 3770 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 3771 3772 // Mismatched characters in the vectors 3773 bind(VECTOR_NOT_EQUAL); 3774 addptr(cnt1, result); 3775 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 3776 subl(result, cnt2); 3777 jmpb(POP_LABEL); 3778 3779 bind(COMPARE_TAIL); // limit is zero 3780 movl(cnt2, result); 3781 // Fallthru to tail compare 3782 } 3783 // Shift str2 and str1 to the end of the arrays, negate min 3784 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3785 lea(str1, Address(str1, cnt2, scale)); 3786 lea(str2, Address(str2, cnt2, scale)); 3787 } else { 3788 lea(str1, Address(str1, cnt2, scale1)); 3789 lea(str2, Address(str2, cnt2, scale2)); 3790 } 3791 decrementl(cnt2); // first character was compared already 3792 negptr(cnt2); 3793 3794 // Compare the rest of the elements 3795 bind(WHILE_HEAD_LABEL); 3796 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 3797 subl(result, cnt1); 3798 jccb(Assembler::notZero, POP_LABEL); 3799 increment(cnt2); 3800 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 3801 3802 // Strings are equal up to min length. Return the length difference. 3803 bind(LENGTH_DIFF_LABEL); 3804 pop(result); 3805 if (ae == StrIntrinsicNode::UU) { 3806 // Divide diff by 2 to get number of chars 3807 sarl(result, 1); 3808 } 3809 jmpb(DONE_LABEL); 3810 3811 #ifdef _LP64 3812 if (VM_Version::supports_avx512vlbw()) { 3813 3814 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 3815 3816 kmovql(cnt1, mask); 3817 notq(cnt1); 3818 bsfq(cnt2, cnt1); 3819 if (ae != StrIntrinsicNode::LL) { 3820 // Divide diff by 2 to get number of chars 3821 sarl(cnt2, 1); 3822 } 3823 addq(result, cnt2); 3824 if (ae == StrIntrinsicNode::LL) { 3825 load_unsigned_byte(cnt1, Address(str2, result)); 3826 load_unsigned_byte(result, Address(str1, result)); 3827 } else if (ae == StrIntrinsicNode::UU) { 3828 load_unsigned_short(cnt1, Address(str2, result, scale)); 3829 load_unsigned_short(result, Address(str1, result, scale)); 3830 } else { 3831 load_unsigned_short(cnt1, Address(str2, result, scale2)); 3832 load_unsigned_byte(result, Address(str1, result, scale1)); 3833 } 3834 subl(result, cnt1); 3835 jmpb(POP_LABEL); 3836 }//if (VM_Version::supports_avx512vlbw()) 3837 #endif // _LP64 3838 3839 // Discard the stored length difference 3840 bind(POP_LABEL); 3841 pop(cnt1); 3842 3843 // That's it 3844 bind(DONE_LABEL); 3845 if(ae == StrIntrinsicNode::UL) { 3846 negl(result); 3847 } 3848 3849 } 3850 3851 // Search for Non-ASCII character (Negative byte value) in a byte array, 3852 // return the index of the first such character, otherwise the length 3853 // of the array segment searched. 3854 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 3855 // @IntrinsicCandidate 3856 // public static int countPositives(byte[] ba, int off, int len) { 3857 // for (int i = off; i < off + len; i++) { 3858 // if (ba[i] < 0) { 3859 // return i - off; 3860 // } 3861 // } 3862 // return len; 3863 // } 3864 void C2_MacroAssembler::count_positives(Register ary1, Register len, 3865 Register result, Register tmp1, 3866 XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { 3867 // rsi: byte array 3868 // rcx: len 3869 // rax: result 3870 ShortBranchVerifier sbv(this); 3871 assert_different_registers(ary1, len, result, tmp1); 3872 assert_different_registers(vec1, vec2); 3873 Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 3874 3875 movl(result, len); // copy 3876 // len == 0 3877 testl(len, len); 3878 jcc(Assembler::zero, DONE); 3879 3880 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 3881 VM_Version::supports_avx512vlbw() && 3882 VM_Version::supports_bmi2()) { 3883 3884 Label test_64_loop, test_tail, BREAK_LOOP; 3885 movl(tmp1, len); 3886 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 3887 3888 andl(tmp1, 0x0000003f); // tail count (in chars) 0x3F 3889 andl(len, 0xffffffc0); // vector count (in chars) 3890 jccb(Assembler::zero, test_tail); 3891 3892 lea(ary1, Address(ary1, len, Address::times_1)); 3893 negptr(len); 3894 3895 bind(test_64_loop); 3896 // Check whether our 64 elements of size byte contain negatives 3897 evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 3898 kortestql(mask1, mask1); 3899 jcc(Assembler::notZero, BREAK_LOOP); 3900 3901 addptr(len, 64); 3902 jccb(Assembler::notZero, test_64_loop); 3903 3904 bind(test_tail); 3905 // bail out when there is nothing to be done 3906 testl(tmp1, -1); 3907 jcc(Assembler::zero, DONE); 3908 3909 3910 // check the tail for absense of negatives 3911 // ~(~0 << len) applied up to two times (for 32-bit scenario) 3912 #ifdef _LP64 3913 { 3914 Register tmp3_aliased = len; 3915 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 3916 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 3917 notq(tmp3_aliased); 3918 kmovql(mask2, tmp3_aliased); 3919 } 3920 #else 3921 Label k_init; 3922 jmp(k_init); 3923 3924 // We could not read 64-bits from a general purpose register thus we move 3925 // data required to compose 64 1's to the instruction stream 3926 // We emit 64 byte wide series of elements from 0..63 which later on would 3927 // be used as a compare targets with tail count contained in tmp1 register. 3928 // Result would be a k register having tmp1 consecutive number or 1 3929 // counting from least significant bit. 3930 address tmp = pc(); 3931 emit_int64(0x0706050403020100); 3932 emit_int64(0x0F0E0D0C0B0A0908); 3933 emit_int64(0x1716151413121110); 3934 emit_int64(0x1F1E1D1C1B1A1918); 3935 emit_int64(0x2726252423222120); 3936 emit_int64(0x2F2E2D2C2B2A2928); 3937 emit_int64(0x3736353433323130); 3938 emit_int64(0x3F3E3D3C3B3A3938); 3939 3940 bind(k_init); 3941 lea(len, InternalAddress(tmp)); 3942 // create mask to test for negative byte inside a vector 3943 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 3944 evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); 3945 3946 #endif 3947 evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); 3948 ktestq(mask1, mask2); 3949 jcc(Assembler::zero, DONE); 3950 3951 // do a full check for negative registers in the tail 3952 movl(len, tmp1); // tmp1 holds low 6-bit from original len; 3953 // ary1 already pointing to the right place 3954 jmpb(TAIL_START); 3955 3956 bind(BREAK_LOOP); 3957 // At least one byte in the last 64 byte block was negative. 3958 // Set up to look at the last 64 bytes as if they were a tail 3959 lea(ary1, Address(ary1, len, Address::times_1)); 3960 addptr(result, len); 3961 // Ignore the very last byte: if all others are positive, 3962 // it must be negative, so we can skip right to the 2+1 byte 3963 // end comparison at this point 3964 orl(result, 63); 3965 movl(len, 63); 3966 // Fallthru to tail compare 3967 } else { 3968 3969 if (UseAVX >= 2 && UseSSE >= 2) { 3970 // With AVX2, use 32-byte vector compare 3971 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 3972 3973 // Compare 32-byte vectors 3974 testl(len, 0xffffffe0); // vector count (in bytes) 3975 jccb(Assembler::zero, TAIL_START); 3976 3977 andl(len, 0xffffffe0); 3978 lea(ary1, Address(ary1, len, Address::times_1)); 3979 negptr(len); 3980 3981 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 3982 movdl(vec2, tmp1); 3983 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 3984 3985 bind(COMPARE_WIDE_VECTORS); 3986 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 3987 vptest(vec1, vec2); 3988 jccb(Assembler::notZero, BREAK_LOOP); 3989 addptr(len, 32); 3990 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 3991 3992 testl(result, 0x0000001f); // any bytes remaining? 3993 jcc(Assembler::zero, DONE); 3994 3995 // Quick test using the already prepared vector mask 3996 movl(len, result); 3997 andl(len, 0x0000001f); 3998 vmovdqu(vec1, Address(ary1, len, Address::times_1, -32)); 3999 vptest(vec1, vec2); 4000 jcc(Assembler::zero, DONE); 4001 // There are zeros, jump to the tail to determine exactly where 4002 jmpb(TAIL_START); 4003 4004 bind(BREAK_LOOP); 4005 // At least one byte in the last 32-byte vector is negative. 4006 // Set up to look at the last 32 bytes as if they were a tail 4007 lea(ary1, Address(ary1, len, Address::times_1)); 4008 addptr(result, len); 4009 // Ignore the very last byte: if all others are positive, 4010 // it must be negative, so we can skip right to the 2+1 byte 4011 // end comparison at this point 4012 orl(result, 31); 4013 movl(len, 31); 4014 // Fallthru to tail compare 4015 } else if (UseSSE42Intrinsics) { 4016 // With SSE4.2, use double quad vector compare 4017 Label COMPARE_WIDE_VECTORS, BREAK_LOOP; 4018 4019 // Compare 16-byte vectors 4020 testl(len, 0xfffffff0); // vector count (in bytes) 4021 jcc(Assembler::zero, TAIL_START); 4022 4023 andl(len, 0xfffffff0); 4024 lea(ary1, Address(ary1, len, Address::times_1)); 4025 negptr(len); 4026 4027 movl(tmp1, 0x80808080); 4028 movdl(vec2, tmp1); 4029 pshufd(vec2, vec2, 0); 4030 4031 bind(COMPARE_WIDE_VECTORS); 4032 movdqu(vec1, Address(ary1, len, Address::times_1)); 4033 ptest(vec1, vec2); 4034 jccb(Assembler::notZero, BREAK_LOOP); 4035 addptr(len, 16); 4036 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 4037 4038 testl(result, 0x0000000f); // len is zero, any bytes remaining? 4039 jcc(Assembler::zero, DONE); 4040 4041 // Quick test using the already prepared vector mask 4042 movl(len, result); 4043 andl(len, 0x0000000f); // tail count (in bytes) 4044 movdqu(vec1, Address(ary1, len, Address::times_1, -16)); 4045 ptest(vec1, vec2); 4046 jcc(Assembler::zero, DONE); 4047 jmpb(TAIL_START); 4048 4049 bind(BREAK_LOOP); 4050 // At least one byte in the last 16-byte vector is negative. 4051 // Set up and look at the last 16 bytes as if they were a tail 4052 lea(ary1, Address(ary1, len, Address::times_1)); 4053 addptr(result, len); 4054 // Ignore the very last byte: if all others are positive, 4055 // it must be negative, so we can skip right to the 2+1 byte 4056 // end comparison at this point 4057 orl(result, 15); 4058 movl(len, 15); 4059 // Fallthru to tail compare 4060 } 4061 } 4062 4063 bind(TAIL_START); 4064 // Compare 4-byte vectors 4065 andl(len, 0xfffffffc); // vector count (in bytes) 4066 jccb(Assembler::zero, COMPARE_CHAR); 4067 4068 lea(ary1, Address(ary1, len, Address::times_1)); 4069 negptr(len); 4070 4071 bind(COMPARE_VECTORS); 4072 movl(tmp1, Address(ary1, len, Address::times_1)); 4073 andl(tmp1, 0x80808080); 4074 jccb(Assembler::notZero, TAIL_ADJUST); 4075 addptr(len, 4); 4076 jccb(Assembler::notZero, COMPARE_VECTORS); 4077 4078 // Compare trailing char (final 2-3 bytes), if any 4079 bind(COMPARE_CHAR); 4080 4081 testl(result, 0x2); // tail char 4082 jccb(Assembler::zero, COMPARE_BYTE); 4083 load_unsigned_short(tmp1, Address(ary1, 0)); 4084 andl(tmp1, 0x00008080); 4085 jccb(Assembler::notZero, CHAR_ADJUST); 4086 lea(ary1, Address(ary1, 2)); 4087 4088 bind(COMPARE_BYTE); 4089 testl(result, 0x1); // tail byte 4090 jccb(Assembler::zero, DONE); 4091 load_unsigned_byte(tmp1, Address(ary1, 0)); 4092 testl(tmp1, 0x00000080); 4093 jccb(Assembler::zero, DONE); 4094 subptr(result, 1); 4095 jmpb(DONE); 4096 4097 bind(TAIL_ADJUST); 4098 // there are negative bits in the last 4 byte block. 4099 // Adjust result and check the next three bytes 4100 addptr(result, len); 4101 orl(result, 3); 4102 lea(ary1, Address(ary1, len, Address::times_1)); 4103 jmpb(COMPARE_CHAR); 4104 4105 bind(CHAR_ADJUST); 4106 // We are looking at a char + optional byte tail, and found that one 4107 // of the bytes in the char is negative. Adjust the result, check the 4108 // first byte and readjust if needed. 4109 andl(result, 0xfffffffc); 4110 testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first 4111 jccb(Assembler::notZero, DONE); 4112 addptr(result, 1); 4113 4114 // That's it 4115 bind(DONE); 4116 if (UseAVX >= 2 && UseSSE >= 2) { 4117 // clean upper bits of YMM registers 4118 vpxor(vec1, vec1); 4119 vpxor(vec2, vec2); 4120 } 4121 } 4122 4123 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 4124 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 4125 Register limit, Register result, Register chr, 4126 XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) { 4127 ShortBranchVerifier sbv(this); 4128 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 4129 4130 int length_offset = arrayOopDesc::length_offset_in_bytes(); 4131 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 4132 4133 if (is_array_equ) { 4134 // Check the input args 4135 cmpoop(ary1, ary2); 4136 jcc(Assembler::equal, TRUE_LABEL); 4137 4138 // Need additional checks for arrays_equals. 4139 testptr(ary1, ary1); 4140 jcc(Assembler::zero, FALSE_LABEL); 4141 testptr(ary2, ary2); 4142 jcc(Assembler::zero, FALSE_LABEL); 4143 4144 // Check the lengths 4145 movl(limit, Address(ary1, length_offset)); 4146 cmpl(limit, Address(ary2, length_offset)); 4147 jcc(Assembler::notEqual, FALSE_LABEL); 4148 } 4149 4150 // count == 0 4151 testl(limit, limit); 4152 jcc(Assembler::zero, TRUE_LABEL); 4153 4154 if (is_array_equ) { 4155 // Load array address 4156 lea(ary1, Address(ary1, base_offset)); 4157 lea(ary2, Address(ary2, base_offset)); 4158 } 4159 4160 if (is_array_equ && is_char) { 4161 // arrays_equals when used for char[]. 4162 shll(limit, 1); // byte count != 0 4163 } 4164 movl(result, limit); // copy 4165 4166 if (UseAVX >= 2) { 4167 // With AVX2, use 32-byte vector compare 4168 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4169 4170 // Compare 32-byte vectors 4171 andl(result, 0x0000001f); // tail count (in bytes) 4172 andl(limit, 0xffffffe0); // vector count (in bytes) 4173 jcc(Assembler::zero, COMPARE_TAIL); 4174 4175 lea(ary1, Address(ary1, limit, Address::times_1)); 4176 lea(ary2, Address(ary2, limit, Address::times_1)); 4177 negptr(limit); 4178 4179 #ifdef _LP64 4180 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 4181 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 4182 4183 cmpl(limit, -64); 4184 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 4185 4186 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 4187 4188 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 4189 evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 4190 kortestql(mask, mask); 4191 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4192 addptr(limit, 64); // update since we already compared at this addr 4193 cmpl(limit, -64); 4194 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 4195 4196 // At this point we may still need to compare -limit+result bytes. 4197 // We could execute the next two instruction and just continue via non-wide path: 4198 // cmpl(limit, 0); 4199 // jcc(Assembler::equal, COMPARE_TAIL); // true 4200 // But since we stopped at the points ary{1,2}+limit which are 4201 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 4202 // (|limit| <= 32 and result < 32), 4203 // we may just compare the last 64 bytes. 4204 // 4205 addptr(result, -64); // it is safe, bc we just came from this area 4206 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 4207 evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 4208 kortestql(mask, mask); 4209 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 4210 4211 jmp(TRUE_LABEL); 4212 4213 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 4214 4215 }//if (VM_Version::supports_avx512vlbw()) 4216 #endif //_LP64 4217 bind(COMPARE_WIDE_VECTORS); 4218 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); 4219 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 4220 vpxor(vec1, vec2); 4221 4222 vptest(vec1, vec1); 4223 jcc(Assembler::notZero, FALSE_LABEL); 4224 addptr(limit, 32); 4225 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4226 4227 testl(result, result); 4228 jcc(Assembler::zero, TRUE_LABEL); 4229 4230 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 4231 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 4232 vpxor(vec1, vec2); 4233 4234 vptest(vec1, vec1); 4235 jccb(Assembler::notZero, FALSE_LABEL); 4236 jmpb(TRUE_LABEL); 4237 4238 bind(COMPARE_TAIL); // limit is zero 4239 movl(limit, result); 4240 // Fallthru to tail compare 4241 } else if (UseSSE42Intrinsics) { 4242 // With SSE4.2, use double quad vector compare 4243 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 4244 4245 // Compare 16-byte vectors 4246 andl(result, 0x0000000f); // tail count (in bytes) 4247 andl(limit, 0xfffffff0); // vector count (in bytes) 4248 jcc(Assembler::zero, COMPARE_TAIL); 4249 4250 lea(ary1, Address(ary1, limit, Address::times_1)); 4251 lea(ary2, Address(ary2, limit, Address::times_1)); 4252 negptr(limit); 4253 4254 bind(COMPARE_WIDE_VECTORS); 4255 movdqu(vec1, Address(ary1, limit, Address::times_1)); 4256 movdqu(vec2, Address(ary2, limit, Address::times_1)); 4257 pxor(vec1, vec2); 4258 4259 ptest(vec1, vec1); 4260 jcc(Assembler::notZero, FALSE_LABEL); 4261 addptr(limit, 16); 4262 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 4263 4264 testl(result, result); 4265 jcc(Assembler::zero, TRUE_LABEL); 4266 4267 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 4268 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 4269 pxor(vec1, vec2); 4270 4271 ptest(vec1, vec1); 4272 jccb(Assembler::notZero, FALSE_LABEL); 4273 jmpb(TRUE_LABEL); 4274 4275 bind(COMPARE_TAIL); // limit is zero 4276 movl(limit, result); 4277 // Fallthru to tail compare 4278 } 4279 4280 // Compare 4-byte vectors 4281 andl(limit, 0xfffffffc); // vector count (in bytes) 4282 jccb(Assembler::zero, COMPARE_CHAR); 4283 4284 lea(ary1, Address(ary1, limit, Address::times_1)); 4285 lea(ary2, Address(ary2, limit, Address::times_1)); 4286 negptr(limit); 4287 4288 bind(COMPARE_VECTORS); 4289 movl(chr, Address(ary1, limit, Address::times_1)); 4290 cmpl(chr, Address(ary2, limit, Address::times_1)); 4291 jccb(Assembler::notEqual, FALSE_LABEL); 4292 addptr(limit, 4); 4293 jcc(Assembler::notZero, COMPARE_VECTORS); 4294 4295 // Compare trailing char (final 2 bytes), if any 4296 bind(COMPARE_CHAR); 4297 testl(result, 0x2); // tail char 4298 jccb(Assembler::zero, COMPARE_BYTE); 4299 load_unsigned_short(chr, Address(ary1, 0)); 4300 load_unsigned_short(limit, Address(ary2, 0)); 4301 cmpl(chr, limit); 4302 jccb(Assembler::notEqual, FALSE_LABEL); 4303 4304 if (is_array_equ && is_char) { 4305 bind(COMPARE_BYTE); 4306 } else { 4307 lea(ary1, Address(ary1, 2)); 4308 lea(ary2, Address(ary2, 2)); 4309 4310 bind(COMPARE_BYTE); 4311 testl(result, 0x1); // tail byte 4312 jccb(Assembler::zero, TRUE_LABEL); 4313 load_unsigned_byte(chr, Address(ary1, 0)); 4314 load_unsigned_byte(limit, Address(ary2, 0)); 4315 cmpl(chr, limit); 4316 jccb(Assembler::notEqual, FALSE_LABEL); 4317 } 4318 bind(TRUE_LABEL); 4319 movl(result, 1); // return true 4320 jmpb(DONE); 4321 4322 bind(FALSE_LABEL); 4323 xorl(result, result); // return false 4324 4325 // That's it 4326 bind(DONE); 4327 if (UseAVX >= 2) { 4328 // clean upper bits of YMM registers 4329 vpxor(vec1, vec1); 4330 vpxor(vec2, vec2); 4331 } 4332 } 4333 4334 #ifdef _LP64 4335 4336 static void convertF2I_slowpath(C2_MacroAssembler& masm, C2GeneralStub<Register, XMMRegister, address>& stub) { 4337 #define __ masm. 4338 Register dst = stub.data<0>(); 4339 XMMRegister src = stub.data<1>(); 4340 address target = stub.data<2>(); 4341 __ bind(stub.entry()); 4342 __ subptr(rsp, 8); 4343 __ movdbl(Address(rsp), src); 4344 __ call(RuntimeAddress(target)); 4345 __ pop(dst); 4346 __ jmp(stub.continuation()); 4347 #undef __ 4348 } 4349 4350 void C2_MacroAssembler::convertF2I(BasicType dst_bt, BasicType src_bt, Register dst, XMMRegister src) { 4351 assert(dst_bt == T_INT || dst_bt == T_LONG, ""); 4352 assert(src_bt == T_FLOAT || src_bt == T_DOUBLE, ""); 4353 4354 address slowpath_target; 4355 if (dst_bt == T_INT) { 4356 if (src_bt == T_FLOAT) { 4357 cvttss2sil(dst, src); 4358 cmpl(dst, 0x80000000); 4359 slowpath_target = StubRoutines::x86::f2i_fixup(); 4360 } else { 4361 cvttsd2sil(dst, src); 4362 cmpl(dst, 0x80000000); 4363 slowpath_target = StubRoutines::x86::d2i_fixup(); 4364 } 4365 } else { 4366 if (src_bt == T_FLOAT) { 4367 cvttss2siq(dst, src); 4368 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4369 slowpath_target = StubRoutines::x86::f2l_fixup(); 4370 } else { 4371 cvttsd2siq(dst, src); 4372 cmp64(dst, ExternalAddress(StubRoutines::x86::double_sign_flip())); 4373 slowpath_target = StubRoutines::x86::d2l_fixup(); 4374 } 4375 } 4376 4377 auto stub = C2CodeStub::make<Register, XMMRegister, address>(dst, src, slowpath_target, 23, convertF2I_slowpath); 4378 jcc(Assembler::equal, stub->entry()); 4379 bind(stub->continuation()); 4380 } 4381 4382 #endif // _LP64 4383 4384 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4385 XMMRegister src1, int imm8, bool merge, int vlen_enc) { 4386 switch(ideal_opc) { 4387 case Op_LShiftVS: 4388 Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; 4389 case Op_LShiftVI: 4390 Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; 4391 case Op_LShiftVL: 4392 Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; 4393 case Op_RShiftVS: 4394 Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; 4395 case Op_RShiftVI: 4396 Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; 4397 case Op_RShiftVL: 4398 Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; 4399 case Op_URShiftVS: 4400 Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; 4401 case Op_URShiftVI: 4402 Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; 4403 case Op_URShiftVL: 4404 Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; 4405 case Op_RotateRightV: 4406 evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4407 case Op_RotateLeftV: 4408 evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; 4409 default: 4410 fatal("Unsupported masked operation"); break; 4411 } 4412 } 4413 4414 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4415 XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, 4416 bool is_varshift) { 4417 switch (ideal_opc) { 4418 case Op_AddVB: 4419 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4420 case Op_AddVS: 4421 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4422 case Op_AddVI: 4423 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4424 case Op_AddVL: 4425 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4426 case Op_AddVF: 4427 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4428 case Op_AddVD: 4429 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4430 case Op_SubVB: 4431 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4432 case Op_SubVS: 4433 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4434 case Op_SubVI: 4435 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4436 case Op_SubVL: 4437 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4438 case Op_SubVF: 4439 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4440 case Op_SubVD: 4441 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4442 case Op_MulVS: 4443 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4444 case Op_MulVI: 4445 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4446 case Op_MulVL: 4447 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4448 case Op_MulVF: 4449 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4450 case Op_MulVD: 4451 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4452 case Op_DivVF: 4453 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4454 case Op_DivVD: 4455 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4456 case Op_SqrtVF: 4457 evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; 4458 case Op_SqrtVD: 4459 evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; 4460 case Op_AbsVB: 4461 evpabsb(dst, mask, src2, merge, vlen_enc); break; 4462 case Op_AbsVS: 4463 evpabsw(dst, mask, src2, merge, vlen_enc); break; 4464 case Op_AbsVI: 4465 evpabsd(dst, mask, src2, merge, vlen_enc); break; 4466 case Op_AbsVL: 4467 evpabsq(dst, mask, src2, merge, vlen_enc); break; 4468 case Op_FmaVF: 4469 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4470 case Op_FmaVD: 4471 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4472 case Op_VectorRearrange: 4473 evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; 4474 case Op_LShiftVS: 4475 evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4476 case Op_LShiftVI: 4477 evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4478 case Op_LShiftVL: 4479 evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4480 case Op_RShiftVS: 4481 evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4482 case Op_RShiftVI: 4483 evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4484 case Op_RShiftVL: 4485 evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4486 case Op_URShiftVS: 4487 evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4488 case Op_URShiftVI: 4489 evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4490 case Op_URShiftVL: 4491 evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; 4492 case Op_RotateLeftV: 4493 evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4494 case Op_RotateRightV: 4495 evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4496 case Op_MaxV: 4497 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4498 case Op_MinV: 4499 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4500 case Op_XorV: 4501 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4502 case Op_OrV: 4503 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4504 case Op_AndV: 4505 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4506 default: 4507 fatal("Unsupported masked operation"); break; 4508 } 4509 } 4510 4511 void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst, 4512 XMMRegister src1, Address src2, bool merge, int vlen_enc) { 4513 switch (ideal_opc) { 4514 case Op_AddVB: 4515 evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; 4516 case Op_AddVS: 4517 evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; 4518 case Op_AddVI: 4519 evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; 4520 case Op_AddVL: 4521 evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; 4522 case Op_AddVF: 4523 evaddps(dst, mask, src1, src2, merge, vlen_enc); break; 4524 case Op_AddVD: 4525 evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; 4526 case Op_SubVB: 4527 evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; 4528 case Op_SubVS: 4529 evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; 4530 case Op_SubVI: 4531 evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; 4532 case Op_SubVL: 4533 evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; 4534 case Op_SubVF: 4535 evsubps(dst, mask, src1, src2, merge, vlen_enc); break; 4536 case Op_SubVD: 4537 evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; 4538 case Op_MulVS: 4539 evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; 4540 case Op_MulVI: 4541 evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; 4542 case Op_MulVL: 4543 evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; 4544 case Op_MulVF: 4545 evmulps(dst, mask, src1, src2, merge, vlen_enc); break; 4546 case Op_MulVD: 4547 evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; 4548 case Op_DivVF: 4549 evdivps(dst, mask, src1, src2, merge, vlen_enc); break; 4550 case Op_DivVD: 4551 evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; 4552 case Op_FmaVF: 4553 evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; 4554 case Op_FmaVD: 4555 evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; 4556 case Op_MaxV: 4557 evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4558 case Op_MinV: 4559 evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4560 case Op_XorV: 4561 evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4562 case Op_OrV: 4563 evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4564 case Op_AndV: 4565 evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; 4566 default: 4567 fatal("Unsupported masked operation"); break; 4568 } 4569 } 4570 4571 void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst, 4572 KRegister src1, KRegister src2) { 4573 BasicType etype = T_ILLEGAL; 4574 switch(mask_len) { 4575 case 2: 4576 case 4: 4577 case 8: etype = T_BYTE; break; 4578 case 16: etype = T_SHORT; break; 4579 case 32: etype = T_INT; break; 4580 case 64: etype = T_LONG; break; 4581 default: fatal("Unsupported type"); break; 4582 } 4583 assert(etype != T_ILLEGAL, ""); 4584 switch(ideal_opc) { 4585 case Op_AndVMask: 4586 kand(etype, dst, src1, src2); break; 4587 case Op_OrVMask: 4588 kor(etype, dst, src1, src2); break; 4589 case Op_XorVMask: 4590 kxor(etype, dst, src1, src2); break; 4591 default: 4592 fatal("Unsupported masked operation"); break; 4593 } 4594 } 4595 4596 /* 4597 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4598 * If src is NaN, the result is 0. 4599 * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, 4600 * the result is equal to the value of Integer.MIN_VALUE. 4601 * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, 4602 * the result is equal to the value of Integer.MAX_VALUE. 4603 */ 4604 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4605 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4606 Register rscratch, AddressLiteral float_sign_flip, 4607 int vec_enc) { 4608 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4609 Label done; 4610 vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch); 4611 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 4612 vptest(xtmp2, xtmp2, vec_enc); 4613 jccb(Assembler::equal, done); 4614 4615 vpcmpeqd(xtmp4, xtmp4, xtmp4, vec_enc); 4616 vpxor(xtmp1, xtmp1, xtmp4, vec_enc); 4617 4618 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4619 vcmpps(xtmp3, src, src, Assembler::UNORD_Q, vec_enc); 4620 vblendvps(dst, dst, xtmp4, xtmp3, vec_enc); 4621 4622 // Recompute the mask for remaining special value. 4623 vpxor(xtmp2, xtmp2, xtmp3, vec_enc); 4624 // Extract SRC values corresponding to TRUE mask lanes. 4625 vpand(xtmp4, xtmp2, src, vec_enc); 4626 // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special 4627 // values are set. 4628 vpxor(xtmp3, xtmp2, xtmp4, vec_enc); 4629 4630 vblendvps(dst, dst, xtmp1, xtmp3, vec_enc); 4631 bind(done); 4632 } 4633 4634 void C2_MacroAssembler::vector_cast_float_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4635 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4636 Register rscratch, AddressLiteral float_sign_flip, 4637 int vec_enc) { 4638 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4639 Label done; 4640 evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4641 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4642 kortestwl(ktmp1, ktmp1); 4643 jccb(Assembler::equal, done); 4644 4645 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4646 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4647 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4648 4649 kxorwl(ktmp1, ktmp1, ktmp2); 4650 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4651 vpternlogd(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4652 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4653 bind(done); 4654 } 4655 4656 void C2_MacroAssembler::vector_cast_float_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4657 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4658 Register rscratch, AddressLiteral double_sign_flip, 4659 int vec_enc) { 4660 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4661 4662 Label done; 4663 evmovdquq(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4664 Assembler::evpcmpeqq(ktmp1, k0, xtmp1, dst, vec_enc); 4665 kortestwl(ktmp1, ktmp1); 4666 jccb(Assembler::equal, done); 4667 4668 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4669 evcmpps(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4670 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4671 4672 kxorwl(ktmp1, ktmp1, ktmp2); 4673 evcmpps(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4674 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4675 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4676 bind(done); 4677 } 4678 4679 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4680 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4681 Register rscratch, AddressLiteral float_sign_flip, 4682 int vec_enc) { 4683 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4684 Label done; 4685 evmovdquq(xtmp1, k0, float_sign_flip, false, vec_enc, rscratch); 4686 Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc); 4687 kortestwl(ktmp1, ktmp1); 4688 jccb(Assembler::equal, done); 4689 4690 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4691 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4692 evmovdqul(dst, ktmp2, xtmp2, true, vec_enc); 4693 4694 kxorwl(ktmp1, ktmp1, ktmp2); 4695 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4696 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4697 evmovdqul(dst, ktmp1, xtmp2, true, vec_enc); 4698 bind(done); 4699 } 4700 4701 /* 4702 * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. 4703 * If src is NaN, the result is 0. 4704 * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, 4705 * the result is equal to the value of Long.MIN_VALUE. 4706 * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, 4707 * the result is equal to the value of Long.MAX_VALUE. 4708 */ 4709 void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4710 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, 4711 Register rscratch, AddressLiteral double_sign_flip, 4712 int vec_enc) { 4713 assert(rscratch != noreg || always_reachable(double_sign_flip), "missing"); 4714 4715 Label done; 4716 evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, rscratch); 4717 evpcmpeqq(ktmp1, xtmp1, dst, vec_enc); 4718 kortestwl(ktmp1, ktmp1); 4719 jccb(Assembler::equal, done); 4720 4721 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 4722 evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc); 4723 evmovdquq(dst, ktmp2, xtmp2, true, vec_enc); 4724 4725 kxorwl(ktmp1, ktmp1, ktmp2); 4726 evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc); 4727 vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc); 4728 evmovdquq(dst, ktmp1, xtmp2, true, vec_enc); 4729 bind(done); 4730 } 4731 4732 void C2_MacroAssembler::vector_crosslane_doubleword_pack_avx(XMMRegister dst, XMMRegister src, XMMRegister zero, 4733 XMMRegister xtmp, int index, int vec_enc) { 4734 assert(vec_enc < Assembler::AVX_512bit, ""); 4735 if (vec_enc == Assembler::AVX_256bit) { 4736 vextractf128_high(xtmp, src); 4737 vshufps(dst, src, xtmp, index, vec_enc); 4738 } else { 4739 vshufps(dst, src, zero, index, vec_enc); 4740 } 4741 } 4742 4743 void C2_MacroAssembler::vector_cast_double_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4744 XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, Register rscratch, 4745 AddressLiteral float_sign_flip, int src_vec_enc) { 4746 assert(rscratch != noreg || always_reachable(float_sign_flip), "missing"); 4747 4748 Label done; 4749 // Compare the destination lanes with float_sign_flip 4750 // value to get mask for all special values. 4751 movdqu(xtmp1, float_sign_flip, rscratch); 4752 vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit); 4753 ptest(xtmp2, xtmp2); 4754 jccb(Assembler::equal, done); 4755 4756 // Flip float_sign_flip to get max integer value. 4757 vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit); 4758 pxor(xtmp1, xtmp4); 4759 4760 // Set detination lanes corresponding to unordered source lanes as zero. 4761 vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc); 4762 vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc); 4763 4764 // Shuffle mask vector and pack lower doubles word from each quadword lane. 4765 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 4766 vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit); 4767 4768 // Recompute the mask for remaining special value. 4769 pxor(xtmp2, xtmp3); 4770 // Extract mask corresponding to non-negative source lanes. 4771 vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc); 4772 4773 // Shuffle mask vector and pack lower doubles word from each quadword lane. 4774 vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc); 4775 pand(xtmp3, xtmp2); 4776 4777 // Replace destination lanes holding special value(0x80000000) with max int 4778 // if corresponding source lane holds a +ve value. 4779 vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit); 4780 bind(done); 4781 } 4782 4783 4784 void C2_MacroAssembler::vector_cast_int_to_subword(BasicType to_elem_bt, XMMRegister dst, XMMRegister zero, 4785 XMMRegister xtmp, Register rscratch, int vec_enc) { 4786 switch(to_elem_bt) { 4787 case T_SHORT: 4788 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_short_mask())), "missing"); 4789 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_short_mask()), vec_enc, rscratch); 4790 vpackusdw(dst, dst, zero, vec_enc); 4791 if (vec_enc == Assembler::AVX_256bit) { 4792 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 4793 } 4794 break; 4795 case T_BYTE: 4796 assert(rscratch != noreg || always_reachable(ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask())), "missing"); 4797 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), vec_enc, rscratch); 4798 vpackusdw(dst, dst, zero, vec_enc); 4799 if (vec_enc == Assembler::AVX_256bit) { 4800 vector_crosslane_doubleword_pack_avx(dst, dst, zero, xtmp, 0x44, vec_enc); 4801 } 4802 vpackuswb(dst, dst, zero, vec_enc); 4803 break; 4804 default: assert(false, "%s", type2name(to_elem_bt)); 4805 } 4806 } 4807 4808 /* 4809 * Algorithm for vector D2L and F2I conversions:- 4810 * a) Perform vector D2L/F2I cast. 4811 * b) Choose fast path if none of the result vector lane contains 0x80000000 value. 4812 * It signifies that source value could be any of the special floating point 4813 * values(NaN,-Inf,Inf,Max,-Min). 4814 * c) Set destination to zero if source is NaN value. 4815 * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value. 4816 */ 4817 4818 void C2_MacroAssembler::vector_castF2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4819 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, 4820 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 4821 int to_elem_sz = type2aelembytes(to_elem_bt); 4822 assert(to_elem_sz <= 4, ""); 4823 vcvttps2dq(dst, src, vec_enc); 4824 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, rscratch, float_sign_flip, vec_enc); 4825 if (to_elem_sz < 4) { 4826 vpxor(xtmp4, xtmp4, xtmp4, vec_enc); 4827 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp3, rscratch, vec_enc); 4828 } 4829 } 4830 4831 void C2_MacroAssembler::vector_castF2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4832 XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip, 4833 Register rscratch, int vec_enc) { 4834 int to_elem_sz = type2aelembytes(to_elem_bt); 4835 assert(to_elem_sz <= 4, ""); 4836 vcvttps2dq(dst, src, vec_enc); 4837 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, float_sign_flip, vec_enc); 4838 switch(to_elem_bt) { 4839 case T_INT: 4840 break; 4841 case T_SHORT: 4842 evpmovdw(dst, dst, vec_enc); 4843 break; 4844 case T_BYTE: 4845 evpmovdb(dst, dst, vec_enc); 4846 break; 4847 default: assert(false, "%s", type2name(to_elem_bt)); 4848 } 4849 } 4850 4851 void C2_MacroAssembler::vector_castF2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2, 4852 KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip, 4853 Register rscratch, int vec_enc) { 4854 evcvttps2qq(dst, src, vec_enc); 4855 vector_cast_float_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, double_sign_flip, vec_enc); 4856 } 4857 4858 // Handling for downcasting from double to integer or sub-word types on AVX2. 4859 void C2_MacroAssembler::vector_castD2X_avx(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 4860 XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, 4861 AddressLiteral float_sign_flip, Register rscratch, int vec_enc) { 4862 int to_elem_sz = type2aelembytes(to_elem_bt); 4863 assert(to_elem_sz < 8, ""); 4864 vcvttpd2dq(dst, src, vec_enc); 4865 vector_cast_double_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, rscratch, 4866 float_sign_flip, vec_enc); 4867 if (to_elem_sz < 4) { 4868 // xtmp4 holds all zero lanes. 4869 vector_cast_int_to_subword(to_elem_bt, dst, xtmp4, xtmp5, rscratch, Assembler::AVX_128bit); 4870 } 4871 } 4872 4873 void C2_MacroAssembler::vector_castD2X_evex(BasicType to_elem_bt, XMMRegister dst, XMMRegister src, 4874 XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, 4875 KRegister ktmp2, AddressLiteral sign_flip, 4876 Register rscratch, int vec_enc) { 4877 if (VM_Version::supports_avx512dq()) { 4878 evcvttpd2qq(dst, src, vec_enc); 4879 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 4880 switch(to_elem_bt) { 4881 case T_LONG: 4882 break; 4883 case T_INT: 4884 evpmovsqd(dst, dst, vec_enc); 4885 break; 4886 case T_SHORT: 4887 evpmovsqd(dst, dst, vec_enc); 4888 evpmovdw(dst, dst, vec_enc); 4889 break; 4890 case T_BYTE: 4891 evpmovsqd(dst, dst, vec_enc); 4892 evpmovdb(dst, dst, vec_enc); 4893 break; 4894 default: assert(false, "%s", type2name(to_elem_bt)); 4895 } 4896 } else { 4897 assert(type2aelembytes(to_elem_bt) <= 4, ""); 4898 vcvttpd2dq(dst, src, vec_enc); 4899 vector_cast_double_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, rscratch, sign_flip, vec_enc); 4900 switch(to_elem_bt) { 4901 case T_INT: 4902 break; 4903 case T_SHORT: 4904 evpmovdw(dst, dst, vec_enc); 4905 break; 4906 case T_BYTE: 4907 evpmovdb(dst, dst, vec_enc); 4908 break; 4909 default: assert(false, "%s", type2name(to_elem_bt)); 4910 } 4911 } 4912 } 4913 4914 #ifdef _LP64 4915 void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, 4916 AddressLiteral double_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 4917 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 4918 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4919 // and re-instantiate original MXCSR.RC mode after that. 4920 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 4921 4922 mov64(tmp, julong_cast(0.5L)); 4923 evpbroadcastq(xtmp1, tmp, vec_enc); 4924 vaddpd(xtmp1, src , xtmp1, vec_enc); 4925 evcvtpd2qq(dst, xtmp1, vec_enc); 4926 vector_cast_double_to_long_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 4927 double_sign_flip, vec_enc);; 4928 4929 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 4930 } 4931 4932 void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, 4933 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 4934 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2) { 4935 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4936 // and re-instantiate original MXCSR.RC mode after that. 4937 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 4938 4939 movl(tmp, jint_cast(0.5)); 4940 movq(xtmp1, tmp); 4941 vbroadcastss(xtmp1, xtmp1, vec_enc); 4942 vaddps(xtmp1, src , xtmp1, vec_enc); 4943 vcvtps2dq(dst, xtmp1, vec_enc); 4944 vector_cast_float_to_int_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, tmp /*rscratch*/, 4945 float_sign_flip, vec_enc); 4946 4947 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 4948 } 4949 4950 void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, 4951 AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc, 4952 Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4) { 4953 // Perform floor(val+0.5) operation under the influence of MXCSR.RC mode roundTowards -inf. 4954 // and re-instantiate original MXCSR.RC mode after that. 4955 ldmxcsr(new_mxcsr, tmp /*rscratch*/); 4956 4957 movl(tmp, jint_cast(0.5)); 4958 movq(xtmp1, tmp); 4959 vbroadcastss(xtmp1, xtmp1, vec_enc); 4960 vaddps(xtmp1, src , xtmp1, vec_enc); 4961 vcvtps2dq(dst, xtmp1, vec_enc); 4962 vector_cast_float_to_int_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, tmp /*rscratch*/, float_sign_flip, vec_enc); 4963 4964 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), tmp /*rscratch*/); 4965 } 4966 #endif // _LP64 4967 4968 void C2_MacroAssembler::vector_unsigned_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 4969 BasicType from_elem_bt, BasicType to_elem_bt) { 4970 switch (from_elem_bt) { 4971 case T_BYTE: 4972 switch (to_elem_bt) { 4973 case T_SHORT: vpmovzxbw(dst, src, vlen_enc); break; 4974 case T_INT: vpmovzxbd(dst, src, vlen_enc); break; 4975 case T_LONG: vpmovzxbq(dst, src, vlen_enc); break; 4976 default: ShouldNotReachHere(); 4977 } 4978 break; 4979 case T_SHORT: 4980 switch (to_elem_bt) { 4981 case T_INT: vpmovzxwd(dst, src, vlen_enc); break; 4982 case T_LONG: vpmovzxwq(dst, src, vlen_enc); break; 4983 default: ShouldNotReachHere(); 4984 } 4985 break; 4986 case T_INT: 4987 assert(to_elem_bt == T_LONG, ""); 4988 vpmovzxdq(dst, src, vlen_enc); 4989 break; 4990 default: 4991 ShouldNotReachHere(); 4992 } 4993 } 4994 4995 void C2_MacroAssembler::vector_signed_cast(XMMRegister dst, XMMRegister src, int vlen_enc, 4996 BasicType from_elem_bt, BasicType to_elem_bt) { 4997 switch (from_elem_bt) { 4998 case T_BYTE: 4999 switch (to_elem_bt) { 5000 case T_SHORT: vpmovsxbw(dst, src, vlen_enc); break; 5001 case T_INT: vpmovsxbd(dst, src, vlen_enc); break; 5002 case T_LONG: vpmovsxbq(dst, src, vlen_enc); break; 5003 default: ShouldNotReachHere(); 5004 } 5005 break; 5006 case T_SHORT: 5007 switch (to_elem_bt) { 5008 case T_INT: vpmovsxwd(dst, src, vlen_enc); break; 5009 case T_LONG: vpmovsxwq(dst, src, vlen_enc); break; 5010 default: ShouldNotReachHere(); 5011 } 5012 break; 5013 case T_INT: 5014 assert(to_elem_bt == T_LONG, ""); 5015 vpmovsxdq(dst, src, vlen_enc); 5016 break; 5017 default: 5018 ShouldNotReachHere(); 5019 } 5020 } 5021 5022 void C2_MacroAssembler::vector_mask_cast(XMMRegister dst, XMMRegister src, 5023 BasicType dst_bt, BasicType src_bt, int vlen) { 5024 int vlen_enc = vector_length_encoding(MAX2(type2aelembytes(src_bt), type2aelembytes(dst_bt)) * vlen); 5025 assert(vlen_enc != AVX_512bit, ""); 5026 5027 int dst_bt_size = type2aelembytes(dst_bt); 5028 int src_bt_size = type2aelembytes(src_bt); 5029 if (dst_bt_size > src_bt_size) { 5030 switch (dst_bt_size / src_bt_size) { 5031 case 2: vpmovsxbw(dst, src, vlen_enc); break; 5032 case 4: vpmovsxbd(dst, src, vlen_enc); break; 5033 case 8: vpmovsxbq(dst, src, vlen_enc); break; 5034 default: ShouldNotReachHere(); 5035 } 5036 } else { 5037 assert(dst_bt_size < src_bt_size, ""); 5038 switch (src_bt_size / dst_bt_size) { 5039 case 2: { 5040 if (vlen_enc == AVX_128bit) { 5041 vpacksswb(dst, src, src, vlen_enc); 5042 } else { 5043 vpacksswb(dst, src, src, vlen_enc); 5044 vpermq(dst, dst, 0x08, vlen_enc); 5045 } 5046 break; 5047 } 5048 case 4: { 5049 if (vlen_enc == AVX_128bit) { 5050 vpackssdw(dst, src, src, vlen_enc); 5051 vpacksswb(dst, dst, dst, vlen_enc); 5052 } else { 5053 vpackssdw(dst, src, src, vlen_enc); 5054 vpermq(dst, dst, 0x08, vlen_enc); 5055 vpacksswb(dst, dst, dst, AVX_128bit); 5056 } 5057 break; 5058 } 5059 case 8: { 5060 if (vlen_enc == AVX_128bit) { 5061 vpshufd(dst, src, 0x08, vlen_enc); 5062 vpackssdw(dst, dst, dst, vlen_enc); 5063 vpacksswb(dst, dst, dst, vlen_enc); 5064 } else { 5065 vpshufd(dst, src, 0x08, vlen_enc); 5066 vpermq(dst, dst, 0x08, vlen_enc); 5067 vpackssdw(dst, dst, dst, AVX_128bit); 5068 vpacksswb(dst, dst, dst, AVX_128bit); 5069 } 5070 break; 5071 } 5072 default: ShouldNotReachHere(); 5073 } 5074 } 5075 } 5076 5077 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3, 5078 bool merge, BasicType bt, int vlen_enc) { 5079 if (bt == T_INT) { 5080 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5081 } else { 5082 assert(bt == T_LONG, ""); 5083 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5084 } 5085 } 5086 5087 void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3, 5088 bool merge, BasicType bt, int vlen_enc) { 5089 if (bt == T_INT) { 5090 evpternlogd(dst, func, mask, src2, src3, merge, vlen_enc); 5091 } else { 5092 assert(bt == T_LONG, ""); 5093 evpternlogq(dst, func, mask, src2, src3, merge, vlen_enc); 5094 } 5095 } 5096 5097 #ifdef _LP64 5098 void C2_MacroAssembler::vector_long_to_maskvec(XMMRegister dst, Register src, Register rtmp1, 5099 Register rtmp2, XMMRegister xtmp, int mask_len, 5100 int vec_enc) { 5101 int index = 0; 5102 int vindex = 0; 5103 mov64(rtmp1, 0x0101010101010101L); 5104 pdepq(rtmp1, src, rtmp1); 5105 if (mask_len > 8) { 5106 movq(rtmp2, src); 5107 vpxor(xtmp, xtmp, xtmp, vec_enc); 5108 movq(xtmp, rtmp1); 5109 } 5110 movq(dst, rtmp1); 5111 5112 mask_len -= 8; 5113 while (mask_len > 0) { 5114 assert ((mask_len & 0x7) == 0, "mask must be multiple of 8"); 5115 index++; 5116 if ((index % 2) == 0) { 5117 pxor(xtmp, xtmp); 5118 } 5119 mov64(rtmp1, 0x0101010101010101L); 5120 shrq(rtmp2, 8); 5121 pdepq(rtmp1, rtmp2, rtmp1); 5122 pinsrq(xtmp, rtmp1, index % 2); 5123 vindex = index / 2; 5124 if (vindex) { 5125 // Write entire 16 byte vector when both 64 bit 5126 // lanes are update to save redundant instructions. 5127 if (index % 2) { 5128 vinsertf128(dst, dst, xtmp, vindex); 5129 } 5130 } else { 5131 vmovdqu(dst, xtmp); 5132 } 5133 mask_len -= 8; 5134 } 5135 } 5136 5137 void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) { 5138 switch(opc) { 5139 case Op_VectorMaskTrueCount: 5140 popcntq(dst, tmp); 5141 break; 5142 case Op_VectorMaskLastTrue: 5143 if (VM_Version::supports_lzcnt()) { 5144 lzcntq(tmp, tmp); 5145 movl(dst, 63); 5146 subl(dst, tmp); 5147 } else { 5148 movl(dst, -1); 5149 bsrq(tmp, tmp); 5150 cmov32(Assembler::notZero, dst, tmp); 5151 } 5152 break; 5153 case Op_VectorMaskFirstTrue: 5154 if (VM_Version::supports_bmi1()) { 5155 if (masklen < 32) { 5156 orl(tmp, 1 << masklen); 5157 tzcntl(dst, tmp); 5158 } else if (masklen == 32) { 5159 tzcntl(dst, tmp); 5160 } else { 5161 assert(masklen == 64, ""); 5162 tzcntq(dst, tmp); 5163 } 5164 } else { 5165 if (masklen < 32) { 5166 orl(tmp, 1 << masklen); 5167 bsfl(dst, tmp); 5168 } else { 5169 assert(masklen == 32 || masklen == 64, ""); 5170 movl(dst, masklen); 5171 if (masklen == 32) { 5172 bsfl(tmp, tmp); 5173 } else { 5174 bsfq(tmp, tmp); 5175 } 5176 cmov32(Assembler::notZero, dst, tmp); 5177 } 5178 } 5179 break; 5180 case Op_VectorMaskToLong: 5181 assert(dst == tmp, "Dst and tmp should be the same for toLong operations"); 5182 break; 5183 default: assert(false, "Unhandled mask operation"); 5184 } 5185 } 5186 5187 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, 5188 int masklen, int masksize, int vec_enc) { 5189 assert(VM_Version::supports_popcnt(), ""); 5190 5191 if(VM_Version::supports_avx512bw()) { 5192 kmovql(tmp, mask); 5193 } else { 5194 assert(masklen <= 16, ""); 5195 kmovwl(tmp, mask); 5196 } 5197 5198 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5199 // operations needs to be clipped. 5200 if (masksize < 16 && opc != Op_VectorMaskFirstTrue) { 5201 andq(tmp, (1 << masklen) - 1); 5202 } 5203 5204 vector_mask_operation_helper(opc, dst, tmp, masklen); 5205 } 5206 5207 void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, 5208 Register tmp, int masklen, BasicType bt, int vec_enc) { 5209 assert(vec_enc == AVX_128bit && VM_Version::supports_avx() || 5210 vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4), ""); 5211 assert(VM_Version::supports_popcnt(), ""); 5212 5213 bool need_clip = false; 5214 switch(bt) { 5215 case T_BOOLEAN: 5216 // While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1 5217 vpxor(xtmp, xtmp, xtmp, vec_enc); 5218 vpsubb(xtmp, xtmp, mask, vec_enc); 5219 vpmovmskb(tmp, xtmp, vec_enc); 5220 need_clip = masklen < 16; 5221 break; 5222 case T_BYTE: 5223 vpmovmskb(tmp, mask, vec_enc); 5224 need_clip = masklen < 16; 5225 break; 5226 case T_SHORT: 5227 vpacksswb(xtmp, mask, mask, vec_enc); 5228 if (masklen >= 16) { 5229 vpermpd(xtmp, xtmp, 8, vec_enc); 5230 } 5231 vpmovmskb(tmp, xtmp, Assembler::AVX_128bit); 5232 need_clip = masklen < 16; 5233 break; 5234 case T_INT: 5235 case T_FLOAT: 5236 vmovmskps(tmp, mask, vec_enc); 5237 need_clip = masklen < 4; 5238 break; 5239 case T_LONG: 5240 case T_DOUBLE: 5241 vmovmskpd(tmp, mask, vec_enc); 5242 need_clip = masklen < 2; 5243 break; 5244 default: assert(false, "Unhandled type, %s", type2name(bt)); 5245 } 5246 5247 // Mask generated out of partial vector comparisons/replicate/mask manipulation 5248 // operations needs to be clipped. 5249 if (need_clip && opc != Op_VectorMaskFirstTrue) { 5250 // need_clip implies masklen < 32 5251 andq(tmp, (1 << masklen) - 1); 5252 } 5253 5254 vector_mask_operation_helper(opc, dst, tmp, masklen); 5255 } 5256 5257 void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Register rtmp1, 5258 Register rtmp2, int mask_len) { 5259 kmov(rtmp1, src); 5260 andq(rtmp1, (0xFFFFFFFFFFFFFFFFUL >> (64 - mask_len))); 5261 mov64(rtmp2, -1L); 5262 pextq(rtmp2, rtmp2, rtmp1); 5263 kmov(dst, rtmp2); 5264 } 5265 5266 void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask, 5267 bool merge, BasicType bt, int vec_enc) { 5268 if (opcode == Op_CompressV) { 5269 switch(bt) { 5270 case T_BYTE: 5271 evpcompressb(dst, mask, src, merge, vec_enc); 5272 break; 5273 case T_CHAR: 5274 case T_SHORT: 5275 evpcompressw(dst, mask, src, merge, vec_enc); 5276 break; 5277 case T_INT: 5278 evpcompressd(dst, mask, src, merge, vec_enc); 5279 break; 5280 case T_FLOAT: 5281 evcompressps(dst, mask, src, merge, vec_enc); 5282 break; 5283 case T_LONG: 5284 evpcompressq(dst, mask, src, merge, vec_enc); 5285 break; 5286 case T_DOUBLE: 5287 evcompresspd(dst, mask, src, merge, vec_enc); 5288 break; 5289 default: 5290 fatal("Unsupported type %s", type2name(bt)); 5291 break; 5292 } 5293 } else { 5294 assert(opcode == Op_ExpandV, ""); 5295 switch(bt) { 5296 case T_BYTE: 5297 evpexpandb(dst, mask, src, merge, vec_enc); 5298 break; 5299 case T_CHAR: 5300 case T_SHORT: 5301 evpexpandw(dst, mask, src, merge, vec_enc); 5302 break; 5303 case T_INT: 5304 evpexpandd(dst, mask, src, merge, vec_enc); 5305 break; 5306 case T_FLOAT: 5307 evexpandps(dst, mask, src, merge, vec_enc); 5308 break; 5309 case T_LONG: 5310 evpexpandq(dst, mask, src, merge, vec_enc); 5311 break; 5312 case T_DOUBLE: 5313 evexpandpd(dst, mask, src, merge, vec_enc); 5314 break; 5315 default: 5316 fatal("Unsupported type %s", type2name(bt)); 5317 break; 5318 } 5319 } 5320 } 5321 #endif 5322 5323 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5324 KRegister ktmp1, int vec_enc) { 5325 if (opcode == Op_SignumVD) { 5326 vsubpd(dst, zero, one, vec_enc); 5327 // if src < 0 ? -1 : 1 5328 evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5329 evblendmpd(dst, ktmp1, one, dst, true, vec_enc); 5330 // if src == NaN, -0.0 or 0.0 return src. 5331 evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5332 evblendmpd(dst, ktmp1, dst, src, true, vec_enc); 5333 } else { 5334 assert(opcode == Op_SignumVF, ""); 5335 vsubps(dst, zero, one, vec_enc); 5336 // if src < 0 ? -1 : 1 5337 evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc); 5338 evblendmps(dst, ktmp1, one, dst, true, vec_enc); 5339 // if src == NaN, -0.0 or 0.0 return src. 5340 evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc); 5341 evblendmps(dst, ktmp1, dst, src, true, vec_enc); 5342 } 5343 } 5344 5345 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one, 5346 XMMRegister xtmp1, int vec_enc) { 5347 if (opcode == Op_SignumVD) { 5348 vsubpd(dst, zero, one, vec_enc); 5349 // if src < 0 ? -1 : 1 5350 vblendvpd(dst, one, dst, src, vec_enc); 5351 // if src == NaN, -0.0 or 0.0 return src. 5352 vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5353 vblendvpd(dst, dst, src, xtmp1, vec_enc); 5354 } else { 5355 assert(opcode == Op_SignumVF, ""); 5356 vsubps(dst, zero, one, vec_enc); 5357 // if src < 0 ? -1 : 1 5358 vblendvps(dst, one, dst, src, vec_enc); 5359 // if src == NaN, -0.0 or 0.0 return src. 5360 vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc); 5361 vblendvps(dst, dst, src, xtmp1, vec_enc); 5362 } 5363 } 5364 5365 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) { 5366 if (VM_Version::supports_avx512bw()) { 5367 if (mask_len > 32) { 5368 kmovql(dst, src); 5369 } else { 5370 kmovdl(dst, src); 5371 if (mask_len != 32) { 5372 kshiftrdl(dst, dst, 32 - mask_len); 5373 } 5374 } 5375 } else { 5376 assert(mask_len <= 16, ""); 5377 kmovwl(dst, src); 5378 if (mask_len != 16) { 5379 kshiftrwl(dst, dst, 16 - mask_len); 5380 } 5381 } 5382 } 5383 5384 void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { 5385 int lane_size = type2aelembytes(bt); 5386 bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); 5387 if ((is_LP64 || lane_size < 8) && 5388 ((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) || 5389 (is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) { 5390 movptr(rtmp, imm32); 5391 switch(lane_size) { 5392 case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; 5393 case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; 5394 case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; 5395 case 8 : evpbroadcastq(dst, rtmp, vec_enc); break; 5396 fatal("Unsupported lane size %d", lane_size); 5397 break; 5398 } 5399 } else { 5400 movptr(rtmp, imm32); 5401 LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); 5402 switch(lane_size) { 5403 case 1 : vpbroadcastb(dst, dst, vec_enc); break; 5404 case 2 : vpbroadcastw(dst, dst, vec_enc); break; 5405 case 4 : vpbroadcastd(dst, dst, vec_enc); break; 5406 case 8 : vpbroadcastq(dst, dst, vec_enc); break; 5407 fatal("Unsupported lane size %d", lane_size); 5408 break; 5409 } 5410 } 5411 } 5412 5413 // 5414 // Following is lookup table based popcount computation algorithm:- 5415 // Index Bit set count 5416 // [ 0000 -> 0, 5417 // 0001 -> 1, 5418 // 0010 -> 1, 5419 // 0011 -> 2, 5420 // 0100 -> 1, 5421 // 0101 -> 2, 5422 // 0110 -> 2, 5423 // 0111 -> 3, 5424 // 1000 -> 1, 5425 // 1001 -> 2, 5426 // 1010 -> 3, 5427 // 1011 -> 3, 5428 // 1100 -> 2, 5429 // 1101 -> 3, 5430 // 1111 -> 4 ] 5431 // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as 5432 // shuffle indices for lookup table access. 5433 // b. Right shift each byte of vector lane by 4 positions. 5434 // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as 5435 // shuffle indices for lookup table access. 5436 // d. Add the bitset count of upper and lower 4 bits of each byte. 5437 // e. Unpack double words to quad words and compute sum of absolute difference of bitset 5438 // count of all the bytes of a quadword. 5439 // f. Perform step e. for upper 128bit vector lane. 5440 // g. Pack the bitset count of quadwords back to double word. 5441 // h. Unpacking and packing operations are not needed for 64bit vector lane. 5442 5443 void C2_MacroAssembler::vector_popcount_byte(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5444 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5445 assert((vec_enc == Assembler::AVX_512bit && VM_Version::supports_avx512bw()) || VM_Version::supports_avx2(), ""); 5446 vbroadcast(T_INT, xtmp1, 0x0F0F0F0F, rtmp, vec_enc); 5447 vpsrlw(dst, src, 4, vec_enc); 5448 vpand(dst, dst, xtmp1, vec_enc); 5449 vpand(xtmp1, src, xtmp1, vec_enc); 5450 vmovdqu(xtmp2, ExternalAddress(StubRoutines::x86::vector_popcount_lut()), vec_enc, noreg); 5451 vpshufb(xtmp1, xtmp2, xtmp1, vec_enc); 5452 vpshufb(dst, xtmp2, dst, vec_enc); 5453 vpaddb(dst, dst, xtmp1, vec_enc); 5454 } 5455 5456 void C2_MacroAssembler::vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5457 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5458 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5459 // Following code is as per steps e,f,g and h of above algorithm. 5460 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5461 vpunpckhdq(dst, xtmp1, xtmp2, vec_enc); 5462 vpsadbw(dst, dst, xtmp2, vec_enc); 5463 vpunpckldq(xtmp1, xtmp1, xtmp2, vec_enc); 5464 vpsadbw(xtmp1, xtmp1, xtmp2, vec_enc); 5465 vpackuswb(dst, xtmp1, dst, vec_enc); 5466 } 5467 5468 void C2_MacroAssembler::vector_popcount_short(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5469 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5470 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5471 // Add the popcount of upper and lower bytes of word. 5472 vbroadcast(T_INT, xtmp2, 0x00FF00FF, rtmp, vec_enc); 5473 vpsrlw(dst, xtmp1, 8, vec_enc); 5474 vpand(xtmp1, xtmp1, xtmp2, vec_enc); 5475 vpaddw(dst, dst, xtmp1, vec_enc); 5476 } 5477 5478 void C2_MacroAssembler::vector_popcount_long(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5479 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5480 vector_popcount_byte(xtmp1, src, dst, xtmp2, rtmp, vec_enc); 5481 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5482 vpsadbw(dst, xtmp1, xtmp2, vec_enc); 5483 } 5484 5485 void C2_MacroAssembler::vector_popcount_integral(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5486 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5487 switch(bt) { 5488 case T_LONG: 5489 vector_popcount_long(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5490 break; 5491 case T_INT: 5492 vector_popcount_int(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5493 break; 5494 case T_CHAR: 5495 case T_SHORT: 5496 vector_popcount_short(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5497 break; 5498 case T_BYTE: 5499 case T_BOOLEAN: 5500 vector_popcount_byte(dst, src, xtmp1, xtmp2, rtmp, vec_enc); 5501 break; 5502 default: 5503 fatal("Unsupported type %s", type2name(bt)); 5504 break; 5505 } 5506 } 5507 5508 void C2_MacroAssembler::vector_popcount_integral_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5509 KRegister mask, bool merge, int vec_enc) { 5510 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5511 switch(bt) { 5512 case T_LONG: 5513 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5514 evpopcntq(dst, mask, src, merge, vec_enc); 5515 break; 5516 case T_INT: 5517 assert(VM_Version::supports_avx512_vpopcntdq(), ""); 5518 evpopcntd(dst, mask, src, merge, vec_enc); 5519 break; 5520 case T_CHAR: 5521 case T_SHORT: 5522 assert(VM_Version::supports_avx512_bitalg(), ""); 5523 evpopcntw(dst, mask, src, merge, vec_enc); 5524 break; 5525 case T_BYTE: 5526 case T_BOOLEAN: 5527 assert(VM_Version::supports_avx512_bitalg(), ""); 5528 evpopcntb(dst, mask, src, merge, vec_enc); 5529 break; 5530 default: 5531 fatal("Unsupported type %s", type2name(bt)); 5532 break; 5533 } 5534 } 5535 5536 #ifndef _LP64 5537 void C2_MacroAssembler::vector_maskall_operation32(KRegister dst, Register src, KRegister tmp, int mask_len) { 5538 assert(VM_Version::supports_avx512bw(), ""); 5539 kmovdl(tmp, src); 5540 kunpckdql(dst, tmp, tmp); 5541 } 5542 #endif 5543 5544 // Bit reversal algorithm first reverses the bits of each byte followed by 5545 // a byte level reversal for multi-byte primitive types (short/int/long). 5546 // Algorithm performs a lookup table access to get reverse bit sequence 5547 // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte 5548 // is obtained by swapping the reverse bit sequences of upper and lower 5549 // nibble of a byte. 5550 void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5551 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5552 if (VM_Version::supports_avx512vlbw()) { 5553 5554 // Get the reverse bit sequence of lower nibble of each byte. 5555 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); 5556 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5557 evpandq(dst, xtmp2, src, vec_enc); 5558 vpshufb(dst, xtmp1, dst, vec_enc); 5559 vpsllq(dst, dst, 4, vec_enc); 5560 5561 // Get the reverse bit sequence of upper nibble of each byte. 5562 vpandn(xtmp2, xtmp2, src, vec_enc); 5563 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5564 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5565 5566 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5567 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5568 evporq(xtmp2, dst, xtmp2, vec_enc); 5569 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5570 5571 } else if(vec_enc == Assembler::AVX_512bit) { 5572 // Shift based bit reversal. 5573 assert(bt == T_LONG || bt == T_INT, ""); 5574 5575 // Swap lower and upper nibble of each byte. 5576 vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc); 5577 5578 // Swap two least and most significant bits of each nibble. 5579 vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc); 5580 5581 // Swap adjacent pair of bits. 5582 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5583 vector_swap_nbits(1, 0x55555555, dst, xtmp1, xtmp2, rtmp, vec_enc); 5584 5585 evmovdqul(xtmp1, k0, dst, true, vec_enc); 5586 vector_reverse_byte64(bt, dst, xtmp1, xtmp1, xtmp2, rtmp, vec_enc); 5587 } else { 5588 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, rtmp); 5589 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5590 5591 // Get the reverse bit sequence of lower nibble of each byte. 5592 vpand(dst, xtmp2, src, vec_enc); 5593 vpshufb(dst, xtmp1, dst, vec_enc); 5594 vpsllq(dst, dst, 4, vec_enc); 5595 5596 // Get the reverse bit sequence of upper nibble of each byte. 5597 vpandn(xtmp2, xtmp2, src, vec_enc); 5598 vpsrlq(xtmp2, xtmp2, 4, vec_enc); 5599 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5600 5601 // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and 5602 // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. 5603 vpor(xtmp2, dst, xtmp2, vec_enc); 5604 vector_reverse_byte(bt, dst, xtmp2, vec_enc); 5605 } 5606 } 5607 5608 void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, XMMRegister src, AddressLiteral mask, int vec_enc, 5609 XMMRegister xtmp, Register rscratch) { 5610 assert(VM_Version::supports_gfni(), ""); 5611 assert(rscratch != noreg || always_reachable(mask), "missing"); 5612 5613 // Galois field instruction based bit reversal based on following algorithm. 5614 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 5615 vpbroadcastq(xtmp, mask, vec_enc, rscratch); 5616 vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc); 5617 vector_reverse_byte(bt, dst, xtmp, vec_enc); 5618 } 5619 5620 void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, 5621 XMMRegister xtmp1, Register rtmp, int vec_enc) { 5622 vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); 5623 evpandq(dst, xtmp1, src, vec_enc); 5624 vpsllq(dst, dst, nbits, vec_enc); 5625 vpandn(xtmp1, xtmp1, src, vec_enc); 5626 vpsrlq(xtmp1, xtmp1, nbits, vec_enc); 5627 evporq(dst, dst, xtmp1, vec_enc); 5628 } 5629 5630 void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5631 XMMRegister xtmp2, Register rtmp, int vec_enc) { 5632 // Shift based bit reversal. 5633 assert(VM_Version::supports_evex(), ""); 5634 switch(bt) { 5635 case T_LONG: 5636 // Swap upper and lower double word of each quad word. 5637 evprorq(xtmp1, k0, src, 32, true, vec_enc); 5638 evprord(xtmp1, k0, xtmp1, 16, true, vec_enc); 5639 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5640 break; 5641 case T_INT: 5642 // Swap upper and lower word of each double word. 5643 evprord(xtmp1, k0, src, 16, true, vec_enc); 5644 vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); 5645 break; 5646 case T_CHAR: 5647 case T_SHORT: 5648 // Swap upper and lower byte of each word. 5649 vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); 5650 break; 5651 case T_BYTE: 5652 evmovdquq(dst, k0, src, true, vec_enc); 5653 break; 5654 default: 5655 fatal("Unsupported type %s", type2name(bt)); 5656 break; 5657 } 5658 } 5659 5660 void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { 5661 if (bt == T_BYTE) { 5662 if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) { 5663 evmovdquq(dst, k0, src, true, vec_enc); 5664 } else { 5665 vmovdqu(dst, src); 5666 } 5667 return; 5668 } 5669 // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using 5670 // pre-computed shuffle indices. 5671 switch(bt) { 5672 case T_LONG: 5673 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); 5674 break; 5675 case T_INT: 5676 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); 5677 break; 5678 case T_CHAR: 5679 case T_SHORT: 5680 vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); 5681 break; 5682 default: 5683 fatal("Unsupported type %s", type2name(bt)); 5684 break; 5685 } 5686 vpshufb(dst, src, dst, vec_enc); 5687 } 5688 5689 void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5690 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5691 KRegister ktmp, Register rtmp, bool merge, int vec_enc) { 5692 assert(is_integral_type(bt), ""); 5693 assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, ""); 5694 assert(VM_Version::supports_avx512cd(), ""); 5695 switch(bt) { 5696 case T_LONG: 5697 evplzcntq(dst, ktmp, src, merge, vec_enc); 5698 break; 5699 case T_INT: 5700 evplzcntd(dst, ktmp, src, merge, vec_enc); 5701 break; 5702 case T_SHORT: 5703 vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc); 5704 vpunpcklwd(xtmp2, xtmp1, src, vec_enc); 5705 evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc); 5706 vpunpckhwd(dst, xtmp1, src, vec_enc); 5707 evplzcntd(dst, ktmp, dst, merge, vec_enc); 5708 vpackusdw(dst, xtmp2, dst, vec_enc); 5709 break; 5710 case T_BYTE: 5711 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5712 // accessing the lookup table. 5713 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 5714 // accessing the lookup table. 5715 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 5716 assert(VM_Version::supports_avx512bw(), ""); 5717 evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp); 5718 vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc); 5719 vpand(xtmp2, dst, src, vec_enc); 5720 vpshufb(xtmp2, xtmp1, xtmp2, vec_enc); 5721 vpsrlw(xtmp3, src, 4, vec_enc); 5722 vpand(xtmp3, dst, xtmp3, vec_enc); 5723 vpshufb(dst, xtmp1, xtmp3, vec_enc); 5724 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 5725 evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc); 5726 evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); 5727 break; 5728 default: 5729 fatal("Unsupported type %s", type2name(bt)); 5730 break; 5731 } 5732 } 5733 5734 void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5735 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5736 vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp); 5737 vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); 5738 // T1 = Compute leading zero counts of 4 LSB bits of each byte by 5739 // accessing the lookup table. 5740 vpand(dst, xtmp2, src, vec_enc); 5741 vpshufb(dst, xtmp1, dst, vec_enc); 5742 // T2 = Compute leading zero counts of 4 MSB bits of each byte by 5743 // accessing the lookup table. 5744 vpsrlw(xtmp3, src, 4, vec_enc); 5745 vpand(xtmp3, xtmp2, xtmp3, vec_enc); 5746 vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); 5747 // Add T1 to T2 if 4 MSB bits of byte are all zeros. 5748 vpxor(xtmp1, xtmp1, xtmp1, vec_enc); 5749 vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc); 5750 vpaddb(dst, dst, xtmp2, vec_enc); 5751 vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc); 5752 } 5753 5754 void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5755 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5756 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5757 // Add zero counts of lower byte and upper byte of a word if 5758 // upper byte holds a zero value. 5759 vpsrlw(xtmp3, src, 8, vec_enc); 5760 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 5761 vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc); 5762 vpsllw(xtmp2, dst, 8, vec_enc); 5763 vpaddw(xtmp2, xtmp2, dst, vec_enc); 5764 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5765 vpsrlw(dst, dst, 8, vec_enc); 5766 } 5767 5768 void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5769 XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { 5770 // Since IEEE 754 floating point format represents mantissa in 1.0 format 5771 // hence biased exponent can be used to compute leading zero count as per 5772 // following formula:- 5773 // LZCNT = 32 - (biased_exp - 127) 5774 // Special handling has been introduced for Zero, Max_Int and -ve source values. 5775 5776 // Broadcast 0xFF 5777 vpcmpeqd(xtmp1, xtmp1, xtmp1, vec_enc); 5778 vpsrld(xtmp1, xtmp1, 24, vec_enc); 5779 5780 // Extract biased exponent. 5781 vcvtdq2ps(dst, src, vec_enc); 5782 vpsrld(dst, dst, 23, vec_enc); 5783 vpand(dst, dst, xtmp1, vec_enc); 5784 5785 // Broadcast 127. 5786 vpsrld(xtmp1, xtmp1, 1, vec_enc); 5787 // Exponent = biased_exp - 127 5788 vpsubd(dst, dst, xtmp1, vec_enc); 5789 5790 // Exponent = Exponent + 1 5791 vpsrld(xtmp3, xtmp1, 6, vec_enc); 5792 vpaddd(dst, dst, xtmp3, vec_enc); 5793 5794 // Replace -ve exponent with zero, exponent is -ve when src 5795 // lane contains a zero value. 5796 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5797 vblendvps(dst, dst, xtmp2, dst, vec_enc); 5798 5799 // Rematerialize broadcast 32. 5800 vpslld(xtmp1, xtmp3, 5, vec_enc); 5801 // Exponent is 32 if corresponding source lane contains max_int value. 5802 vpcmpeqd(xtmp2, dst, xtmp1, vec_enc); 5803 // LZCNT = 32 - exponent 5804 vpsubd(dst, xtmp1, dst, vec_enc); 5805 5806 // Replace LZCNT with a value 1 if corresponding source lane 5807 // contains max_int value. 5808 vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc); 5809 5810 // Replace biased_exp with 0 if source lane value is less than zero. 5811 vpxor(xtmp2, xtmp2, xtmp2, vec_enc); 5812 vblendvps(dst, dst, xtmp2, src, vec_enc); 5813 } 5814 5815 void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5816 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5817 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5818 // Add zero counts of lower word and upper word of a double word if 5819 // upper word holds a zero value. 5820 vpsrld(xtmp3, src, 16, vec_enc); 5821 // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx. 5822 vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc); 5823 vpslld(xtmp2, dst, 16, vec_enc); 5824 vpaddd(xtmp2, xtmp2, dst, vec_enc); 5825 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5826 vpsrld(dst, dst, 16, vec_enc); 5827 // Add zero counts of lower doubleword and upper doubleword of a 5828 // quadword if upper doubleword holds a zero value. 5829 vpsrlq(xtmp3, src, 32, vec_enc); 5830 vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc); 5831 vpsllq(xtmp2, dst, 32, vec_enc); 5832 vpaddq(xtmp2, xtmp2, dst, vec_enc); 5833 vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc); 5834 vpsrlq(dst, dst, 32, vec_enc); 5835 } 5836 5837 void C2_MacroAssembler::vector_count_leading_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, 5838 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5839 Register rtmp, int vec_enc) { 5840 assert(is_integral_type(bt), "unexpected type"); 5841 assert(vec_enc < Assembler::AVX_512bit, ""); 5842 switch(bt) { 5843 case T_LONG: 5844 vector_count_leading_zeros_long_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5845 break; 5846 case T_INT: 5847 vector_count_leading_zeros_int_avx(dst, src, xtmp1, xtmp2, xtmp3, vec_enc); 5848 break; 5849 case T_SHORT: 5850 vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5851 break; 5852 case T_BYTE: 5853 vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); 5854 break; 5855 default: 5856 fatal("Unsupported type %s", type2name(bt)); 5857 break; 5858 } 5859 } 5860 5861 void C2_MacroAssembler::vpsub(BasicType bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vec_enc) { 5862 switch(bt) { 5863 case T_BYTE: 5864 vpsubb(dst, src1, src2, vec_enc); 5865 break; 5866 case T_SHORT: 5867 vpsubw(dst, src1, src2, vec_enc); 5868 break; 5869 case T_INT: 5870 vpsubd(dst, src1, src2, vec_enc); 5871 break; 5872 case T_LONG: 5873 vpsubq(dst, src1, src2, vec_enc); 5874 break; 5875 default: 5876 fatal("Unsupported type %s", type2name(bt)); 5877 break; 5878 } 5879 } 5880 5881 // Trailing zero count computation is based on leading zero count operation as per 5882 // following equation. All AVX3 targets support AVX512CD feature which offers 5883 // direct vector instruction to compute leading zero count. 5884 // CTZ = PRIM_TYPE_WIDHT - CLZ((x - 1) & ~x) 5885 void C2_MacroAssembler::vector_count_trailing_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src, 5886 XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, 5887 XMMRegister xtmp4, KRegister ktmp, Register rtmp, int vec_enc) { 5888 assert(is_integral_type(bt), ""); 5889 // xtmp = -1 5890 vpternlogd(xtmp4, 0xff, xtmp4, xtmp4, vec_enc); 5891 // xtmp = xtmp + src 5892 vpadd(bt, xtmp4, xtmp4, src, vec_enc); 5893 // xtmp = xtmp & ~src 5894 vpternlogd(xtmp4, 0x40, xtmp4, src, vec_enc); 5895 vector_count_leading_zeros_evex(bt, dst, xtmp4, xtmp1, xtmp2, xtmp3, ktmp, rtmp, true, vec_enc); 5896 vbroadcast(bt, xtmp4, 8 * type2aelembytes(bt), rtmp, vec_enc); 5897 vpsub(bt, dst, xtmp4, dst, vec_enc); 5898 } 5899 5900 // Trailing zero count computation for AVX2 targets is based on popcount operation as per following equation 5901 // CTZ = PRIM_TYPE_WIDHT - POPC(x | -x) 5902 void C2_MacroAssembler::vector_count_trailing_zeros_avx(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, 5903 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) { 5904 assert(is_integral_type(bt), ""); 5905 // xtmp = 0 5906 vpxor(xtmp3 , xtmp3, xtmp3, vec_enc); 5907 // xtmp = 0 - src 5908 vpsub(bt, xtmp3, xtmp3, src, vec_enc); 5909 // xtmp = xtmp | src 5910 vpor(xtmp3, xtmp3, src, vec_enc); 5911 vector_popcount_integral(bt, dst, xtmp3, xtmp1, xtmp2, rtmp, vec_enc); 5912 vbroadcast(bt, xtmp1, 8 * type2aelembytes(bt), rtmp, vec_enc); 5913 vpsub(bt, dst, xtmp1, dst, vec_enc); 5914 } 5915 5916 void C2_MacroAssembler::udivI(Register rax, Register divisor, Register rdx) { 5917 Label done; 5918 Label neg_divisor_fastpath; 5919 cmpl(divisor, 0); 5920 jccb(Assembler::less, neg_divisor_fastpath); 5921 xorl(rdx, rdx); 5922 divl(divisor); 5923 jmpb(done); 5924 bind(neg_divisor_fastpath); 5925 // Fastpath for divisor < 0: 5926 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 5927 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 5928 movl(rdx, rax); 5929 subl(rdx, divisor); 5930 if (VM_Version::supports_bmi1()) { 5931 andnl(rax, rdx, rax); 5932 } else { 5933 notl(rdx); 5934 andl(rax, rdx); 5935 } 5936 shrl(rax, 31); 5937 bind(done); 5938 } 5939 5940 void C2_MacroAssembler::umodI(Register rax, Register divisor, Register rdx) { 5941 Label done; 5942 Label neg_divisor_fastpath; 5943 cmpl(divisor, 0); 5944 jccb(Assembler::less, neg_divisor_fastpath); 5945 xorl(rdx, rdx); 5946 divl(divisor); 5947 jmpb(done); 5948 bind(neg_divisor_fastpath); 5949 // Fastpath when divisor < 0: 5950 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 5951 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 5952 movl(rdx, rax); 5953 subl(rax, divisor); 5954 if (VM_Version::supports_bmi1()) { 5955 andnl(rax, rax, rdx); 5956 } else { 5957 notl(rax); 5958 andl(rax, rdx); 5959 } 5960 sarl(rax, 31); 5961 andl(rax, divisor); 5962 subl(rdx, rax); 5963 bind(done); 5964 } 5965 5966 void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, Register tmp) { 5967 Label done; 5968 Label neg_divisor_fastpath; 5969 5970 cmpl(divisor, 0); 5971 jccb(Assembler::less, neg_divisor_fastpath); 5972 xorl(rdx, rdx); 5973 divl(divisor); 5974 jmpb(done); 5975 bind(neg_divisor_fastpath); 5976 // Fastpath for divisor < 0: 5977 // quotient = (dividend & ~(dividend - divisor)) >>> (Integer.SIZE - 1) 5978 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Integer.SIZE - 1)) & divisor) 5979 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 5980 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 5981 movl(rdx, rax); 5982 subl(rax, divisor); 5983 if (VM_Version::supports_bmi1()) { 5984 andnl(rax, rax, rdx); 5985 } else { 5986 notl(rax); 5987 andl(rax, rdx); 5988 } 5989 movl(tmp, rax); 5990 shrl(rax, 31); // quotient 5991 sarl(tmp, 31); 5992 andl(tmp, divisor); 5993 subl(rdx, tmp); // remainder 5994 bind(done); 5995 } 5996 5997 #ifdef _LP64 5998 void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1, 5999 XMMRegister xtmp2, Register rtmp) { 6000 if(VM_Version::supports_gfni()) { 6001 // Galois field instruction based bit reversal based on following algorithm. 6002 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6003 mov64(rtmp, 0x8040201008040201L); 6004 movq(xtmp1, src); 6005 movq(xtmp2, rtmp); 6006 gf2p8affineqb(xtmp1, xtmp2, 0); 6007 movq(dst, xtmp1); 6008 } else { 6009 // Swap even and odd numbered bits. 6010 movl(rtmp, src); 6011 andl(rtmp, 0x55555555); 6012 shll(rtmp, 1); 6013 movl(dst, src); 6014 andl(dst, 0xAAAAAAAA); 6015 shrl(dst, 1); 6016 orl(dst, rtmp); 6017 6018 // Swap LSB and MSB 2 bits of each nibble. 6019 movl(rtmp, dst); 6020 andl(rtmp, 0x33333333); 6021 shll(rtmp, 2); 6022 andl(dst, 0xCCCCCCCC); 6023 shrl(dst, 2); 6024 orl(dst, rtmp); 6025 6026 // Swap LSB and MSB 4 bits of each byte. 6027 movl(rtmp, dst); 6028 andl(rtmp, 0x0F0F0F0F); 6029 shll(rtmp, 4); 6030 andl(dst, 0xF0F0F0F0); 6031 shrl(dst, 4); 6032 orl(dst, rtmp); 6033 } 6034 bswapl(dst); 6035 } 6036 6037 void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1, 6038 XMMRegister xtmp2, Register rtmp1, Register rtmp2) { 6039 if(VM_Version::supports_gfni()) { 6040 // Galois field instruction based bit reversal based on following algorithm. 6041 // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html 6042 mov64(rtmp1, 0x8040201008040201L); 6043 movq(xtmp1, src); 6044 movq(xtmp2, rtmp1); 6045 gf2p8affineqb(xtmp1, xtmp2, 0); 6046 movq(dst, xtmp1); 6047 } else { 6048 // Swap even and odd numbered bits. 6049 movq(rtmp1, src); 6050 mov64(rtmp2, 0x5555555555555555L); 6051 andq(rtmp1, rtmp2); 6052 shlq(rtmp1, 1); 6053 movq(dst, src); 6054 notq(rtmp2); 6055 andq(dst, rtmp2); 6056 shrq(dst, 1); 6057 orq(dst, rtmp1); 6058 6059 // Swap LSB and MSB 2 bits of each nibble. 6060 movq(rtmp1, dst); 6061 mov64(rtmp2, 0x3333333333333333L); 6062 andq(rtmp1, rtmp2); 6063 shlq(rtmp1, 2); 6064 notq(rtmp2); 6065 andq(dst, rtmp2); 6066 shrq(dst, 2); 6067 orq(dst, rtmp1); 6068 6069 // Swap LSB and MSB 4 bits of each byte. 6070 movq(rtmp1, dst); 6071 mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL); 6072 andq(rtmp1, rtmp2); 6073 shlq(rtmp1, 4); 6074 notq(rtmp2); 6075 andq(dst, rtmp2); 6076 shrq(dst, 4); 6077 orq(dst, rtmp1); 6078 } 6079 bswapq(dst); 6080 } 6081 6082 void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) { 6083 Label done; 6084 Label neg_divisor_fastpath; 6085 cmpq(divisor, 0); 6086 jccb(Assembler::less, neg_divisor_fastpath); 6087 xorl(rdx, rdx); 6088 divq(divisor); 6089 jmpb(done); 6090 bind(neg_divisor_fastpath); 6091 // Fastpath for divisor < 0: 6092 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6093 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.divideUnsigned() 6094 movq(rdx, rax); 6095 subq(rdx, divisor); 6096 if (VM_Version::supports_bmi1()) { 6097 andnq(rax, rdx, rax); 6098 } else { 6099 notq(rdx); 6100 andq(rax, rdx); 6101 } 6102 shrq(rax, 63); 6103 bind(done); 6104 } 6105 6106 void C2_MacroAssembler::umodL(Register rax, Register divisor, Register rdx) { 6107 Label done; 6108 Label neg_divisor_fastpath; 6109 cmpq(divisor, 0); 6110 jccb(Assembler::less, neg_divisor_fastpath); 6111 xorq(rdx, rdx); 6112 divq(divisor); 6113 jmp(done); 6114 bind(neg_divisor_fastpath); 6115 // Fastpath when divisor < 0: 6116 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6117 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in java.lang.Long.remainderUnsigned() 6118 movq(rdx, rax); 6119 subq(rax, divisor); 6120 if (VM_Version::supports_bmi1()) { 6121 andnq(rax, rax, rdx); 6122 } else { 6123 notq(rax); 6124 andq(rax, rdx); 6125 } 6126 sarq(rax, 63); 6127 andq(rax, divisor); 6128 subq(rdx, rax); 6129 bind(done); 6130 } 6131 6132 void C2_MacroAssembler::udivmodL(Register rax, Register divisor, Register rdx, Register tmp) { 6133 Label done; 6134 Label neg_divisor_fastpath; 6135 cmpq(divisor, 0); 6136 jccb(Assembler::less, neg_divisor_fastpath); 6137 xorq(rdx, rdx); 6138 divq(divisor); 6139 jmp(done); 6140 bind(neg_divisor_fastpath); 6141 // Fastpath for divisor < 0: 6142 // quotient = (dividend & ~(dividend - divisor)) >>> (Long.SIZE - 1) 6143 // remainder = dividend - (((dividend & ~(dividend - divisor)) >> (Long.SIZE - 1)) & divisor) 6144 // See Hacker's Delight (2nd ed), section 9.3 which is implemented in 6145 // java.lang.Long.divideUnsigned() and java.lang.Long.remainderUnsigned() 6146 movq(rdx, rax); 6147 subq(rax, divisor); 6148 if (VM_Version::supports_bmi1()) { 6149 andnq(rax, rax, rdx); 6150 } else { 6151 notq(rax); 6152 andq(rax, rdx); 6153 } 6154 movq(tmp, rax); 6155 shrq(rax, 63); // quotient 6156 sarq(tmp, 63); 6157 andq(tmp, divisor); 6158 subq(rdx, tmp); // remainder 6159 bind(done); 6160 } 6161 #endif 6162 6163 void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1, 6164 XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, 6165 int vlen_enc) { 6166 assert(VM_Version::supports_avx512bw(), ""); 6167 // Byte shuffles are inlane operations and indices are determined using 6168 // lower 4 bit of each shuffle lane, thus all shuffle indices are 6169 // normalized to index range 0-15. This makes sure that all the multiples 6170 // of an index value are placed at same relative position in 128 bit 6171 // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 6172 // will be 16th element in their respective 128 bit lanes. 6173 movl(rtmp, 16); 6174 evpbroadcastb(xtmp1, rtmp, vlen_enc); 6175 6176 // Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, 6177 // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using 6178 // original shuffle indices and move the shuffled lanes corresponding to true 6179 // mask to destination vector. 6180 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6181 evshufi64x2(xtmp2, src, src, 0x0, vlen_enc); 6182 evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc); 6183 6184 // Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 6185 // and broadcasting second 128 bit lane. 6186 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6187 vpsllq(xtmp2, xtmp1, 0x1, vlen_enc); 6188 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6189 evshufi64x2(xtmp3, src, src, 0x55, vlen_enc); 6190 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6191 6192 // Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 6193 // and broadcasting third 128 bit lane. 6194 evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc); 6195 vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc); 6196 evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc); 6197 evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc); 6198 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6199 6200 // Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 6201 // and broadcasting third 128 bit lane. 6202 evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc); 6203 vpsllq(xtmp2, xtmp2, 0x1, vlen_enc); 6204 evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc); 6205 evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc); 6206 evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc); 6207 } 6208 6209 void C2_MacroAssembler::vector_rearrange_int_float(BasicType bt, XMMRegister dst, 6210 XMMRegister shuffle, XMMRegister src, int vlen_enc) { 6211 if (vlen_enc == AVX_128bit) { 6212 vpermilps(dst, src, shuffle, vlen_enc); 6213 } else if (bt == T_INT) { 6214 vpermd(dst, shuffle, src, vlen_enc); 6215 } else { 6216 assert(bt == T_FLOAT, ""); 6217 vpermps(dst, shuffle, src, vlen_enc); 6218 } 6219 }